16 vuotta sitten · 54bcf382da
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
 
				 	/* number of things on the pending list */
			
 
				 	atomic_t num_pending;
			
 
				 
			
 
				+	/* reference counter for this struct */
			
 
				+	atomic_t refs;
			
 
				+
			
 
				 	unsigned long sequence;
			
 
				 
			
 
				 	/* protects the pending list. */
			
@@ -71,7 +74,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
 
				 		unsigned long flags;
			
 
				 		spin_lock_irqsave(&worker->workers->lock, flags);
			
 
				 		worker->idle = 1;
			
 
				-		list_move(&worker->worker_list, &worker->workers->idle_list);
			
 
				+
			
 
				+		/* the list may be empty if the worker is just starting */
			
 
				+		if (!list_empty(&worker->worker_list)) {
			
 
				+			list_move(&worker->worker_list,
			
 
				+				 &worker->workers->idle_list);
			
 
				+		}
			
 
				 		spin_unlock_irqrestore(&worker->workers->lock, flags);
			
 
				 	}
			
 
				 }
			
@@ -87,23 +95,49 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 
				 		unsigned long flags;
			
 
				 		spin_lock_irqsave(&worker->workers->lock, flags);
			
 
				 		worker->idle = 0;
			
 
				-		list_move_tail(&worker->worker_list,
			
 
				-			       &worker->workers->worker_list);
			
 
				+
			
 
				+		if (!list_empty(&worker->worker_list)) {
			
 
				+			list_move_tail(&worker->worker_list,
			
 
				+				      &worker->workers->worker_list);
			
 
				+		}
			
 
				 		spin_unlock_irqrestore(&worker->workers->lock, flags);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static noinline int run_ordered_completions(struct btrfs_workers *workers,
			
 
				-					    struct btrfs_work *work)
			
 
				+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
			
 
				 {
			
 
				+	struct btrfs_workers *workers = worker->workers;
			
 
				 	unsigned long flags;
			
 
				 
			
 
				+	rmb();
			
 
				+	if (!workers->atomic_start_pending)
			
 
				+		return;
			
 
				+
			
 
				+	spin_lock_irqsave(&workers->lock, flags);
			
 
				+	if (!workers->atomic_start_pending)
			
 
				+		goto out;
			
 
				+
			
 
				+	workers->atomic_start_pending = 0;
			
 
				+	if (workers->num_workers >= workers->max_workers)
			
 
				+		goto out;
			
 
				+
			
 
				+	spin_unlock_irqrestore(&workers->lock, flags);
			
 
				+	btrfs_start_workers(workers, 1);
			
 
				+	return;
			
 
				+
			
 
				+out:
			
 
				+	spin_unlock_irqrestore(&workers->lock, flags);
			
 
				+}
			
 
				+
			
 
				+static noinline int run_ordered_completions(struct btrfs_workers *workers,
			
 
				+					    struct btrfs_work *work)
			
 
				+{
			
 
				 	if (!workers->ordered)
			
 
				 		return 0;
			
 
				 
			
 
				 	set_bit(WORK_DONE_BIT, &work->flags);
			
 
				 
			
 
				-	spin_lock_irqsave(&workers->lock, flags);
			
 
				+	spin_lock(&workers->order_lock);
			
 
				 
			
 
				 	while (1) {
			
 
				 		if (!list_empty(&workers->prio_order_list)) {
			
@@ -126,45 +160,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
				 		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
			
 
				 			break;
			
 
				 
			
 
				-		spin_unlock_irqrestore(&workers->lock, flags);
			
 
				+		spin_unlock(&workers->order_lock);
			
 
				 
			
 
				 		work->ordered_func(work);
			
 
				 
			
 
				 		/* now take the lock again and call the freeing code */
			
 
				-		spin_lock_irqsave(&workers->lock, flags);
			
 
				+		spin_lock(&workers->order_lock);
			
 
				 		list_del(&work->order_list);
			
 
				 		work->ordered_free(work);
			
 
				 	}
			
 
				 
			
 
				-	spin_unlock_irqrestore(&workers->lock, flags);
			
 
				+	spin_unlock(&workers->order_lock);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static void put_worker(struct btrfs_worker_thread *worker)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&worker->refs))
			
 
				+		kfree(worker);
			
 
				+}
			
 
				+
			
 
				+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
			
 
				+{
			
 
				+	int freeit = 0;
			
 
				+
			
 
				+	spin_lock_irq(&worker->lock);
			
 
				+	spin_lock(&worker->workers->lock);
			
 
				+	if (worker->workers->num_workers > 1 &&
			
 
				+	    worker->idle &&
			
 
				+	    !worker->working &&
			
 
				+	    !list_empty(&worker->worker_list) &&
			
 
				+	    list_empty(&worker->prio_pending) &&
			
 
				+	    list_empty(&worker->pending) &&
			
 
				+	    atomic_read(&worker->num_pending) == 0) {
			
 
				+		freeit = 1;
			
 
				+		list_del_init(&worker->worker_list);
			
 
				+		worker->workers->num_workers--;
			
 
				+	}
			
 
				+	spin_unlock(&worker->workers->lock);
			
 
				+	spin_unlock_irq(&worker->lock);
			
 
				+
			
 
				+	if (freeit)
			
 
				+		put_worker(worker);
			
 
				+	return freeit;
			
 
				+}
			
 
				+
			
 
				+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
			
 
				+					struct list_head *prio_head,
			
 
				+					struct list_head *head)
			
 
				+{
			
 
				+	struct btrfs_work *work = NULL;
			
 
				+	struct list_head *cur = NULL;
			
 
				+
			
 
				+	if(!list_empty(prio_head))
			
 
				+		cur = prio_head->next;
			
 
				+
			
 
				+	smp_mb();
			
 
				+	if (!list_empty(&worker->prio_pending))
			
 
				+		goto refill;
			
 
				+
			
 
				+	if (!list_empty(head))
			
 
				+		cur = head->next;
			
 
				+
			
 
				+	if (cur)
			
 
				+		goto out;
			
 
				+
			
 
				+refill:
			
 
				+	spin_lock_irq(&worker->lock);
			
 
				+	list_splice_tail_init(&worker->prio_pending, prio_head);
			
 
				+	list_splice_tail_init(&worker->pending, head);
			
 
				+
			
 
				+	if (!list_empty(prio_head))
			
 
				+		cur = prio_head->next;
			
 
				+	else if (!list_empty(head))
			
 
				+		cur = head->next;
			
 
				+	spin_unlock_irq(&worker->lock);
			
 
				+
			
 
				+	if (!cur)
			
 
				+		goto out_fail;
			
 
				+
			
 
				+out:
			
 
				+	work = list_entry(cur, struct btrfs_work, list);
			
 
				+
			
 
				+out_fail:
			
 
				+	return work;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * main loop for servicing work items
			
 
				  */
			
 
				 static int worker_loop(void *arg)
			
 
				 {
			
 
				 	struct btrfs_worker_thread *worker = arg;
			
 
				-	struct list_head *cur;
			
 
				+	struct list_head head;
			
 
				+	struct list_head prio_head;
			
 
				 	struct btrfs_work *work;
			
 
				+
			
 
				+	INIT_LIST_HEAD(&head);
			
 
				+	INIT_LIST_HEAD(&prio_head);
			
 
				+
			
 
				 	do {
			
 
				-		spin_lock_irq(&worker->lock);
			
 
				-again_locked:
			
 
				+again:
			
 
				 		while (1) {
			
 
				-			if (!list_empty(&worker->prio_pending))
			
 
				-				cur = worker->prio_pending.next;
			
 
				-			else if (!list_empty(&worker->pending))
			
 
				-				cur = worker->pending.next;
			
 
				-			else
			
 
				+
			
 
				+
			
 
				+			work = get_next_work(worker, &prio_head, &head);
			
 
				+			if (!work)
			
 
				 				break;
			
 
				 
			
 
				-			work = list_entry(cur, struct btrfs_work, list);
			
 
				 			list_del(&work->list);
			
 
				 			clear_bit(WORK_QUEUED_BIT, &work->flags);
			
 
				 
			
 
				 			work->worker = worker;
			
 
				-			spin_unlock_irq(&worker->lock);
			
 
				 
			
 
				 			work->func(work);
			
 
				 
			
@@ -175,9 +282,13 @@ again_locked:
 
				 			 */
			
 
				 			run_ordered_completions(worker->workers, work);
			
 
				 
			
 
				-			spin_lock_irq(&worker->lock);
			
 
				-			check_idle_worker(worker);
			
 
				+			check_pending_worker_creates(worker);
			
 
				+
			
 
				 		}
			
 
				+
			
 
				+		spin_lock_irq(&worker->lock);
			
 
				+		check_idle_worker(worker);
			
 
				+
			
 
				 		if (freezing(current)) {
			
 
				 			worker->working = 0;
			
 
				 			spin_unlock_irq(&worker->lock);
			
@@ -216,8 +327,10 @@ again_locked:
 
				 				spin_lock_irq(&worker->lock);
			
 
				 				set_current_state(TASK_INTERRUPTIBLE);
			
 
				 				if (!list_empty(&worker->pending) ||
			
 
				-				    !list_empty(&worker->prio_pending))
			
 
				-					goto again_locked;
			
 
				+				    !list_empty(&worker->prio_pending)) {
			
 
				+					spin_unlock_irq(&worker->lock);
			
 
				+					goto again;
			
 
				+				}
			
 
				 
			
 
				 				/*
			
 
				 				 * this makes sure we get a wakeup when someone
			
@@ -226,8 +339,13 @@ again_locked:
 
				 				worker->working = 0;
			
 
				 				spin_unlock_irq(&worker->lock);
			
 
				 
			
 
				-				if (!kthread_should_stop())
			
 
				-					schedule();
			
 
				+				if (!kthread_should_stop()) {
			
 
				+					schedule_timeout(HZ * 120);
			
 
				+					if (!worker->working &&
			
 
				+					    try_worker_shutdown(worker)) {
			
 
				+						return 0;
			
 
				+					}
			
 
				+				}
			
 
				 			}
			
 
				 			__set_current_state(TASK_RUNNING);
			
 
				 		}
			
@@ -242,16 +360,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 
				 {
			
 
				 	struct list_head *cur;
			
 
				 	struct btrfs_worker_thread *worker;
			
 
				+	int can_stop;
			
 
				 
			
 
				+	spin_lock_irq(&workers->lock);
			
 
				 	list_splice_init(&workers->idle_list, &workers->worker_list);
			
 
				 	while (!list_empty(&workers->worker_list)) {
			
 
				 		cur = workers->worker_list.next;
			
 
				 		worker = list_entry(cur, struct btrfs_worker_thread,
			
 
				 				    worker_list);
			
 
				-		kthread_stop(worker->task);
			
 
				-		list_del(&worker->worker_list);
			
 
				-		kfree(worker);
			
 
				+
			
 
				+		atomic_inc(&worker->refs);
			
 
				+		workers->num_workers -= 1;
			
 
				+		if (!list_empty(&worker->worker_list)) {
			
 
				+			list_del_init(&worker->worker_list);
			
 
				+			put_worker(worker);
			
 
				+			can_stop = 1;
			
 
				+		} else
			
 
				+			can_stop = 0;
			
 
				+		spin_unlock_irq(&workers->lock);
			
 
				+		if (can_stop)
			
 
				+			kthread_stop(worker->task);
			
 
				+		spin_lock_irq(&workers->lock);
			
 
				+		put_worker(worker);
			
 
				 	}
			
 
				+	spin_unlock_irq(&workers->lock);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -266,10 +398,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 
				 	INIT_LIST_HEAD(&workers->order_list);
			
 
				 	INIT_LIST_HEAD(&workers->prio_order_list);
			
 
				 	spin_lock_init(&workers->lock);
			
 
				+	spin_lock_init(&workers->order_lock);
			
 
				 	workers->max_workers = max;
			
 
				 	workers->idle_thresh = 32;
			
 
				 	workers->name = name;
			
 
				 	workers->ordered = 0;
			
 
				+	workers->atomic_start_pending = 0;
			
 
				+	workers->atomic_worker_start = 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -293,7 +428,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 
				 		INIT_LIST_HEAD(&worker->prio_pending);
			
 
				 		INIT_LIST_HEAD(&worker->worker_list);
			
 
				 		spin_lock_init(&worker->lock);
			
 
				+
			
 
				 		atomic_set(&worker->num_pending, 0);
			
 
				+		atomic_set(&worker->refs, 1);
			
 
				 		worker->workers = workers;
			
 
				 		worker->task = kthread_run(worker_loop, worker,
			
 
				 					   "btrfs-%s-%d", workers->name,
			
@@ -303,7 +440,6 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 
				 			kfree(worker);
			
 
				 			goto fail;
			
 
				 		}
			
 
				-
			
 
				 		spin_lock_irq(&workers->lock);
			
 
				 		list_add_tail(&worker->worker_list, &workers->idle_list);
			
 
				 		worker->idle = 1;
			
@@ -350,7 +486,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 
				 	 */
			
 
				 	next = workers->worker_list.next;
			
 
				 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
			
 
				-	atomic_inc(&worker->num_pending);
			
 
				 	worker->sequence++;
			
 
				 
			
 
				 	if (worker->sequence % workers->idle_thresh == 0)
			
@@ -367,28 +502,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 
				 {
			
 
				 	struct btrfs_worker_thread *worker;
			
 
				 	unsigned long flags;
			
 
				+	struct list_head *fallback;
			
 
				 
			
 
				 again:
			
 
				 	spin_lock_irqsave(&workers->lock, flags);
			
 
				 	worker = next_worker(workers);
			
 
				-	spin_unlock_irqrestore(&workers->lock, flags);
			
 
				 
			
 
				 	if (!worker) {
			
 
				-		spin_lock_irqsave(&workers->lock, flags);
			
 
				 		if (workers->num_workers >= workers->max_workers) {
			
 
				-			struct list_head *fallback = NULL;
			
 
				-			/*
			
 
				-			 * we have failed to find any workers, just
			
 
				-			 * return the force one
			
 
				-			 */
			
 
				-			if (!list_empty(&workers->worker_list))
			
 
				-				fallback = workers->worker_list.next;
			
 
				-			if (!list_empty(&workers->idle_list))
			
 
				-				fallback = workers->idle_list.next;
			
 
				-			BUG_ON(!fallback);
			
 
				-			worker = list_entry(fallback,
			
 
				-				  struct btrfs_worker_thread, worker_list);
			
 
				-			spin_unlock_irqrestore(&workers->lock, flags);
			
 
				+			goto fallback;
			
 
				+		} else if (workers->atomic_worker_start) {
			
 
				+			workers->atomic_start_pending = 1;
			
 
				+			goto fallback;
			
 
				 		} else {
			
 
				 			spin_unlock_irqrestore(&workers->lock, flags);
			
 
				 			/* we're below the limit, start another worker */
			
@@ -396,6 +521,28 @@ again:
 
				 			goto again;
			
 
				 		}
			
 
				 	}
			
 
				+	goto found;
			
 
				+
			
 
				+fallback:
			
 
				+	fallback = NULL;
			
 
				+	/*
			
 
				+	 * we have failed to find any workers, just
			
 
				+	 * return the first one we can find.
			
 
				+	 */
			
 
				+	if (!list_empty(&workers->worker_list))
			
 
				+		fallback = workers->worker_list.next;
			
 
				+	if (!list_empty(&workers->idle_list))
			
 
				+		fallback = workers->idle_list.next;
			
 
				+	BUG_ON(!fallback);
			
 
				+	worker = list_entry(fallback,
			
 
				+		  struct btrfs_worker_thread, worker_list);
			
 
				+found:
			
 
				+	/*
			
 
				+	 * this makes sure the worker doesn't exit before it is placed
			
 
				+	 * onto a busy/idle list
			
 
				+	 */
			
 
				+	atomic_inc(&worker->num_pending);
			
 
				+	spin_unlock_irqrestore(&workers->lock, flags);
			
 
				 	return worker;
			
 
				 }
			
 
				 
			
@@ -427,7 +574,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
 
				 		spin_lock(&worker->workers->lock);
			
 
				 		worker->idle = 0;
			
 
				 		list_move_tail(&worker->worker_list,
			
 
				-			       &worker->workers->worker_list);
			
 
				+			      &worker->workers->worker_list);
			
 
				 		spin_unlock(&worker->workers->lock);
			
 
				 	}
			
 
				 	if (!worker->working) {
			
@@ -435,9 +582,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
 
				 		worker->working = 1;
			
 
				 	}
			
 
				 
			
 
				-	spin_unlock_irqrestore(&worker->lock, flags);
			
 
				 	if (wake)
			
 
				 		wake_up_process(worker->task);
			
 
				+	spin_unlock_irqrestore(&worker->lock, flags);
			
 
				 out:
			
 
				 
			
 
				 	return 0;
			
@@ -463,14 +610,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
				 
			
 
				 	worker = find_worker(workers);
			
 
				 	if (workers->ordered) {
			
 
				-		spin_lock_irqsave(&workers->lock, flags);
			
 
				+		/*
			
 
				+		 * you're not allowed to do ordered queues from an
			
 
				+		 * interrupt handler
			
 
				+		 */
			
 
				+		spin_lock(&workers->order_lock);
			
 
				 		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
			
 
				 			list_add_tail(&work->order_list,
			
 
				 				      &workers->prio_order_list);
			
 
				 		} else {
			
 
				 			list_add_tail(&work->order_list, &workers->order_list);
			
 
				 		}
			
 
				-		spin_unlock_irqrestore(&workers->lock, flags);
			
 
				+		spin_unlock(&workers->order_lock);
			
 
				 	} else {
			
 
				 		INIT_LIST_HEAD(&work->order_list);
			
 
				 	}
			
@@ -481,7 +632,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
				 		list_add_tail(&work->list, &worker->prio_pending);
			
 
				 	else
			
 
				 		list_add_tail(&work->list, &worker->pending);
			
 
				-	atomic_inc(&worker->num_pending);
			
 
				 	check_busy_worker(worker);
			
 
				 
			
 
				 	/*
			
@@ -492,10 +642,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
				 		wake = 1;
			
 
				 	worker->working = 1;
			
 
				 
			
 
				-	spin_unlock_irqrestore(&worker->lock, flags);
			
 
				-
			
 
				 	if (wake)
			
 
				 		wake_up_process(worker->task);
			
 
				+	spin_unlock_irqrestore(&worker->lock, flags);
			
 
				+
			
 
				 out:
			
 
				 	return 0;
			
 
				 }
			
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -73,6 +73,15 @@ struct btrfs_workers {
 
				 	/* force completions in the order they were queued */
			
 
				 	int ordered;
			
 
				 
			
 
				+	/* more workers required, but in an interrupt handler */
			
 
				+	int atomic_start_pending;
			
 
				+
			
 
				+	/*
			
 
				+	 * are we allowed to sleep while starting workers or are we required
			
 
				+	 * to start them at a later time?
			
 
				+	 */
			
 
				+	int atomic_worker_start;
			
 
				+
			
 
				 	/* list with all the work threads.  The workers on the idle thread
			
 
				 	 * may be actively servicing jobs, but they haven't yet hit the
			
 
				 	 * idle thresh limit above.
			
@@ -90,6 +99,9 @@ struct btrfs_workers {
 
				 	/* lock for finding the next worker thread to queue on */
			
 
				 	spinlock_t lock;
			
 
				 
			
 
				+	/* lock for the ordered lists */
			
 
				+	spinlock_t order_lock;
			
 
				+
			
 
				 	/* extra name for this worker, used for current->name */
			
 
				 	char *name;
			
 
				 };
			
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,6 +138,7 @@ struct btrfs_inode {
 
				 	 * of these.
			
 
				 	 */
			
 
				 	unsigned ordered_data_close:1;
			
 
				+	unsigned dummy_inode:1;
			
 
				 
			
 
				 	struct inode vfs_inode;
			
 
				 };
			
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 
				 		 */
			
 
				 		set_page_extent_mapped(page);
			
 
				 		lock_extent(tree, last_offset, end, GFP_NOFS);
			
 
				-		spin_lock(&em_tree->lock);
			
 
				+		read_lock(&em_tree->lock);
			
 
				 		em = lookup_extent_mapping(em_tree, last_offset,
			
 
				 					   PAGE_CACHE_SIZE);
			
 
				-		spin_unlock(&em_tree->lock);
			
 
				+		read_unlock(&em_tree->lock);
			
 
				 
			
 
				 		if (!em || last_offset < em->start ||
			
 
				 		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
			
@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
				 	em_tree = &BTRFS_I(inode)->extent_tree;
			
 
				 
			
 
				 	/* we need the actual starting offset of this extent in the file */
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree,
			
 
				 				   page_offset(bio->bi_io_vec->bv_page),
			
 
				 				   PAGE_CACHE_SIZE);
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	read_unlock(&em_tree->lock);
			
 
				 
			
 
				 	compressed_len = em->block_len;
			
 
				 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
			
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 
				 	int split;
			
 
				 	int num_doubles = 0;
			
 
				 
			
 
				+	l = path->nodes[0];
			
 
				+	slot = path->slots[0];
			
 
				+	if (extend && data_size + btrfs_item_size_nr(l, slot) +
			
 
				+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
			
 
				+		return -EOVERFLOW;
			
 
				+
			
 
				 	/* first try to make some room by pushing left and right */
			
 
				 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
			
 
				 		wret = push_leaf_right(trans, root, path, data_size, 0);
			
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
 
				  */
			
 
				 #define BTRFS_DEV_ITEMS_OBJECTID 1ULL
			
 
				 
			
 
				+#define BTRFS_BTREE_INODE_OBJECTID 1
			
 
				+
			
 
				+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
			
 
				+
			
 
				 /*
			
 
				  * we can actually store much bigger names, but lets not confuse the rest
			
 
				  * of linux
			
@@ -670,6 +674,7 @@ struct btrfs_space_info {
 
				 	u64 bytes_reserved;	/* total bytes the allocator has reserved for
			
 
				 				   current allocations */
			
 
				 	u64 bytes_readonly;	/* total bytes that are read only */
			
 
				+	u64 bytes_super;	/* total bytes reserved for the super blocks */
			
 
				 
			
 
				 	/* delalloc accounting */
			
 
				 	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
			
@@ -726,6 +731,15 @@ enum btrfs_caching_type {
 
				 	BTRFS_CACHE_FINISHED	= 2,
			
 
				 };
			
 
				 
			
 
				+struct btrfs_caching_control {
			
 
				+	struct list_head list;
			
 
				+	struct mutex mutex;
			
 
				+	wait_queue_head_t wait;
			
 
				+	struct btrfs_block_group_cache *block_group;
			
 
				+	u64 progress;
			
 
				+	atomic_t count;
			
 
				+};
			
 
				+
			
 
				 struct btrfs_block_group_cache {
			
 
				 	struct btrfs_key key;
			
 
				 	struct btrfs_block_group_item item;
			
@@ -733,6 +747,7 @@ struct btrfs_block_group_cache {
 
				 	spinlock_t lock;
			
 
				 	u64 pinned;
			
 
				 	u64 reserved;
			
 
				+	u64 bytes_super;
			
 
				 	u64 flags;
			
 
				 	u64 sectorsize;
			
 
				 	int extents_thresh;
			
@@ -742,8 +757,9 @@ struct btrfs_block_group_cache {
 
				 	int dirty;
			
 
				 
			
 
				 	/* cache tracking stuff */
			
 
				-	wait_queue_head_t caching_q;
			
 
				 	int cached;
			
 
				+	struct btrfs_caching_control *caching_ctl;
			
 
				+	u64 last_byte_to_unpin;
			
 
				 
			
 
				 	struct btrfs_space_info *space_info;
			
 
				 
			
@@ -782,13 +798,16 @@ struct btrfs_fs_info {
 
				 
			
 
				 	/* the log root tree is a directory of all the other log roots */
			
 
				 	struct btrfs_root *log_root_tree;
			
 
				+
			
 
				+	spinlock_t fs_roots_radix_lock;
			
 
				 	struct radix_tree_root fs_roots_radix;
			
 
				 
			
 
				 	/* block group cache stuff */
			
 
				 	spinlock_t block_group_cache_lock;
			
 
				 	struct rb_root block_group_cache_tree;
			
 
				 
			
 
				-	struct extent_io_tree pinned_extents;
			
 
				+	struct extent_io_tree freed_extents[2];
			
 
				+	struct extent_io_tree *pinned_extents;
			
 
				 
			
 
				 	/* logical->physical extent mapping */
			
 
				 	struct btrfs_mapping_tree mapping_tree;
			
@@ -822,11 +841,7 @@ struct btrfs_fs_info {
 
				 	struct mutex transaction_kthread_mutex;
			
 
				 	struct mutex cleaner_mutex;
			
 
				 	struct mutex chunk_mutex;
			
 
				-	struct mutex drop_mutex;
			
 
				 	struct mutex volume_mutex;
			
 
				-	struct mutex tree_reloc_mutex;
			
 
				-	struct rw_semaphore extent_commit_sem;
			
 
				-
			
 
				 	/*
			
 
				 	 * this protects the ordered operations list only while we are
			
 
				 	 * processing all of the entries on it.  This way we make
			
@@ -835,10 +850,16 @@ struct btrfs_fs_info {
 
				 	 * before jumping into the main commit.
			
 
				 	 */
			
 
				 	struct mutex ordered_operations_mutex;
			
 
				+	struct rw_semaphore extent_commit_sem;
			
 
				+
			
 
				+	struct rw_semaphore subvol_sem;
			
 
				+
			
 
				+	struct srcu_struct subvol_srcu;
			
 
				 
			
 
				 	struct list_head trans_list;
			
 
				 	struct list_head hashers;
			
 
				 	struct list_head dead_roots;
			
 
				+	struct list_head caching_block_groups;
			
 
				 
			
 
				 	atomic_t nr_async_submits;
			
 
				 	atomic_t async_submit_draining;
			
@@ -996,10 +1017,12 @@ struct btrfs_root {
 
				 	u32 stripesize;
			
 
				 
			
 
				 	u32 type;
			
 
				-	u64 highest_inode;
			
 
				-	u64 last_inode_alloc;
			
 
				+
			
 
				+	u64 highest_objectid;
			
 
				 	int ref_cows;
			
 
				 	int track_dirty;
			
 
				+	int in_radix;
			
 
				+
			
 
				 	u64 defrag_trans_start;
			
 
				 	struct btrfs_key defrag_progress;
			
 
				 	struct btrfs_key defrag_max;
			
@@ -1920,8 +1943,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 
				 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
			
 
				 			   struct btrfs_root *root, unsigned long count);
			
 
				 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
			
 
				-int btrfs_update_pinned_extents(struct btrfs_root *root,
			
 
				-				u64 bytenr, u64 num, int pin);
			
 
				+int btrfs_pin_extent(struct btrfs_root *root,
			
 
				+		     u64 bytenr, u64 num, int reserved);
			
 
				 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
			
 
				 			struct btrfs_root *root, struct extent_buffer *leaf);
			
 
				 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
			
@@ -1971,9 +1994,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 
				 		      u64 root_objectid, u64 owner, u64 offset);
			
 
				 
			
 
				 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
			
 
				+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
			
 
				+				struct btrfs_root *root);
			
 
				 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
			
 
				-			       struct btrfs_root *root,
			
 
				-			       struct extent_io_tree *unpin);
			
 
				+			       struct btrfs_root *root);
			
 
				 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
			
 
				 			 struct btrfs_root *root,
			
 
				 			 u64 bytenr, u64 num_bytes, u64 parent,
			
@@ -1984,6 +2008,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
				 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
			
 
				 int btrfs_free_block_groups(struct btrfs_fs_info *info);
			
 
				 int btrfs_read_block_groups(struct btrfs_root *root);
			
 
				+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
			
 
				 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
			
 
				 			   struct btrfs_root *root, u64 bytes_used,
			
 
				 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
			
@@ -2006,7 +2031,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 
				 				 u64 bytes);
			
 
				 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
			
 
				 			      u64 bytes);
			
 
				-void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
			
 
				 /* ctree.c */
			
 
				 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
			
 
				 		     int level, int *slot);
			
@@ -2100,12 +2124,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 
				 			struct extent_buffer *parent);
			
 
				 /* root-item.c */
			
 
				 int btrfs_find_root_ref(struct btrfs_root *tree_root,
			
 
				-		   struct btrfs_path *path,
			
 
				-		   u64 root_id, u64 ref_id);
			
 
				+			struct btrfs_path *path,
			
 
				+			u64 root_id, u64 ref_id);
			
 
				 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
			
 
				 		       struct btrfs_root *tree_root,
			
 
				-		       u64 root_id, u8 type, u64 ref_id,
			
 
				-		       u64 dirid, u64 sequence,
			
 
				+		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
			
 
				+		       const char *name, int name_len);
			
 
				+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
			
 
				+		       struct btrfs_root *tree_root,
			
 
				+		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
			
 
				 		       const char *name, int name_len);
			
 
				 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
			
 
				 		   struct btrfs_key *key);
			
@@ -2120,6 +2147,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 
				 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
			
 
				 		      u64 *found_objectid);
			
 
				 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
			
 
				+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
			
 
				 int btrfs_set_root_node(struct btrfs_root_item *item,
			
 
				 			struct extent_buffer *node);
			
 
				 /* dir-item.c */
			
@@ -2138,6 +2166,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 
				 			    struct btrfs_path *path, u64 dir,
			
 
				 			    u64 objectid, const char *name, int name_len,
			
 
				 			    int mod);
			
 
				+struct btrfs_dir_item *
			
 
				+btrfs_search_dir_index_item(struct btrfs_root *root,
			
 
				+			    struct btrfs_path *path, u64 dirid,
			
 
				+			    const char *name, int name_len);
			
 
				 struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
			
 
				 			      struct btrfs_path *path,
			
 
				 			      const char *name, int name_len);
			
@@ -2160,6 +2192,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 
				 			     struct btrfs_root *root, u64 offset);
			
 
				 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
			
 
				 			  struct btrfs_root *root, u64 offset);
			
 
				+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
			
 
				 
			
 
				 /* inode-map.c */
			
 
				 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
			
@@ -2232,6 +2265,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 
				 int btrfs_add_link(struct btrfs_trans_handle *trans,
			
 
				 		   struct inode *parent_inode, struct inode *inode,
			
 
				 		   const char *name, int name_len, int add_backref, u64 index);
			
 
				+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
			
 
				+			struct btrfs_root *root,
			
 
				+			struct inode *dir, u64 objectid,
			
 
				+			const char *name, int name_len);
			
 
				 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
			
 
				 			       struct btrfs_root *root,
			
 
				 			       struct inode *inode, u64 new_size,
			
@@ -2242,7 +2279,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 
				 int btrfs_writepages(struct address_space *mapping,
			
 
				 		     struct writeback_control *wbc);
			
 
				 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
			
 
				-			     struct btrfs_root *new_root, struct dentry *dentry,
			
 
				+			     struct btrfs_root *new_root,
			
 
				 			     u64 new_dirid, u64 alloc_hint);
			
 
				 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
			
 
				 			 size_t size, struct bio *bio, unsigned long bio_flags);
			
@@ -2258,6 +2295,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
 
				 void btrfs_dirty_inode(struct inode *inode);
			
 
				 struct inode *btrfs_alloc_inode(struct super_block *sb);
			
 
				 void btrfs_destroy_inode(struct inode *inode);
			
 
				+void btrfs_drop_inode(struct inode *inode);
			
 
				 int btrfs_init_cachep(void);
			
 
				 void btrfs_destroy_cachep(void);
			
 
				 long btrfs_ioctl_trans_end(struct file *file);
			
@@ -2275,6 +2313,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 
				 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
			
 
				 void btrfs_orphan_cleanup(struct btrfs_root *root);
			
 
				 int btrfs_cont_expand(struct inode *inode, loff_t size);
			
 
				+int btrfs_invalidate_inodes(struct btrfs_root *root);
			
 
				+extern struct dentry_operations btrfs_dentry_operations;
			
 
				 
			
 
				 /* ioctl.c */
			
 
				 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
			
@@ -2290,7 +2330,7 @@ extern struct file_operations btrfs_file_operations;
 
				 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
			
 
				 		       struct btrfs_root *root, struct inode *inode,
			
 
				 		       u64 start, u64 end, u64 locked_end,
			
 
				-		       u64 inline_limit, u64 *hint_block);
			
 
				+		       u64 inline_limit, u64 *hint_block, int drop_cache);
			
 
				 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
			
 
				 			      struct btrfs_root *root,
			
 
				 			      struct inode *inode, u64 start, u64 end);
			
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 
				 	return btrfs_match_dir_item_name(root, path, name, name_len);
			
 
				 }
			
 
				 
			
 
				+struct btrfs_dir_item *
			
 
				+btrfs_search_dir_index_item(struct btrfs_root *root,
			
 
				+			    struct btrfs_path *path, u64 dirid,
			
 
				+			    const char *name, int name_len)
			
 
				+{
			
 
				+	struct extent_buffer *leaf;
			
 
				+	struct btrfs_dir_item *di;
			
 
				+	struct btrfs_key key;
			
 
				+	u32 nritems;
			
 
				+	int ret;
			
 
				+
			
 
				+	key.objectid = dirid;
			
 
				+	key.type = BTRFS_DIR_INDEX_KEY;
			
 
				+	key.offset = 0;
			
 
				+
			
 
				+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				+	if (ret < 0)
			
 
				+		return ERR_PTR(ret);
			
 
				+
			
 
				+	leaf = path->nodes[0];
			
 
				+	nritems = btrfs_header_nritems(leaf);
			
 
				+
			
 
				+	while (1) {
			
 
				+		if (path->slots[0] >= nritems) {
			
 
				+			ret = btrfs_next_leaf(root, path);
			
 
				+			if (ret < 0)
			
 
				+				return ERR_PTR(ret);
			
 
				+			if (ret > 0)
			
 
				+				break;
			
 
				+			leaf = path->nodes[0];
			
 
				+			nritems = btrfs_header_nritems(leaf);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
			
 
				+		if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
			
 
				+			break;
			
 
				+
			
 
				+		di = btrfs_match_dir_item_name(root, path, name, name_len);
			
 
				+		if (di)
			
 
				+			return di;
			
 
				+
			
 
				+		path->slots[0]++;
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				 struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
			
 
				 					  struct btrfs_root *root,
			
 
				 					  struct btrfs_path *path, u64 dir,
			
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
 
				 
			
 
				 static struct extent_io_ops btree_extent_io_ops;
			
 
				 static void end_workqueue_fn(struct btrfs_work *work);
			
 
				+static void free_fs_root(struct btrfs_root *root);
			
 
				 
			
 
				 static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
			
 
				 
			
@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 
				 	struct extent_map *em;
			
 
				 	int ret;
			
 
				 
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree, start, len);
			
 
				 	if (em) {
			
 
				 		em->bdev =
			
 
				 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
			
 
				-		spin_unlock(&em_tree->lock);
			
 
				+		read_unlock(&em_tree->lock);
			
 
				 		goto out;
			
 
				 	}
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	read_unlock(&em_tree->lock);
			
 
				 
			
 
				 	em = alloc_extent_map(GFP_NOFS);
			
 
				 	if (!em) {
			
@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 
				 	em->block_start = 0;
			
 
				 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
			
 
				 
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	write_lock(&em_tree->lock);
			
 
				 	ret = add_extent_mapping(em_tree, em);
			
 
				 	if (ret == -EEXIST) {
			
 
				 		u64 failed_start = em->start;
			
@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 
				 		free_extent_map(em);
			
 
				 		em = NULL;
			
 
				 	}
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	write_unlock(&em_tree->lock);
			
 
				 
			
 
				 	if (ret)
			
 
				 		em = ERR_PTR(ret);
			
@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
				 	root->fs_info = fs_info;
			
 
				 	root->objectid = objectid;
			
 
				 	root->last_trans = 0;
			
 
				-	root->highest_inode = 0;
			
 
				-	root->last_inode_alloc = 0;
			
 
				+	root->highest_objectid = 0;
			
 
				 	root->name = NULL;
			
 
				 	root->in_sysfs = 0;
			
 
				 	root->inode_tree.rb_node = NULL;
			
@@ -952,14 +952,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 
				 		     root, fs_info, objectid);
			
 
				 	ret = btrfs_find_last_root(tree_root, objectid,
			
 
				 				   &root->root_item, &root->root_key);
			
 
				+	if (ret > 0)
			
 
				+		return -ENOENT;
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				 	generation = btrfs_root_generation(&root->root_item);
			
 
				 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
			
 
				 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
			
 
				 				     blocksize, generation);
			
 
				-	root->commit_root = btrfs_root_node(root);
			
 
				 	BUG_ON(!root->node);
			
 
				+	root->commit_root = btrfs_root_node(root);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1095,7 +1097,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 
				 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
			
 
				 	struct btrfs_path *path;
			
 
				 	struct extent_buffer *l;
			
 
				-	u64 highest_inode;
			
 
				 	u64 generation;
			
 
				 	u32 blocksize;
			
 
				 	int ret = 0;
			
@@ -1110,7 +1111,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 
				 			kfree(root);
			
 
				 			return ERR_PTR(ret);
			
 
				 		}
			
 
				-		goto insert;
			
 
				+		goto out;
			
 
				 	}
			
 
				 
			
 
				 	__setup_root(tree_root->nodesize, tree_root->leafsize,
			
@@ -1120,39 +1121,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 
				 	path = btrfs_alloc_path();
			
 
				 	BUG_ON(!path);
			
 
				 	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
			
 
				-	if (ret != 0) {
			
 
				-		if (ret > 0)
			
 
				-			ret = -ENOENT;
			
 
				-		goto out;
			
 
				+	if (ret == 0) {
			
 
				+		l = path->nodes[0];
			
 
				+		read_extent_buffer(l, &root->root_item,
			
 
				+				btrfs_item_ptr_offset(l, path->slots[0]),
			
 
				+				sizeof(root->root_item));
			
 
				+		memcpy(&root->root_key, location, sizeof(*location));
			
 
				 	}
			
 
				-	l = path->nodes[0];
			
 
				-	read_extent_buffer(l, &root->root_item,
			
 
				-	       btrfs_item_ptr_offset(l, path->slots[0]),
			
 
				-	       sizeof(root->root_item));
			
 
				-	memcpy(&root->root_key, location, sizeof(*location));
			
 
				-	ret = 0;
			
 
				-out:
			
 
				-	btrfs_release_path(root, path);
			
 
				 	btrfs_free_path(path);
			
 
				 	if (ret) {
			
 
				-		kfree(root);
			
 
				+		if (ret > 0)
			
 
				+			ret = -ENOENT;
			
 
				 		return ERR_PTR(ret);
			
 
				 	}
			
 
				+
			
 
				 	generation = btrfs_root_generation(&root->root_item);
			
 
				 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
			
 
				 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
			
 
				 				     blocksize, generation);
			
 
				 	root->commit_root = btrfs_root_node(root);
			
 
				 	BUG_ON(!root->node);
			
 
				-insert:
			
 
				-	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
			
 
				+out:
			
 
				+	if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
			
 
				 		root->ref_cows = 1;
			
 
				-		ret = btrfs_find_highest_inode(root, &highest_inode);
			
 
				-		if (ret == 0) {
			
 
				-			root->highest_inode = highest_inode;
			
 
				-			root->last_inode_alloc = highest_inode;
			
 
				-		}
			
 
				-	}
			
 
				+
			
 
				 	return root;
			
 
				 }
			
 
				 
			
@@ -1187,39 +1179,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 
				 		return fs_info->dev_root;
			
 
				 	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
			
 
				 		return fs_info->csum_root;
			
 
				-
			
 
				+again:
			
 
				+	spin_lock(&fs_info->fs_roots_radix_lock);
			
 
				 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
			
 
				 				 (unsigned long)location->objectid);
			
 
				+	spin_unlock(&fs_info->fs_roots_radix_lock);
			
 
				 	if (root)
			
 
				 		return root;
			
 
				 
			
 
				+	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
			
 
				+	if (ret == 0)
			
 
				+		ret = -ENOENT;
			
 
				+	if (ret < 0)
			
 
				+		return ERR_PTR(ret);
			
 
				+
			
 
				 	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
			
 
				 	if (IS_ERR(root))
			
 
				 		return root;
			
 
				 
			
 
				+	WARN_ON(btrfs_root_refs(&root->root_item) == 0);
			
 
				 	set_anon_super(&root->anon_super, NULL);
			
 
				 
			
 
				+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
			
 
				+	if (ret)
			
 
				+		goto fail;
			
 
				+
			
 
				+	spin_lock(&fs_info->fs_roots_radix_lock);
			
 
				 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
			
 
				 				(unsigned long)root->root_key.objectid,
			
 
				 				root);
			
 
				+	if (ret == 0)
			
 
				+		root->in_radix = 1;
			
 
				+	spin_unlock(&fs_info->fs_roots_radix_lock);
			
 
				+	radix_tree_preload_end();
			
 
				 	if (ret) {
			
 
				-		free_extent_buffer(root->node);
			
 
				-		kfree(root);
			
 
				-		return ERR_PTR(ret);
			
 
				+		if (ret == -EEXIST) {
			
 
				+			free_fs_root(root);
			
 
				+			goto again;
			
 
				+		}
			
 
				+		goto fail;
			
 
				 	}
			
 
				-	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
			
 
				-		ret = btrfs_find_dead_roots(fs_info->tree_root,
			
 
				-					    root->root_key.objectid);
			
 
				-		BUG_ON(ret);
			
 
				+
			
 
				+	ret = btrfs_find_dead_roots(fs_info->tree_root,
			
 
				+				    root->root_key.objectid);
			
 
				+	WARN_ON(ret);
			
 
				+
			
 
				+	if (!(fs_info->sb->s_flags & MS_RDONLY))
			
 
				 		btrfs_orphan_cleanup(root);
			
 
				-	}
			
 
				+
			
 
				 	return root;
			
 
				+fail:
			
 
				+	free_fs_root(root);
			
 
				+	return ERR_PTR(ret);
			
 
				 }
			
 
				 
			
 
				 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
			
 
				 				      struct btrfs_key *location,
			
 
				 				      const char *name, int namelen)
			
 
				 {
			
 
				+	return btrfs_read_fs_root_no_name(fs_info, location);
			
 
				+#if 0
			
 
				 	struct btrfs_root *root;
			
 
				 	int ret;
			
 
				 
			
@@ -1236,7 +1255,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 
				 		kfree(root);
			
 
				 		return ERR_PTR(ret);
			
 
				 	}
			
 
				-#if 0
			
 
				+
			
 
				 	ret = btrfs_sysfs_add_root(root);
			
 
				 	if (ret) {
			
 
				 		free_extent_buffer(root->node);
			
@@ -1244,9 +1263,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 
				 		kfree(root);
			
 
				 		return ERR_PTR(ret);
			
 
				 	}
			
 
				-#endif
			
 
				 	root->in_sysfs = 1;
			
 
				 	return root;
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
			
@@ -1325,9 +1344,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 
				 	offset = page_offset(page);
			
 
				 
			
 
				 	em_tree = &BTRFS_I(inode)->extent_tree;
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	read_unlock(&em_tree->lock);
			
 
				 	if (!em) {
			
 
				 		__unplug_io_fn(bdi, page);
			
 
				 		return;
			
@@ -1360,8 +1379,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 
				 
			
 
				 	err = bdi_register(bdi, NULL, "btrfs-%d",
			
 
				 				atomic_inc_return(&btrfs_bdi_num));
			
 
				-	if (err)
			
 
				+	if (err) {
			
 
				+		bdi_destroy(bdi);
			
 
				 		return err;
			
 
				+	}
			
 
				 
			
 
				 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
			
 
				 	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
			
@@ -1451,9 +1472,12 @@ static int cleaner_kthread(void *arg)
 
				 			break;
			
 
				 
			
 
				 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
			
 
				-		mutex_lock(&root->fs_info->cleaner_mutex);
			
 
				-		btrfs_clean_old_snapshots(root);
			
 
				-		mutex_unlock(&root->fs_info->cleaner_mutex);
			
 
				+
			
 
				+		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
			
 
				+		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
			
 
				+			btrfs_clean_old_snapshots(root);
			
 
				+			mutex_unlock(&root->fs_info->cleaner_mutex);
			
 
				+		}
			
 
				 
			
 
				 		if (freezing(current)) {
			
 
				 			refrigerator();
			
@@ -1558,15 +1582,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 		err = -ENOMEM;
			
 
				 		goto fail;
			
 
				 	}
			
 
				-	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
			
 
				+
			
 
				+	ret = init_srcu_struct(&fs_info->subvol_srcu);
			
 
				+	if (ret) {
			
 
				+		err = ret;
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	ret = setup_bdi(fs_info, &fs_info->bdi);
			
 
				+	if (ret) {
			
 
				+		err = ret;
			
 
				+		goto fail_srcu;
			
 
				+	}
			
 
				+
			
 
				+	fs_info->btree_inode = new_inode(sb);
			
 
				+	if (!fs_info->btree_inode) {
			
 
				+		err = -ENOMEM;
			
 
				+		goto fail_bdi;
			
 
				+	}
			
 
				+
			
 
				+	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
			
 
				 	INIT_LIST_HEAD(&fs_info->trans_list);
			
 
				 	INIT_LIST_HEAD(&fs_info->dead_roots);
			
 
				 	INIT_LIST_HEAD(&fs_info->hashers);
			
 
				 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
			
 
				 	INIT_LIST_HEAD(&fs_info->ordered_operations);
			
 
				+	INIT_LIST_HEAD(&fs_info->caching_block_groups);
			
 
				 	spin_lock_init(&fs_info->delalloc_lock);
			
 
				 	spin_lock_init(&fs_info->new_trans_lock);
			
 
				 	spin_lock_init(&fs_info->ref_cache_lock);
			
 
				+	spin_lock_init(&fs_info->fs_roots_radix_lock);
			
 
				 
			
 
				 	init_completion(&fs_info->kobj_unregister);
			
 
				 	fs_info->tree_root = tree_root;
			
@@ -1585,11 +1630,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	fs_info->sb = sb;
			
 
				 	fs_info->max_extent = (u64)-1;
			
 
				 	fs_info->max_inline = 8192 * 1024;
			
 
				-	if (setup_bdi(fs_info, &fs_info->bdi))
			
 
				-		goto fail_bdi;
			
 
				-	fs_info->btree_inode = new_inode(sb);
			
 
				-	fs_info->btree_inode->i_ino = 1;
			
 
				-	fs_info->btree_inode->i_nlink = 1;
			
 
				 	fs_info->metadata_ratio = 8;
			
 
				 
			
 
				 	fs_info->thread_pool_size = min_t(unsigned long,
			
@@ -1602,6 +1642,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	sb->s_blocksize_bits = blksize_bits(4096);
			
 
				 	sb->s_bdi = &fs_info->bdi;
			
 
				 
			
 
				+	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
			
 
				+	fs_info->btree_inode->i_nlink = 1;
			
 
				 	/*
			
 
				 	 * we set the i_size on the btree inode to the max possible int.
			
 
				 	 * the real end of the address space is determined by all of
			
@@ -1620,28 +1662,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 
			
 
				 	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
			
 
				 
			
 
				+	BTRFS_I(fs_info->btree_inode)->root = tree_root;
			
 
				+	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
			
 
				+	       sizeof(struct btrfs_key));
			
 
				+	BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
			
 
				+	insert_inode_hash(fs_info->btree_inode);
			
 
				+
			
 
				 	spin_lock_init(&fs_info->block_group_cache_lock);
			
 
				 	fs_info->block_group_cache_tree.rb_node = NULL;
			
 
				 
			
 
				-	extent_io_tree_init(&fs_info->pinned_extents,
			
 
				+	extent_io_tree_init(&fs_info->freed_extents[0],
			
 
				 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
			
 
				+	extent_io_tree_init(&fs_info->freed_extents[1],
			
 
				+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
			
 
				+	fs_info->pinned_extents = &fs_info->freed_extents[0];
			
 
				 	fs_info->do_barriers = 1;
			
 
				 
			
 
				-	BTRFS_I(fs_info->btree_inode)->root = tree_root;
			
 
				-	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
			
 
				-	       sizeof(struct btrfs_key));
			
 
				-	insert_inode_hash(fs_info->btree_inode);
			
 
				 
			
 
				 	mutex_init(&fs_info->trans_mutex);
			
 
				 	mutex_init(&fs_info->ordered_operations_mutex);
			
 
				 	mutex_init(&fs_info->tree_log_mutex);
			
 
				-	mutex_init(&fs_info->drop_mutex);
			
 
				 	mutex_init(&fs_info->chunk_mutex);
			
 
				 	mutex_init(&fs_info->transaction_kthread_mutex);
			
 
				 	mutex_init(&fs_info->cleaner_mutex);
			
 
				 	mutex_init(&fs_info->volume_mutex);
			
 
				-	mutex_init(&fs_info->tree_reloc_mutex);
			
 
				 	init_rwsem(&fs_info->extent_commit_sem);
			
 
				+	init_rwsem(&fs_info->subvol_sem);
			
 
				 
			
 
				 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
			
 
				 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
			
@@ -1700,7 +1746,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 		err = -EINVAL;
			
 
				 		goto fail_iput;
			
 
				 	}
			
 
				-
			
 
				+printk("thread pool is %d\n", fs_info->thread_pool_size);
			
 
				 	/*
			
 
				 	 * we need to start all the end_io workers up front because the
			
 
				 	 * queue work function gets called at interrupt time, and so it
			
@@ -1745,20 +1791,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 	fs_info->endio_workers.idle_thresh = 4;
			
 
				 	fs_info->endio_meta_workers.idle_thresh = 4;
			
 
				 
			
 
				-	fs_info->endio_write_workers.idle_thresh = 64;
			
 
				-	fs_info->endio_meta_write_workers.idle_thresh = 64;
			
 
				+	fs_info->endio_write_workers.idle_thresh = 2;
			
 
				+	fs_info->endio_meta_write_workers.idle_thresh = 2;
			
 
				+
			
 
				+	fs_info->endio_workers.atomic_worker_start = 1;
			
 
				+	fs_info->endio_meta_workers.atomic_worker_start = 1;
			
 
				+	fs_info->endio_write_workers.atomic_worker_start = 1;
			
 
				+	fs_info->endio_meta_write_workers.atomic_worker_start = 1;
			
 
				 
			
 
				 	btrfs_start_workers(&fs_info->workers, 1);
			
 
				 	btrfs_start_workers(&fs_info->submit_workers, 1);
			
 
				 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
			
 
				 	btrfs_start_workers(&fs_info->fixup_workers, 1);
			
 
				-	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
			
 
				-	btrfs_start_workers(&fs_info->endio_meta_workers,
			
 
				-			    fs_info->thread_pool_size);
			
 
				-	btrfs_start_workers(&fs_info->endio_meta_write_workers,
			
 
				-			    fs_info->thread_pool_size);
			
 
				-	btrfs_start_workers(&fs_info->endio_write_workers,
			
 
				-			    fs_info->thread_pool_size);
			
 
				+	btrfs_start_workers(&fs_info->endio_workers, 1);
			
 
				+	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
			
 
				+	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
			
 
				+	btrfs_start_workers(&fs_info->endio_write_workers, 1);
			
 
				 
			
 
				 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
			
 
				 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
			
@@ -1918,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	ret = btrfs_find_orphan_roots(tree_root);
			
 
				+	BUG_ON(ret);
			
 
				+
			
 
				 	if (!(sb->s_flags & MS_RDONLY)) {
			
 
				 		ret = btrfs_recover_relocation(tree_root);
			
 
				 		BUG_ON(ret);
			
@@ -1977,6 +2028,8 @@ fail_iput:
 
				 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
			
 
				 fail_bdi:
			
 
				 	bdi_destroy(&fs_info->bdi);
			
 
				+fail_srcu:
			
 
				+	cleanup_srcu_struct(&fs_info->subvol_srcu);
			
 
				 fail:
			
 
				 	kfree(extent_root);
			
 
				 	kfree(tree_root);
			
@@ -2236,20 +2289,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 
				 
			
 
				 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
			
 
				 {
			
 
				-	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
			
 
				+	spin_lock(&fs_info->fs_roots_radix_lock);
			
 
				 	radix_tree_delete(&fs_info->fs_roots_radix,
			
 
				 			  (unsigned long)root->root_key.objectid);
			
 
				+	spin_unlock(&fs_info->fs_roots_radix_lock);
			
 
				+
			
 
				+	if (btrfs_root_refs(&root->root_item) == 0)
			
 
				+		synchronize_srcu(&fs_info->subvol_srcu);
			
 
				+
			
 
				+	free_fs_root(root);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void free_fs_root(struct btrfs_root *root)
			
 
				+{
			
 
				+	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
			
 
				 	if (root->anon_super.s_dev) {
			
 
				 		down_write(&root->anon_super.s_umount);
			
 
				 		kill_anon_super(&root->anon_super);
			
 
				 	}
			
 
				-	if (root->node)
			
 
				-		free_extent_buffer(root->node);
			
 
				-	if (root->commit_root)
			
 
				-		free_extent_buffer(root->commit_root);
			
 
				+	free_extent_buffer(root->node);
			
 
				+	free_extent_buffer(root->commit_root);
			
 
				 	kfree(root->name);
			
 
				 	kfree(root);
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				 static int del_fs_roots(struct btrfs_fs_info *fs_info)
			
@@ -2258,6 +2320,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
 
				 	struct btrfs_root *gang[8];
			
 
				 	int i;
			
 
				 
			
 
				+	while (!list_empty(&fs_info->dead_roots)) {
			
 
				+		gang[0] = list_entry(fs_info->dead_roots.next,
			
 
				+				     struct btrfs_root, root_list);
			
 
				+		list_del(&gang[0]->root_list);
			
 
				+
			
 
				+		if (gang[0]->in_radix) {
			
 
				+			btrfs_free_fs_root(fs_info, gang[0]);
			
 
				+		} else {
			
 
				+			free_extent_buffer(gang[0]->node);
			
 
				+			free_extent_buffer(gang[0]->commit_root);
			
 
				+			kfree(gang[0]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	while (1) {
			
 
				 		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
			
 
				 					     (void **)gang, 0,
			
@@ -2287,9 +2363,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 
				 		root_objectid = gang[ret - 1]->root_key.objectid + 1;
			
 
				 		for (i = 0; i < ret; i++) {
			
 
				 			root_objectid = gang[i]->root_key.objectid;
			
 
				-			ret = btrfs_find_dead_roots(fs_info->tree_root,
			
 
				-						    root_objectid);
			
 
				-			BUG_ON(ret);
			
 
				 			btrfs_orphan_cleanup(gang[i]);
			
 
				 		}
			
 
				 		root_objectid++;
			
@@ -2359,7 +2432,6 @@ int close_ctree(struct btrfs_root *root)
 
				 	free_extent_buffer(root->fs_info->csum_root->commit_root);
			
 
				 
			
 
				 	btrfs_free_block_groups(root->fs_info);
			
 
				-	btrfs_free_pinned_extents(root->fs_info);
			
 
				 
			
 
				 	del_fs_roots(fs_info);
			
 
				 
			
@@ -2378,6 +2450,7 @@ int close_ctree(struct btrfs_root *root)
 
				 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
			
 
				 
			
 
				 	bdi_destroy(&fs_info->bdi);
			
 
				+	cleanup_srcu_struct(&fs_info->subvol_srcu);
			
 
				 
			
 
				 	kfree(fs_info->extent_root);
			
 
				 	kfree(fs_info->tree_root);
			
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 
				 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
			
 
				 	type = FILEID_BTRFS_WITHOUT_PARENT;
			
 
				 
			
 
				-	fid->objectid = BTRFS_I(inode)->location.objectid;
			
 
				+	fid->objectid = inode->i_ino;
			
 
				 	fid->root_objectid = BTRFS_I(inode)->root->objectid;
			
 
				 	fid->gen = inode->i_generation;
			
 
				 
			
@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 
				 }
			
 
				 
			
 
				 static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
			
 
				-				       u64 root_objectid, u32 generation)
			
 
				+				       u64 root_objectid, u32 generation,
			
 
				+				       int check_generation)
			
 
				 {
			
 
				+	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
			
 
				 	struct btrfs_root *root;
			
 
				+	struct dentry *dentry;
			
 
				 	struct inode *inode;
			
 
				 	struct btrfs_key key;
			
 
				+	int index;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	if (objectid < BTRFS_FIRST_FREE_OBJECTID)
			
 
				+		return ERR_PTR(-ESTALE);
			
 
				 
			
 
				 	key.objectid = root_objectid;
			
 
				 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
			
 
				 	key.offset = (u64)-1;
			
 
				 
			
 
				-	root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
			
 
				-	if (IS_ERR(root))
			
 
				-		return ERR_CAST(root);
			
 
				+	index = srcu_read_lock(&fs_info->subvol_srcu);
			
 
				+
			
 
				+	root = btrfs_read_fs_root_no_name(fs_info, &key);
			
 
				+	if (IS_ERR(root)) {
			
 
				+		err = PTR_ERR(root);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	if (btrfs_root_refs(&root->root_item) == 0) {
			
 
				+		err = -ENOENT;
			
 
				+		goto fail;
			
 
				+	}
			
 
				 
			
 
				 	key.objectid = objectid;
			
 
				 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
			
 
				 	key.offset = 0;
			
 
				 
			
 
				 	inode = btrfs_iget(sb, &key, root);
			
 
				-	if (IS_ERR(inode))
			
 
				-		return (void *)inode;
			
 
				+	if (IS_ERR(inode)) {
			
 
				+		err = PTR_ERR(inode);
			
 
				+		goto fail;
			
 
				+	}
			
 
				+
			
 
				+	srcu_read_unlock(&fs_info->subvol_srcu, index);
			
 
				 
			
 
				-	if (generation != inode->i_generation) {
			
 
				+	if (check_generation && generation != inode->i_generation) {
			
 
				 		iput(inode);
			
 
				 		return ERR_PTR(-ESTALE);
			
 
				 	}
			
 
				 
			
 
				-	return d_obtain_alias(inode);
			
 
				+	dentry = d_obtain_alias(inode);
			
 
				+	if (!IS_ERR(dentry))
			
 
				+		dentry->d_op = &btrfs_dentry_operations;
			
 
				+	return dentry;
			
 
				+fail:
			
 
				+	srcu_read_unlock(&fs_info->subvol_srcu, index);
			
 
				+	return ERR_PTR(err);
			
 
				 }
			
 
				 
			
 
				 static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
			
@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
 
				 	objectid = fid->parent_objectid;
			
 
				 	generation = fid->parent_gen;
			
 
				 
			
 
				-	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
			
 
				+	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
			
 
				 }
			
 
				 
			
 
				 static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
			
@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 
				 	root_objectid = fid->root_objectid;
			
 
				 	generation = fid->gen;
			
 
				 
			
 
				-	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
			
 
				+	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
			
 
				 }
			
 
				 
			
 
				 static struct dentry *btrfs_get_parent(struct dentry *child)
			
 
				 {
			
 
				 	struct inode *dir = child->d_inode;
			
 
				+	static struct dentry *dentry;
			
 
				 	struct btrfs_root *root = BTRFS_I(dir)->root;
			
 
				-	struct btrfs_key key;
			
 
				 	struct btrfs_path *path;
			
 
				 	struct extent_buffer *leaf;
			
 
				-	int slot;
			
 
				-	u64 objectid;
			
 
				+	struct btrfs_root_ref *ref;
			
 
				+	struct btrfs_key key;
			
 
				+	struct btrfs_key found_key;
			
 
				 	int ret;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 
			
 
				-	key.objectid = dir->i_ino;
			
 
				-	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
			
 
				-	key.offset = (u64)-1;
			
 
				+	if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
			
 
				+		key.objectid = root->root_key.objectid;
			
 
				+		key.type = BTRFS_ROOT_BACKREF_KEY;
			
 
				+		key.offset = (u64)-1;
			
 
				+		root = root->fs_info->tree_root;
			
 
				+	} else {
			
 
				+		key.objectid = dir->i_ino;
			
 
				+		key.type = BTRFS_INODE_REF_KEY;
			
 
				+		key.offset = (u64)-1;
			
 
				+	}
			
 
				 
			
 
				 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				-	if (ret < 0) {
			
 
				-		/* Error */
			
 
				-		btrfs_free_path(path);
			
 
				-		return ERR_PTR(ret);
			
 
				+	if (ret < 0)
			
 
				+		goto fail;
			
 
				+
			
 
				+	BUG_ON(ret == 0);
			
 
				+	if (path->slots[0] == 0) {
			
 
				+		ret = -ENOENT;
			
 
				+		goto fail;
			
 
				 	}
			
 
				+
			
 
				+	path->slots[0]--;
			
 
				 	leaf = path->nodes[0];
			
 
				-	slot = path->slots[0];
			
 
				-	if (ret) {
			
 
				-		/* btrfs_search_slot() returns the slot where we'd want to
			
 
				-		   insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
			
 
				-		   The _real_ backref, telling us what the parent inode
			
 
				-		   _actually_ is, will be in the slot _before_ the one
			
 
				-		   that btrfs_search_slot() returns. */
			
 
				-		if (!slot) {
			
 
				-			/* Unless there is _no_ key in the tree before... */
			
 
				-			btrfs_free_path(path);
			
 
				-			return ERR_PTR(-EIO);
			
 
				-		}
			
 
				-		slot--;
			
 
				+
			
 
				+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
			
 
				+	if (found_key.objectid != key.objectid || found_key.type != key.type) {
			
 
				+		ret = -ENOENT;
			
 
				+		goto fail;
			
 
				 	}
			
 
				 
			
 
				-	btrfs_item_key_to_cpu(leaf, &key, slot);
			
 
				+	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
			
 
				+		ref = btrfs_item_ptr(leaf, path->slots[0],
			
 
				+				     struct btrfs_root_ref);
			
 
				+		key.objectid = btrfs_root_ref_dirid(leaf, ref);
			
 
				+	} else {
			
 
				+		key.objectid = found_key.offset;
			
 
				+	}
			
 
				 	btrfs_free_path(path);
			
 
				 
			
 
				-	if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
			
 
				-		return ERR_PTR(-EINVAL);
			
 
				-
			
 
				-	objectid = key.offset;
			
 
				-
			
 
				-	/* If we are already at the root of a subvol, return the real root */
			
 
				-	if (objectid == dir->i_ino)
			
 
				-		return dget(dir->i_sb->s_root);
			
 
				+	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
			
 
				+		return btrfs_get_dentry(root->fs_info->sb, key.objectid,
			
 
				+					found_key.offset, 0, 0);
			
 
				+	}
			
 
				 
			
 
				-	/* Build a new key for the inode item */
			
 
				-	key.objectid = objectid;
			
 
				-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
			
 
				+	key.type = BTRFS_INODE_ITEM_KEY;
			
 
				 	key.offset = 0;
			
 
				-
			
 
				-	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
			
 
				+	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
			
 
				+	if (!IS_ERR(dentry))
			
 
				+		dentry->d_op = &btrfs_dentry_operations;
			
 
				+	return dentry;
			
 
				+fail:
			
 
				+	btrfs_free_path(path);
			
 
				+	return ERR_PTR(ret);
			
 
				 }
			
 
				 
			
 
				 const struct export_operations btrfs_export_ops = {
			
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,12 +32,12 @@
 
				 #include "locking.h"
			
 
				 #include "free-space-cache.h"
			
 
				 
			
 
				-static int update_reserved_extents(struct btrfs_root *root,
			
 
				-				   u64 bytenr, u64 num, int reserve);
			
 
				 static int update_block_group(struct btrfs_trans_handle *trans,
			
 
				 			      struct btrfs_root *root,
			
 
				 			      u64 bytenr, u64 num_bytes, int alloc,
			
 
				 			      int mark_free);
			
 
				+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
			
 
				+				   u64 num_bytes, int reserve);
			
 
				 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
			
 
				 				struct btrfs_root *root,
			
 
				 				u64 bytenr, u64 num_bytes, u64 parent,
			
@@ -57,10 +57,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 
				 				     u64 parent, u64 root_objectid,
			
 
				 				     u64 flags, struct btrfs_disk_key *key,
			
 
				 				     int level, struct btrfs_key *ins);
			
 
				-
			
 
				 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
			
 
				 			  struct btrfs_root *extent_root, u64 alloc_bytes,
			
 
				 			  u64 flags, int force);
			
 
				+static int pin_down_bytes(struct btrfs_trans_handle *trans,
			
 
				+			  struct btrfs_root *root,
			
 
				+			  struct btrfs_path *path,
			
 
				+			  u64 bytenr, u64 num_bytes,
			
 
				+			  int is_data, int reserved,
			
 
				+			  struct extent_buffer **must_clean);
			
 
				+static int find_next_key(struct btrfs_path *path, int level,
			
 
				+			 struct btrfs_key *key);
			
 
				 
			
 
				 static noinline int
			
 
				 block_group_cache_done(struct btrfs_block_group_cache *cache)
			
@@ -153,34 +160,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * We always set EXTENT_LOCKED for the super mirror extents so we don't
			
 
				- * overwrite them, so those bits need to be unset.  Also, if we are unmounting
			
 
				- * with pinned extents still sitting there because we had a block group caching,
			
 
				- * we need to clear those now, since we are done.
			
 
				- */
			
 
				-void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
			
 
				+static int add_excluded_extent(struct btrfs_root *root,
			
 
				+			       u64 start, u64 num_bytes)
			
 
				 {
			
 
				-	u64 start, end, last = 0;
			
 
				-	int ret;
			
 
				+	u64 end = start + num_bytes - 1;
			
 
				+	set_extent_bits(&root->fs_info->freed_extents[0],
			
 
				+			start, end, EXTENT_UPTODATE, GFP_NOFS);
			
 
				+	set_extent_bits(&root->fs_info->freed_extents[1],
			
 
				+			start, end, EXTENT_UPTODATE, GFP_NOFS);
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				-	while (1) {
			
 
				-		ret = find_first_extent_bit(&info->pinned_extents, last,
			
 
				-					    &start, &end,
			
 
				-					    EXTENT_LOCKED|EXTENT_DIRTY);
			
 
				-		if (ret)
			
 
				-			break;
			
 
				+static void free_excluded_extents(struct btrfs_root *root,
			
 
				+				  struct btrfs_block_group_cache *cache)
			
 
				+{
			
 
				+	u64 start, end;
			
 
				 
			
 
				-		clear_extent_bits(&info->pinned_extents, start, end,
			
 
				-				  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
			
 
				-		last = end+1;
			
 
				-	}
			
 
				+	start = cache->key.objectid;
			
 
				+	end = start + cache->key.offset - 1;
			
 
				+
			
 
				+	clear_extent_bits(&root->fs_info->freed_extents[0],
			
 
				+			  start, end, EXTENT_UPTODATE, GFP_NOFS);
			
 
				+	clear_extent_bits(&root->fs_info->freed_extents[1],
			
 
				+			  start, end, EXTENT_UPTODATE, GFP_NOFS);
			
 
				 }
			
 
				 
			
 
				-static int remove_sb_from_cache(struct btrfs_root *root,
			
 
				-				struct btrfs_block_group_cache *cache)
			
 
				+static int exclude_super_stripes(struct btrfs_root *root,
			
 
				+				 struct btrfs_block_group_cache *cache)
			
 
				 {
			
 
				-	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				 	u64 bytenr;
			
 
				 	u64 *logical;
			
 
				 	int stripe_len;
			
@@ -192,17 +199,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
 
				 				       cache->key.objectid, bytenr,
			
 
				 				       0, &logical, &nr, &stripe_len);
			
 
				 		BUG_ON(ret);
			
 
				+
			
 
				 		while (nr--) {
			
 
				-			try_lock_extent(&fs_info->pinned_extents,
			
 
				-					logical[nr],
			
 
				-					logical[nr] + stripe_len - 1, GFP_NOFS);
			
 
				+			cache->bytes_super += stripe_len;
			
 
				+			ret = add_excluded_extent(root, logical[nr],
			
 
				+						  stripe_len);
			
 
				+			BUG_ON(ret);
			
 
				 		}
			
 
				+
			
 
				 		kfree(logical);
			
 
				 	}
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static struct btrfs_caching_control *
			
 
				+get_caching_control(struct btrfs_block_group_cache *cache)
			
 
				+{
			
 
				+	struct btrfs_caching_control *ctl;
			
 
				+
			
 
				+	spin_lock(&cache->lock);
			
 
				+	if (cache->cached != BTRFS_CACHE_STARTED) {
			
 
				+		spin_unlock(&cache->lock);
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	ctl = cache->caching_ctl;
			
 
				+	atomic_inc(&ctl->count);
			
 
				+	spin_unlock(&cache->lock);
			
 
				+	return ctl;
			
 
				+}
			
 
				+
			
 
				+static void put_caching_control(struct btrfs_caching_control *ctl)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&ctl->count))
			
 
				+		kfree(ctl);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * this is only called by cache_block_group, since we could have freed extents
			
 
				  * we need to check the pinned_extents for any extents that can't be used yet
			
@@ -215,9 +247,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 
				 	int ret;
			
 
				 
			
 
				 	while (start < end) {
			
 
				-		ret = find_first_extent_bit(&info->pinned_extents, start,
			
 
				+		ret = find_first_extent_bit(info->pinned_extents, start,
			
 
				 					    &extent_start, &extent_end,
			
 
				-					    EXTENT_DIRTY|EXTENT_LOCKED);
			
 
				+					    EXTENT_DIRTY | EXTENT_UPTODATE);
			
 
				 		if (ret)
			
 
				 			break;
			
 
				 
			
@@ -249,22 +281,27 @@ static int caching_kthread(void *data)
 
				 {
			
 
				 	struct btrfs_block_group_cache *block_group = data;
			
 
				 	struct btrfs_fs_info *fs_info = block_group->fs_info;
			
 
				-	u64 last = 0;
			
 
				+	struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
			
 
				+	struct btrfs_root *extent_root = fs_info->extent_root;
			
 
				 	struct btrfs_path *path;
			
 
				-	int ret = 0;
			
 
				-	struct btrfs_key key;
			
 
				 	struct extent_buffer *leaf;
			
 
				-	int slot;
			
 
				+	struct btrfs_key key;
			
 
				 	u64 total_found = 0;
			
 
				-
			
 
				-	BUG_ON(!fs_info);
			
 
				+	u64 last = 0;
			
 
				+	u32 nritems;
			
 
				+	int ret = 0;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	atomic_inc(&block_group->space_info->caching_threads);
			
 
				+	exclude_super_stripes(extent_root, block_group);
			
 
				+	spin_lock(&block_group->space_info->lock);
			
 
				+	block_group->space_info->bytes_super += block_group->bytes_super;
			
 
				+	spin_unlock(&block_group->space_info->lock);
			
 
				+
			
 
				 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
			
 
				+
			
 
				 	/*
			
 
				 	 * We don't want to deadlock with somebody trying to allocate a new
			
 
				 	 * extent for the extent root while also trying to search the extent
			
@@ -277,74 +314,64 @@ static int caching_kthread(void *data)
 
				 
			
 
				 	key.objectid = last;
			
 
				 	key.offset = 0;
			
 
				-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
			
 
				+	key.type = BTRFS_EXTENT_ITEM_KEY;
			
 
				 again:
			
 
				+	mutex_lock(&caching_ctl->mutex);
			
 
				 	/* need to make sure the commit_root doesn't disappear */
			
 
				 	down_read(&fs_info->extent_commit_sem);
			
 
				 
			
 
				-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
			
 
				+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
			
 
				 	if (ret < 0)
			
 
				 		goto err;
			
 
				 
			
 
				+	leaf = path->nodes[0];
			
 
				+	nritems = btrfs_header_nritems(leaf);
			
 
				+
			
 
				 	while (1) {
			
 
				 		smp_mb();
			
 
				-		if (block_group->fs_info->closing > 1) {
			
 
				+		if (fs_info->closing > 1) {
			
 
				 			last = (u64)-1;
			
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		leaf = path->nodes[0];
			
 
				-		slot = path->slots[0];
			
 
				-		if (slot >= btrfs_header_nritems(leaf)) {
			
 
				-			ret = btrfs_next_leaf(fs_info->extent_root, path);
			
 
				-			if (ret < 0)
			
 
				-				goto err;
			
 
				-			else if (ret)
			
 
				+		if (path->slots[0] < nritems) {
			
 
				+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
			
 
				+		} else {
			
 
				+			ret = find_next_key(path, 0, &key);
			
 
				+			if (ret)
			
 
				 				break;
			
 
				 
			
 
				-			if (need_resched() ||
			
 
				-			    btrfs_transaction_in_commit(fs_info)) {
			
 
				-				leaf = path->nodes[0];
			
 
				-
			
 
				-				/* this shouldn't happen, but if the
			
 
				-				 * leaf is empty just move on.
			
 
				-				 */
			
 
				-				if (btrfs_header_nritems(leaf) == 0)
			
 
				-					break;
			
 
				-				/*
			
 
				-				 * we need to copy the key out so that
			
 
				-				 * we are sure the next search advances
			
 
				-				 * us forward in the btree.
			
 
				-				 */
			
 
				-				btrfs_item_key_to_cpu(leaf, &key, 0);
			
 
				-				btrfs_release_path(fs_info->extent_root, path);
			
 
				-				up_read(&fs_info->extent_commit_sem);
			
 
				+			caching_ctl->progress = last;
			
 
				+			btrfs_release_path(extent_root, path);
			
 
				+			up_read(&fs_info->extent_commit_sem);
			
 
				+			mutex_unlock(&caching_ctl->mutex);
			
 
				+			if (btrfs_transaction_in_commit(fs_info))
			
 
				 				schedule_timeout(1);
			
 
				-				goto again;
			
 
				-			}
			
 
				+			else
			
 
				+				cond_resched();
			
 
				+			goto again;
			
 
				+		}
			
 
				 
			
 
				+		if (key.objectid < block_group->key.objectid) {
			
 
				+			path->slots[0]++;
			
 
				 			continue;
			
 
				 		}
			
 
				-		btrfs_item_key_to_cpu(leaf, &key, slot);
			
 
				-		if (key.objectid < block_group->key.objectid)
			
 
				-			goto next;
			
 
				 
			
 
				 		if (key.objectid >= block_group->key.objectid +
			
 
				 		    block_group->key.offset)
			
 
				 			break;
			
 
				 
			
 
				-		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
			
 
				+		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
			
 
				 			total_found += add_new_free_space(block_group,
			
 
				 							  fs_info, last,
			
 
				 							  key.objectid);
			
 
				 			last = key.objectid + key.offset;
			
 
				-		}
			
 
				 
			
 
				-		if (total_found > (1024 * 1024 * 2)) {
			
 
				-			total_found = 0;
			
 
				-			wake_up(&block_group->caching_q);
			
 
				+			if (total_found > (1024 * 1024 * 2)) {
			
 
				+				total_found = 0;
			
 
				+				wake_up(&caching_ctl->wait);
			
 
				+			}
			
 
				 		}
			
 
				-next:
			
 
				 		path->slots[0]++;
			
 
				 	}
			
 
				 	ret = 0;
			
@@ -352,33 +379,65 @@ next:
 
				 	total_found += add_new_free_space(block_group, fs_info, last,
			
 
				 					  block_group->key.objectid +
			
 
				 					  block_group->key.offset);
			
 
				+	caching_ctl->progress = (u64)-1;
			
 
				 
			
 
				 	spin_lock(&block_group->lock);
			
 
				+	block_group->caching_ctl = NULL;
			
 
				 	block_group->cached = BTRFS_CACHE_FINISHED;
			
 
				 	spin_unlock(&block_group->lock);
			
 
				 
			
 
				 err:
			
 
				 	btrfs_free_path(path);
			
 
				 	up_read(&fs_info->extent_commit_sem);
			
 
				-	atomic_dec(&block_group->space_info->caching_threads);
			
 
				-	wake_up(&block_group->caching_q);
			
 
				 
			
 
				+	free_excluded_extents(extent_root, block_group);
			
 
				+
			
 
				+	mutex_unlock(&caching_ctl->mutex);
			
 
				+	wake_up(&caching_ctl->wait);
			
 
				+
			
 
				+	put_caching_control(caching_ctl);
			
 
				+	atomic_dec(&block_group->space_info->caching_threads);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 static int cache_block_group(struct btrfs_block_group_cache *cache)
			
 
				 {
			
 
				+	struct btrfs_fs_info *fs_info = cache->fs_info;
			
 
				+	struct btrfs_caching_control *caching_ctl;
			
 
				 	struct task_struct *tsk;
			
 
				 	int ret = 0;
			
 
				 
			
 
				+	smp_mb();
			
 
				+	if (cache->cached != BTRFS_CACHE_NO)
			
 
				+		return 0;
			
 
				+
			
 
				+	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
			
 
				+	BUG_ON(!caching_ctl);
			
 
				+
			
 
				+	INIT_LIST_HEAD(&caching_ctl->list);
			
 
				+	mutex_init(&caching_ctl->mutex);
			
 
				+	init_waitqueue_head(&caching_ctl->wait);
			
 
				+	caching_ctl->block_group = cache;
			
 
				+	caching_ctl->progress = cache->key.objectid;
			
 
				+	/* one for caching kthread, one for caching block group list */
			
 
				+	atomic_set(&caching_ctl->count, 2);
			
 
				+
			
 
				 	spin_lock(&cache->lock);
			
 
				 	if (cache->cached != BTRFS_CACHE_NO) {
			
 
				 		spin_unlock(&cache->lock);
			
 
				-		return ret;
			
 
				+		kfree(caching_ctl);
			
 
				+		return 0;
			
 
				 	}
			
 
				+	cache->caching_ctl = caching_ctl;
			
 
				 	cache->cached = BTRFS_CACHE_STARTED;
			
 
				 	spin_unlock(&cache->lock);
			
 
				 
			
 
				+	down_write(&fs_info->extent_commit_sem);
			
 
				+	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
			
 
				+	up_write(&fs_info->extent_commit_sem);
			
 
				+
			
 
				+	atomic_inc(&cache->space_info->caching_threads);
			
 
				+
			
 
				 	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
			
 
				 			  cache->key.objectid);
			
 
				 	if (IS_ERR(tsk)) {
			
@@ -1657,7 +1716,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 
				 						 parent, ref_root, flags,
			
 
				 						 ref->objectid, ref->offset,
			
 
				 						 &ins, node->ref_mod);
			
 
				-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
			
 
				 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
			
 
				 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
			
 
				 					     node->num_bytes, parent,
			
@@ -1783,7 +1841,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 
				 						extent_op->flags_to_set,
			
 
				 						&extent_op->key,
			
 
				 						ref->level, &ins);
			
 
				-		update_reserved_extents(root, ins.objectid, ins.offset, 0);
			
 
				 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
			
 
				 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
			
 
				 					     node->num_bytes, parent, ref_root,
			
@@ -1818,16 +1875,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 
				 		BUG_ON(extent_op);
			
 
				 		head = btrfs_delayed_node_to_head(node);
			
 
				 		if (insert_reserved) {
			
 
				+			int mark_free = 0;
			
 
				+			struct extent_buffer *must_clean = NULL;
			
 
				+
			
 
				+			ret = pin_down_bytes(trans, root, NULL,
			
 
				+					     node->bytenr, node->num_bytes,
			
 
				+					     head->is_data, 1, &must_clean);
			
 
				+			if (ret > 0)
			
 
				+				mark_free = 1;
			
 
				+
			
 
				+			if (must_clean) {
			
 
				+				clean_tree_block(NULL, root, must_clean);
			
 
				+				btrfs_tree_unlock(must_clean);
			
 
				+				free_extent_buffer(must_clean);
			
 
				+			}
			
 
				 			if (head->is_data) {
			
 
				 				ret = btrfs_del_csums(trans, root,
			
 
				 						      node->bytenr,
			
 
				 						      node->num_bytes);
			
 
				 				BUG_ON(ret);
			
 
				 			}
			
 
				-			btrfs_update_pinned_extents(root, node->bytenr,
			
 
				-						    node->num_bytes, 1);
			
 
				-			update_reserved_extents(root, node->bytenr,
			
 
				-						node->num_bytes, 0);
			
 
				+			if (mark_free) {
			
 
				+				ret = btrfs_free_reserved_extent(root,
			
 
				+							node->bytenr,
			
 
				+							node->num_bytes);
			
 
				+				BUG_ON(ret);
			
 
				+			}
			
 
				 		}
			
 
				 		mutex_unlock(&head->mutex);
			
 
				 		return 0;
			
@@ -2706,6 +2779,8 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
 
				 	/* get the space info for where the metadata will live */
			
 
				 	alloc_target = btrfs_get_alloc_profile(root, 0);
			
 
				 	meta_sinfo = __find_space_info(info, alloc_target);
			
 
				+	if (!meta_sinfo)
			
 
				+		goto alloc;
			
 
				 
			
 
				 again:
			
 
				 	spin_lock(&meta_sinfo->lock);
			
@@ -2717,12 +2792,13 @@ again:
 
				 	do_div(thresh, 100);
			
 
				 
			
 
				 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
			
 
				-	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
			
 
				+	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
			
 
				+	    meta_sinfo->bytes_super > thresh) {
			
 
				 		struct btrfs_trans_handle *trans;
			
 
				 		if (!meta_sinfo->full) {
			
 
				 			meta_sinfo->force_alloc = 1;
			
 
				 			spin_unlock(&meta_sinfo->lock);
			
 
				-
			
 
				+alloc:
			
 
				 			trans = btrfs_start_transaction(root, 1);
			
 
				 			if (!trans)
			
 
				 				return -ENOMEM;
			
@@ -2730,6 +2806,10 @@ again:
 
				 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
			
 
				 					     2 * 1024 * 1024, alloc_target, 0);
			
 
				 			btrfs_end_transaction(trans, root);
			
 
				+			if (!meta_sinfo) {
			
 
				+				meta_sinfo = __find_space_info(info,
			
 
				+							       alloc_target);
			
 
				+			}
			
 
				 			goto again;
			
 
				 		}
			
 
				 		spin_unlock(&meta_sinfo->lock);
			
@@ -2765,13 +2845,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 
				 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
			
 
				 
			
 
				 	data_sinfo = BTRFS_I(inode)->space_info;
			
 
				+	if (!data_sinfo)
			
 
				+		goto alloc;
			
 
				+
			
 
				 again:
			
 
				 	/* make sure we have enough space to handle the data first */
			
 
				 	spin_lock(&data_sinfo->lock);
			
 
				 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
			
 
				 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
			
 
				 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
			
 
				-	    data_sinfo->bytes_may_use < bytes) {
			
 
				+	    data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
			
 
				 		struct btrfs_trans_handle *trans;
			
 
				 
			
 
				 		/*
			
@@ -2783,7 +2866,7 @@ again:
 
				 
			
 
				 			data_sinfo->force_alloc = 1;
			
 
				 			spin_unlock(&data_sinfo->lock);
			
 
				-
			
 
				+alloc:
			
 
				 			alloc_target = btrfs_get_alloc_profile(root, 1);
			
 
				 			trans = btrfs_start_transaction(root, 1);
			
 
				 			if (!trans)
			
@@ -2795,6 +2878,11 @@ again:
 
				 			btrfs_end_transaction(trans, root);
			
 
				 			if (ret)
			
 
				 				return ret;
			
 
				+
			
 
				+			if (!data_sinfo) {
			
 
				+				btrfs_set_inode_space_info(root, inode);
			
 
				+				data_sinfo = BTRFS_I(inode)->space_info;
			
 
				+			}
			
 
				 			goto again;
			
 
				 		}
			
 
				 		spin_unlock(&data_sinfo->lock);
			
@@ -3009,10 +3097,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
				 		num_bytes = min(total, cache->key.offset - byte_in_group);
			
 
				 		if (alloc) {
			
 
				 			old_val += num_bytes;
			
 
				+			btrfs_set_block_group_used(&cache->item, old_val);
			
 
				+			cache->reserved -= num_bytes;
			
 
				 			cache->space_info->bytes_used += num_bytes;
			
 
				+			cache->space_info->bytes_reserved -= num_bytes;
			
 
				 			if (cache->ro)
			
 
				 				cache->space_info->bytes_readonly -= num_bytes;
			
 
				-			btrfs_set_block_group_used(&cache->item, old_val);
			
 
				 			spin_unlock(&cache->lock);
			
 
				 			spin_unlock(&cache->space_info->lock);
			
 
				 		} else {
			
@@ -3057,127 +3147,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 
				 	return bytenr;
			
 
				 }
			
 
				 
			
 
				-int btrfs_update_pinned_extents(struct btrfs_root *root,
			
 
				-				u64 bytenr, u64 num, int pin)
			
 
				+/*
			
 
				+ * this function must be called within transaction
			
 
				+ */
			
 
				+int btrfs_pin_extent(struct btrfs_root *root,
			
 
				+		     u64 bytenr, u64 num_bytes, int reserved)
			
 
				 {
			
 
				-	u64 len;
			
 
				-	struct btrfs_block_group_cache *cache;
			
 
				 	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct btrfs_block_group_cache *cache;
			
 
				 
			
 
				-	if (pin)
			
 
				-		set_extent_dirty(&fs_info->pinned_extents,
			
 
				-				bytenr, bytenr + num - 1, GFP_NOFS);
			
 
				-
			
 
				-	while (num > 0) {
			
 
				-		cache = btrfs_lookup_block_group(fs_info, bytenr);
			
 
				-		BUG_ON(!cache);
			
 
				-		len = min(num, cache->key.offset -
			
 
				-			  (bytenr - cache->key.objectid));
			
 
				-		if (pin) {
			
 
				-			spin_lock(&cache->space_info->lock);
			
 
				-			spin_lock(&cache->lock);
			
 
				-			cache->pinned += len;
			
 
				-			cache->space_info->bytes_pinned += len;
			
 
				-			spin_unlock(&cache->lock);
			
 
				-			spin_unlock(&cache->space_info->lock);
			
 
				-			fs_info->total_pinned += len;
			
 
				-		} else {
			
 
				-			int unpin = 0;
			
 
				+	cache = btrfs_lookup_block_group(fs_info, bytenr);
			
 
				+	BUG_ON(!cache);
			
 
				 
			
 
				-			/*
			
 
				-			 * in order to not race with the block group caching, we
			
 
				-			 * only want to unpin the extent if we are cached.  If
			
 
				-			 * we aren't cached, we want to start async caching this
			
 
				-			 * block group so we can free the extent the next time
			
 
				-			 * around.
			
 
				-			 */
			
 
				-			spin_lock(&cache->space_info->lock);
			
 
				-			spin_lock(&cache->lock);
			
 
				-			unpin = (cache->cached == BTRFS_CACHE_FINISHED);
			
 
				-			if (likely(unpin)) {
			
 
				-				cache->pinned -= len;
			
 
				-				cache->space_info->bytes_pinned -= len;
			
 
				-				fs_info->total_pinned -= len;
			
 
				-			}
			
 
				-			spin_unlock(&cache->lock);
			
 
				-			spin_unlock(&cache->space_info->lock);
			
 
				+	spin_lock(&cache->space_info->lock);
			
 
				+	spin_lock(&cache->lock);
			
 
				+	cache->pinned += num_bytes;
			
 
				+	cache->space_info->bytes_pinned += num_bytes;
			
 
				+	if (reserved) {
			
 
				+		cache->reserved -= num_bytes;
			
 
				+		cache->space_info->bytes_reserved -= num_bytes;
			
 
				+	}
			
 
				+	spin_unlock(&cache->lock);
			
 
				+	spin_unlock(&cache->space_info->lock);
			
 
				 
			
 
				-			if (likely(unpin))
			
 
				-				clear_extent_dirty(&fs_info->pinned_extents,
			
 
				-						   bytenr, bytenr + len -1,
			
 
				-						   GFP_NOFS);
			
 
				-			else
			
 
				-				cache_block_group(cache);
			
 
				+	btrfs_put_block_group(cache);
			
 
				 
			
 
				-			if (unpin)
			
 
				-				btrfs_add_free_space(cache, bytenr, len);
			
 
				-		}
			
 
				-		btrfs_put_block_group(cache);
			
 
				-		bytenr += len;
			
 
				-		num -= len;
			
 
				+	set_extent_dirty(fs_info->pinned_extents,
			
 
				+			 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
			
 
				+				   u64 num_bytes, int reserve)
			
 
				+{
			
 
				+	spin_lock(&cache->space_info->lock);
			
 
				+	spin_lock(&cache->lock);
			
 
				+	if (reserve) {
			
 
				+		cache->reserved += num_bytes;
			
 
				+		cache->space_info->bytes_reserved += num_bytes;
			
 
				+	} else {
			
 
				+		cache->reserved -= num_bytes;
			
 
				+		cache->space_info->bytes_reserved -= num_bytes;
			
 
				 	}
			
 
				+	spin_unlock(&cache->lock);
			
 
				+	spin_unlock(&cache->space_info->lock);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int update_reserved_extents(struct btrfs_root *root,
			
 
				-				   u64 bytenr, u64 num, int reserve)
			
 
				+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
			
 
				+				struct btrfs_root *root)
			
 
				 {
			
 
				-	u64 len;
			
 
				-	struct btrfs_block_group_cache *cache;
			
 
				 	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct btrfs_caching_control *next;
			
 
				+	struct btrfs_caching_control *caching_ctl;
			
 
				+	struct btrfs_block_group_cache *cache;
			
 
				 
			
 
				-	while (num > 0) {
			
 
				-		cache = btrfs_lookup_block_group(fs_info, bytenr);
			
 
				-		BUG_ON(!cache);
			
 
				-		len = min(num, cache->key.offset -
			
 
				-			  (bytenr - cache->key.objectid));
			
 
				+	down_write(&fs_info->extent_commit_sem);
			
 
				 
			
 
				-		spin_lock(&cache->space_info->lock);
			
 
				-		spin_lock(&cache->lock);
			
 
				-		if (reserve) {
			
 
				-			cache->reserved += len;
			
 
				-			cache->space_info->bytes_reserved += len;
			
 
				+	list_for_each_entry_safe(caching_ctl, next,
			
 
				+				 &fs_info->caching_block_groups, list) {
			
 
				+		cache = caching_ctl->block_group;
			
 
				+		if (block_group_cache_done(cache)) {
			
 
				+			cache->last_byte_to_unpin = (u64)-1;
			
 
				+			list_del_init(&caching_ctl->list);
			
 
				+			put_caching_control(caching_ctl);
			
 
				 		} else {
			
 
				-			cache->reserved -= len;
			
 
				-			cache->space_info->bytes_reserved -= len;
			
 
				+			cache->last_byte_to_unpin = caching_ctl->progress;
			
 
				 		}
			
 
				-		spin_unlock(&cache->lock);
			
 
				-		spin_unlock(&cache->space_info->lock);
			
 
				-		btrfs_put_block_group(cache);
			
 
				-		bytenr += len;
			
 
				-		num -= len;
			
 
				 	}
			
 
				+
			
 
				+	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
			
 
				+		fs_info->pinned_extents = &fs_info->freed_extents[1];
			
 
				+	else
			
 
				+		fs_info->pinned_extents = &fs_info->freed_extents[0];
			
 
				+
			
 
				+	up_write(&fs_info->extent_commit_sem);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
			
 
				+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
			
 
				 {
			
 
				-	u64 last = 0;
			
 
				-	u64 start;
			
 
				-	u64 end;
			
 
				-	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
			
 
				-	int ret;
			
 
				+	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct btrfs_block_group_cache *cache = NULL;
			
 
				+	u64 len;
			
 
				 
			
 
				-	while (1) {
			
 
				-		ret = find_first_extent_bit(pinned_extents, last,
			
 
				-					    &start, &end, EXTENT_DIRTY);
			
 
				-		if (ret)
			
 
				-			break;
			
 
				+	while (start <= end) {
			
 
				+		if (!cache ||
			
 
				+		    start >= cache->key.objectid + cache->key.offset) {
			
 
				+			if (cache)
			
 
				+				btrfs_put_block_group(cache);
			
 
				+			cache = btrfs_lookup_block_group(fs_info, start);
			
 
				+			BUG_ON(!cache);
			
 
				+		}
			
 
				+
			
 
				+		len = cache->key.objectid + cache->key.offset - start;
			
 
				+		len = min(len, end + 1 - start);
			
 
				+
			
 
				+		if (start < cache->last_byte_to_unpin) {
			
 
				+			len = min(len, cache->last_byte_to_unpin - start);
			
 
				+			btrfs_add_free_space(cache, start, len);
			
 
				+		}
			
 
				+
			
 
				+		spin_lock(&cache->space_info->lock);
			
 
				+		spin_lock(&cache->lock);
			
 
				+		cache->pinned -= len;
			
 
				+		cache->space_info->bytes_pinned -= len;
			
 
				+		spin_unlock(&cache->lock);
			
 
				+		spin_unlock(&cache->space_info->lock);
			
 
				 
			
 
				-		set_extent_dirty(copy, start, end, GFP_NOFS);
			
 
				-		last = end + 1;
			
 
				+		start += len;
			
 
				 	}
			
 
				+
			
 
				+	if (cache)
			
 
				+		btrfs_put_block_group(cache);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
			
 
				-			       struct btrfs_root *root,
			
 
				-			       struct extent_io_tree *unpin)
			
 
				+			       struct btrfs_root *root)
			
 
				 {
			
 
				+	struct btrfs_fs_info *fs_info = root->fs_info;
			
 
				+	struct extent_io_tree *unpin;
			
 
				 	u64 start;
			
 
				 	u64 end;
			
 
				 	int ret;
			
 
				 
			
 
				+	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
			
 
				+		unpin = &fs_info->freed_extents[1];
			
 
				+	else
			
 
				+		unpin = &fs_info->freed_extents[0];
			
 
				+
			
 
				 	while (1) {
			
 
				 		ret = find_first_extent_bit(unpin, 0, &start, &end,
			
 
				 					    EXTENT_DIRTY);
			
@@ -3186,10 +3285,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
				 
			
 
				 		ret = btrfs_discard_extent(root, start, end + 1 - start);
			
 
				 
			
 
				-		/* unlocks the pinned mutex */
			
 
				-		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
			
 
				 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
			
 
				-
			
 
				+		unpin_extent_range(root, start, end);
			
 
				 		cond_resched();
			
 
				 	}
			
 
				 
			
@@ -3199,7 +3296,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
				 static int pin_down_bytes(struct btrfs_trans_handle *trans,
			
 
				 			  struct btrfs_root *root,
			
 
				 			  struct btrfs_path *path,
			
 
				-			  u64 bytenr, u64 num_bytes, int is_data,
			
 
				+			  u64 bytenr, u64 num_bytes,
			
 
				+			  int is_data, int reserved,
			
 
				 			  struct extent_buffer **must_clean)
			
 
				 {
			
 
				 	int err = 0;
			
@@ -3231,15 +3329,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 	free_extent_buffer(buf);
			
 
				 pinit:
			
 
				-	btrfs_set_path_blocking(path);
			
 
				+	if (path)
			
 
				+		btrfs_set_path_blocking(path);
			
 
				 	/* unlocks the pinned mutex */
			
 
				-	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
			
 
				+	btrfs_pin_extent(root, bytenr, num_bytes, reserved);
			
 
				 
			
 
				 	BUG_ON(err < 0);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
			
 
				 				struct btrfs_root *root,
			
 
				 				u64 bytenr, u64 num_bytes, u64 parent,
			
@@ -3413,7 +3511,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
				 		}
			
 
				 
			
 
				 		ret = pin_down_bytes(trans, root, path, bytenr,
			
 
				-				     num_bytes, is_data, &must_clean);
			
 
				+				     num_bytes, is_data, 0, &must_clean);
			
 
				 		if (ret > 0)
			
 
				 			mark_free = 1;
			
 
				 		BUG_ON(ret < 0);
			
@@ -3544,8 +3642,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 
				 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
			
 
				 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
			
 
				 		/* unlocks the pinned mutex */
			
 
				-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
			
 
				-		update_reserved_extents(root, bytenr, num_bytes, 0);
			
 
				+		btrfs_pin_extent(root, bytenr, num_bytes, 1);
			
 
				 		ret = 0;
			
 
				 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
			
 
				 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
			
@@ -3585,19 +3682,33 @@ static noinline int
 
				 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
			
 
				 				u64 num_bytes)
			
 
				 {
			
 
				+	struct btrfs_caching_control *caching_ctl;
			
 
				 	DEFINE_WAIT(wait);
			
 
				 
			
 
				-	prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
			
 
				-
			
 
				-	if (block_group_cache_done(cache)) {
			
 
				-		finish_wait(&cache->caching_q, &wait);
			
 
				+	caching_ctl = get_caching_control(cache);
			
 
				+	if (!caching_ctl)
			
 
				 		return 0;
			
 
				-	}
			
 
				-	schedule();
			
 
				-	finish_wait(&cache->caching_q, &wait);
			
 
				 
			
 
				-	wait_event(cache->caching_q, block_group_cache_done(cache) ||
			
 
				+	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
			
 
				 		   (cache->free_space >= num_bytes));
			
 
				+
			
 
				+	put_caching_control(caching_ctl);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static noinline int
			
 
				+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
			
 
				+{
			
 
				+	struct btrfs_caching_control *caching_ctl;
			
 
				+	DEFINE_WAIT(wait);
			
 
				+
			
 
				+	caching_ctl = get_caching_control(cache);
			
 
				+	if (!caching_ctl)
			
 
				+		return 0;
			
 
				+
			
 
				+	wait_event(caching_ctl->wait, block_group_cache_done(cache));
			
 
				+
			
 
				+	put_caching_control(caching_ctl);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -3635,6 +3746,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
				 	int last_ptr_loop = 0;
			
 
				 	int loop = 0;
			
 
				 	bool found_uncached_bg = false;
			
 
				+	bool failed_cluster_refill = false;
			
 
				 
			
 
				 	WARN_ON(num_bytes < root->sectorsize);
			
 
				 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
			
@@ -3732,7 +3844,16 @@ have_block_group:
 
				 		if (unlikely(block_group->ro))
			
 
				 			goto loop;
			
 
				 
			
 
				-		if (last_ptr) {
			
 
				+		/*
			
 
				+		 * Ok we want to try and use the cluster allocator, so lets look
			
 
				+		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
			
 
				+		 * have tried the cluster allocator plenty of times at this
			
 
				+		 * point and not have found anything, so we are likely way too
			
 
				+		 * fragmented for the clustering stuff to find anything, so lets
			
 
				+		 * just skip it and let the allocator find whatever block it can
			
 
				+		 * find
			
 
				+		 */
			
 
				+		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
			
 
				 			/*
			
 
				 			 * the refill lock keeps out other
			
 
				 			 * people trying to start a new cluster
			
@@ -3807,9 +3928,11 @@ refill_cluster:
 
				 					spin_unlock(&last_ptr->refill_lock);
			
 
				 					goto checks;
			
 
				 				}
			
 
				-			} else if (!cached && loop > LOOP_CACHING_NOWAIT) {
			
 
				+			} else if (!cached && loop > LOOP_CACHING_NOWAIT
			
 
				+				   && !failed_cluster_refill) {
			
 
				 				spin_unlock(&last_ptr->refill_lock);
			
 
				 
			
 
				+				failed_cluster_refill = true;
			
 
				 				wait_block_group_cache_progress(block_group,
			
 
				 				       num_bytes + empty_cluster + empty_size);
			
 
				 				goto have_block_group;
			
@@ -3821,13 +3944,9 @@ refill_cluster:
 
				 			 * cluster.  Free the cluster we've been trying
			
 
				 			 * to use, and go to the next block group
			
 
				 			 */
			
 
				-			if (loop < LOOP_NO_EMPTY_SIZE) {
			
 
				-				btrfs_return_cluster_to_free_space(NULL,
			
 
				-								   last_ptr);
			
 
				-				spin_unlock(&last_ptr->refill_lock);
			
 
				-				goto loop;
			
 
				-			}
			
 
				+			btrfs_return_cluster_to_free_space(NULL, last_ptr);
			
 
				 			spin_unlock(&last_ptr->refill_lock);
			
 
				+			goto loop;
			
 
				 		}
			
 
				 
			
 
				 		offset = btrfs_find_space_for_alloc(block_group, search_start,
			
@@ -3881,9 +4000,12 @@ checks:
 
				 					     search_start - offset);
			
 
				 		BUG_ON(offset > search_start);
			
 
				 
			
 
				+		update_reserved_extents(block_group, num_bytes, 1);
			
 
				+
			
 
				 		/* we are all good, lets return */
			
 
				 		break;
			
 
				 loop:
			
 
				+		failed_cluster_refill = false;
			
 
				 		btrfs_put_block_group(block_group);
			
 
				 	}
			
 
				 	up_read(&space_info->groups_sem);
			
@@ -3973,12 +4095,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 
				 	up_read(&info->groups_sem);
			
 
				 }
			
 
				 
			
 
				-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
			
 
				-				  struct btrfs_root *root,
			
 
				-				  u64 num_bytes, u64 min_alloc_size,
			
 
				-				  u64 empty_size, u64 hint_byte,
			
 
				-				  u64 search_end, struct btrfs_key *ins,
			
 
				-				  u64 data)
			
 
				+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
			
 
				+			 struct btrfs_root *root,
			
 
				+			 u64 num_bytes, u64 min_alloc_size,
			
 
				+			 u64 empty_size, u64 hint_byte,
			
 
				+			 u64 search_end, struct btrfs_key *ins,
			
 
				+			 u64 data)
			
 
				 {
			
 
				 	int ret;
			
 
				 	u64 search_start = 0;
			
@@ -4044,25 +4166,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 
				 	ret = btrfs_discard_extent(root, start, len);
			
 
				 
			
 
				 	btrfs_add_free_space(cache, start, len);
			
 
				+	update_reserved_extents(cache, len, 0);
			
 
				 	btrfs_put_block_group(cache);
			
 
				-	update_reserved_extents(root, start, len, 0);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
			
 
				-				  struct btrfs_root *root,
			
 
				-				  u64 num_bytes, u64 min_alloc_size,
			
 
				-				  u64 empty_size, u64 hint_byte,
			
 
				-				  u64 search_end, struct btrfs_key *ins,
			
 
				-				  u64 data)
			
 
				-{
			
 
				-	int ret;
			
 
				-	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
			
 
				-				     empty_size, hint_byte, search_end, ins,
			
 
				-				     data);
			
 
				-	if (!ret)
			
 
				-		update_reserved_extents(root, ins->objectid, ins->offset, 1);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -4223,15 +4328,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 
				 {
			
 
				 	int ret;
			
 
				 	struct btrfs_block_group_cache *block_group;
			
 
				+	struct btrfs_caching_control *caching_ctl;
			
 
				+	u64 start = ins->objectid;
			
 
				+	u64 num_bytes = ins->offset;
			
 
				 
			
 
				 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
			
 
				 	cache_block_group(block_group);
			
 
				-	wait_event(block_group->caching_q,
			
 
				-		   block_group_cache_done(block_group));
			
 
				+	caching_ctl = get_caching_control(block_group);
			
 
				 
			
 
				-	ret = btrfs_remove_free_space(block_group, ins->objectid,
			
 
				-				      ins->offset);
			
 
				-	BUG_ON(ret);
			
 
				+	if (!caching_ctl) {
			
 
				+		BUG_ON(!block_group_cache_done(block_group));
			
 
				+		ret = btrfs_remove_free_space(block_group, start, num_bytes);
			
 
				+		BUG_ON(ret);
			
 
				+	} else {
			
 
				+		mutex_lock(&caching_ctl->mutex);
			
 
				+
			
 
				+		if (start >= caching_ctl->progress) {
			
 
				+			ret = add_excluded_extent(root, start, num_bytes);
			
 
				+			BUG_ON(ret);
			
 
				+		} else if (start + num_bytes <= caching_ctl->progress) {
			
 
				+			ret = btrfs_remove_free_space(block_group,
			
 
				+						      start, num_bytes);
			
 
				+			BUG_ON(ret);
			
 
				+		} else {
			
 
				+			num_bytes = caching_ctl->progress - start;
			
 
				+			ret = btrfs_remove_free_space(block_group,
			
 
				+						      start, num_bytes);
			
 
				+			BUG_ON(ret);
			
 
				+
			
 
				+			start = caching_ctl->progress;
			
 
				+			num_bytes = ins->objectid + ins->offset -
			
 
				+				    caching_ctl->progress;
			
 
				+			ret = add_excluded_extent(root, start, num_bytes);
			
 
				+			BUG_ON(ret);
			
 
				+		}
			
 
				+
			
 
				+		mutex_unlock(&caching_ctl->mutex);
			
 
				+		put_caching_control(caching_ctl);
			
 
				+	}
			
 
				+
			
 
				+	update_reserved_extents(block_group, ins->offset, 1);
			
 
				 	btrfs_put_block_group(block_group);
			
 
				 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
			
 
				 					 0, owner, offset, ins, 1);
			
@@ -4255,9 +4391,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 
				 	int ret;
			
 
				 	u64 flags = 0;
			
 
				 
			
 
				-	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
			
 
				-				     empty_size, hint_byte, search_end,
			
 
				-				     ins, 0);
			
 
				+	ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
			
 
				+				   empty_size, hint_byte, search_end,
			
 
				+				   ins, 0);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
@@ -4268,7 +4404,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 
				 	} else
			
 
				 		BUG_ON(parent > 0);
			
 
				 
			
 
				-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
			
 
				 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
			
 
				 		struct btrfs_delayed_extent_op *extent_op;
			
 
				 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
			
@@ -4347,452 +4482,99 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
				 	return buf;
			
 
				 }
			
 
				 
			
 
				-#if 0
			
 
				-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
			
 
				-			struct btrfs_root *root, struct extent_buffer *leaf)
			
 
				+struct walk_control {
			
 
				+	u64 refs[BTRFS_MAX_LEVEL];
			
 
				+	u64 flags[BTRFS_MAX_LEVEL];
			
 
				+	struct btrfs_key update_progress;
			
 
				+	int stage;
			
 
				+	int level;
			
 
				+	int shared_level;
			
 
				+	int update_ref;
			
 
				+	int keep_locks;
			
 
				+	int reada_slot;
			
 
				+	int reada_count;
			
 
				+};
			
 
				+
			
 
				+#define DROP_REFERENCE	1
			
 
				+#define UPDATE_BACKREF	2
			
 
				+
			
 
				+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
			
 
				+				     struct btrfs_root *root,
			
 
				+				     struct walk_control *wc,
			
 
				+				     struct btrfs_path *path)
			
 
				 {
			
 
				-	u64 disk_bytenr;
			
 
				-	u64 num_bytes;
			
 
				-	struct btrfs_key key;
			
 
				-	struct btrfs_file_extent_item *fi;
			
 
				+	u64 bytenr;
			
 
				+	u64 generation;
			
 
				+	u64 refs;
			
 
				+	u64 last = 0;
			
 
				 	u32 nritems;
			
 
				-	int i;
			
 
				+	u32 blocksize;
			
 
				+	struct btrfs_key key;
			
 
				+	struct extent_buffer *eb;
			
 
				 	int ret;
			
 
				+	int slot;
			
 
				+	int nread = 0;
			
 
				 
			
 
				-	BUG_ON(!btrfs_is_leaf(leaf));
			
 
				-	nritems = btrfs_header_nritems(leaf);
			
 
				+	if (path->slots[wc->level] < wc->reada_slot) {
			
 
				+		wc->reada_count = wc->reada_count * 2 / 3;
			
 
				+		wc->reada_count = max(wc->reada_count, 2);
			
 
				+	} else {
			
 
				+		wc->reada_count = wc->reada_count * 3 / 2;
			
 
				+		wc->reada_count = min_t(int, wc->reada_count,
			
 
				+					BTRFS_NODEPTRS_PER_BLOCK(root));
			
 
				+	}
			
 
				 
			
 
				-	for (i = 0; i < nritems; i++) {
			
 
				-		cond_resched();
			
 
				-		btrfs_item_key_to_cpu(leaf, &key, i);
			
 
				+	eb = path->nodes[wc->level];
			
 
				+	nritems = btrfs_header_nritems(eb);
			
 
				+	blocksize = btrfs_level_size(root, wc->level - 1);
			
 
				 
			
 
				-		/* only extents have references, skip everything else */
			
 
				-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
			
 
				-			continue;
			
 
				-
			
 
				-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
			
 
				-
			
 
				-		/* inline extents live in the btree, they don't have refs */
			
 
				-		if (btrfs_file_extent_type(leaf, fi) ==
			
 
				-		    BTRFS_FILE_EXTENT_INLINE)
			
 
				-			continue;
			
 
				-
			
 
				-		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
			
 
				-
			
 
				-		/* holes don't have refs */
			
 
				-		if (disk_bytenr == 0)
			
 
				-			continue;
			
 
				-
			
 
				-		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
			
 
				-		ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
			
 
				-					leaf->start, 0, key.objectid, 0);
			
 
				-		BUG_ON(ret);
			
 
				-	}
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
			
 
				-					struct btrfs_root *root,
			
 
				-					struct btrfs_leaf_ref *ref)
			
 
				-{
			
 
				-	int i;
			
 
				-	int ret;
			
 
				-	struct btrfs_extent_info *info;
			
 
				-	struct refsort *sorted;
			
 
				-
			
 
				-	if (ref->nritems == 0)
			
 
				-		return 0;
			
 
				-
			
 
				-	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
			
 
				-	for (i = 0; i < ref->nritems; i++) {
			
 
				-		sorted[i].bytenr = ref->extents[i].bytenr;
			
 
				-		sorted[i].slot = i;
			
 
				-	}
			
 
				-	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
			
 
				+	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
			
 
				+		if (nread >= wc->reada_count)
			
 
				+			break;
			
 
				 
			
 
				-	/*
			
 
				-	 * the items in the ref were sorted when the ref was inserted
			
 
				-	 * into the ref cache, so this is already in order
			
 
				-	 */
			
 
				-	for (i = 0; i < ref->nritems; i++) {
			
 
				-		info = ref->extents + sorted[i].slot;
			
 
				-		ret = btrfs_free_extent(trans, root, info->bytenr,
			
 
				-					  info->num_bytes, ref->bytenr,
			
 
				-					  ref->owner, ref->generation,
			
 
				-					  info->objectid, 0);
			
 
				-
			
 
				-		atomic_inc(&root->fs_info->throttle_gen);
			
 
				-		wake_up(&root->fs_info->transaction_throttle);
			
 
				 		cond_resched();
			
 
				+		bytenr = btrfs_node_blockptr(eb, slot);
			
 
				+		generation = btrfs_node_ptr_generation(eb, slot);
			
 
				 
			
 
				-		BUG_ON(ret);
			
 
				-		info++;
			
 
				-	}
			
 
				-
			
 
				-	kfree(sorted);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
			
 
				-				     struct btrfs_root *root, u64 start,
			
 
				-				     u64 len, u32 *refs)
			
 
				-{
			
 
				-	int ret;
			
 
				-
			
 
				-	ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
			
 
				-	BUG_ON(ret);
			
 
				-
			
 
				-#if 0 /* some debugging code in case we see problems here */
			
 
				-	/* if the refs count is one, it won't get increased again.  But
			
 
				-	 * if the ref count is > 1, someone may be decreasing it at
			
 
				-	 * the same time we are.
			
 
				-	 */
			
 
				-	if (*refs != 1) {
			
 
				-		struct extent_buffer *eb = NULL;
			
 
				-		eb = btrfs_find_create_tree_block(root, start, len);
			
 
				-		if (eb)
			
 
				-			btrfs_tree_lock(eb);
			
 
				-
			
 
				-		mutex_lock(&root->fs_info->alloc_mutex);
			
 
				-		ret = lookup_extent_ref(NULL, root, start, len, refs);
			
 
				-		BUG_ON(ret);
			
 
				-		mutex_unlock(&root->fs_info->alloc_mutex);
			
 
				-
			
 
				-		if (eb) {
			
 
				-			btrfs_tree_unlock(eb);
			
 
				-			free_extent_buffer(eb);
			
 
				-		}
			
 
				-		if (*refs == 1) {
			
 
				-			printk(KERN_ERR "btrfs block %llu went down to one "
			
 
				-			       "during drop_snap\n", (unsigned long long)start);
			
 
				-		}
			
 
				-
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-	cond_resched();
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				+		if (slot == path->slots[wc->level])
			
 
				+			goto reada;
			
 
				 
			
 
				-/*
			
 
				- * this is used while deleting old snapshots, and it drops the refs
			
 
				- * on a whole subtree starting from a level 1 node.
			
 
				- *
			
 
				- * The idea is to sort all the leaf pointers, and then drop the
			
 
				- * ref on all the leaves in order.  Most of the time the leaves
			
 
				- * will have ref cache entries, so no leaf IOs will be required to
			
 
				- * find the extents they have references on.
			
 
				- *
			
 
				- * For each leaf, any references it has are also dropped in order
			
 
				- *
			
 
				- * This ends up dropping the references in something close to optimal
			
 
				- * order for reading and modifying the extent allocation tree.
			
 
				- */
			
 
				-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
			
 
				-					struct btrfs_root *root,
			
 
				-					struct btrfs_path *path)
			
 
				-{
			
 
				-	u64 bytenr;
			
 
				-	u64 root_owner;
			
 
				-	u64 root_gen;
			
 
				-	struct extent_buffer *eb = path->nodes[1];
			
 
				-	struct extent_buffer *leaf;
			
 
				-	struct btrfs_leaf_ref *ref;
			
 
				-	struct refsort *sorted = NULL;
			
 
				-	int nritems = btrfs_header_nritems(eb);
			
 
				-	int ret;
			
 
				-	int i;
			
 
				-	int refi = 0;
			
 
				-	int slot = path->slots[1];
			
 
				-	u32 blocksize = btrfs_level_size(root, 0);
			
 
				-	u32 refs;
			
 
				-
			
 
				-	if (nritems == 0)
			
 
				-		goto out;
			
 
				-
			
 
				-	root_owner = btrfs_header_owner(eb);
			
 
				-	root_gen = btrfs_header_generation(eb);
			
 
				-	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
			
 
				-
			
 
				-	/*
			
 
				-	 * step one, sort all the leaf pointers so we don't scribble
			
 
				-	 * randomly into the extent allocation tree
			
 
				-	 */
			
 
				-	for (i = slot; i < nritems; i++) {
			
 
				-		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
			
 
				-		sorted[refi].slot = i;
			
 
				-		refi++;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * nritems won't be zero, but if we're picking up drop_snapshot
			
 
				-	 * after a crash, slot might be > 0, so double check things
			
 
				-	 * just in case.
			
 
				-	 */
			
 
				-	if (refi == 0)
			
 
				-		goto out;
			
 
				-
			
 
				-	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
			
 
				-
			
 
				-	/*
			
 
				-	 * the first loop frees everything the leaves point to
			
 
				-	 */
			
 
				-	for (i = 0; i < refi; i++) {
			
 
				-		u64 ptr_gen;
			
 
				-
			
 
				-		bytenr = sorted[i].bytenr;
			
 
				-
			
 
				-		/*
			
 
				-		 * check the reference count on this leaf.  If it is > 1
			
 
				-		 * we just decrement it below and don't update any
			
 
				-		 * of the refs the leaf points to.
			
 
				-		 */
			
 
				-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
			
 
				-						blocksize, &refs);
			
 
				-		BUG_ON(ret);
			
 
				-		if (refs != 1)
			
 
				+		if (wc->stage == UPDATE_BACKREF &&
			
 
				+		    generation <= root->root_key.offset)
			
 
				 			continue;
			
 
				 
			
 
				-		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
			
 
				-
			
 
				-		/*
			
 
				-		 * the leaf only had one reference, which means the
			
 
				-		 * only thing pointing to this leaf is the snapshot
			
 
				-		 * we're deleting.  It isn't possible for the reference
			
 
				-		 * count to increase again later
			
 
				-		 *
			
 
				-		 * The reference cache is checked for the leaf,
			
 
				-		 * and if found we'll be able to drop any refs held by
			
 
				-		 * the leaf without needing to read it in.
			
 
				-		 */
			
 
				-		ref = btrfs_lookup_leaf_ref(root, bytenr);
			
 
				-		if (ref && ref->generation != ptr_gen) {
			
 
				-			btrfs_free_leaf_ref(root, ref);
			
 
				-			ref = NULL;
			
 
				-		}
			
 
				-		if (ref) {
			
 
				-			ret = cache_drop_leaf_ref(trans, root, ref);
			
 
				-			BUG_ON(ret);
			
 
				-			btrfs_remove_leaf_ref(root, ref);
			
 
				-			btrfs_free_leaf_ref(root, ref);
			
 
				-		} else {
			
 
				-			/*
			
 
				-			 * the leaf wasn't in the reference cache, so
			
 
				-			 * we have to read it.
			
 
				-			 */
			
 
				-			leaf = read_tree_block(root, bytenr, blocksize,
			
 
				-					       ptr_gen);
			
 
				-			ret = btrfs_drop_leaf_ref(trans, root, leaf);
			
 
				+		if (wc->stage == DROP_REFERENCE) {
			
 
				+			ret = btrfs_lookup_extent_info(trans, root,
			
 
				+						bytenr, blocksize,
			
 
				+						&refs, NULL);
			
 
				 			BUG_ON(ret);
			
 
				-			free_extent_buffer(leaf);
			
 
				-		}
			
 
				-		atomic_inc(&root->fs_info->throttle_gen);
			
 
				-		wake_up(&root->fs_info->transaction_throttle);
			
 
				-		cond_resched();
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * run through the loop again to free the refs on the leaves.
			
 
				-	 * This is faster than doing it in the loop above because
			
 
				-	 * the leaves are likely to be clustered together.  We end up
			
 
				-	 * working in nice chunks on the extent allocation tree.
			
 
				-	 */
			
 
				-	for (i = 0; i < refi; i++) {
			
 
				-		bytenr = sorted[i].bytenr;
			
 
				-		ret = btrfs_free_extent(trans, root, bytenr,
			
 
				-					blocksize, eb->start,
			
 
				-					root_owner, root_gen, 0, 1);
			
 
				-		BUG_ON(ret);
			
 
				+			BUG_ON(refs == 0);
			
 
				+			if (refs == 1)
			
 
				+				goto reada;
			
 
				 
			
 
				-		atomic_inc(&root->fs_info->throttle_gen);
			
 
				-		wake_up(&root->fs_info->transaction_throttle);
			
 
				-		cond_resched();
			
 
				-	}
			
 
				-out:
			
 
				-	kfree(sorted);
			
 
				-
			
 
				-	/*
			
 
				-	 * update the path to show we've processed the entire level 1
			
 
				-	 * node.  This will get saved into the root's drop_snapshot_progress
			
 
				-	 * field so these drops are not repeated again if this transaction
			
 
				-	 * commits.
			
 
				-	 */
			
 
				-	path->slots[1] = nritems;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * helper function for drop_snapshot, this walks down the tree dropping ref
			
 
				- * counts as it goes.
			
 
				- */
			
 
				-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
			
 
				-				   struct btrfs_root *root,
			
 
				-				   struct btrfs_path *path, int *level)
			
 
				-{
			
 
				-	u64 root_owner;
			
 
				-	u64 root_gen;
			
 
				-	u64 bytenr;
			
 
				-	u64 ptr_gen;
			
 
				-	struct extent_buffer *next;
			
 
				-	struct extent_buffer *cur;
			
 
				-	struct extent_buffer *parent;
			
 
				-	u32 blocksize;
			
 
				-	int ret;
			
 
				-	u32 refs;
			
 
				-
			
 
				-	WARN_ON(*level < 0);
			
 
				-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
			
 
				-	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
			
 
				-				path->nodes[*level]->len, &refs);
			
 
				-	BUG_ON(ret);
			
 
				-	if (refs > 1)
			
 
				-		goto out;
			
 
				-
			
 
				-	/*
			
 
				-	 * walk down to the last node level and free all the leaves
			
 
				-	 */
			
 
				-	while (*level >= 0) {
			
 
				-		WARN_ON(*level < 0);
			
 
				-		WARN_ON(*level >= BTRFS_MAX_LEVEL);
			
 
				-		cur = path->nodes[*level];
			
 
				-
			
 
				-		if (btrfs_header_level(cur) != *level)
			
 
				-			WARN_ON(1);
			
 
				-
			
 
				-		if (path->slots[*level] >=
			
 
				-		    btrfs_header_nritems(cur))
			
 
				-			break;
			
 
				-
			
 
				-		/* the new code goes down to level 1 and does all the
			
 
				-		 * leaves pointed to that node in bulk.  So, this check
			
 
				-		 * for level 0 will always be false.
			
 
				-		 *
			
 
				-		 * But, the disk format allows the drop_snapshot_progress
			
 
				-		 * field in the root to leave things in a state where
			
 
				-		 * a leaf will need cleaning up here.  If someone crashes
			
 
				-		 * with the old code and then boots with the new code,
			
 
				-		 * we might find a leaf here.
			
 
				-		 */
			
 
				-		if (*level == 0) {
			
 
				-			ret = btrfs_drop_leaf_ref(trans, root, cur);
			
 
				-			BUG_ON(ret);
			
 
				-			break;
			
 
				+			if (!wc->update_ref ||
			
 
				+			    generation <= root->root_key.offset)
			
 
				+				continue;
			
 
				+			btrfs_node_key_to_cpu(eb, &key, slot);
			
 
				+			ret = btrfs_comp_cpu_keys(&key,
			
 
				+						  &wc->update_progress);
			
 
				+			if (ret < 0)
			
 
				+				continue;
			
 
				 		}
			
 
				-
			
 
				-		/*
			
 
				-		 * once we get to level one, process the whole node
			
 
				-		 * at once, including everything below it.
			
 
				-		 */
			
 
				-		if (*level == 1) {
			
 
				-			ret = drop_level_one_refs(trans, root, path);
			
 
				-			BUG_ON(ret);
			
 
				+reada:
			
 
				+		ret = readahead_tree_block(root, bytenr, blocksize,
			
 
				+					   generation);
			
 
				+		if (ret)
			
 
				 			break;
			
 
				-		}
			
 
				-
			
 
				-		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
			
 
				-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
			
 
				-		blocksize = btrfs_level_size(root, *level - 1);
			
 
				-
			
 
				-		ret = drop_snap_lookup_refcount(trans, root, bytenr,
			
 
				-						blocksize, &refs);
			
 
				-		BUG_ON(ret);
			
 
				-
			
 
				-		/*
			
 
				-		 * if there is more than one reference, we don't need
			
 
				-		 * to read that node to drop any references it has.  We
			
 
				-		 * just drop the ref we hold on that node and move on to the
			
 
				-		 * next slot in this level.
			
 
				-		 */
			
 
				-		if (refs != 1) {
			
 
				-			parent = path->nodes[*level];
			
 
				-			root_owner = btrfs_header_owner(parent);
			
 
				-			root_gen = btrfs_header_generation(parent);
			
 
				-			path->slots[*level]++;
			
 
				-
			
 
				-			ret = btrfs_free_extent(trans, root, bytenr,
			
 
				-						blocksize, parent->start,
			
 
				-						root_owner, root_gen,
			
 
				-						*level - 1, 1);
			
 
				-			BUG_ON(ret);
			
 
				-
			
 
				-			atomic_inc(&root->fs_info->throttle_gen);
			
 
				-			wake_up(&root->fs_info->transaction_throttle);
			
 
				-			cond_resched();
			
 
				-
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * we need to keep freeing things in the next level down.
			
 
				-		 * read the block and loop around to process it
			
 
				-		 */
			
 
				-		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
			
 
				-		WARN_ON(*level <= 0);
			
 
				-		if (path->nodes[*level-1])
			
 
				-			free_extent_buffer(path->nodes[*level-1]);
			
 
				-		path->nodes[*level-1] = next;
			
 
				-		*level = btrfs_header_level(next);
			
 
				-		path->slots[*level] = 0;
			
 
				-		cond_resched();
			
 
				+		last = bytenr + blocksize;
			
 
				+		nread++;
			
 
				 	}
			
 
				-out:
			
 
				-	WARN_ON(*level < 0);
			
 
				-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
			
 
				-
			
 
				-	if (path->nodes[*level] == root->node) {
			
 
				-		parent = path->nodes[*level];
			
 
				-		bytenr = path->nodes[*level]->start;
			
 
				-	} else {
			
 
				-		parent = path->nodes[*level + 1];
			
 
				-		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
			
 
				-	}
			
 
				-
			
 
				-	blocksize = btrfs_level_size(root, *level);
			
 
				-	root_owner = btrfs_header_owner(parent);
			
 
				-	root_gen = btrfs_header_generation(parent);
			
 
				-
			
 
				-	/*
			
 
				-	 * cleanup and free the reference on the last node
			
 
				-	 * we processed
			
 
				-	 */
			
 
				-	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
			
 
				-				  parent->start, root_owner, root_gen,
			
 
				-				  *level, 1);
			
 
				-	free_extent_buffer(path->nodes[*level]);
			
 
				-	path->nodes[*level] = NULL;
			
 
				-
			
 
				-	*level += 1;
			
 
				-	BUG_ON(ret);
			
 
				-
			
 
				-	cond_resched();
			
 
				-	return 0;
			
 
				+	wc->reada_slot = slot;
			
 
				 }
			
 
				-#endif
			
 
				-
			
 
				-struct walk_control {
			
 
				-	u64 refs[BTRFS_MAX_LEVEL];
			
 
				-	u64 flags[BTRFS_MAX_LEVEL];
			
 
				-	struct btrfs_key update_progress;
			
 
				-	int stage;
			
 
				-	int level;
			
 
				-	int shared_level;
			
 
				-	int update_ref;
			
 
				-	int keep_locks;
			
 
				-};
			
 
				-
			
 
				-#define DROP_REFERENCE	1
			
 
				-#define UPDATE_BACKREF	2
			
 
				 
			
 
				 /*
			
 
				  * hepler to process tree block while walking down the tree.
			
 
				  *
			
 
				- * when wc->stage == DROP_REFERENCE, this function checks
			
 
				- * reference count of the block. if the block is shared and
			
 
				- * we need update back refs for the subtree rooted at the
			
 
				- * block, this function changes wc->stage to UPDATE_BACKREF
			
 
				- *
			
 
				  * when wc->stage == UPDATE_BACKREF, this function updates
			
 
				  * back refs for pointers in the block.
			
 
				  *
			
@@ -4805,7 +4587,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 
				 {
			
 
				 	int level = wc->level;
			
 
				 	struct extent_buffer *eb = path->nodes[level];
			
 
				-	struct btrfs_key key;
			
 
				 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
			
 
				 	int ret;
			
 
				 
			
@@ -4828,21 +4609,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 
				 		BUG_ON(wc->refs[level] == 0);
			
 
				 	}
			
 
				 
			
 
				-	if (wc->stage == DROP_REFERENCE &&
			
 
				-	    wc->update_ref && wc->refs[level] > 1) {
			
 
				-		BUG_ON(eb == root->node);
			
 
				-		BUG_ON(path->slots[level] > 0);
			
 
				-		if (level == 0)
			
 
				-			btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
			
 
				-		else
			
 
				-			btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
			
 
				-		if (btrfs_header_owner(eb) == root->root_key.objectid &&
			
 
				-		    btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
			
 
				-			wc->stage = UPDATE_BACKREF;
			
 
				-			wc->shared_level = level;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				 	if (wc->stage == DROP_REFERENCE) {
			
 
				 		if (wc->refs[level] > 1)
			
 
				 			return 1;
			
@@ -4878,6 +4644,123 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * hepler to process tree block pointer.
			
 
				+ *
			
 
				+ * when wc->stage == DROP_REFERENCE, this function checks
			
 
				+ * reference count of the block pointed to. if the block
			
 
				+ * is shared and we need update back refs for the subtree
			
 
				+ * rooted at the block, this function changes wc->stage to
			
 
				+ * UPDATE_BACKREF. if the block is shared and there is no
			
 
				+ * need to update back, this function drops the reference
			
 
				+ * to the block.
			
 
				+ *
			
 
				+ * NOTE: return value 1 means we should stop walking down.
			
 
				+ */
			
 
				+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
			
 
				+				 struct btrfs_root *root,
			
 
				+				 struct btrfs_path *path,
			
 
				+				 struct walk_control *wc)
			
 
				+{
			
 
				+	u64 bytenr;
			
 
				+	u64 generation;
			
 
				+	u64 parent;
			
 
				+	u32 blocksize;
			
 
				+	struct btrfs_key key;
			
 
				+	struct extent_buffer *next;
			
 
				+	int level = wc->level;
			
 
				+	int reada = 0;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	generation = btrfs_node_ptr_generation(path->nodes[level],
			
 
				+					       path->slots[level]);
			
 
				+	/*
			
 
				+	 * if the lower level block was created before the snapshot
			
 
				+	 * was created, we know there is no need to update back refs
			
 
				+	 * for the subtree
			
 
				+	 */
			
 
				+	if (wc->stage == UPDATE_BACKREF &&
			
 
				+	    generation <= root->root_key.offset)
			
 
				+		return 1;
			
 
				+
			
 
				+	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
			
 
				+	blocksize = btrfs_level_size(root, level - 1);
			
 
				+
			
 
				+	next = btrfs_find_tree_block(root, bytenr, blocksize);
			
 
				+	if (!next) {
			
 
				+		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
			
 
				+		reada = 1;
			
 
				+	}
			
 
				+	btrfs_tree_lock(next);
			
 
				+	btrfs_set_lock_blocking(next);
			
 
				+
			
 
				+	if (wc->stage == DROP_REFERENCE) {
			
 
				+		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
			
 
				+					       &wc->refs[level - 1],
			
 
				+					       &wc->flags[level - 1]);
			
 
				+		BUG_ON(ret);
			
 
				+		BUG_ON(wc->refs[level - 1] == 0);
			
 
				+
			
 
				+		if (wc->refs[level - 1] > 1) {
			
 
				+			if (!wc->update_ref ||
			
 
				+			    generation <= root->root_key.offset)
			
 
				+				goto skip;
			
 
				+
			
 
				+			btrfs_node_key_to_cpu(path->nodes[level], &key,
			
 
				+					      path->slots[level]);
			
 
				+			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
			
 
				+			if (ret < 0)
			
 
				+				goto skip;
			
 
				+
			
 
				+			wc->stage = UPDATE_BACKREF;
			
 
				+			wc->shared_level = level - 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (!btrfs_buffer_uptodate(next, generation)) {
			
 
				+		btrfs_tree_unlock(next);
			
 
				+		free_extent_buffer(next);
			
 
				+		next = NULL;
			
 
				+	}
			
 
				+
			
 
				+	if (!next) {
			
 
				+		if (reada && level == 1)
			
 
				+			reada_walk_down(trans, root, wc, path);
			
 
				+		next = read_tree_block(root, bytenr, blocksize, generation);
			
 
				+		btrfs_tree_lock(next);
			
 
				+		btrfs_set_lock_blocking(next);
			
 
				+	}
			
 
				+
			
 
				+	level--;
			
 
				+	BUG_ON(level != btrfs_header_level(next));
			
 
				+	path->nodes[level] = next;
			
 
				+	path->slots[level] = 0;
			
 
				+	path->locks[level] = 1;
			
 
				+	wc->level = level;
			
 
				+	if (wc->level == 1)
			
 
				+		wc->reada_slot = 0;
			
 
				+	return 0;
			
 
				+skip:
			
 
				+	wc->refs[level - 1] = 0;
			
 
				+	wc->flags[level - 1] = 0;
			
 
				+
			
 
				+	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			
 
				+		parent = path->nodes[level]->start;
			
 
				+	} else {
			
 
				+		BUG_ON(root->root_key.objectid !=
			
 
				+		       btrfs_header_owner(path->nodes[level]));
			
 
				+		parent = 0;
			
 
				+	}
			
 
				+
			
 
				+	ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
			
 
				+				root->root_key.objectid, level - 1, 0);
			
 
				+	BUG_ON(ret);
			
 
				+
			
 
				+	btrfs_tree_unlock(next);
			
 
				+	free_extent_buffer(next);
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * hepler to process tree block while walking up the tree.
			
 
				  *
			
@@ -4905,7 +4788,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 
				 		if (level < wc->shared_level)
			
 
				 			goto out;
			
 
				 
			
 
				-		BUG_ON(wc->refs[level] <= 1);
			
 
				 		ret = find_next_key(path, level + 1, &wc->update_progress);
			
 
				 		if (ret > 0)
			
 
				 			wc->update_ref = 0;
			
@@ -4936,8 +4818,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 
				 				path->locks[level] = 0;
			
 
				 				return 1;
			
 
				 			}
			
 
				-		} else {
			
 
				-			BUG_ON(level != 0);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -4990,17 +4870,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
				 				   struct btrfs_path *path,
			
 
				 				   struct walk_control *wc)
			
 
				 {
			
 
				-	struct extent_buffer *next;
			
 
				-	struct extent_buffer *cur;
			
 
				-	u64 bytenr;
			
 
				-	u64 ptr_gen;
			
 
				-	u32 blocksize;
			
 
				 	int level = wc->level;
			
 
				 	int ret;
			
 
				 
			
 
				 	while (level >= 0) {
			
 
				-		cur = path->nodes[level];
			
 
				-		BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
			
 
				+		if (path->slots[level] >=
			
 
				+		    btrfs_header_nritems(path->nodes[level]))
			
 
				+			break;
			
 
				 
			
 
				 		ret = walk_down_proc(trans, root, path, wc);
			
 
				 		if (ret > 0)
			
@@ -5009,20 +4885,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
				 		if (level == 0)
			
 
				 			break;
			
 
				 
			
 
				-		bytenr = btrfs_node_blockptr(cur, path->slots[level]);
			
 
				-		blocksize = btrfs_level_size(root, level - 1);
			
 
				-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
			
 
				-
			
 
				-		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
			
 
				-		btrfs_tree_lock(next);
			
 
				-		btrfs_set_lock_blocking(next);
			
 
				-
			
 
				-		level--;
			
 
				-		BUG_ON(level != btrfs_header_level(next));
			
 
				-		path->nodes[level] = next;
			
 
				-		path->slots[level] = 0;
			
 
				-		path->locks[level] = 1;
			
 
				-		wc->level = level;
			
 
				+		ret = do_walk_down(trans, root, path, wc);
			
 
				+		if (ret > 0) {
			
 
				+			path->slots[level]++;
			
 
				+			continue;
			
 
				+		}
			
 
				+		level = wc->level;
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
@@ -5112,9 +4980,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 
				 			err = ret;
			
 
				 			goto out;
			
 
				 		}
			
 
				-		btrfs_node_key_to_cpu(path->nodes[level], &key,
			
 
				-				      path->slots[level]);
			
 
				-		WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
			
 
				+		WARN_ON(ret > 0);
			
 
				 
			
 
				 		/*
			
 
				 		 * unlock our path, this is safe because only this
			
@@ -5149,6 +5015,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 
				 	wc->stage = DROP_REFERENCE;
			
 
				 	wc->update_ref = update_ref;
			
 
				 	wc->keep_locks = 0;
			
 
				+	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
			
 
				 
			
 
				 	while (1) {
			
 
				 		ret = walk_down_tree(trans, root, path, wc);
			
@@ -5201,9 +5068,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 
				 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	free_extent_buffer(root->node);
			
 
				-	free_extent_buffer(root->commit_root);
			
 
				-	kfree(root);
			
 
				+	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
			
 
				+		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
			
 
				+					   NULL, NULL);
			
 
				+		BUG_ON(ret < 0);
			
 
				+		if (ret > 0) {
			
 
				+			ret = btrfs_del_orphan_item(trans, tree_root,
			
 
				+						    root->root_key.objectid);
			
 
				+			BUG_ON(ret);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (root->in_radix) {
			
 
				+		btrfs_free_fs_root(tree_root->fs_info, root);
			
 
				+	} else {
			
 
				+		free_extent_buffer(root->node);
			
 
				+		free_extent_buffer(root->commit_root);
			
 
				+		kfree(root);
			
 
				+	}
			
 
				 out:
			
 
				 	btrfs_end_transaction(trans, tree_root);
			
 
				 	kfree(wc);
			
@@ -5255,6 +5137,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 
				 	wc->stage = DROP_REFERENCE;
			
 
				 	wc->update_ref = 0;
			
 
				 	wc->keep_locks = 1;
			
 
				+	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
			
 
				 
			
 
				 	while (1) {
			
 
				 		wret = walk_down_tree(trans, root, path, wc);
			
@@ -5397,9 +5280,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
 
				 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
			
 
				 	while (1) {
			
 
				 		int ret;
			
 
				-		spin_lock(&em_tree->lock);
			
 
				+		write_lock(&em_tree->lock);
			
 
				 		ret = add_extent_mapping(em_tree, em);
			
 
				-		spin_unlock(&em_tree->lock);
			
 
				+		write_unlock(&em_tree->lock);
			
 
				 		if (ret != -EEXIST) {
			
 
				 			free_extent_map(em);
			
 
				 			break;
			
@@ -6842,287 +6725,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-#if 0
			
 
				-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
			
 
				-				 struct btrfs_root *root,
			
 
				-				 u64 objectid, u64 size)
			
 
				-{
			
 
				-	struct btrfs_path *path;
			
 
				-	struct btrfs_inode_item *item;
			
 
				-	struct extent_buffer *leaf;
			
 
				-	int ret;
			
 
				-
			
 
				-	path = btrfs_alloc_path();
			
 
				-	if (!path)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	path->leave_spinning = 1;
			
 
				-	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
			
 
				-	if (ret)
			
 
				-		goto out;
			
 
				-
			
 
				-	leaf = path->nodes[0];
			
 
				-	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
			
 
				-	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
			
 
				-	btrfs_set_inode_generation(leaf, item, 1);
			
 
				-	btrfs_set_inode_size(leaf, item, size);
			
 
				-	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
			
 
				-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
			
 
				-	btrfs_mark_buffer_dirty(leaf);
			
 
				-	btrfs_release_path(root, path);
			
 
				-out:
			
 
				-	btrfs_free_path(path);
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
			
 
				-					struct btrfs_block_group_cache *group)
			
 
				+/*
			
 
				+ * checks to see if its even possible to relocate this block group.
			
 
				+ *
			
 
				+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
			
 
				+ * ok to go ahead and try.
			
 
				+ */
			
 
				+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
			
 
				 {
			
 
				-	struct inode *inode = NULL;
			
 
				-	struct btrfs_trans_handle *trans;
			
 
				-	struct btrfs_root *root;
			
 
				-	struct btrfs_key root_key;
			
 
				-	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
			
 
				-	int err = 0;
			
 
				+	struct btrfs_block_group_cache *block_group;
			
 
				+	struct btrfs_space_info *space_info;
			
 
				+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
			
 
				+	struct btrfs_device *device;
			
 
				+	int full = 0;
			
 
				+	int ret = 0;
			
 
				 
			
 
				-	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
			
 
				-	root_key.type = BTRFS_ROOT_ITEM_KEY;
			
 
				-	root_key.offset = (u64)-1;
			
 
				-	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
			
 
				-	if (IS_ERR(root))
			
 
				-		return ERR_CAST(root);
			
 
				+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
			
 
				 
			
 
				-	trans = btrfs_start_transaction(root, 1);
			
 
				-	BUG_ON(!trans);
			
 
				+	/* odd, couldn't find the block group, leave it alone */
			
 
				+	if (!block_group)
			
 
				+		return -1;
			
 
				 
			
 
				-	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
			
 
				-	if (err)
			
 
				+	/* no bytes used, we're good */
			
 
				+	if (!btrfs_block_group_used(&block_group->item))
			
 
				 		goto out;
			
 
				 
			
 
				-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
			
 
				-	BUG_ON(err);
			
 
				-
			
 
				-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
			
 
				-				       group->key.offset, 0, group->key.offset,
			
 
				-				       0, 0, 0);
			
 
				-	BUG_ON(err);
			
 
				-
			
 
				-	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
			
 
				-	if (inode->i_state & I_NEW) {
			
 
				-		BTRFS_I(inode)->root = root;
			
 
				-		BTRFS_I(inode)->location.objectid = objectid;
			
 
				-		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
			
 
				-		BTRFS_I(inode)->location.offset = 0;
			
 
				-		btrfs_read_locked_inode(inode);
			
 
				-		unlock_new_inode(inode);
			
 
				-		BUG_ON(is_bad_inode(inode));
			
 
				-	} else {
			
 
				-		BUG_ON(1);
			
 
				-	}
			
 
				-	BTRFS_I(inode)->index_cnt = group->key.objectid;
			
 
				-
			
 
				-	err = btrfs_orphan_add(trans, inode);
			
 
				-out:
			
 
				-	btrfs_end_transaction(trans, root);
			
 
				-	if (err) {
			
 
				-		if (inode)
			
 
				-			iput(inode);
			
 
				-		inode = ERR_PTR(err);
			
 
				-	}
			
 
				-	return inode;
			
 
				-}
			
 
				-
			
 
				-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
			
 
				-{
			
 
				-
			
 
				-	struct btrfs_ordered_sum *sums;
			
 
				-	struct btrfs_sector_sum *sector_sum;
			
 
				-	struct btrfs_ordered_extent *ordered;
			
 
				-	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				-	struct list_head list;
			
 
				-	size_t offset;
			
 
				-	int ret;
			
 
				-	u64 disk_bytenr;
			
 
				-
			
 
				-	INIT_LIST_HEAD(&list);
			
 
				-
			
 
				-	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
			
 
				-	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
			
 
				-
			
 
				-	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
			
 
				-	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
			
 
				-				       disk_bytenr + len - 1, &list);
			
 
				-
			
 
				-	while (!list_empty(&list)) {
			
 
				-		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
			
 
				-		list_del_init(&sums->list);
			
 
				-
			
 
				-		sector_sum = sums->sums;
			
 
				-		sums->bytenr = ordered->start;
			
 
				+	space_info = block_group->space_info;
			
 
				+	spin_lock(&space_info->lock);
			
 
				 
			
 
				-		offset = 0;
			
 
				-		while (offset < sums->len) {
			
 
				-			sector_sum->bytenr += ordered->start - disk_bytenr;
			
 
				-			sector_sum++;
			
 
				-			offset += root->sectorsize;
			
 
				-		}
			
 
				+	full = space_info->full;
			
 
				 
			
 
				-		btrfs_add_ordered_sum(inode, ordered, sums);
			
 
				+	/*
			
 
				+	 * if this is the last block group we have in this space, we can't
			
 
				+	 * relocate it unless we're able to allocate a new chunk below.
			
 
				+	 *
			
 
				+	 * Otherwise, we need to make sure we have room in the space to handle
			
 
				+	 * all of the extents from this block group.  If we can, we're good
			
 
				+	 */
			
 
				+	if ((space_info->total_bytes != block_group->key.offset) &&
			
 
				+	   (space_info->bytes_used + space_info->bytes_reserved +
			
 
				+	    space_info->bytes_pinned + space_info->bytes_readonly +
			
 
				+	    btrfs_block_group_used(&block_group->item) <
			
 
				+	    space_info->total_bytes)) {
			
 
				+		spin_unlock(&space_info->lock);
			
 
				+		goto out;
			
 
				 	}
			
 
				-	btrfs_put_ordered_extent(ordered);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
			
 
				-{
			
 
				-	struct btrfs_trans_handle *trans;
			
 
				-	struct btrfs_path *path;
			
 
				-	struct btrfs_fs_info *info = root->fs_info;
			
 
				-	struct extent_buffer *leaf;
			
 
				-	struct inode *reloc_inode;
			
 
				-	struct btrfs_block_group_cache *block_group;
			
 
				-	struct btrfs_key key;
			
 
				-	u64 skipped;
			
 
				-	u64 cur_byte;
			
 
				-	u64 total_found;
			
 
				-	u32 nritems;
			
 
				-	int ret;
			
 
				-	int progress;
			
 
				-	int pass = 0;
			
 
				-
			
 
				-	root = root->fs_info->extent_root;
			
 
				-
			
 
				-	block_group = btrfs_lookup_block_group(info, group_start);
			
 
				-	BUG_ON(!block_group);
			
 
				-
			
 
				-	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
			
 
				-	       (unsigned long long)block_group->key.objectid,
			
 
				-	       (unsigned long long)block_group->flags);
			
 
				-
			
 
				-	path = btrfs_alloc_path();
			
 
				-	BUG_ON(!path);
			
 
				-
			
 
				-	reloc_inode = create_reloc_inode(info, block_group);
			
 
				-	BUG_ON(IS_ERR(reloc_inode));
			
 
				-
			
 
				-	__alloc_chunk_for_shrink(root, block_group, 1);
			
 
				-	set_block_group_readonly(block_group);
			
 
				-
			
 
				-	btrfs_start_delalloc_inodes(info->tree_root);
			
 
				-	btrfs_wait_ordered_extents(info->tree_root, 0);
			
 
				-again:
			
 
				-	skipped = 0;
			
 
				-	total_found = 0;
			
 
				-	progress = 0;
			
 
				-	key.objectid = block_group->key.objectid;
			
 
				-	key.offset = 0;
			
 
				-	key.type = 0;
			
 
				-	cur_byte = key.objectid;
			
 
				-
			
 
				-	trans = btrfs_start_transaction(info->tree_root, 1);
			
 
				-	btrfs_commit_transaction(trans, info->tree_root);
			
 
				+	spin_unlock(&space_info->lock);
			
 
				 
			
 
				-	mutex_lock(&root->fs_info->cleaner_mutex);
			
 
				-	btrfs_clean_old_snapshots(info->tree_root);
			
 
				-	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
			
 
				-	mutex_unlock(&root->fs_info->cleaner_mutex);
			
 
				+	/*
			
 
				+	 * ok we don't have enough space, but maybe we have free space on our
			
 
				+	 * devices to allocate new chunks for relocation, so loop through our
			
 
				+	 * alloc devices and guess if we have enough space.  However, if we
			
 
				+	 * were marked as full, then we know there aren't enough chunks, and we
			
 
				+	 * can just return.
			
 
				+	 */
			
 
				+	ret = -1;
			
 
				+	if (full)
			
 
				+		goto out;
			
 
				 
			
 
				-	trans = btrfs_start_transaction(info->tree_root, 1);
			
 
				-	btrfs_commit_transaction(trans, info->tree_root);
			
 
				+	mutex_lock(&root->fs_info->chunk_mutex);
			
 
				+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
			
 
				+		u64 min_free = btrfs_block_group_used(&block_group->item);
			
 
				+		u64 dev_offset, max_avail;
			
 
				 
			
 
				-	while (1) {
			
 
				-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				-		if (ret < 0)
			
 
				-			goto out;
			
 
				-next:
			
 
				-		leaf = path->nodes[0];
			
 
				-		nritems = btrfs_header_nritems(leaf);
			
 
				-		if (path->slots[0] >= nritems) {
			
 
				-			ret = btrfs_next_leaf(root, path);
			
 
				-			if (ret < 0)
			
 
				-				goto out;
			
 
				-			if (ret == 1) {
			
 
				-				ret = 0;
			
 
				+		/*
			
 
				+		 * check to make sure we can actually find a chunk with enough
			
 
				+		 * space to fit our block group in.
			
 
				+		 */
			
 
				+		if (device->total_bytes > device->bytes_used + min_free) {
			
 
				+			ret = find_free_dev_extent(NULL, device, min_free,
			
 
				+						   &dev_offset, &max_avail);
			
 
				+			if (!ret)
			
 
				 				break;
			
 
				-			}
			
 
				-			leaf = path->nodes[0];
			
 
				-			nritems = btrfs_header_nritems(leaf);
			
 
				-		}
			
 
				-
			
 
				-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
			
 
				-
			
 
				-		if (key.objectid >= block_group->key.objectid +
			
 
				-		    block_group->key.offset)
			
 
				-			break;
			
 
				-
			
 
				-		if (progress && need_resched()) {
			
 
				-			btrfs_release_path(root, path);
			
 
				-			cond_resched();
			
 
				-			progress = 0;
			
 
				-			continue;
			
 
				+			ret = -1;
			
 
				 		}
			
 
				-		progress = 1;
			
 
				-
			
 
				-		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
			
 
				-		    key.objectid + key.offset <= cur_byte) {
			
 
				-			path->slots[0]++;
			
 
				-			goto next;
			
 
				-		}
			
 
				-
			
 
				-		total_found++;
			
 
				-		cur_byte = key.objectid + key.offset;
			
 
				-		btrfs_release_path(root, path);
			
 
				-
			
 
				-		__alloc_chunk_for_shrink(root, block_group, 0);
			
 
				-		ret = relocate_one_extent(root, path, &key, block_group,
			
 
				-					  reloc_inode, pass);
			
 
				-		BUG_ON(ret < 0);
			
 
				-		if (ret > 0)
			
 
				-			skipped++;
			
 
				-
			
 
				-		key.objectid = cur_byte;
			
 
				-		key.type = 0;
			
 
				-		key.offset = 0;
			
 
				 	}
			
 
				-
			
 
				-	btrfs_release_path(root, path);
			
 
				-
			
 
				-	if (pass == 0) {
			
 
				-		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
			
 
				-		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
			
 
				-	}
			
 
				-
			
 
				-	if (total_found > 0) {
			
 
				-		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
			
 
				-		       (unsigned long long)total_found, pass);
			
 
				-		pass++;
			
 
				-		if (total_found == skipped && pass > 2) {
			
 
				-			iput(reloc_inode);
			
 
				-			reloc_inode = create_reloc_inode(info, block_group);
			
 
				-			pass = 0;
			
 
				-		}
			
 
				-		goto again;
			
 
				-	}
			
 
				-
			
 
				-	/* delete reloc_inode */
			
 
				-	iput(reloc_inode);
			
 
				-
			
 
				-	/* unpin extents in this range */
			
 
				-	trans = btrfs_start_transaction(info->tree_root, 1);
			
 
				-	btrfs_commit_transaction(trans, info->tree_root);
			
 
				-
			
 
				-	spin_lock(&block_group->lock);
			
 
				-	WARN_ON(block_group->pinned > 0);
			
 
				-	WARN_ON(block_group->reserved > 0);
			
 
				-	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
			
 
				-	spin_unlock(&block_group->lock);
			
 
				-	btrfs_put_block_group(block_group);
			
 
				-	ret = 0;
			
 
				+	mutex_unlock(&root->fs_info->chunk_mutex);
			
 
				 out:
			
 
				-	btrfs_free_path(path);
			
 
				+	btrfs_put_block_group(block_group);
			
 
				 	return ret;
			
 
				 }
			
 
				-#endif
			
 
				 
			
 
				 static int find_first_block_group(struct btrfs_root *root,
			
 
				 		struct btrfs_path *path, struct btrfs_key *key)
			
@@ -7165,8 +6847,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 
				 {
			
 
				 	struct btrfs_block_group_cache *block_group;
			
 
				 	struct btrfs_space_info *space_info;
			
 
				+	struct btrfs_caching_control *caching_ctl;
			
 
				 	struct rb_node *n;
			
 
				 
			
 
				+	down_write(&info->extent_commit_sem);
			
 
				+	while (!list_empty(&info->caching_block_groups)) {
			
 
				+		caching_ctl = list_entry(info->caching_block_groups.next,
			
 
				+					 struct btrfs_caching_control, list);
			
 
				+		list_del(&caching_ctl->list);
			
 
				+		put_caching_control(caching_ctl);
			
 
				+	}
			
 
				+	up_write(&info->extent_commit_sem);
			
 
				+
			
 
				 	spin_lock(&info->block_group_cache_lock);
			
 
				 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
			
 
				 		block_group = rb_entry(n, struct btrfs_block_group_cache,
			
@@ -7180,8 +6872,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 
				 		up_write(&block_group->space_info->groups_sem);
			
 
				 
			
 
				 		if (block_group->cached == BTRFS_CACHE_STARTED)
			
 
				-			wait_event(block_group->caching_q,
			
 
				-				   block_group_cache_done(block_group));
			
 
				+			wait_block_group_cache_done(block_group);
			
 
				 
			
 
				 		btrfs_remove_free_space_cache(block_group);
			
 
				 
			
@@ -7251,7 +6942,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
				 		spin_lock_init(&cache->lock);
			
 
				 		spin_lock_init(&cache->tree_lock);
			
 
				 		cache->fs_info = info;
			
 
				-		init_waitqueue_head(&cache->caching_q);
			
 
				 		INIT_LIST_HEAD(&cache->list);
			
 
				 		INIT_LIST_HEAD(&cache->cluster_list);
			
 
				 
			
@@ -7273,8 +6963,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
				 		cache->flags = btrfs_block_group_flags(&cache->item);
			
 
				 		cache->sectorsize = root->sectorsize;
			
 
				 
			
 
				-		remove_sb_from_cache(root, cache);
			
 
				-
			
 
				 		/*
			
 
				 		 * check for two cases, either we are full, and therefore
			
 
				 		 * don't need to bother with the caching work since we won't
			
@@ -7283,13 +6971,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
				 		 * time, particularly in the full case.
			
 
				 		 */
			
 
				 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
			
 
				+			exclude_super_stripes(root, cache);
			
 
				+			cache->last_byte_to_unpin = (u64)-1;
			
 
				 			cache->cached = BTRFS_CACHE_FINISHED;
			
 
				+			free_excluded_extents(root, cache);
			
 
				 		} else if (btrfs_block_group_used(&cache->item) == 0) {
			
 
				+			exclude_super_stripes(root, cache);
			
 
				+			cache->last_byte_to_unpin = (u64)-1;
			
 
				 			cache->cached = BTRFS_CACHE_FINISHED;
			
 
				 			add_new_free_space(cache, root->fs_info,
			
 
				 					   found_key.objectid,
			
 
				 					   found_key.objectid +
			
 
				 					   found_key.offset);
			
 
				+			free_excluded_extents(root, cache);
			
 
				 		}
			
 
				 
			
 
				 		ret = update_space_info(info, cache->flags, found_key.offset,
			
@@ -7297,6 +6991,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
				 					&space_info);
			
 
				 		BUG_ON(ret);
			
 
				 		cache->space_info = space_info;
			
 
				+		spin_lock(&cache->space_info->lock);
			
 
				+		cache->space_info->bytes_super += cache->bytes_super;
			
 
				+		spin_unlock(&cache->space_info->lock);
			
 
				+
			
 
				 		down_write(&space_info->groups_sem);
			
 
				 		list_add_tail(&cache->list, &space_info->block_groups);
			
 
				 		up_write(&space_info->groups_sem);
			
@@ -7346,7 +7044,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
				 	atomic_set(&cache->count, 1);
			
 
				 	spin_lock_init(&cache->lock);
			
 
				 	spin_lock_init(&cache->tree_lock);
			
 
				-	init_waitqueue_head(&cache->caching_q);
			
 
				 	INIT_LIST_HEAD(&cache->list);
			
 
				 	INIT_LIST_HEAD(&cache->cluster_list);
			
 
				 
			
@@ -7355,15 +7052,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
				 	cache->flags = type;
			
 
				 	btrfs_set_block_group_flags(&cache->item, type);
			
 
				 
			
 
				+	cache->last_byte_to_unpin = (u64)-1;
			
 
				 	cache->cached = BTRFS_CACHE_FINISHED;
			
 
				-	remove_sb_from_cache(root, cache);
			
 
				+	exclude_super_stripes(root, cache);
			
 
				 
			
 
				 	add_new_free_space(cache, root->fs_info, chunk_offset,
			
 
				 			   chunk_offset + size);
			
 
				 
			
 
				+	free_excluded_extents(root, cache);
			
 
				+
			
 
				 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
			
 
				 				&cache->space_info);
			
 
				 	BUG_ON(ret);
			
 
				+
			
 
				+	spin_lock(&cache->space_info->lock);
			
 
				+	cache->space_info->bytes_super += cache->bytes_super;
			
 
				+	spin_unlock(&cache->space_info->lock);
			
 
				+
			
 
				 	down_write(&cache->space_info->groups_sem);
			
 
				 	list_add_tail(&cache->list, &cache->space_info->block_groups);
			
 
				 	up_write(&cache->space_info->groups_sem);
			
@@ -7429,8 +7134,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
				 	up_write(&block_group->space_info->groups_sem);
			
 
				 
			
 
				 	if (block_group->cached == BTRFS_CACHE_STARTED)
			
 
				-		wait_event(block_group->caching_q,
			
 
				-			   block_group_cache_done(block_group));
			
 
				+		wait_block_group_cache_done(block_group);
			
 
				 
			
 
				 	btrfs_remove_free_space_cache(block_group);
			
 
				 
			
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -367,10 +367,10 @@ static int insert_state(struct extent_io_tree *tree,
 
				 	}
			
 
				 	if (bits & EXTENT_DIRTY)
			
 
				 		tree->dirty_bytes += end - start + 1;
			
 
				-	set_state_cb(tree, state, bits);
			
 
				-	state->state |= bits;
			
 
				 	state->start = start;
			
 
				 	state->end = end;
			
 
				+	set_state_cb(tree, state, bits);
			
 
				+	state->state |= bits;
			
 
				 	node = tree_insert(&tree->state, end, &state->rb_node);
			
 
				 	if (node) {
			
 
				 		struct extent_state *found;
			
@@ -471,10 +471,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
 
				  * bits were already set, or zero if none of the bits were already set.
			
 
				  */
			
 
				 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				-		     int bits, int wake, int delete, gfp_t mask)
			
 
				+		     int bits, int wake, int delete,
			
 
				+		     struct extent_state **cached_state,
			
 
				+		     gfp_t mask)
			
 
				 {
			
 
				 	struct extent_state *state;
			
 
				+	struct extent_state *cached;
			
 
				 	struct extent_state *prealloc = NULL;
			
 
				+	struct rb_node *next_node;
			
 
				 	struct rb_node *node;
			
 
				 	u64 last_end;
			
 
				 	int err;
			
@@ -488,6 +492,17 @@ again:
 
				 	}
			
 
				 
			
 
				 	spin_lock(&tree->lock);
			
 
				+	if (cached_state) {
			
 
				+		cached = *cached_state;
			
 
				+		*cached_state = NULL;
			
 
				+		cached_state = NULL;
			
 
				+		if (cached && cached->tree && cached->start == start) {
			
 
				+			atomic_dec(&cached->refs);
			
 
				+			state = cached;
			
 
				+			goto hit_next;
			
 
				+		}
			
 
				+		free_extent_state(cached);
			
 
				+	}
			
 
				 	/*
			
 
				 	 * this search will find the extents that end after
			
 
				 	 * our range starts
			
@@ -496,6 +511,7 @@ again:
 
				 	if (!node)
			
 
				 		goto out;
			
 
				 	state = rb_entry(node, struct extent_state, rb_node);
			
 
				+hit_next:
			
 
				 	if (state->start > end)
			
 
				 		goto out;
			
 
				 	WARN_ON(state->end < start);
			
@@ -531,8 +547,6 @@ again:
 
				 			if (last_end == (u64)-1)
			
 
				 				goto out;
			
 
				 			start = last_end + 1;
			
 
				-		} else {
			
 
				-			start = state->start;
			
 
				 		}
			
 
				 		goto search_again;
			
 
				 	}
			
@@ -550,16 +564,28 @@ again:
 
				 
			
 
				 		if (wake)
			
 
				 			wake_up(&state->wq);
			
 
				+
			
 
				 		set |= clear_state_bit(tree, prealloc, bits,
			
 
				 				       wake, delete);
			
 
				 		prealloc = NULL;
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				+	if (state->end < end && prealloc && !need_resched())
			
 
				+		next_node = rb_next(&state->rb_node);
			
 
				+	else
			
 
				+		next_node = NULL;
			
 
				+
			
 
				 	set |= clear_state_bit(tree, state, bits, wake, delete);
			
 
				 	if (last_end == (u64)-1)
			
 
				 		goto out;
			
 
				 	start = last_end + 1;
			
 
				+	if (start <= end && next_node) {
			
 
				+		state = rb_entry(next_node, struct extent_state,
			
 
				+				 rb_node);
			
 
				+		if (state->start == start)
			
 
				+			goto hit_next;
			
 
				+	}
			
 
				 	goto search_again;
			
 
				 
			
 
				 out:
			
@@ -653,28 +679,40 @@ static void set_state_bits(struct extent_io_tree *tree,
 
				 	state->state |= bits;
			
 
				 }
			
 
				 
			
 
				+static void cache_state(struct extent_state *state,
			
 
				+			struct extent_state **cached_ptr)
			
 
				+{
			
 
				+	if (cached_ptr && !(*cached_ptr)) {
			
 
				+		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
			
 
				+			*cached_ptr = state;
			
 
				+			atomic_inc(&state->refs);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				- * set some bits on a range in the tree.  This may require allocations
			
 
				- * or sleeping, so the gfp mask is used to indicate what is allowed.
			
 
				+ * set some bits on a range in the tree.  This may require allocations or
			
 
				+ * sleeping, so the gfp mask is used to indicate what is allowed.
			
 
				  *
			
 
				- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
			
 
				- * range already has the desired bits set.  The start of the existing
			
 
				- * range is returned in failed_start in this case.
			
 
				+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
			
 
				+ * part of the range already has the desired bits set.  The start of the
			
 
				+ * existing range is returned in failed_start in this case.
			
 
				  *
			
 
				- * [start, end] is inclusive
			
 
				- * This takes the tree lock.
			
 
				+ * [start, end] is inclusive This takes the tree lock.
			
 
				  */
			
 
				+
			
 
				 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				-			  int bits, int exclusive, u64 *failed_start,
			
 
				+			  int bits, int exclusive_bits, u64 *failed_start,
			
 
				+			  struct extent_state **cached_state,
			
 
				 			  gfp_t mask)
			
 
				 {
			
 
				 	struct extent_state *state;
			
 
				 	struct extent_state *prealloc = NULL;
			
 
				 	struct rb_node *node;
			
 
				 	int err = 0;
			
 
				-	int set;
			
 
				 	u64 last_start;
			
 
				 	u64 last_end;
			
 
				+
			
 
				 again:
			
 
				 	if (!prealloc && (mask & __GFP_WAIT)) {
			
 
				 		prealloc = alloc_extent_state(mask);
			
@@ -683,6 +721,13 @@ again:
 
				 	}
			
 
				 
			
 
				 	spin_lock(&tree->lock);
			
 
				+	if (cached_state && *cached_state) {
			
 
				+		state = *cached_state;
			
 
				+		if (state->start == start && state->tree) {
			
 
				+			node = &state->rb_node;
			
 
				+			goto hit_next;
			
 
				+		}
			
 
				+	}
			
 
				 	/*
			
 
				 	 * this search will find all the extents that end after
			
 
				 	 * our range starts.
			
@@ -694,8 +739,8 @@ again:
 
				 		BUG_ON(err == -EEXIST);
			
 
				 		goto out;
			
 
				 	}
			
 
				-
			
 
				 	state = rb_entry(node, struct extent_state, rb_node);
			
 
				+hit_next:
			
 
				 	last_start = state->start;
			
 
				 	last_end = state->end;
			
 
				 
			
@@ -706,17 +751,29 @@ again:
 
				 	 * Just lock what we found and keep going
			
 
				 	 */
			
 
				 	if (state->start == start && state->end <= end) {
			
 
				-		set = state->state & bits;
			
 
				-		if (set && exclusive) {
			
 
				+		struct rb_node *next_node;
			
 
				+		if (state->state & exclusive_bits) {
			
 
				 			*failed_start = state->start;
			
 
				 			err = -EEXIST;
			
 
				 			goto out;
			
 
				 		}
			
 
				+
			
 
				 		set_state_bits(tree, state, bits);
			
 
				+		cache_state(state, cached_state);
			
 
				 		merge_state(tree, state);
			
 
				 		if (last_end == (u64)-1)
			
 
				 			goto out;
			
 
				+
			
 
				 		start = last_end + 1;
			
 
				+		if (start < end && prealloc && !need_resched()) {
			
 
				+			next_node = rb_next(node);
			
 
				+			if (next_node) {
			
 
				+				state = rb_entry(next_node, struct extent_state,
			
 
				+						 rb_node);
			
 
				+				if (state->start == start)
			
 
				+					goto hit_next;
			
 
				+			}
			
 
				+		}
			
 
				 		goto search_again;
			
 
				 	}
			
 
				 
			
@@ -737,8 +794,7 @@ again:
 
				 	 * desired bit on it.
			
 
				 	 */
			
 
				 	if (state->start < start) {
			
 
				-		set = state->state & bits;
			
 
				-		if (exclusive && set) {
			
 
				+		if (state->state & exclusive_bits) {
			
 
				 			*failed_start = start;
			
 
				 			err = -EEXIST;
			
 
				 			goto out;
			
@@ -750,12 +806,11 @@ again:
 
				 			goto out;
			
 
				 		if (state->end <= end) {
			
 
				 			set_state_bits(tree, state, bits);
			
 
				+			cache_state(state, cached_state);
			
 
				 			merge_state(tree, state);
			
 
				 			if (last_end == (u64)-1)
			
 
				 				goto out;
			
 
				 			start = last_end + 1;
			
 
				-		} else {
			
 
				-			start = state->start;
			
 
				 		}
			
 
				 		goto search_again;
			
 
				 	}
			
@@ -774,6 +829,7 @@ again:
 
				 			this_end = last_start - 1;
			
 
				 		err = insert_state(tree, prealloc, start, this_end,
			
 
				 				   bits);
			
 
				+		cache_state(prealloc, cached_state);
			
 
				 		prealloc = NULL;
			
 
				 		BUG_ON(err == -EEXIST);
			
 
				 		if (err)
			
@@ -788,8 +844,7 @@ again:
 
				 	 * on the first half
			
 
				 	 */
			
 
				 	if (state->start <= end && state->end > end) {
			
 
				-		set = state->state & bits;
			
 
				-		if (exclusive && set) {
			
 
				+		if (state->state & exclusive_bits) {
			
 
				 			*failed_start = start;
			
 
				 			err = -EEXIST;
			
 
				 			goto out;
			
@@ -798,6 +853,7 @@ again:
 
				 		BUG_ON(err == -EEXIST);
			
 
				 
			
 
				 		set_state_bits(tree, prealloc, bits);
			
 
				+		cache_state(prealloc, cached_state);
			
 
				 		merge_state(tree, prealloc);
			
 
				 		prealloc = NULL;
			
 
				 		goto out;
			
@@ -826,86 +882,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 
				 		     gfp_t mask)
			
 
				 {
			
 
				 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
			
 
				-			      mask);
			
 
				-}
			
 
				-
			
 
				-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				-		       gfp_t mask)
			
 
				-{
			
 
				-	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
			
 
				+			      NULL, mask);
			
 
				 }
			
 
				 
			
 
				 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		    int bits, gfp_t mask)
			
 
				 {
			
 
				 	return set_extent_bit(tree, start, end, bits, 0, NULL,
			
 
				-			      mask);
			
 
				+			      NULL, mask);
			
 
				 }
			
 
				 
			
 
				 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		      int bits, gfp_t mask)
			
 
				 {
			
 
				-	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
			
 
				+	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
			
 
				 }
			
 
				 
			
 
				 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		     gfp_t mask)
			
 
				 {
			
 
				 	return set_extent_bit(tree, start, end,
			
 
				-			      EXTENT_DELALLOC | EXTENT_DIRTY,
			
 
				-			      0, NULL, mask);
			
 
				+			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
			
 
				+			      0, NULL, NULL, mask);
			
 
				 }
			
 
				 
			
 
				 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		       gfp_t mask)
			
 
				 {
			
 
				 	return clear_extent_bit(tree, start, end,
			
 
				-				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
			
 
				-}
			
 
				-
			
 
				-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				-			 gfp_t mask)
			
 
				-{
			
 
				-	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
			
 
				+				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
			
 
				+				NULL, mask);
			
 
				 }
			
 
				 
			
 
				 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		     gfp_t mask)
			
 
				 {
			
 
				 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
			
 
				-			      mask);
			
 
				+			      NULL, mask);
			
 
				 }
			
 
				 
			
 
				 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		       gfp_t mask)
			
 
				 {
			
 
				-	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
			
 
				+	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
			
 
				+				NULL, mask);
			
 
				 }
			
 
				 
			
 
				 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 			gfp_t mask)
			
 
				 {
			
 
				 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
			
 
				-			      mask);
			
 
				+			      NULL, mask);
			
 
				 }
			
 
				 
			
 
				 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
			
 
				 				 u64 end, gfp_t mask)
			
 
				 {
			
 
				-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
			
 
				-}
			
 
				-
			
 
				-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				-			 gfp_t mask)
			
 
				-{
			
 
				-	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
			
 
				-			      0, NULL, mask);
			
 
				-}
			
 
				-
			
 
				-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
			
 
				-				  u64 end, gfp_t mask)
			
 
				-{
			
 
				-	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
			
 
				+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
			
 
				+				NULL, mask);
			
 
				 }
			
 
				 
			
 
				 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
			
@@ -917,13 +951,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 
				  * either insert or lock state struct between start and end use mask to tell
			
 
				  * us if waiting is desired.
			
 
				  */
			
 
				-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
			
 
				+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				+		     int bits, struct extent_state **cached_state, gfp_t mask)
			
 
				 {
			
 
				 	int err;
			
 
				 	u64 failed_start;
			
 
				 	while (1) {
			
 
				-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
			
 
				-				     &failed_start, mask);
			
 
				+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
			
 
				+				     EXTENT_LOCKED, &failed_start,
			
 
				+				     cached_state, mask);
			
 
				 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
			
 
				 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
			
 
				 			start = failed_start;
			
@@ -935,27 +971,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
			
 
				+{
			
 
				+	return lock_extent_bits(tree, start, end, 0, NULL, mask);
			
 
				+}
			
 
				+
			
 
				 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		    gfp_t mask)
			
 
				 {
			
 
				 	int err;
			
 
				 	u64 failed_start;
			
 
				 
			
 
				-	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
			
 
				-			     &failed_start, mask);
			
 
				+	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
			
 
				+			     &failed_start, NULL, mask);
			
 
				 	if (err == -EEXIST) {
			
 
				 		if (failed_start > start)
			
 
				 			clear_extent_bit(tree, start, failed_start - 1,
			
 
				-					 EXTENT_LOCKED, 1, 0, mask);
			
 
				+					 EXTENT_LOCKED, 1, 0, NULL, mask);
			
 
				 		return 0;
			
 
				 	}
			
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				+			 struct extent_state **cached, gfp_t mask)
			
 
				+{
			
 
				+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
			
 
				+				mask);
			
 
				+}
			
 
				+
			
 
				 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		  gfp_t mask)
			
 
				 {
			
 
				-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
			
 
				+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
			
 
				+				mask);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -974,7 +1023,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
 
				 		page_cache_release(page);
			
 
				 		index++;
			
 
				 	}
			
 
				-	set_extent_dirty(tree, start, end, GFP_NOFS);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -994,7 +1042,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 
				 		page_cache_release(page);
			
 
				 		index++;
			
 
				 	}
			
 
				-	set_extent_writeback(tree, start, end, GFP_NOFS);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1232,6 +1279,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
 
				 	u64 delalloc_start;
			
 
				 	u64 delalloc_end;
			
 
				 	u64 found;
			
 
				+	struct extent_state *cached_state = NULL;
			
 
				 	int ret;
			
 
				 	int loops = 0;
			
 
				 
			
@@ -1269,6 +1317,7 @@ again:
 
				 		/* some of the pages are gone, lets avoid looping by
			
 
				 		 * shortening the size of the delalloc range we're searching
			
 
				 		 */
			
 
				+		free_extent_state(cached_state);
			
 
				 		if (!loops) {
			
 
				 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
			
 
				 			max_bytes = PAGE_CACHE_SIZE - offset;
			
@@ -1282,18 +1331,21 @@ again:
 
				 	BUG_ON(ret);
			
 
				 
			
 
				 	/* step three, lock the state bits for the whole range */
			
 
				-	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
			
 
				+	lock_extent_bits(tree, delalloc_start, delalloc_end,
			
 
				+			 0, &cached_state, GFP_NOFS);
			
 
				 
			
 
				 	/* then test to make sure it is all still delalloc */
			
 
				 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
			
 
				-			     EXTENT_DELALLOC, 1);
			
 
				+			     EXTENT_DELALLOC, 1, cached_state);
			
 
				 	if (!ret) {
			
 
				-		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
			
 
				+		unlock_extent_cached(tree, delalloc_start, delalloc_end,
			
 
				+				     &cached_state, GFP_NOFS);
			
 
				 		__unlock_for_delalloc(inode, locked_page,
			
 
				 			      delalloc_start, delalloc_end);
			
 
				 		cond_resched();
			
 
				 		goto again;
			
 
				 	}
			
 
				+	free_extent_state(cached_state);
			
 
				 	*start = delalloc_start;
			
 
				 	*end = delalloc_end;
			
 
				 out_failed:
			
@@ -1307,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 
				 				int clear_unlock,
			
 
				 				int clear_delalloc, int clear_dirty,
			
 
				 				int set_writeback,
			
 
				-				int end_writeback)
			
 
				+				int end_writeback,
			
 
				+				int set_private2)
			
 
				 {
			
 
				 	int ret;
			
 
				 	struct page *pages[16];
			
@@ -1325,8 +1378,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 
				 	if (clear_delalloc)
			
 
				 		clear_bits |= EXTENT_DELALLOC;
			
 
				 
			
 
				-	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
			
 
				-	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
			
 
				+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
			
 
				+	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
			
 
				+	      set_private2))
			
 
				 		return 0;
			
 
				 
			
 
				 	while (nr_pages > 0) {
			
@@ -1334,6 +1388,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 
				 				     min_t(unsigned long,
			
 
				 				     nr_pages, ARRAY_SIZE(pages)), pages);
			
 
				 		for (i = 0; i < ret; i++) {
			
 
				+
			
 
				+			if (set_private2)
			
 
				+				SetPagePrivate2(pages[i]);
			
 
				+
			
 
				 			if (pages[i] == locked_page) {
			
 
				 				page_cache_release(pages[i]);
			
 
				 				continue;
			
@@ -1476,14 +1534,17 @@ out:
 
				  * range is found set.
			
 
				  */
			
 
				 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				-		   int bits, int filled)
			
 
				+		   int bits, int filled, struct extent_state *cached)
			
 
				 {
			
 
				 	struct extent_state *state = NULL;
			
 
				 	struct rb_node *node;
			
 
				 	int bitset = 0;
			
 
				 
			
 
				 	spin_lock(&tree->lock);
			
 
				-	node = tree_search(tree, start);
			
 
				+	if (cached && cached->tree && cached->start == start)
			
 
				+		node = &cached->rb_node;
			
 
				+	else
			
 
				+		node = tree_search(tree, start);
			
 
				 	while (node && start <= end) {
			
 
				 		state = rb_entry(node, struct extent_state, rb_node);
			
 
				 
			
@@ -1503,6 +1564,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
				 			bitset = 0;
			
 
				 			break;
			
 
				 		}
			
 
				+
			
 
				+		if (state->end == (u64)-1)
			
 
				+			break;
			
 
				+
			
 
				 		start = state->end + 1;
			
 
				 		if (start > end)
			
 
				 			break;
			
@@ -1526,7 +1591,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
 
				 {
			
 
				 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
			
 
				 	u64 end = start + PAGE_CACHE_SIZE - 1;
			
 
				-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
			
 
				+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
			
 
				 		SetPageUptodate(page);
			
 
				 	return 0;
			
 
				 }
			
@@ -1540,7 +1605,7 @@ static int check_page_locked(struct extent_io_tree *tree,
 
				 {
			
 
				 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
			
 
				 	u64 end = start + PAGE_CACHE_SIZE - 1;
			
 
				-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
			
 
				+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
			
 
				 		unlock_page(page);
			
 
				 	return 0;
			
 
				 }
			
@@ -1552,10 +1617,7 @@ static int check_page_locked(struct extent_io_tree *tree,
 
				 static int check_page_writeback(struct extent_io_tree *tree,
			
 
				 			     struct page *page)
			
 
				 {
			
 
				-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
			
 
				-	u64 end = start + PAGE_CACHE_SIZE - 1;
			
 
				-	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
			
 
				-		end_page_writeback(page);
			
 
				+	end_page_writeback(page);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1613,13 +1675,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 
				 		}
			
 
				 
			
 
				 		if (!uptodate) {
			
 
				-			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
			
 
				+			clear_extent_uptodate(tree, start, end, GFP_NOFS);
			
 
				 			ClearPageUptodate(page);
			
 
				 			SetPageError(page);
			
 
				 		}
			
 
				 
			
 
				-		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
			
 
				-
			
 
				 		if (whole_page)
			
 
				 			end_page_writeback(page);
			
 
				 		else
			
@@ -1983,7 +2043,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
				 			continue;
			
 
				 		}
			
 
				 		/* the get_extent function already copied into the page */
			
 
				-		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
			
 
				+		if (test_range_bit(tree, cur, cur_end,
			
 
				+				   EXTENT_UPTODATE, 1, NULL)) {
			
 
				 			check_page_uptodate(tree, page);
			
 
				 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
			
 
				 			cur = cur + iosize;
			
@@ -2078,6 +2139,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 	u64 iosize;
			
 
				 	u64 unlock_start;
			
 
				 	sector_t sector;
			
 
				+	struct extent_state *cached_state = NULL;
			
 
				 	struct extent_map *em;
			
 
				 	struct block_device *bdev;
			
 
				 	int ret;
			
@@ -2124,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 	delalloc_end = 0;
			
 
				 	page_started = 0;
			
 
				 	if (!epd->extent_locked) {
			
 
				+		u64 delalloc_to_write = 0;
			
 
				 		/*
			
 
				 		 * make sure the wbc mapping index is at least updated
			
 
				 		 * to this page.
			
@@ -2143,8 +2206,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 			tree->ops->fill_delalloc(inode, page, delalloc_start,
			
 
				 						 delalloc_end, &page_started,
			
 
				 						 &nr_written);
			
 
				+			/*
			
 
				+			 * delalloc_end is already one less than the total
			
 
				+			 * length, so we don't subtract one from
			
 
				+			 * PAGE_CACHE_SIZE
			
 
				+			 */
			
 
				+			delalloc_to_write += (delalloc_end - delalloc_start +
			
 
				+					      PAGE_CACHE_SIZE) >>
			
 
				+					      PAGE_CACHE_SHIFT;
			
 
				 			delalloc_start = delalloc_end + 1;
			
 
				 		}
			
 
				+		if (wbc->nr_to_write < delalloc_to_write) {
			
 
				+			int thresh = 8192;
			
 
				+
			
 
				+			if (delalloc_to_write < thresh * 2)
			
 
				+				thresh = delalloc_to_write;
			
 
				+			wbc->nr_to_write = min_t(u64, delalloc_to_write,
			
 
				+						 thresh);
			
 
				+		}
			
 
				 
			
 
				 		/* did the fill delalloc function already unlock and start
			
 
				 		 * the IO?
			
@@ -2160,15 +2239,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 			goto done_unlocked;
			
 
				 		}
			
 
				 	}
			
 
				-	lock_extent(tree, start, page_end, GFP_NOFS);
			
 
				-
			
 
				-	unlock_start = start;
			
 
				-
			
 
				 	if (tree->ops && tree->ops->writepage_start_hook) {
			
 
				 		ret = tree->ops->writepage_start_hook(page, start,
			
 
				 						      page_end);
			
 
				 		if (ret == -EAGAIN) {
			
 
				-			unlock_extent(tree, start, page_end, GFP_NOFS);
			
 
				 			redirty_page_for_writepage(wbc, page);
			
 
				 			update_nr_written(page, wbc, nr_written);
			
 
				 			unlock_page(page);
			
@@ -2184,12 +2258,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 	update_nr_written(page, wbc, nr_written + 1);
			
 
				 
			
 
				 	end = page_end;
			
 
				-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
			
 
				-		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
			
 
				-
			
 
				 	if (last_byte <= start) {
			
 
				-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
			
 
				-		unlock_extent(tree, start, page_end, GFP_NOFS);
			
 
				 		if (tree->ops && tree->ops->writepage_end_io_hook)
			
 
				 			tree->ops->writepage_end_io_hook(page, start,
			
 
				 							 page_end, NULL, 1);
			
@@ -2197,13 +2266,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 		goto done;
			
 
				 	}
			
 
				 
			
 
				-	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
			
 
				 	blocksize = inode->i_sb->s_blocksize;
			
 
				 
			
 
				 	while (cur <= end) {
			
 
				 		if (cur >= last_byte) {
			
 
				-			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
			
 
				-			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
			
 
				 			if (tree->ops && tree->ops->writepage_end_io_hook)
			
 
				 				tree->ops->writepage_end_io_hook(page, cur,
			
 
				 							 page_end, NULL, 1);
			
@@ -2235,12 +2301,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 		 */
			
 
				 		if (compressed || block_start == EXTENT_MAP_HOLE ||
			
 
				 		    block_start == EXTENT_MAP_INLINE) {
			
 
				-			clear_extent_dirty(tree, cur,
			
 
				-					   cur + iosize - 1, GFP_NOFS);
			
 
				-
			
 
				-			unlock_extent(tree, unlock_start, cur + iosize - 1,
			
 
				-				      GFP_NOFS);
			
 
				-
			
 
				 			/*
			
 
				 			 * end_io notification does not happen here for
			
 
				 			 * compressed extents
			
@@ -2265,13 +2325,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
				 		}
			
 
				 		/* leave this out until we have a page_mkwrite call */
			
 
				 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
			
 
				-				   EXTENT_DIRTY, 0)) {
			
 
				+				   EXTENT_DIRTY, 0, NULL)) {
			
 
				 			cur = cur + iosize;
			
 
				 			pg_offset += iosize;
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
			
 
				 		if (tree->ops && tree->ops->writepage_io_hook) {
			
 
				 			ret = tree->ops->writepage_io_hook(page, cur,
			
 
				 						cur + iosize - 1);
			
@@ -2309,12 +2368,12 @@ done:
 
				 		set_page_writeback(page);
			
 
				 		end_page_writeback(page);
			
 
				 	}
			
 
				-	if (unlock_start <= page_end)
			
 
				-		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
			
 
				 	unlock_page(page);
			
 
				 
			
 
				 done_unlocked:
			
 
				 
			
 
				+	/* drop our reference on any cached states */
			
 
				+	free_extent_state(cached_state);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -2339,9 +2398,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 
				 			     writepage_t writepage, void *data,
			
 
				 			     void (*flush_fn)(void *))
			
 
				 {
			
 
				-	struct backing_dev_info *bdi = mapping->backing_dev_info;
			
 
				 	int ret = 0;
			
 
				 	int done = 0;
			
 
				+	int nr_to_write_done = 0;
			
 
				 	struct pagevec pvec;
			
 
				 	int nr_pages;
			
 
				 	pgoff_t index;
			
@@ -2361,7 +2420,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 
				 		scanned = 1;
			
 
				 	}
			
 
				 retry:
			
 
				-	while (!done && (index <= end) &&
			
 
				+	while (!done && !nr_to_write_done && (index <= end) &&
			
 
				 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			
 
				 			      PAGECACHE_TAG_DIRTY, min(end - index,
			
 
				 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
			
@@ -2412,12 +2471,15 @@ retry:
 
				 				unlock_page(page);
			
 
				 				ret = 0;
			
 
				 			}
			
 
				-			if (ret || wbc->nr_to_write <= 0)
			
 
				-				done = 1;
			
 
				-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
			
 
				-				wbc->encountered_congestion = 1;
			
 
				+			if (ret)
			
 
				 				done = 1;
			
 
				-			}
			
 
				+
			
 
				+			/*
			
 
				+			 * the filesystem may choose to bump up nr_to_write.
			
 
				+			 * We have to make sure to honor the new nr_to_write
			
 
				+			 * at any time
			
 
				+			 */
			
 
				+			nr_to_write_done = wbc->nr_to_write <= 0;
			
 
				 		}
			
 
				 		pagevec_release(&pvec);
			
 
				 		cond_resched();
			
@@ -2604,10 +2666,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 
				 		return 0;
			
 
				 
			
 
				 	lock_extent(tree, start, end, GFP_NOFS);
			
 
				-	wait_on_extent_writeback(tree, start, end);
			
 
				+	wait_on_page_writeback(page);
			
 
				 	clear_extent_bit(tree, start, end,
			
 
				 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
			
 
				-			 1, 1, GFP_NOFS);
			
 
				+			 1, 1, NULL, GFP_NOFS);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -2687,7 +2749,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
 
				 		    !isnew && !PageUptodate(page) &&
			
 
				 		    (block_off_end > to || block_off_start < from) &&
			
 
				 		    !test_range_bit(tree, block_start, cur_end,
			
 
				-				    EXTENT_UPTODATE, 1)) {
			
 
				+				    EXTENT_UPTODATE, 1, NULL)) {
			
 
				 			u64 sector;
			
 
				 			u64 extent_offset = block_start - em->start;
			
 
				 			size_t iosize;
			
@@ -2701,7 +2763,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
 
				 			 */
			
 
				 			set_extent_bit(tree, block_start,
			
 
				 				       block_start + iosize - 1,
			
 
				-				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
			
 
				+				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
			
 
				 			ret = submit_extent_page(READ, tree, page,
			
 
				 					 sector, iosize, page_offset, em->bdev,
			
 
				 					 NULL, 1,
			
@@ -2742,13 +2804,18 @@ int try_release_extent_state(struct extent_map_tree *map,
 
				 	int ret = 1;
			
 
				 
			
 
				 	if (test_range_bit(tree, start, end,
			
 
				-			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
			
 
				+			   EXTENT_IOBITS, 0, NULL))
			
 
				 		ret = 0;
			
 
				 	else {
			
 
				 		if ((mask & GFP_NOFS) == GFP_NOFS)
			
 
				 			mask = GFP_NOFS;
			
 
				-		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
			
 
				-				 1, 1, mask);
			
 
				+		/*
			
 
				+		 * at this point we can safely clear everything except the
			
 
				+		 * locked bit and the nodatasum bit
			
 
				+		 */
			
 
				+		clear_extent_bit(tree, start, end,
			
 
				+				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
			
 
				+				 0, 0, NULL, mask);
			
 
				 	}
			
 
				 	return ret;
			
 
				 }
			
@@ -2771,29 +2838,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 
				 		u64 len;
			
 
				 		while (start <= end) {
			
 
				 			len = end - start + 1;
			
 
				-			spin_lock(&map->lock);
			
 
				+			write_lock(&map->lock);
			
 
				 			em = lookup_extent_mapping(map, start, len);
			
 
				 			if (!em || IS_ERR(em)) {
			
 
				-				spin_unlock(&map->lock);
			
 
				+				write_unlock(&map->lock);
			
 
				 				break;
			
 
				 			}
			
 
				 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
			
 
				 			    em->start != start) {
			
 
				-				spin_unlock(&map->lock);
			
 
				+				write_unlock(&map->lock);
			
 
				 				free_extent_map(em);
			
 
				 				break;
			
 
				 			}
			
 
				 			if (!test_range_bit(tree, em->start,
			
 
				 					    extent_map_end(em) - 1,
			
 
				-					    EXTENT_LOCKED | EXTENT_WRITEBACK |
			
 
				-					    EXTENT_ORDERED,
			
 
				-					    0)) {
			
 
				+					    EXTENT_LOCKED | EXTENT_WRITEBACK,
			
 
				+					    0, NULL)) {
			
 
				 				remove_extent_mapping(map, em);
			
 
				 				/* once for the rb tree */
			
 
				 				free_extent_map(em);
			
 
				 			}
			
 
				 			start = extent_map_end(em);
			
 
				-			spin_unlock(&map->lock);
			
 
				+			write_unlock(&map->lock);
			
 
				 
			
 
				 			/* once for us */
			
 
				 			free_extent_map(em);
			
@@ -3203,7 +3269,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 
				 	int uptodate;
			
 
				 	unsigned long index;
			
 
				 
			
 
				-	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
			
 
				+	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
			
 
				 	if (ret)
			
 
				 		return 1;
			
 
				 	while (start <= end) {
			
@@ -3233,7 +3299,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 
				 		return 1;
			
 
				 
			
 
				 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
			
 
				-			   EXTENT_UPTODATE, 1);
			
 
				+			   EXTENT_UPTODATE, 1, NULL);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
@@ -3269,7 +3335,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 
				 		return 0;
			
 
				 
			
 
				 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
			
 
				-			   EXTENT_UPTODATE, 1)) {
			
 
				+			   EXTENT_UPTODATE, 1, NULL)) {
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,10 +13,8 @@
 
				 #define EXTENT_DEFRAG (1 << 6)
			
 
				 #define EXTENT_DEFRAG_DONE (1 << 7)
			
 
				 #define EXTENT_BUFFER_FILLED (1 << 8)
			
 
				-#define EXTENT_ORDERED (1 << 9)
			
 
				-#define EXTENT_ORDERED_METADATA (1 << 10)
			
 
				-#define EXTENT_BOUNDARY (1 << 11)
			
 
				-#define EXTENT_NODATASUM (1 << 12)
			
 
				+#define EXTENT_BOUNDARY (1 << 9)
			
 
				+#define EXTENT_NODATASUM (1 << 10)
			
 
				 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
			
 
				 
			
 
				 /* flags for bio submission */
			
@@ -142,6 +140,8 @@ int try_release_extent_state(struct extent_map_tree *map,
 
				 			     struct extent_io_tree *tree, struct page *page,
			
 
				 			     gfp_t mask);
			
 
				 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
			
 
				+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				+		     int bits, struct extent_state **cached, gfp_t mask);
			
 
				 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
			
 
				 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		    gfp_t mask);
			
@@ -155,11 +155,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
 
				 		     u64 max_bytes, unsigned long bits);
			
 
				 
			
 
				 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				-		   int bits, int filled);
			
 
				+		   int bits, int filled, struct extent_state *cached_state);
			
 
				 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		      int bits, gfp_t mask);
			
 
				 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				-		     int bits, int wake, int delete, gfp_t mask);
			
 
				+		     int bits, int wake, int delete, struct extent_state **cached,
			
 
				+		     gfp_t mask);
			
 
				 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
			
 
				 		    int bits, gfp_t mask);
			
 
				 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
			
@@ -282,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 
				 				int clear_unlock,
			
 
				 				int clear_delalloc, int clear_dirty,
			
 
				 				int set_writeback,
			
 
				-				int end_writeback);
			
 
				+				int end_writeback,
			
 
				+				int set_private2);
			
 
				 #endif
			
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -36,7 +36,7 @@ void extent_map_exit(void)
 
				 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
			
 
				 {
			
 
				 	tree->map.rb_node = NULL;
			
 
				-	spin_lock_init(&tree->lock);
			
 
				+	rwlock_init(&tree->lock);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	struct extent_map *merge = NULL;
			
 
				+	struct rb_node *rb;
			
 
				+	struct extent_map *em;
			
 
				+
			
 
				+	write_lock(&tree->lock);
			
 
				+	em = lookup_extent_mapping(tree, start, len);
			
 
				+
			
 
				+	WARN_ON(em->start != start || !em);
			
 
				+
			
 
				+	if (!em)
			
 
				+		goto out;
			
 
				+
			
 
				+	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				+
			
 
				+	if (em->start != 0) {
			
 
				+		rb = rb_prev(&em->rb_node);
			
 
				+		if (rb)
			
 
				+			merge = rb_entry(rb, struct extent_map, rb_node);
			
 
				+		if (rb && mergable_maps(merge, em)) {
			
 
				+			em->start = merge->start;
			
 
				+			em->len += merge->len;
			
 
				+			em->block_len += merge->block_len;
			
 
				+			em->block_start = merge->block_start;
			
 
				+			merge->in_tree = 0;
			
 
				+			rb_erase(&merge->rb_node, &tree->map);
			
 
				+			free_extent_map(merge);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	rb = rb_next(&em->rb_node);
			
 
				+	if (rb)
			
 
				+		merge = rb_entry(rb, struct extent_map, rb_node);
			
 
				+	if (rb && mergable_maps(em, merge)) {
			
 
				+		em->len += merge->len;
			
 
				+		em->block_len += merge->len;
			
 
				+		rb_erase(&merge->rb_node, &tree->map);
			
 
				+		merge->in_tree = 0;
			
 
				+		free_extent_map(merge);
			
 
				+	}
			
 
				+
			
 
				+	free_extent_map(em);
			
 
				+out:
			
 
				+	write_unlock(&tree->lock);
			
 
				+	return ret;
			
 
				+
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * add_extent_mapping - add new extent map to the extent tree
			
 
				  * @tree:	tree to insert new map in
			
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 
				 		ret = -EEXIST;
			
 
				 		goto out;
			
 
				 	}
			
 
				-	assert_spin_locked(&tree->lock);
			
 
				 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
			
 
				 	if (rb) {
			
 
				 		ret = -EEXIST;
			
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 
				 	struct rb_node *next = NULL;
			
 
				 	u64 end = range_end(start, len);
			
 
				 
			
 
				-	assert_spin_locked(&tree->lock);
			
 
				 	rb_node = __tree_search(&tree->map, start, &prev, &next);
			
 
				 	if (!rb_node && prev) {
			
 
				 		em = rb_entry(prev, struct extent_map, rb_node);
			
@@ -318,6 +366,54 @@ out:
 
				 	return em;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * search_extent_mapping - find a nearby extent map
			
 
				+ * @tree:	tree to lookup in
			
 
				+ * @start:	byte offset to start the search
			
 
				+ * @len:	length of the lookup range
			
 
				+ *
			
 
				+ * Find and return the first extent_map struct in @tree that intersects the
			
 
				+ * [start, len] range.
			
 
				+ *
			
 
				+ * If one can't be found, any nearby extent may be returned
			
 
				+ */
			
 
				+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
			
 
				+					 u64 start, u64 len)
			
 
				+{
			
 
				+	struct extent_map *em;
			
 
				+	struct rb_node *rb_node;
			
 
				+	struct rb_node *prev = NULL;
			
 
				+	struct rb_node *next = NULL;
			
 
				+
			
 
				+	rb_node = __tree_search(&tree->map, start, &prev, &next);
			
 
				+	if (!rb_node && prev) {
			
 
				+		em = rb_entry(prev, struct extent_map, rb_node);
			
 
				+		goto found;
			
 
				+	}
			
 
				+	if (!rb_node && next) {
			
 
				+		em = rb_entry(next, struct extent_map, rb_node);
			
 
				+		goto found;
			
 
				+	}
			
 
				+	if (!rb_node) {
			
 
				+		em = NULL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+	if (IS_ERR(rb_node)) {
			
 
				+		em = ERR_PTR(PTR_ERR(rb_node));
			
 
				+		goto out;
			
 
				+	}
			
 
				+	em = rb_entry(rb_node, struct extent_map, rb_node);
			
 
				+	goto found;
			
 
				+
			
 
				+	em = NULL;
			
 
				+	goto out;
			
 
				+
			
 
				+found:
			
 
				+	atomic_inc(&em->refs);
			
 
				+out:
			
 
				+	return em;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * remove_extent_mapping - removes an extent_map from the extent tree
			
 
				  * @tree:	extent tree to remove from
			
@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 
				 	int ret = 0;
			
 
				 
			
 
				 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
			
 
				-	assert_spin_locked(&tree->lock);
			
 
				 	rb_erase(&em->rb_node, &tree->map);
			
 
				 	em->in_tree = 0;
			
 
				 	return ret;
			
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -31,7 +31,7 @@ struct extent_map {
 
				 
			
 
				 struct extent_map_tree {
			
 
				 	struct rb_root map;
			
 
				-	spinlock_t lock;
			
 
				+	rwlock_t lock;
			
 
				 };
			
 
				 
			
 
				 static inline u64 extent_map_end(struct extent_map *em)
			
@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
 
				 void free_extent_map(struct extent_map *em);
			
 
				 int __init extent_map_init(void);
			
 
				 void extent_map_exit(void);
			
 
				+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
			
 
				+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
			
 
				+					 u64 start, u64 len);
			
 
				 #endif
			
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
				 	int err = 0;
			
 
				 	int i;
			
 
				 	struct inode *inode = fdentry(file)->d_inode;
			
 
				-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
			
 
				-	u64 hint_byte;
			
 
				 	u64 num_bytes;
			
 
				 	u64 start_pos;
			
 
				 	u64 end_of_last_block;
			
@@ -125,22 +123,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
				 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
			
 
				 
			
 
				 	end_of_last_block = start_pos + num_bytes - 1;
			
 
				-
			
 
				-	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
			
 
				-	trans = btrfs_join_transaction(root, 1);
			
 
				-	if (!trans) {
			
 
				-		err = -ENOMEM;
			
 
				-		goto out_unlock;
			
 
				-	}
			
 
				-	btrfs_set_trans_block_group(trans, inode);
			
 
				-	hint_byte = 0;
			
 
				-
			
 
				-	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
			
 
				-
			
 
				-	/* check for reserved extents on each page, we don't want
			
 
				-	 * to reset the delalloc bit on things that already have
			
 
				-	 * extents reserved.
			
 
				-	 */
			
 
				 	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
			
 
				 	for (i = 0; i < num_pages; i++) {
			
 
				 		struct page *p = pages[i];
			
@@ -155,9 +137,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
				 		 * at this time.
			
 
				 		 */
			
 
				 	}
			
 
				-	err = btrfs_end_transaction(trans, root);
			
 
				-out_unlock:
			
 
				-	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -189,18 +168,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
				 		if (!split2)
			
 
				 			split2 = alloc_extent_map(GFP_NOFS);
			
 
				 
			
 
				-		spin_lock(&em_tree->lock);
			
 
				+		write_lock(&em_tree->lock);
			
 
				 		em = lookup_extent_mapping(em_tree, start, len);
			
 
				 		if (!em) {
			
 
				-			spin_unlock(&em_tree->lock);
			
 
				+			write_unlock(&em_tree->lock);
			
 
				 			break;
			
 
				 		}
			
 
				 		flags = em->flags;
			
 
				 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
			
 
				-			spin_unlock(&em_tree->lock);
			
 
				 			if (em->start <= start &&
			
 
				 			    (!testend || em->start + em->len >= start + len)) {
			
 
				 				free_extent_map(em);
			
 
				+				write_unlock(&em_tree->lock);
			
 
				 				break;
			
 
				 			}
			
 
				 			if (start < em->start) {
			
@@ -210,6 +189,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
				 				start = em->start + em->len;
			
 
				 			}
			
 
				 			free_extent_map(em);
			
 
				+			write_unlock(&em_tree->lock);
			
 
				 			continue;
			
 
				 		}
			
 
				 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
			
@@ -260,7 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
				 			free_extent_map(split);
			
 
				 			split = NULL;
			
 
				 		}
			
 
				-		spin_unlock(&em_tree->lock);
			
 
				+		write_unlock(&em_tree->lock);
			
 
				 
			
 
				 		/* once for us */
			
 
				 		free_extent_map(em);
			
@@ -289,7 +269,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
				 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
			
 
				 		       struct btrfs_root *root, struct inode *inode,
			
 
				 		       u64 start, u64 end, u64 locked_end,
			
 
				-		       u64 inline_limit, u64 *hint_byte)
			
 
				+		       u64 inline_limit, u64 *hint_byte, int drop_cache)
			
 
				 {
			
 
				 	u64 extent_end = 0;
			
 
				 	u64 search_start = start;
			
@@ -314,7 +294,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 
				 	int ret;
			
 
				 
			
 
				 	inline_limit = 0;
			
 
				-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
			
 
				+	if (drop_cache)
			
 
				+		btrfs_drop_extent_cache(inode, start, end - 1, 0);
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 
				 
			
 
				 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
			
 
				 {
			
 
				-	u64 max_bytes, possible_bytes;
			
 
				+	u64 max_bytes;
			
 
				+	u64 bitmap_bytes;
			
 
				+	u64 extent_bytes;
			
 
				 
			
 
				 	/*
			
 
				 	 * The goal is to keep the total amount of memory used per 1gb of space
			
@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 
				 	max_bytes = MAX_CACHE_BYTES_PER_GIG *
			
 
				 		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
			
 
				 
			
 
				-	possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
			
 
				-		(sizeof(struct btrfs_free_space) *
			
 
				-		 block_group->extents_thresh);
			
 
				+	/*
			
 
				+	 * we want to account for 1 more bitmap than what we have so we can make
			
 
				+	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
			
 
				+	 * we add more bitmaps.
			
 
				+	 */
			
 
				+	bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
			
 
				 
			
 
				-	if (possible_bytes > max_bytes) {
			
 
				-		int extent_bytes = max_bytes -
			
 
				-			(block_group->total_bitmaps * PAGE_CACHE_SIZE);
			
 
				+	if (bitmap_bytes >= max_bytes) {
			
 
				+		block_group->extents_thresh = 0;
			
 
				+		return;
			
 
				+	}
			
 
				 
			
 
				-		if (extent_bytes <= 0) {
			
 
				-			block_group->extents_thresh = 0;
			
 
				-			return;
			
 
				-		}
			
 
				+	/*
			
 
				+	 * we want the extent entry threshold to always be at most 1/2 the maxw
			
 
				+	 * bytes we can have, or whatever is less than that.
			
 
				+	 */
			
 
				+	extent_bytes = max_bytes - bitmap_bytes;
			
 
				+	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
			
 
				 
			
 
				-		block_group->extents_thresh = extent_bytes /
			
 
				-			(sizeof(struct btrfs_free_space));
			
 
				-	}
			
 
				+	block_group->extents_thresh =
			
 
				+		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
			
 
				 }
			
 
				 
			
 
				 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
			
@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
 
				 	BUG_ON(block_group->total_bitmaps >= max_bitmaps);
			
 
				 
			
 
				 	info->offset = offset_to_bitmap(block_group, offset);
			
 
				+	info->bytes = 0;
			
 
				 	link_free_space(block_group, info);
			
 
				 	block_group->total_bitmaps++;
			
 
				 
			
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 
				 		ptr = (unsigned long)(ref + 1);
			
 
				 		ret = 0;
			
 
				 	} else if (ret < 0) {
			
 
				+		if (ret == -EOVERFLOW)
			
 
				+			ret = -EMLINK;
			
 
				 		goto out;
			
 
				 	} else {
			
 
				 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
			
@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 
				 
			
 
				 	ret = btrfs_insert_empty_item(trans, root, path, &key,
			
 
				 				      sizeof(struct btrfs_inode_item));
			
 
				-	if (ret == 0 && objectid > root->highest_inode)
			
 
				-		root->highest_inode = objectid;
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 
				 		slot = path->slots[0] - 1;
			
 
				 		l = path->nodes[0];
			
 
				 		btrfs_item_key_to_cpu(l, &found_key, slot);
			
 
				-		*objectid = found_key.objectid;
			
 
				+		*objectid = max_t(u64, found_key.objectid,
			
 
				+				  BTRFS_FIRST_FREE_OBJECTID - 1);
			
 
				 	} else {
			
 
				-		*objectid = BTRFS_FIRST_FREE_OBJECTID;
			
 
				+		*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
			
 
				 	}
			
 
				 	ret = 0;
			
 
				 error:
			
@@ -53,91 +54,27 @@ error:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * walks the btree of allocated inodes and find a hole.
			
 
				- */
			
 
				 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
			
 
				 			     struct btrfs_root *root,
			
 
				 			     u64 dirid, u64 *objectid)
			
 
				 {
			
 
				-	struct btrfs_path *path;
			
 
				-	struct btrfs_key key;
			
 
				 	int ret;
			
 
				-	int slot = 0;
			
 
				-	u64 last_ino = 0;
			
 
				-	int start_found;
			
 
				-	struct extent_buffer *l;
			
 
				-	struct btrfs_key search_key;
			
 
				-	u64 search_start = dirid;
			
 
				-
			
 
				 	mutex_lock(&root->objectid_mutex);
			
 
				-	if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
			
 
				-	    root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
			
 
				-		*objectid = ++root->last_inode_alloc;
			
 
				-		mutex_unlock(&root->objectid_mutex);
			
 
				-		return 0;
			
 
				-	}
			
 
				-	path = btrfs_alloc_path();
			
 
				-	BUG_ON(!path);
			
 
				-	search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
			
 
				-	search_key.objectid = search_start;
			
 
				-	search_key.type = 0;
			
 
				-	search_key.offset = 0;
			
 
				-
			
 
				-	start_found = 0;
			
 
				-	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
			
 
				-	if (ret < 0)
			
 
				-		goto error;
			
 
				 
			
 
				-	while (1) {
			
 
				-		l = path->nodes[0];
			
 
				-		slot = path->slots[0];
			
 
				-		if (slot >= btrfs_header_nritems(l)) {
			
 
				-			ret = btrfs_next_leaf(root, path);
			
 
				-			if (ret == 0)
			
 
				-				continue;
			
 
				-			if (ret < 0)
			
 
				-				goto error;
			
 
				-			if (!start_found) {
			
 
				-				*objectid = search_start;
			
 
				-				start_found = 1;
			
 
				-				goto found;
			
 
				-			}
			
 
				-			*objectid = last_ino > search_start ?
			
 
				-				last_ino : search_start;
			
 
				-			goto found;
			
 
				-		}
			
 
				-		btrfs_item_key_to_cpu(l, &key, slot);
			
 
				-		if (key.objectid >= search_start) {
			
 
				-			if (start_found) {
			
 
				-				if (last_ino < search_start)
			
 
				-					last_ino = search_start;
			
 
				-				if (key.objectid > last_ino) {
			
 
				-					*objectid = last_ino;
			
 
				-					goto found;
			
 
				-				}
			
 
				-			} else if (key.objectid > search_start) {
			
 
				-				*objectid = search_start;
			
 
				-				goto found;
			
 
				-			}
			
 
				-		}
			
 
				-		if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
			
 
				-			break;
			
 
				+	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
			
 
				+		ret = btrfs_find_highest_inode(root, &root->highest_objectid);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+	}
			
 
				 
			
 
				-		start_found = 1;
			
 
				-		last_ino = key.objectid + 1;
			
 
				-		path->slots[0]++;
			
 
				+	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
			
 
				+		ret = -ENOSPC;
			
 
				+		goto out;
			
 
				 	}
			
 
				-	BUG_ON(1);
			
 
				-found:
			
 
				-	btrfs_release_path(root, path);
			
 
				-	btrfs_free_path(path);
			
 
				-	BUG_ON(*objectid < search_start);
			
 
				-	mutex_unlock(&root->objectid_mutex);
			
 
				-	return 0;
			
 
				-error:
			
 
				-	btrfs_release_path(root, path);
			
 
				-	btrfs_free_path(path);
			
 
				+
			
 
				+	*objectid = ++root->highest_objectid;
			
 
				+	ret = 0;
			
 
				+out:
			
 
				 	mutex_unlock(&root->objectid_mutex);
			
 
				 	return ret;
			
 
				 }
			
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 
				 	}
			
 
				 
			
 
				 	ret = btrfs_drop_extents(trans, root, inode, start,
			
 
				-				 aligned_end, aligned_end, start, &hint_byte);
			
 
				+				 aligned_end, aligned_end, start,
			
 
				+				 &hint_byte, 1);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				 	if (isize > actual_end)
			
@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 
				 				   inline_len, compressed_size,
			
 
				 				   compressed_pages);
			
 
				 	BUG_ON(ret);
			
 
				-	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
			
 
				+	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -425,7 +426,7 @@ again:
 
				 			extent_clear_unlock_delalloc(inode,
			
 
				 						     &BTRFS_I(inode)->io_tree,
			
 
				 						     start, end, NULL, 1, 0,
			
 
				-						     0, 1, 1, 1);
			
 
				+						     0, 1, 1, 1, 0);
			
 
				 			ret = 0;
			
 
				 			goto free_pages_out;
			
 
				 		}
			
@@ -611,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
				 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
			
 
				 
			
 
				 		while (1) {
			
 
				-			spin_lock(&em_tree->lock);
			
 
				+			write_lock(&em_tree->lock);
			
 
				 			ret = add_extent_mapping(em_tree, em);
			
 
				-			spin_unlock(&em_tree->lock);
			
 
				+			write_unlock(&em_tree->lock);
			
 
				 			if (ret != -EEXIST) {
			
 
				 				free_extent_map(em);
			
 
				 				break;
			
@@ -640,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
				 					     async_extent->start,
			
 
				 					     async_extent->start +
			
 
				 					     async_extent->ram_size - 1,
			
 
				-					     NULL, 1, 1, 0, 1, 1, 0);
			
 
				+					     NULL, 1, 1, 0, 1, 1, 0, 0);
			
 
				 
			
 
				 		ret = btrfs_submit_compressed_write(inode,
			
 
				 				    async_extent->start,
			
@@ -713,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
 
				 			extent_clear_unlock_delalloc(inode,
			
 
				 						     &BTRFS_I(inode)->io_tree,
			
 
				 						     start, end, NULL, 1, 1,
			
 
				-						     1, 1, 1, 1);
			
 
				+						     1, 1, 1, 1, 0);
			
 
				 			*nr_written = *nr_written +
			
 
				 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
			
 
				 			*page_started = 1;
			
@@ -725,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode,
 
				 	BUG_ON(disk_num_bytes >
			
 
				 	       btrfs_super_total_bytes(&root->fs_info->super_copy));
			
 
				 
			
 
				+
			
 
				+	read_lock(&BTRFS_I(inode)->extent_tree.lock);
			
 
				+	em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
			
 
				+				   start, num_bytes);
			
 
				+	if (em) {
			
 
				+		alloc_hint = em->block_start;
			
 
				+		free_extent_map(em);
			
 
				+	}
			
 
				+	read_unlock(&BTRFS_I(inode)->extent_tree.lock);
			
 
				 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
			
 
				 
			
 
				 	while (disk_num_bytes > 0) {
			
@@ -737,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode,
 
				 		em = alloc_extent_map(GFP_NOFS);
			
 
				 		em->start = start;
			
 
				 		em->orig_start = em->start;
			
 
				-
			
 
				 		ram_size = ins.offset;
			
 
				 		em->len = ins.offset;
			
 
				 
			
@@ -747,9 +756,9 @@ static noinline int cow_file_range(struct inode *inode,
 
				 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				 
			
 
				 		while (1) {
			
 
				-			spin_lock(&em_tree->lock);
			
 
				+			write_lock(&em_tree->lock);
			
 
				 			ret = add_extent_mapping(em_tree, em);
			
 
				-			spin_unlock(&em_tree->lock);
			
 
				+			write_unlock(&em_tree->lock);
			
 
				 			if (ret != -EEXIST) {
			
 
				 				free_extent_map(em);
			
 
				 				break;
			
@@ -776,11 +785,14 @@ static noinline int cow_file_range(struct inode *inode,
 
				 		/* we're not doing compressed IO, don't unlock the first
			
 
				 		 * page (which the caller expects to stay locked), don't
			
 
				 		 * clear any dirty bits and don't set any writeback bits
			
 
				+		 *
			
 
				+		 * Do set the Private2 bit so we know this page was properly
			
 
				+		 * setup for writepage
			
 
				 		 */
			
 
				 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
			
 
				 					     start, start + ram_size - 1,
			
 
				 					     locked_page, unlock, 1,
			
 
				-					     1, 0, 0, 0);
			
 
				+					     1, 0, 0, 0, 1);
			
 
				 		disk_num_bytes -= cur_alloc_size;
			
 
				 		num_bytes -= cur_alloc_size;
			
 
				 		alloc_hint = ins.objectid + ins.offset;
			
@@ -853,7 +865,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 
				 	int limit = 10 * 1024 * 1042;
			
 
				 
			
 
				 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
			
 
				-			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
			
 
				+			 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
			
 
				 	while (start < end) {
			
 
				 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
			
 
				 		async_cow->inode = inode;
			
@@ -1080,9 +1092,9 @@ out_check:
 
				 			em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				 			while (1) {
			
 
				-				spin_lock(&em_tree->lock);
			
 
				+				write_lock(&em_tree->lock);
			
 
				 				ret = add_extent_mapping(em_tree, em);
			
 
				-				spin_unlock(&em_tree->lock);
			
 
				+				write_unlock(&em_tree->lock);
			
 
				 				if (ret != -EEXIST) {
			
 
				 					free_extent_map(em);
			
 
				 					break;
			
@@ -1101,7 +1113,7 @@ out_check:
 
				 
			
 
				 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
			
 
				 					cur_offset, cur_offset + num_bytes - 1,
			
 
				-					locked_page, 1, 1, 1, 0, 0, 0);
			
 
				+					locked_page, 1, 1, 1, 0, 0, 0, 1);
			
 
				 		cur_offset = extent_end;
			
 
				 		if (cur_offset > end)
			
 
				 			break;
			
@@ -1374,10 +1386,8 @@ again:
 
				 	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
			
 
				 
			
 
				 	/* already ordered? We're done */
			
 
				-	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
			
 
				-			     EXTENT_ORDERED, 0)) {
			
 
				+	if (PagePrivate2(page))
			
 
				 		goto out;
			
 
				-	}
			
 
				 
			
 
				 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
			
 
				 	if (ordered) {
			
@@ -1413,11 +1423,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 
				 	struct inode *inode = page->mapping->host;
			
 
				 	struct btrfs_writepage_fixup *fixup;
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				-	int ret;
			
 
				 
			
 
				-	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
			
 
				-			     EXTENT_ORDERED, 0);
			
 
				-	if (ret)
			
 
				+	/* this page is properly in the ordered list */
			
 
				+	if (TestClearPagePrivate2(page))
			
 
				 		return 0;
			
 
				 
			
 
				 	if (PageChecked(page))
			
@@ -1455,9 +1463,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 
				 	BUG_ON(!path);
			
 
				 
			
 
				 	path->leave_spinning = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * we may be replacing one extent in the tree with another.
			
 
				+	 * The new extent is pinned in the extent map, and we don't want
			
 
				+	 * to drop it from the cache until it is completely in the btree.
			
 
				+	 *
			
 
				+	 * So, tell btrfs_drop_extents to leave this extent in the cache.
			
 
				+	 * the caller is expected to unpin it and allow it to be merged
			
 
				+	 * with the others.
			
 
				+	 */
			
 
				 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
			
 
				 				 file_pos + num_bytes, locked_end,
			
 
				-				 file_pos, &hint);
			
 
				+				 file_pos, &hint, 0);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				 	ins.objectid = inode->i_ino;
			
@@ -1485,7 +1503,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 
				 	btrfs_mark_buffer_dirty(leaf);
			
 
				 
			
 
				 	inode_add_bytes(inode, num_bytes);
			
 
				-	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
			
 
				 
			
 
				 	ins.objectid = disk_bytenr;
			
 
				 	ins.offset = disk_num_bytes;
			
@@ -1596,6 +1613,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
				 						ordered_extent->len,
			
 
				 						compressed, 0, 0,
			
 
				 						BTRFS_FILE_EXTENT_REG);
			
 
				+		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
			
 
				+				   ordered_extent->file_offset,
			
 
				+				   ordered_extent->len);
			
 
				 		BUG_ON(ret);
			
 
				 	}
			
 
				 	unlock_extent(io_tree, ordered_extent->file_offset,
			
@@ -1623,6 +1643,7 @@ nocow:
 
				 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
			
 
				 				struct extent_state *state, int uptodate)
			
 
				 {
			
 
				+	ClearPagePrivate2(page);
			
 
				 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
			
 
				 }
			
 
				 
			
@@ -1669,13 +1690,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
				 		failrec->last_mirror = 0;
			
 
				 		failrec->bio_flags = 0;
			
 
				 
			
 
				-		spin_lock(&em_tree->lock);
			
 
				+		read_lock(&em_tree->lock);
			
 
				 		em = lookup_extent_mapping(em_tree, start, failrec->len);
			
 
				 		if (em->start > start || em->start + em->len < start) {
			
 
				 			free_extent_map(em);
			
 
				 			em = NULL;
			
 
				 		}
			
 
				-		spin_unlock(&em_tree->lock);
			
 
				+		read_unlock(&em_tree->lock);
			
 
				 
			
 
				 		if (!em || IS_ERR(em)) {
			
 
				 			kfree(failrec);
			
@@ -1794,7 +1815,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
				 		return 0;
			
 
				 
			
 
				 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
			
 
				-	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
			
 
				+	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
			
 
				 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
			
 
				 				  GFP_NOFS);
			
 
				 		return 0;
			
@@ -2352,6 +2373,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
			
 
				+			struct btrfs_root *root,
			
 
				+			struct inode *dir, u64 objectid,
			
 
				+			const char *name, int name_len)
			
 
				+{
			
 
				+	struct btrfs_path *path;
			
 
				+	struct extent_buffer *leaf;
			
 
				+	struct btrfs_dir_item *di;
			
 
				+	struct btrfs_key key;
			
 
				+	u64 index;
			
 
				+	int ret;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
			
 
				+				   name, name_len, -1);
			
 
				+	BUG_ON(!di || IS_ERR(di));
			
 
				+
			
 
				+	leaf = path->nodes[0];
			
 
				+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
			
 
				+	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
			
 
				+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
			
 
				+	BUG_ON(ret);
			
 
				+	btrfs_release_path(root, path);
			
 
				+
			
 
				+	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
			
 
				+				 objectid, root->root_key.objectid,
			
 
				+				 dir->i_ino, &index, name, name_len);
			
 
				+	if (ret < 0) {
			
 
				+		BUG_ON(ret != -ENOENT);
			
 
				+		di = btrfs_search_dir_index_item(root, path, dir->i_ino,
			
 
				+						 name, name_len);
			
 
				+		BUG_ON(!di || IS_ERR(di));
			
 
				+
			
 
				+		leaf = path->nodes[0];
			
 
				+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
			
 
				+		btrfs_release_path(root, path);
			
 
				+		index = key.offset;
			
 
				+	}
			
 
				+
			
 
				+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
			
 
				+					 index, name, name_len, -1);
			
 
				+	BUG_ON(!di || IS_ERR(di));
			
 
				+
			
 
				+	leaf = path->nodes[0];
			
 
				+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
			
 
				+	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
			
 
				+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
			
 
				+	BUG_ON(ret);
			
 
				+	btrfs_release_path(root, path);
			
 
				+
			
 
				+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
			
 
				+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
			
 
				+	ret = btrfs_update_inode(trans, root, dir);
			
 
				+	BUG_ON(ret);
			
 
				+	dir->i_sb->s_dirt = 1;
			
 
				+
			
 
				+	btrfs_free_path(path);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
			
 
				 {
			
 
				 	struct inode *inode = dentry->d_inode;
			
@@ -2361,29 +2445,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	unsigned long nr = 0;
			
 
				 
			
 
				-	/*
			
 
				-	 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
			
 
				-	 * the root of a subvolume or snapshot
			
 
				-	 */
			
 
				 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
			
 
				-	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
			
 
				+	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
			
 
				 		return -ENOTEMPTY;
			
 
				-	}
			
 
				 
			
 
				 	trans = btrfs_start_transaction(root, 1);
			
 
				 	btrfs_set_trans_block_group(trans, dir);
			
 
				 
			
 
				+	if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
			
 
				+		err = btrfs_unlink_subvol(trans, root, dir,
			
 
				+					  BTRFS_I(inode)->location.objectid,
			
 
				+					  dentry->d_name.name,
			
 
				+					  dentry->d_name.len);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				 	err = btrfs_orphan_add(trans, inode);
			
 
				 	if (err)
			
 
				-		goto fail_trans;
			
 
				+		goto out;
			
 
				 
			
 
				 	/* now the directory is empty */
			
 
				 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
			
 
				 				 dentry->d_name.name, dentry->d_name.len);
			
 
				 	if (!err)
			
 
				 		btrfs_i_size_write(inode, 0);
			
 
				-
			
 
				-fail_trans:
			
 
				+out:
			
 
				 	nr = trans->blocks_used;
			
 
				 	ret = btrfs_end_transaction_throttle(trans, root);
			
 
				 	btrfs_btree_balance_dirty(root, nr);
			
@@ -2935,7 +3021,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 
				 						 cur_offset,
			
 
				 						 cur_offset + hole_size,
			
 
				 						 block_end,
			
 
				-						 cur_offset, &hint_byte);
			
 
				+						 cur_offset, &hint_byte, 1);
			
 
				 			if (err)
			
 
				 				break;
			
 
				 			err = btrfs_insert_file_extent(trans, root,
			
@@ -3003,6 +3089,11 @@ void btrfs_delete_inode(struct inode *inode)
 
				 	}
			
 
				 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
			
 
				 
			
 
				+	if (inode->i_nlink > 0) {
			
 
				+		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
			
 
				+		goto no_delete;
			
 
				+	}
			
 
				+
			
 
				 	btrfs_i_size_write(inode, 0);
			
 
				 	trans = btrfs_join_transaction(root, 1);
			
 
				 
			
@@ -3070,29 +3161,67 @@ out_err:
 
				  * is kind of like crossing a mount point.
			
 
				  */
			
 
				 static int fixup_tree_root_location(struct btrfs_root *root,
			
 
				-			     struct btrfs_key *location,
			
 
				-			     struct btrfs_root **sub_root,
			
 
				-			     struct dentry *dentry)
			
 
				+				    struct inode *dir,
			
 
				+				    struct dentry *dentry,
			
 
				+				    struct btrfs_key *location,
			
 
				+				    struct btrfs_root **sub_root)
			
 
				 {
			
 
				-	struct btrfs_root_item *ri;
			
 
				+	struct btrfs_path *path;
			
 
				+	struct btrfs_root *new_root;
			
 
				+	struct btrfs_root_ref *ref;
			
 
				+	struct extent_buffer *leaf;
			
 
				+	int ret;
			
 
				+	int err = 0;
			
 
				 
			
 
				-	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
			
 
				-		return 0;
			
 
				-	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
			
 
				-		return 0;
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path) {
			
 
				+		err = -ENOMEM;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				-	*sub_root = btrfs_read_fs_root(root->fs_info, location,
			
 
				-					dentry->d_name.name,
			
 
				-					dentry->d_name.len);
			
 
				-	if (IS_ERR(*sub_root))
			
 
				-		return PTR_ERR(*sub_root);
			
 
				+	err = -ENOENT;
			
 
				+	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
			
 
				+				  BTRFS_I(dir)->root->root_key.objectid,
			
 
				+				  location->objectid);
			
 
				+	if (ret) {
			
 
				+		if (ret < 0)
			
 
				+			err = ret;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				-	ri = &(*sub_root)->root_item;
			
 
				-	location->objectid = btrfs_root_dirid(ri);
			
 
				-	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
			
 
				-	location->offset = 0;
			
 
				+	leaf = path->nodes[0];
			
 
				+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
			
 
				+	if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
			
 
				+	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
			
 
				+		goto out;
			
 
				 
			
 
				-	return 0;
			
 
				+	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
			
 
				+				   (unsigned long)(ref + 1),
			
 
				+				   dentry->d_name.len);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				+
			
 
				+	btrfs_release_path(root->fs_info->tree_root, path);
			
 
				+
			
 
				+	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
			
 
				+	if (IS_ERR(new_root)) {
			
 
				+		err = PTR_ERR(new_root);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (btrfs_root_refs(&new_root->root_item) == 0) {
			
 
				+		err = -ENOENT;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	*sub_root = new_root;
			
 
				+	location->objectid = btrfs_root_dirid(&new_root->root_item);
			
 
				+	location->type = BTRFS_INODE_ITEM_KEY;
			
 
				+	location->offset = 0;
			
 
				+	err = 0;
			
 
				+out:
			
 
				+	btrfs_free_path(path);
			
 
				+	return err;
			
 
				 }
			
 
				 
			
 
				 static void inode_tree_add(struct inode *inode)
			
@@ -3101,11 +3230,13 @@ static void inode_tree_add(struct inode *inode)
 
				 	struct btrfs_inode *entry;
			
 
				 	struct rb_node **p;
			
 
				 	struct rb_node *parent;
			
 
				-
			
 
				 again:
			
 
				 	p = &root->inode_tree.rb_node;
			
 
				 	parent = NULL;
			
 
				 
			
 
				+	if (hlist_unhashed(&inode->i_hash))
			
 
				+		return;
			
 
				+
			
 
				 	spin_lock(&root->inode_lock);
			
 
				 	while (*p) {
			
 
				 		parent = *p;
			
@@ -3132,13 +3263,87 @@ again:
 
				 static void inode_tree_del(struct inode *inode)
			
 
				 {
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				+	int empty = 0;
			
 
				 
			
 
				 	spin_lock(&root->inode_lock);
			
 
				 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
			
 
				 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
			
 
				 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
			
 
				+		empty = RB_EMPTY_ROOT(&root->inode_tree);
			
 
				 	}
			
 
				 	spin_unlock(&root->inode_lock);
			
 
				+
			
 
				+	if (empty && btrfs_root_refs(&root->root_item) == 0) {
			
 
				+		synchronize_srcu(&root->fs_info->subvol_srcu);
			
 
				+		spin_lock(&root->inode_lock);
			
 
				+		empty = RB_EMPTY_ROOT(&root->inode_tree);
			
 
				+		spin_unlock(&root->inode_lock);
			
 
				+		if (empty)
			
 
				+			btrfs_add_dead_root(root);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int btrfs_invalidate_inodes(struct btrfs_root *root)
			
 
				+{
			
 
				+	struct rb_node *node;
			
 
				+	struct rb_node *prev;
			
 
				+	struct btrfs_inode *entry;
			
 
				+	struct inode *inode;
			
 
				+	u64 objectid = 0;
			
 
				+
			
 
				+	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
			
 
				+
			
 
				+	spin_lock(&root->inode_lock);
			
 
				+again:
			
 
				+	node = root->inode_tree.rb_node;
			
 
				+	prev = NULL;
			
 
				+	while (node) {
			
 
				+		prev = node;
			
 
				+		entry = rb_entry(node, struct btrfs_inode, rb_node);
			
 
				+
			
 
				+		if (objectid < entry->vfs_inode.i_ino)
			
 
				+			node = node->rb_left;
			
 
				+		else if (objectid > entry->vfs_inode.i_ino)
			
 
				+			node = node->rb_right;
			
 
				+		else
			
 
				+			break;
			
 
				+	}
			
 
				+	if (!node) {
			
 
				+		while (prev) {
			
 
				+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
			
 
				+			if (objectid <= entry->vfs_inode.i_ino) {
			
 
				+				node = prev;
			
 
				+				break;
			
 
				+			}
			
 
				+			prev = rb_next(prev);
			
 
				+		}
			
 
				+	}
			
 
				+	while (node) {
			
 
				+		entry = rb_entry(node, struct btrfs_inode, rb_node);
			
 
				+		objectid = entry->vfs_inode.i_ino + 1;
			
 
				+		inode = igrab(&entry->vfs_inode);
			
 
				+		if (inode) {
			
 
				+			spin_unlock(&root->inode_lock);
			
 
				+			if (atomic_read(&inode->i_count) > 1)
			
 
				+				d_prune_aliases(inode);
			
 
				+			/*
			
 
				+			 * btrfs_drop_inode will remove it from
			
 
				+			 * the inode cache when its usage count
			
 
				+			 * hits zero.
			
 
				+			 */
			
 
				+			iput(inode);
			
 
				+			cond_resched();
			
 
				+			spin_lock(&root->inode_lock);
			
 
				+			goto again;
			
 
				+		}
			
 
				+
			
 
				+		if (cond_resched_lock(&root->inode_lock))
			
 
				+			goto again;
			
 
				+
			
 
				+		node = rb_next(node);
			
 
				+	}
			
 
				+	spin_unlock(&root->inode_lock);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static noinline void init_btrfs_i(struct inode *inode)
			
@@ -3225,15 +3430,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 
				 	return inode;
			
 
				 }
			
 
				 
			
 
				+static struct inode *new_simple_dir(struct super_block *s,
			
 
				+				    struct btrfs_key *key,
			
 
				+				    struct btrfs_root *root)
			
 
				+{
			
 
				+	struct inode *inode = new_inode(s);
			
 
				+
			
 
				+	if (!inode)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	init_btrfs_i(inode);
			
 
				+
			
 
				+	BTRFS_I(inode)->root = root;
			
 
				+	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
			
 
				+	BTRFS_I(inode)->dummy_inode = 1;
			
 
				+
			
 
				+	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
			
 
				+	inode->i_op = &simple_dir_inode_operations;
			
 
				+	inode->i_fop = &simple_dir_operations;
			
 
				+	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
			
 
				+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
			
 
				+
			
 
				+	return inode;
			
 
				+}
			
 
				+
			
 
				 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
			
 
				 {
			
 
				 	struct inode *inode;
			
 
				-	struct btrfs_inode *bi = BTRFS_I(dir);
			
 
				-	struct btrfs_root *root = bi->root;
			
 
				+	struct btrfs_root *root = BTRFS_I(dir)->root;
			
 
				 	struct btrfs_root *sub_root = root;
			
 
				 	struct btrfs_key location;
			
 
				+	int index;
			
 
				 	int ret;
			
 
				 
			
 
				+	dentry->d_op = &btrfs_dentry_operations;
			
 
				+
			
 
				 	if (dentry->d_name.len > BTRFS_NAME_LEN)
			
 
				 		return ERR_PTR(-ENAMETOOLONG);
			
 
				 
			
@@ -3242,29 +3473,50 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 
				 	if (ret < 0)
			
 
				 		return ERR_PTR(ret);
			
 
				 
			
 
				-	inode = NULL;
			
 
				-	if (location.objectid) {
			
 
				-		ret = fixup_tree_root_location(root, &location, &sub_root,
			
 
				-						dentry);
			
 
				-		if (ret < 0)
			
 
				-			return ERR_PTR(ret);
			
 
				-		if (ret > 0)
			
 
				-			return ERR_PTR(-ENOENT);
			
 
				+	if (location.objectid == 0)
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (location.type == BTRFS_INODE_ITEM_KEY) {
			
 
				+		inode = btrfs_iget(dir->i_sb, &location, root);
			
 
				+		return inode;
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
			
 
				+
			
 
				+	index = srcu_read_lock(&root->fs_info->subvol_srcu);
			
 
				+	ret = fixup_tree_root_location(root, dir, dentry,
			
 
				+				       &location, &sub_root);
			
 
				+	if (ret < 0) {
			
 
				+		if (ret != -ENOENT)
			
 
				+			inode = ERR_PTR(ret);
			
 
				+		else
			
 
				+			inode = new_simple_dir(dir->i_sb, &location, sub_root);
			
 
				+	} else {
			
 
				 		inode = btrfs_iget(dir->i_sb, &location, sub_root);
			
 
				-		if (IS_ERR(inode))
			
 
				-			return ERR_CAST(inode);
			
 
				 	}
			
 
				+	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
			
 
				+
			
 
				 	return inode;
			
 
				 }
			
 
				 
			
 
				+static int btrfs_dentry_delete(struct dentry *dentry)
			
 
				+{
			
 
				+	struct btrfs_root *root;
			
 
				+
			
 
				+	if (!dentry->d_inode)
			
 
				+		return 0;
			
 
				+
			
 
				+	root = BTRFS_I(dentry->d_inode)->root;
			
 
				+	if (btrfs_root_refs(&root->root_item) == 0)
			
 
				+		return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
			
 
				 				   struct nameidata *nd)
			
 
				 {
			
 
				 	struct inode *inode;
			
 
				 
			
 
				-	if (dentry->d_name.len > BTRFS_NAME_LEN)
			
 
				-		return ERR_PTR(-ENAMETOOLONG);
			
 
				-
			
 
				 	inode = btrfs_lookup_dentry(dir, dentry);
			
 
				 	if (IS_ERR(inode))
			
 
				 		return ERR_CAST(inode);
			
@@ -3603,9 +3855,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
				 	if (ret != 0)
			
 
				 		goto fail;
			
 
				 
			
 
				-	if (objectid > root->highest_inode)
			
 
				-		root->highest_inode = objectid;
			
 
				-
			
 
				 	inode->i_uid = current_fsuid();
			
 
				 
			
 
				 	if (dir && (dir->i_mode & S_ISGID)) {
			
@@ -3673,26 +3922,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 
				 		   struct inode *parent_inode, struct inode *inode,
			
 
				 		   const char *name, int name_len, int add_backref, u64 index)
			
 
				 {
			
 
				-	int ret;
			
 
				+	int ret = 0;
			
 
				 	struct btrfs_key key;
			
 
				 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
			
 
				 
			
 
				-	key.objectid = inode->i_ino;
			
 
				-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
			
 
				-	key.offset = 0;
			
 
				+	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
			
 
				+		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
			
 
				+	} else {
			
 
				+		key.objectid = inode->i_ino;
			
 
				+		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
			
 
				+		key.offset = 0;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
			
 
				+		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
			
 
				+					 key.objectid, root->root_key.objectid,
			
 
				+					 parent_inode->i_ino,
			
 
				+					 index, name, name_len);
			
 
				+	} else if (add_backref) {
			
 
				+		ret = btrfs_insert_inode_ref(trans, root,
			
 
				+					     name, name_len, inode->i_ino,
			
 
				+					     parent_inode->i_ino, index);
			
 
				+	}
			
 
				 
			
 
				-	ret = btrfs_insert_dir_item(trans, root, name, name_len,
			
 
				-				    parent_inode->i_ino,
			
 
				-				    &key, btrfs_inode_type(inode),
			
 
				-				    index);
			
 
				 	if (ret == 0) {
			
 
				-		if (add_backref) {
			
 
				-			ret = btrfs_insert_inode_ref(trans, root,
			
 
				-						     name, name_len,
			
 
				-						     inode->i_ino,
			
 
				-						     parent_inode->i_ino,
			
 
				-						     index);
			
 
				-		}
			
 
				+		ret = btrfs_insert_dir_item(trans, root, name, name_len,
			
 
				+					    parent_inode->i_ino, &key,
			
 
				+					    btrfs_inode_type(inode), index);
			
 
				+		BUG_ON(ret);
			
 
				+
			
 
				 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
			
 
				 				   name_len * 2);
			
 
				 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
			
@@ -3875,18 +4133,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
				 
			
 
				 	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
			
 
				 
			
 
				-	if (err)
			
 
				-		drop_inode = 1;
			
 
				-
			
 
				-	btrfs_update_inode_block_group(trans, dir);
			
 
				-	err = btrfs_update_inode(trans, root, inode);
			
 
				-
			
 
				-	if (err)
			
 
				+	if (err) {
			
 
				 		drop_inode = 1;
			
 
				+	} else {
			
 
				+		btrfs_update_inode_block_group(trans, dir);
			
 
				+		err = btrfs_update_inode(trans, root, inode);
			
 
				+		BUG_ON(err);
			
 
				+		btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
			
 
				+	}
			
 
				 
			
 
				 	nr = trans->blocks_used;
			
 
				-
			
 
				-	btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
			
 
				 	btrfs_end_transaction_throttle(trans, root);
			
 
				 fail:
			
 
				 	if (drop_inode) {
			
@@ -4064,11 +4320,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
				 	int compressed;
			
 
				 
			
 
				 again:
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree, start, len);
			
 
				 	if (em)
			
 
				 		em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	read_unlock(&em_tree->lock);
			
 
				 
			
 
				 	if (em) {
			
 
				 		if (em->start > start || em->start + em->len <= start)
			
@@ -4215,6 +4471,11 @@ again:
 
				 				map = kmap(page);
			
 
				 				read_extent_buffer(leaf, map + pg_offset, ptr,
			
 
				 						   copy_size);
			
 
				+				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
			
 
				+					memset(map + pg_offset + copy_size, 0,
			
 
				+					       PAGE_CACHE_SIZE - pg_offset -
			
 
				+					       copy_size);
			
 
				+				}
			
 
				 				kunmap(page);
			
 
				 			}
			
 
				 			flush_dcache_page(page);
			
@@ -4259,7 +4520,7 @@ insert:
 
				 	}
			
 
				 
			
 
				 	err = 0;
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	write_lock(&em_tree->lock);
			
 
				 	ret = add_extent_mapping(em_tree, em);
			
 
				 	/* it is possible that someone inserted the extent into the tree
			
 
				 	 * while we had the lock dropped.  It is also possible that
			
@@ -4299,7 +4560,7 @@ insert:
 
				 			err = 0;
			
 
				 		}
			
 
				 	}
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	write_unlock(&em_tree->lock);
			
 
				 out:
			
 
				 	if (path)
			
 
				 		btrfs_free_path(path);
			
@@ -4398,13 +4659,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 
				 	u64 page_start = page_offset(page);
			
 
				 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
			
 
				 
			
 
				+
			
 
				+	/*
			
 
				+	 * we have the page locked, so new writeback can't start,
			
 
				+	 * and the dirty bit won't be cleared while we are here.
			
 
				+	 *
			
 
				+	 * Wait for IO on this page so that we can safely clear
			
 
				+	 * the PagePrivate2 bit and do ordered accounting
			
 
				+	 */
			
 
				 	wait_on_page_writeback(page);
			
 
				+
			
 
				 	tree = &BTRFS_I(page->mapping->host)->io_tree;
			
 
				 	if (offset) {
			
 
				 		btrfs_releasepage(page, GFP_NOFS);
			
 
				 		return;
			
 
				 	}
			
 
				-
			
 
				 	lock_extent(tree, page_start, page_end, GFP_NOFS);
			
 
				 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
			
 
				 					   page_offset(page));
			
@@ -4415,16 +4684,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 
				 		 */
			
 
				 		clear_extent_bit(tree, page_start, page_end,
			
 
				 				 EXTENT_DIRTY | EXTENT_DELALLOC |
			
 
				-				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
			
 
				-		btrfs_finish_ordered_io(page->mapping->host,
			
 
				-					page_start, page_end);
			
 
				+				 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
			
 
				+		/*
			
 
				+		 * whoever cleared the private bit is responsible
			
 
				+		 * for the finish_ordered_io
			
 
				+		 */
			
 
				+		if (TestClearPagePrivate2(page)) {
			
 
				+			btrfs_finish_ordered_io(page->mapping->host,
			
 
				+						page_start, page_end);
			
 
				+		}
			
 
				 		btrfs_put_ordered_extent(ordered);
			
 
				 		lock_extent(tree, page_start, page_end, GFP_NOFS);
			
 
				 	}
			
 
				 	clear_extent_bit(tree, page_start, page_end,
			
 
				-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
			
 
				-		 EXTENT_ORDERED,
			
 
				-		 1, 1, GFP_NOFS);
			
 
				+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
			
 
				+		 1, 1, NULL, GFP_NOFS);
			
 
				 	__btrfs_releasepage(page, GFP_NOFS);
			
 
				 
			
 
				 	ClearPageChecked(page);
			
@@ -4521,11 +4795,14 @@ again:
 
				 	}
			
 
				 	ClearPageChecked(page);
			
 
				 	set_page_dirty(page);
			
 
				+	SetPageUptodate(page);
			
 
				 
			
 
				 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
			
 
				 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
			
 
				 
			
 
				 out_unlock:
			
 
				+	if (!ret)
			
 
				+		return VM_FAULT_LOCKED;
			
 
				 	unlock_page(page);
			
 
				 out:
			
 
				 	return ret;
			
@@ -4594,11 +4871,11 @@ out:
 
				  * create a new subvolume directory/inode (helper for the ioctl).
			
 
				  */
			
 
				 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
			
 
				-			     struct btrfs_root *new_root, struct dentry *dentry,
			
 
				+			     struct btrfs_root *new_root,
			
 
				 			     u64 new_dirid, u64 alloc_hint)
			
 
				 {
			
 
				 	struct inode *inode;
			
 
				-	int error;
			
 
				+	int err;
			
 
				 	u64 index = 0;
			
 
				 
			
 
				 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
			
@@ -4611,11 +4888,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 
				 	inode->i_nlink = 1;
			
 
				 	btrfs_i_size_write(inode, 0);
			
 
				 
			
 
				-	error = btrfs_update_inode(trans, new_root, inode);
			
 
				-	if (error)
			
 
				-		return error;
			
 
				+	err = btrfs_update_inode(trans, new_root, inode);
			
 
				+	BUG_ON(err);
			
 
				 
			
 
				-	d_instantiate(dentry, inode);
			
 
				+	iput(inode);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -4693,6 +4969,16 @@ void btrfs_destroy_inode(struct inode *inode)
 
				 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
			
 
				 }
			
 
				 
			
 
				+void btrfs_drop_inode(struct inode *inode)
			
 
				+{
			
 
				+	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				+
			
 
				+	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
			
 
				+		generic_delete_inode(inode);
			
 
				+	else
			
 
				+		generic_drop_inode(inode);
			
 
				+}
			
 
				+
			
 
				 static void init_once(void *foo)
			
 
				 {
			
 
				 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
			
@@ -4761,31 +5047,32 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 {
			
 
				 	struct btrfs_trans_handle *trans;
			
 
				 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
			
 
				+	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
			
 
				 	struct inode *new_inode = new_dentry->d_inode;
			
 
				 	struct inode *old_inode = old_dentry->d_inode;
			
 
				 	struct timespec ctime = CURRENT_TIME;
			
 
				 	u64 index = 0;
			
 
				+	u64 root_objectid;
			
 
				 	int ret;
			
 
				 
			
 
				-	/* we're not allowed to rename between subvolumes */
			
 
				-	if (BTRFS_I(old_inode)->root->root_key.objectid !=
			
 
				-	    BTRFS_I(new_dir)->root->root_key.objectid)
			
 
				+	if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	/* we only allow rename subvolume link between subvolumes */
			
 
				+	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
			
 
				 		return -EXDEV;
			
 
				 
			
 
				-	if (S_ISDIR(old_inode->i_mode) && new_inode &&
			
 
				-	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
			
 
				+	if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
			
 
				+	    (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
			
 
				 		return -ENOTEMPTY;
			
 
				-	}
			
 
				 
			
 
				-	/* to rename a snapshot or subvolume, we need to juggle the
			
 
				-	 * backrefs.  This isn't coded yet
			
 
				-	 */
			
 
				-	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
			
 
				-		return -EXDEV;
			
 
				+	if (S_ISDIR(old_inode->i_mode) && new_inode &&
			
 
				+	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
			
 
				+		return -ENOTEMPTY;
			
 
				 
			
 
				 	ret = btrfs_check_metadata_free_space(root);
			
 
				 	if (ret)
			
 
				-		goto out_unlock;
			
 
				+		return ret;
			
 
				 
			
 
				 	/*
			
 
				 	 * we're using rename to replace one file with another.
			
@@ -4796,8 +5083,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
			
 
				 		filemap_flush(old_inode->i_mapping);
			
 
				 
			
 
				+	/* close the racy window with snapshot create/destroy ioctl */
			
 
				+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
			
 
				+		down_read(&root->fs_info->subvol_sem);
			
 
				+
			
 
				 	trans = btrfs_start_transaction(root, 1);
			
 
				+	btrfs_set_trans_block_group(trans, new_dir);
			
 
				+
			
 
				+	if (dest != root)
			
 
				+		btrfs_record_root_in_trans(trans, dest);
			
 
				 
			
 
				+	ret = btrfs_set_inode_index(new_dir, &index);
			
 
				+	if (ret)
			
 
				+		goto out_fail;
			
 
				+
			
 
				+	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
			
 
				+		/* force full log commit if subvolume involved. */
			
 
				+		root->fs_info->last_trans_log_full_commit = trans->transid;
			
 
				+	} else {
			
 
				+		ret = btrfs_insert_inode_ref(trans, dest,
			
 
				+					     new_dentry->d_name.name,
			
 
				+					     new_dentry->d_name.len,
			
 
				+					     old_inode->i_ino,
			
 
				+					     new_dir->i_ino, index);
			
 
				+		if (ret)
			
 
				+			goto out_fail;
			
 
				+		/*
			
 
				+		 * this is an ugly little race, but the rename is required
			
 
				+		 * to make sure that if we crash, the inode is either at the
			
 
				+		 * old name or the new one.  pinning the log transaction lets
			
 
				+		 * us make sure we don't allow a log commit to come in after
			
 
				+		 * we unlink the name but before we add the new name back in.
			
 
				+		 */
			
 
				+		btrfs_pin_log_trans(root);
			
 
				+	}
			
 
				 	/*
			
 
				 	 * make sure the inode gets flushed if it is replacing
			
 
				 	 * something.
			
@@ -4807,18 +5126,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 		btrfs_add_ordered_operation(trans, root, old_inode);
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * this is an ugly little race, but the rename is required to make
			
 
				-	 * sure that if we crash, the inode is either at the old name
			
 
				-	 * or the new one.  pinning the log transaction lets us make sure
			
 
				-	 * we don't allow a log commit to come in after we unlink the
			
 
				-	 * name but before we add the new name back in.
			
 
				-	 */
			
 
				-	btrfs_pin_log_trans(root);
			
 
				-
			
 
				-	btrfs_set_trans_block_group(trans, new_dir);
			
 
				-
			
 
				-	btrfs_inc_nlink(old_dentry->d_inode);
			
 
				 	old_dir->i_ctime = old_dir->i_mtime = ctime;
			
 
				 	new_dir->i_ctime = new_dir->i_mtime = ctime;
			
 
				 	old_inode->i_ctime = ctime;
			
@@ -4826,47 +5133,58 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
				 	if (old_dentry->d_parent != new_dentry->d_parent)
			
 
				 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
			
 
				 
			
 
				-	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
			
 
				-				 old_dentry->d_name.name,
			
 
				-				 old_dentry->d_name.len);
			
 
				-	if (ret)
			
 
				-		goto out_fail;
			
 
				+	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
			
 
				+		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
			
 
				+		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
			
 
				+					old_dentry->d_name.name,
			
 
				+					old_dentry->d_name.len);
			
 
				+	} else {
			
 
				+		btrfs_inc_nlink(old_dentry->d_inode);
			
 
				+		ret = btrfs_unlink_inode(trans, root, old_dir,
			
 
				+					 old_dentry->d_inode,
			
 
				+					 old_dentry->d_name.name,
			
 
				+					 old_dentry->d_name.len);
			
 
				+	}
			
 
				+	BUG_ON(ret);
			
 
				 
			
 
				 	if (new_inode) {
			
 
				 		new_inode->i_ctime = CURRENT_TIME;
			
 
				-		ret = btrfs_unlink_inode(trans, root, new_dir,
			
 
				-					 new_dentry->d_inode,
			
 
				-					 new_dentry->d_name.name,
			
 
				-					 new_dentry->d_name.len);
			
 
				-		if (ret)
			
 
				-			goto out_fail;
			
 
				+		if (unlikely(new_inode->i_ino ==
			
 
				+			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
			
 
				+			root_objectid = BTRFS_I(new_inode)->location.objectid;
			
 
				+			ret = btrfs_unlink_subvol(trans, dest, new_dir,
			
 
				+						root_objectid,
			
 
				+						new_dentry->d_name.name,
			
 
				+						new_dentry->d_name.len);
			
 
				+			BUG_ON(new_inode->i_nlink == 0);
			
 
				+		} else {
			
 
				+			ret = btrfs_unlink_inode(trans, dest, new_dir,
			
 
				+						 new_dentry->d_inode,
			
 
				+						 new_dentry->d_name.name,
			
 
				+						 new_dentry->d_name.len);
			
 
				+		}
			
 
				+		BUG_ON(ret);
			
 
				 		if (new_inode->i_nlink == 0) {
			
 
				 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
			
 
				-			if (ret)
			
 
				-				goto out_fail;
			
 
				+			BUG_ON(ret);
			
 
				 		}
			
 
				-
			
 
				 	}
			
 
				-	ret = btrfs_set_inode_index(new_dir, &index);
			
 
				-	if (ret)
			
 
				-		goto out_fail;
			
 
				 
			
 
				-	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
			
 
				-			     old_inode, new_dentry->d_name.name,
			
 
				-			     new_dentry->d_name.len, 1, index);
			
 
				-	if (ret)
			
 
				-		goto out_fail;
			
 
				+	ret = btrfs_add_link(trans, new_dir, old_inode,
			
 
				+			     new_dentry->d_name.name,
			
 
				+			     new_dentry->d_name.len, 0, index);
			
 
				+	BUG_ON(ret);
			
 
				 
			
 
				-	btrfs_log_new_name(trans, old_inode, old_dir,
			
 
				-				       new_dentry->d_parent);
			
 
				+	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
			
 
				+		btrfs_log_new_name(trans, old_inode, old_dir,
			
 
				+				   new_dentry->d_parent);
			
 
				+		btrfs_end_log_trans(root);
			
 
				+	}
			
 
				 out_fail:
			
 
				-
			
 
				-	/* this btrfs_end_log_trans just allows the current
			
 
				-	 * log-sub transaction to complete
			
 
				-	 */
			
 
				-	btrfs_end_log_trans(root);
			
 
				 	btrfs_end_transaction_throttle(trans, root);
			
 
				-out_unlock:
			
 
				+
			
 
				+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
			
 
				+		up_read(&root->fs_info->subvol_sem);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -5058,6 +5376,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 
				 						  0, 0, 0,
			
 
				 						  BTRFS_FILE_EXTENT_PREALLOC);
			
 
				 		BUG_ON(ret);
			
 
				+		btrfs_drop_extent_cache(inode, cur_offset,
			
 
				+					cur_offset + ins.offset -1, 0);
			
 
				 		num_bytes -= ins.offset;
			
 
				 		cur_offset += ins.offset;
			
 
				 		alloc_hint = ins.objectid + ins.offset;
			
@@ -5223,6 +5543,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 
				 	.lookup		= btrfs_lookup,
			
 
				 	.permission	= btrfs_permission,
			
 
				 };
			
 
				+
			
 
				 static struct file_operations btrfs_dir_file_operations = {
			
 
				 	.llseek		= generic_file_llseek,
			
 
				 	.read		= generic_read_dir,
			
@@ -5309,3 +5630,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 
				 	.listxattr	= btrfs_listxattr,
			
 
				 	.removexattr	= btrfs_removexattr,
			
 
				 };
			
 
				+
			
 
				+struct dentry_operations btrfs_dentry_operations = {
			
 
				+	.d_delete	= btrfs_dentry_delete,
			
 
				+};
			
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
 
				 	struct btrfs_root_item root_item;
			
 
				 	struct btrfs_inode_item *inode_item;
			
 
				 	struct extent_buffer *leaf;
			
 
				-	struct btrfs_root *new_root = root;
			
 
				-	struct inode *dir;
			
 
				+	struct btrfs_root *new_root;
			
 
				+	struct inode *dir = dentry->d_parent->d_inode;
			
 
				 	int ret;
			
 
				 	int err;
			
 
				 	u64 objectid;
			
@@ -241,7 +241,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 
				 
			
 
				 	ret = btrfs_check_metadata_free_space(root);
			
 
				 	if (ret)
			
 
				-		goto fail_commit;
			
 
				+		return ret;
			
 
				 
			
 
				 	trans = btrfs_start_transaction(root, 1);
			
 
				 	BUG_ON(!trans);
			
@@ -304,11 +304,17 @@ static noinline int create_subvol(struct btrfs_root *root,
 
				 	if (ret)
			
 
				 		goto fail;
			
 
				 
			
 
				+	key.offset = (u64)-1;
			
 
				+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
			
 
				+	BUG_ON(IS_ERR(new_root));
			
 
				+
			
 
				+	btrfs_record_root_in_trans(trans, new_root);
			
 
				+
			
 
				+	ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
			
 
				+				       BTRFS_I(dir)->block_group);
			
 
				 	/*
			
 
				 	 * insert the directory item
			
 
				 	 */
			
 
				-	key.offset = (u64)-1;
			
 
				-	dir = dentry->d_parent->d_inode;
			
 
				 	ret = btrfs_set_inode_index(dir, &index);
			
 
				 	BUG_ON(ret);
			
 
				 
			
@@ -322,44 +328,18 @@ static noinline int create_subvol(struct btrfs_root *root,
 
				 	ret = btrfs_update_inode(trans, root, dir);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	/* add the backref first */
			
 
				 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
			
 
				-				 objectid, BTRFS_ROOT_BACKREF_KEY,
			
 
				-				 root->root_key.objectid,
			
 
				+				 objectid, root->root_key.objectid,
			
 
				 				 dir->i_ino, index, name, namelen);
			
 
				 
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	/* now add the forward ref */
			
 
				-	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
			
 
				-				 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
			
 
				-				 objectid,
			
 
				-				 dir->i_ino, index, name, namelen);
			
 
				-
			
 
				-	BUG_ON(ret);
			
 
				-
			
 
				-	ret = btrfs_commit_transaction(trans, root);
			
 
				-	if (ret)
			
 
				-		goto fail_commit;
			
 
				-
			
 
				-	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
			
 
				-	BUG_ON(!new_root);
			
 
				-
			
 
				-	trans = btrfs_start_transaction(new_root, 1);
			
 
				-	BUG_ON(!trans);
			
 
				-
			
 
				-	ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
			
 
				-				       BTRFS_I(dir)->block_group);
			
 
				-	if (ret)
			
 
				-		goto fail;
			
 
				-
			
 
				+	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
			
 
				 fail:
			
 
				 	nr = trans->blocks_used;
			
 
				-	err = btrfs_commit_transaction(trans, new_root);
			
 
				+	err = btrfs_commit_transaction(trans, root);
			
 
				 	if (err && !ret)
			
 
				 		ret = err;
			
 
				-fail_commit:
			
 
				-	btrfs_btree_balance_dirty(root, nr);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -420,14 +400,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 
				  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
			
 
				  * inside this filesystem so it's quite a bit simpler.
			
 
				  */
			
 
				-static noinline int btrfs_mksubvol(struct path *parent, char *name,
			
 
				-				   int mode, int namelen,
			
 
				+static noinline int btrfs_mksubvol(struct path *parent,
			
 
				+				   char *name, int namelen,
			
 
				 				   struct btrfs_root *snap_src)
			
 
				 {
			
 
				+	struct inode *dir  = parent->dentry->d_inode;
			
 
				 	struct dentry *dentry;
			
 
				 	int error;
			
 
				 
			
 
				-	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
			
 
				+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
			
 
				 
			
 
				 	dentry = lookup_one_len(name, parent->dentry, namelen);
			
 
				 	error = PTR_ERR(dentry);
			
@@ -438,99 +419,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 
				 	if (dentry->d_inode)
			
 
				 		goto out_dput;
			
 
				 
			
 
				-	if (!IS_POSIXACL(parent->dentry->d_inode))
			
 
				-		mode &= ~current_umask();
			
 
				-
			
 
				 	error = mnt_want_write(parent->mnt);
			
 
				 	if (error)
			
 
				 		goto out_dput;
			
 
				 
			
 
				-	error = btrfs_may_create(parent->dentry->d_inode, dentry);
			
 
				+	error = btrfs_may_create(dir, dentry);
			
 
				 	if (error)
			
 
				 		goto out_drop_write;
			
 
				 
			
 
				-	/*
			
 
				-	 * Actually perform the low-level subvolume creation after all
			
 
				-	 * this VFS fuzz.
			
 
				-	 *
			
 
				-	 * Eventually we want to pass in an inode under which we create this
			
 
				-	 * subvolume, but for now all are under the filesystem root.
			
 
				-	 *
			
 
				-	 * Also we should pass on the mode eventually to allow creating new
			
 
				-	 * subvolume with specific mode bits.
			
 
				-	 */
			
 
				+	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
			
 
				+
			
 
				+	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
			
 
				+		goto out_up_read;
			
 
				+
			
 
				 	if (snap_src) {
			
 
				-		struct dentry *dir = dentry->d_parent;
			
 
				-		struct dentry *test = dir->d_parent;
			
 
				-		struct btrfs_path *path = btrfs_alloc_path();
			
 
				-		int ret;
			
 
				-		u64 test_oid;
			
 
				-		u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
			
 
				-
			
 
				-		test_oid = snap_src->root_key.objectid;
			
 
				-
			
 
				-		ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
			
 
				-					  path, parent_oid, test_oid);
			
 
				-		if (ret == 0)
			
 
				-			goto create;
			
 
				-		btrfs_release_path(snap_src->fs_info->tree_root, path);
			
 
				-
			
 
				-		/* we need to make sure we aren't creating a directory loop
			
 
				-		 * by taking a snapshot of something that has our current
			
 
				-		 * subvol in its directory tree.  So, this loops through
			
 
				-		 * the dentries and checks the forward refs for each subvolume
			
 
				-		 * to see if is references the subvolume where we are
			
 
				-		 * placing this new snapshot.
			
 
				-		 */
			
 
				-		while (1) {
			
 
				-			if (!test ||
			
 
				-			    dir == snap_src->fs_info->sb->s_root ||
			
 
				-			    test == snap_src->fs_info->sb->s_root ||
			
 
				-			    test->d_inode->i_sb != snap_src->fs_info->sb) {
			
 
				-				break;
			
 
				-			}
			
 
				-			if (S_ISLNK(test->d_inode->i_mode)) {
			
 
				-				printk(KERN_INFO "Btrfs symlink in snapshot "
			
 
				-				       "path, failed\n");
			
 
				-				error = -EMLINK;
			
 
				-				btrfs_free_path(path);
			
 
				-				goto out_drop_write;
			
 
				-			}
			
 
				-			test_oid =
			
 
				-				BTRFS_I(test->d_inode)->root->root_key.objectid;
			
 
				-			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
			
 
				-				  path, test_oid, parent_oid);
			
 
				-			if (ret == 0) {
			
 
				-				printk(KERN_INFO "Btrfs snapshot creation "
			
 
				-				       "failed, looping\n");
			
 
				-				error = -EMLINK;
			
 
				-				btrfs_free_path(path);
			
 
				-				goto out_drop_write;
			
 
				-			}
			
 
				-			btrfs_release_path(snap_src->fs_info->tree_root, path);
			
 
				-			test = test->d_parent;
			
 
				-		}
			
 
				-create:
			
 
				-		btrfs_free_path(path);
			
 
				-		error = create_snapshot(snap_src, dentry, name, namelen);
			
 
				+		error = create_snapshot(snap_src, dentry,
			
 
				+					name, namelen);
			
 
				 	} else {
			
 
				-		error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
			
 
				-				      dentry, name, namelen);
			
 
				+		error = create_subvol(BTRFS_I(dir)->root, dentry,
			
 
				+				      name, namelen);
			
 
				 	}
			
 
				-	if (error)
			
 
				-		goto out_drop_write;
			
 
				-
			
 
				-	fsnotify_mkdir(parent->dentry->d_inode, dentry);
			
 
				+	if (!error)
			
 
				+		fsnotify_mkdir(dir, dentry);
			
 
				+out_up_read:
			
 
				+	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
			
 
				 out_drop_write:
			
 
				 	mnt_drop_write(parent->mnt);
			
 
				 out_dput:
			
 
				 	dput(dentry);
			
 
				 out_unlock:
			
 
				-	mutex_unlock(&parent->dentry->d_inode->i_mutex);
			
 
				+	mutex_unlock(&dir->i_mutex);
			
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 static int btrfs_defrag_file(struct file *file)
			
 
				 {
			
 
				 	struct inode *inode = fdentry(file)->d_inode;
			
@@ -596,9 +517,8 @@ again:
 
				 		clear_page_dirty_for_io(page);
			
 
				 
			
 
				 		btrfs_set_extent_delalloc(inode, page_start, page_end);
			
 
				-
			
 
				-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
			
 
				 		set_page_dirty(page);
			
 
				+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
			
 
				 		unlock_page(page);
			
 
				 		page_cache_release(page);
			
 
				 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
			
@@ -609,7 +529,8 @@ out_unlock:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
			
 
				+static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
			
 
				+					void __user *arg)
			
 
				 {
			
 
				 	u64 new_size;
			
 
				 	u64 old_size;
			
@@ -718,10 +639,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
				 {
			
 
				 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
			
 
				 	struct btrfs_ioctl_vol_args *vol_args;
			
 
				-	struct btrfs_dir_item *di;
			
 
				-	struct btrfs_path *path;
			
 
				 	struct file *src_file;
			
 
				-	u64 root_dirid;
			
 
				 	int namelen;
			
 
				 	int ret = 0;
			
 
				 
			
@@ -739,32 +657,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	path = btrfs_alloc_path();
			
 
				-	if (!path) {
			
 
				-		ret = -ENOMEM;
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
			
 
				-	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
			
 
				-			    path, root_dirid,
			
 
				-			    vol_args->name, namelen, 0);
			
 
				-	btrfs_free_path(path);
			
 
				-
			
 
				-	if (di && !IS_ERR(di)) {
			
 
				-		ret = -EEXIST;
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	if (IS_ERR(di)) {
			
 
				-		ret = PTR_ERR(di);
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				 	if (subvol) {
			
 
				-		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
			
 
				-				     file->f_path.dentry->d_inode->i_mode,
			
 
				-				     namelen, NULL);
			
 
				+		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
			
 
				+				     NULL);
			
 
				 	} else {
			
 
				 		struct inode *src_inode;
			
 
				 		src_file = fget(vol_args->fd);
			
@@ -781,17 +676,156 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
				 			fput(src_file);
			
 
				 			goto out;
			
 
				 		}
			
 
				-		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
			
 
				-			     file->f_path.dentry->d_inode->i_mode,
			
 
				-			     namelen, BTRFS_I(src_inode)->root);
			
 
				+		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
			
 
				+				     BTRFS_I(src_inode)->root);
			
 
				 		fput(src_file);
			
 
				 	}
			
 
				-
			
 
				 out:
			
 
				 	kfree(vol_args);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * helper to check if the subvolume references other subvolumes
			
 
				+ */
			
 
				+static noinline int may_destroy_subvol(struct btrfs_root *root)
			
 
				+{
			
 
				+	struct btrfs_path *path;
			
 
				+	struct btrfs_key key;
			
 
				+	int ret;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	key.objectid = root->root_key.objectid;
			
 
				+	key.type = BTRFS_ROOT_REF_KEY;
			
 
				+	key.offset = (u64)-1;
			
 
				+
			
 
				+	ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
			
 
				+				&key, path, 0, 0);
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+	BUG_ON(ret == 0);
			
 
				+
			
 
				+	ret = 0;
			
 
				+	if (path->slots[0] > 0) {
			
 
				+		path->slots[0]--;
			
 
				+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
			
 
				+		if (key.objectid == root->root_key.objectid &&
			
 
				+		    key.type == BTRFS_ROOT_REF_KEY)
			
 
				+			ret = -ENOTEMPTY;
			
 
				+	}
			
 
				+out:
			
 
				+	btrfs_free_path(path);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
			
 
				+					     void __user *arg)
			
 
				+{
			
 
				+	struct dentry *parent = fdentry(file);
			
 
				+	struct dentry *dentry;
			
 
				+	struct inode *dir = parent->d_inode;
			
 
				+	struct inode *inode;
			
 
				+	struct btrfs_root *root = BTRFS_I(dir)->root;
			
 
				+	struct btrfs_root *dest = NULL;
			
 
				+	struct btrfs_ioctl_vol_args *vol_args;
			
 
				+	struct btrfs_trans_handle *trans;
			
 
				+	int namelen;
			
 
				+	int ret;
			
 
				+	int err = 0;
			
 
				+
			
 
				+	if (!capable(CAP_SYS_ADMIN))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	vol_args = memdup_user(arg, sizeof(*vol_args));
			
 
				+	if (IS_ERR(vol_args))
			
 
				+		return PTR_ERR(vol_args);
			
 
				+
			
 
				+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
			
 
				+	namelen = strlen(vol_args->name);
			
 
				+	if (strchr(vol_args->name, '/') ||
			
 
				+	    strncmp(vol_args->name, "..", namelen) == 0) {
			
 
				+		err = -EINVAL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	err = mnt_want_write(file->f_path.mnt);
			
 
				+	if (err)
			
 
				+		goto out;
			
 
				+
			
 
				+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
			
 
				+	dentry = lookup_one_len(vol_args->name, parent, namelen);
			
 
				+	if (IS_ERR(dentry)) {
			
 
				+		err = PTR_ERR(dentry);
			
 
				+		goto out_unlock_dir;
			
 
				+	}
			
 
				+
			
 
				+	if (!dentry->d_inode) {
			
 
				+		err = -ENOENT;
			
 
				+		goto out_dput;
			
 
				+	}
			
 
				+
			
 
				+	inode = dentry->d_inode;
			
 
				+	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
			
 
				+		err = -EINVAL;
			
 
				+		goto out_dput;
			
 
				+	}
			
 
				+
			
 
				+	dest = BTRFS_I(inode)->root;
			
 
				+
			
 
				+	mutex_lock(&inode->i_mutex);
			
 
				+	err = d_invalidate(dentry);
			
 
				+	if (err)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	down_write(&root->fs_info->subvol_sem);
			
 
				+
			
 
				+	err = may_destroy_subvol(dest);
			
 
				+	if (err)
			
 
				+		goto out_up_write;
			
 
				+
			
 
				+	trans = btrfs_start_transaction(root, 1);
			
 
				+	ret = btrfs_unlink_subvol(trans, root, dir,
			
 
				+				dest->root_key.objectid,
			
 
				+				dentry->d_name.name,
			
 
				+				dentry->d_name.len);
			
 
				+	BUG_ON(ret);
			
 
				+
			
 
				+	btrfs_record_root_in_trans(trans, dest);
			
 
				+
			
 
				+	memset(&dest->root_item.drop_progress, 0,
			
 
				+		sizeof(dest->root_item.drop_progress));
			
 
				+	dest->root_item.drop_level = 0;
			
 
				+	btrfs_set_root_refs(&dest->root_item, 0);
			
 
				+
			
 
				+	ret = btrfs_insert_orphan_item(trans,
			
 
				+				root->fs_info->tree_root,
			
 
				+				dest->root_key.objectid);
			
 
				+	BUG_ON(ret);
			
 
				+
			
 
				+	ret = btrfs_commit_transaction(trans, root);
			
 
				+	BUG_ON(ret);
			
 
				+	inode->i_flags |= S_DEAD;
			
 
				+out_up_write:
			
 
				+	up_write(&root->fs_info->subvol_sem);
			
 
				+out_unlock:
			
 
				+	mutex_unlock(&inode->i_mutex);
			
 
				+	if (!err) {
			
 
				+		btrfs_invalidate_inodes(dest);
			
 
				+		d_delete(dentry);
			
 
				+	}
			
 
				+out_dput:
			
 
				+	dput(dentry);
			
 
				+out_unlock_dir:
			
 
				+	mutex_unlock(&dir->i_mutex);
			
 
				+	mnt_drop_write(file->f_path.mnt);
			
 
				+out:
			
 
				+	kfree(vol_args);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				 static int btrfs_ioctl_defrag(struct file *file)
			
 
				 {
			
 
				 	struct inode *inode = fdentry(file)->d_inode;
			
@@ -865,8 +899,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
			
 
				-		u64 off, u64 olen, u64 destoff)
			
 
				+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
			
 
				+				       u64 off, u64 olen, u64 destoff)
			
 
				 {
			
 
				 	struct inode *inode = fdentry(file)->d_inode;
			
 
				 	struct btrfs_root *root = BTRFS_I(inode)->root;
			
@@ -976,7 +1010,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
				 
			
 
				 	/* punch hole in destination first */
			
 
				 	btrfs_drop_extents(trans, root, inode, off, off + len,
			
 
				-			   off + len, 0, &hint_byte);
			
 
				+			   off + len, 0, &hint_byte, 1);
			
 
				 
			
 
				 	/* clone data */
			
 
				 	key.objectid = src->i_ino;
			
@@ -1071,8 +1105,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
				 					datao += off - key.offset;
			
 
				 					datal -= off - key.offset;
			
 
				 				}
			
 
				-				if (key.offset + datao + datal + key.offset >
			
 
				-				    off + len)
			
 
				+				if (key.offset + datao + datal > off + len)
			
 
				 					datal = off + len - key.offset - datao;
			
 
				 				/* disko == 0 means it's a hole */
			
 
				 				if (!disko)
			
@@ -1258,6 +1291,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 
				 		return btrfs_ioctl_snap_create(file, argp, 0);
			
 
				 	case BTRFS_IOC_SUBVOL_CREATE:
			
 
				 		return btrfs_ioctl_snap_create(file, argp, 1);
			
 
				+	case BTRFS_IOC_SNAP_DESTROY:
			
 
				+		return btrfs_ioctl_snap_destroy(file, argp);
			
 
				 	case BTRFS_IOC_DEFRAG:
			
 
				 		return btrfs_ioctl_defrag(file);
			
 
				 	case BTRFS_IOC_RESIZE:
			
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
 
				 
			
 
				 #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
			
 
				 				   struct btrfs_ioctl_vol_args)
			
 
				-
			
 
				+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
			
 
				+				struct btrfs_ioctl_vol_args)
			
 
				 #endif
			
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 
				  *
			
 
				  * len is the length of the extent
			
 
				  *
			
 
				- * This also sets the EXTENT_ORDERED bit on the range in the inode.
			
 
				- *
			
 
				  * The tree is given a single reference on the ordered extent that was
			
 
				  * inserted.
			
 
				  */
			
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
				 	entry->start = start;
			
 
				 	entry->len = len;
			
 
				 	entry->disk_len = disk_len;
			
 
				+	entry->bytes_left = len;
			
 
				 	entry->inode = inode;
			
 
				 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
			
 
				 		set_bit(type, &entry->flags);
			
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
				 			   &entry->rb_node);
			
 
				 	BUG_ON(node);
			
 
				 
			
 
				-	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
			
 
				-			   entry_end(entry) - 1, GFP_NOFS);
			
 
				-
			
 
				 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
			
 
				 	list_add_tail(&entry->root_extent_list,
			
 
				 		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
			
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 
				 	struct btrfs_ordered_inode_tree *tree;
			
 
				 	struct rb_node *node;
			
 
				 	struct btrfs_ordered_extent *entry;
			
 
				-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
			
 
				 	int ret;
			
 
				 
			
 
				 	tree = &BTRFS_I(inode)->ordered_tree;
			
 
				 	mutex_lock(&tree->mutex);
			
 
				-	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
			
 
				-			     GFP_NOFS);
			
 
				 	node = tree_search(tree, file_offset);
			
 
				 	if (!node) {
			
 
				 		ret = 1;
			
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	ret = test_range_bit(io_tree, entry->file_offset,
			
 
				-			     entry->file_offset + entry->len - 1,
			
 
				-			     EXTENT_ORDERED, 0);
			
 
				-	if (ret == 0)
			
 
				+	if (io_size > entry->bytes_left) {
			
 
				+		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
			
 
				+		       (unsigned long long)entry->bytes_left,
			
 
				+		       (unsigned long long)io_size);
			
 
				+	}
			
 
				+	entry->bytes_left -= io_size;
			
 
				+	if (entry->bytes_left == 0)
			
 
				 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
			
 
				+	else
			
 
				+		ret = 1;
			
 
				 out:
			
 
				 	mutex_unlock(&tree->mutex);
			
 
				 	return ret == 0;
			
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 
				 	u64 orig_end;
			
 
				 	u64 wait_end;
			
 
				 	struct btrfs_ordered_extent *ordered;
			
 
				+	int found;
			
 
				 
			
 
				 	if (start + len < start) {
			
 
				 		orig_end = INT_LIMIT(loff_t);
			
@@ -502,6 +501,7 @@ again:
 
				 					   orig_end >> PAGE_CACHE_SHIFT);
			
 
				 
			
 
				 	end = orig_end;
			
 
				+	found = 0;
			
 
				 	while (1) {
			
 
				 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
			
 
				 		if (!ordered)
			
@@ -514,6 +514,7 @@ again:
 
				 			btrfs_put_ordered_extent(ordered);
			
 
				 			break;
			
 
				 		}
			
 
				+		found++;
			
 
				 		btrfs_start_ordered_extent(inode, ordered, 1);
			
 
				 		end = ordered->file_offset;
			
 
				 		btrfs_put_ordered_extent(ordered);
			
@@ -521,8 +522,8 @@ again:
 
				 			break;
			
 
				 		end--;
			
 
				 	}
			
 
				-	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
			
 
				-			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
			
 
				+	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
			
 
				+			   EXTENT_DELALLOC, 0, NULL)) {
			
 
				 		schedule_timeout(1);
			
 
				 		goto again;
			
 
				 	}
			
@@ -613,7 +614,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 
				 	 */
			
 
				 	if (test_range_bit(io_tree, disk_i_size,
			
 
				 			   ordered->file_offset + ordered->len - 1,
			
 
				-			   EXTENT_DELALLOC, 0)) {
			
 
				+			   EXTENT_DELALLOC, 0, NULL)) {
			
 
				 		goto out;
			
 
				 	}
			
 
				 	/*
			
@@ -664,7 +665,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 
				 	 */
			
 
				 	if (i_size_test > entry_end(ordered) &&
			
 
				 	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
			
 
				-			   EXTENT_DELALLOC, 0)) {
			
 
				+			   EXTENT_DELALLOC, 0, NULL)) {
			
 
				 		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
			
 
				 	}
			
 
				 	BTRFS_I(inode)->disk_i_size = new_i_size;
			
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
 
				 	/* extent length on disk */
			
 
				 	u64 disk_len;
			
 
				 
			
 
				+	/* number of bytes that still need writing */
			
 
				+	u64 bytes_left;
			
 
				+
			
 
				 	/* flags (described above) */
			
 
				 	unsigned long flags;
			
 
				 
			
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -65,3 +65,23 @@ out:
 
				 	btrfs_free_path(path);
			
 
				 	return ret;
			
 
				 }
			
 
				+
			
 
				+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
			
 
				+{
			
 
				+	struct btrfs_path *path;
			
 
				+	struct btrfs_key key;
			
 
				+	int ret;
			
 
				+
			
 
				+	key.objectid = BTRFS_ORPHAN_OBJECTID;
			
 
				+	key.type = BTRFS_ORPHAN_ITEM_KEY;
			
 
				+	key.offset = offset;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
			
 
				+
			
 
				+	btrfs_free_path(path);
			
 
				+	return ret;
			
 
				+}
			
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -121,6 +121,15 @@ struct inodevec {
 
				 	int nr;
			
 
				 };
			
 
				 
			
 
				+#define MAX_EXTENTS 128
			
 
				+
			
 
				+struct file_extent_cluster {
			
 
				+	u64 start;
			
 
				+	u64 end;
			
 
				+	u64 boundary[MAX_EXTENTS];
			
 
				+	unsigned int nr;
			
 
				+};
			
 
				+
			
 
				 struct reloc_control {
			
 
				 	/* block group to relocate */
			
 
				 	struct btrfs_block_group_cache *block_group;
			
@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
 
				 				struct reloc_control *rc)
			
 
				 {
			
 
				 	if (test_range_bit(&rc->processed_blocks, bytenr,
			
 
				-			   bytenr + blocksize - 1, EXTENT_DIRTY, 1))
			
 
				+			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
			
 
				 		return 1;
			
 
				 	return 0;
			
 
				 }
			
@@ -2529,56 +2538,94 @@ out:
 
				 }
			
 
				 
			
 
				 static noinline_for_stack
			
 
				-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
			
 
				+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
			
 
				+			 u64 block_start)
			
 
				+{
			
 
				+	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
			
 
				+	struct extent_map *em;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	em = alloc_extent_map(GFP_NOFS);
			
 
				+	if (!em)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	em->start = start;
			
 
				+	em->len = end + 1 - start;
			
 
				+	em->block_len = em->len;
			
 
				+	em->block_start = block_start;
			
 
				+	em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				+
			
 
				+	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
			
 
				+	while (1) {
			
 
				+		write_lock(&em_tree->lock);
			
 
				+		ret = add_extent_mapping(em_tree, em);
			
 
				+		write_unlock(&em_tree->lock);
			
 
				+		if (ret != -EEXIST) {
			
 
				+			free_extent_map(em);
			
 
				+			break;
			
 
				+		}
			
 
				+		btrfs_drop_extent_cache(inode, start, end, 0);
			
 
				+	}
			
 
				+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int relocate_file_extent_cluster(struct inode *inode,
			
 
				+					struct file_extent_cluster *cluster)
			
 
				 {
			
 
				 	u64 page_start;
			
 
				 	u64 page_end;
			
 
				-	unsigned long i;
			
 
				-	unsigned long first_index;
			
 
				+	u64 offset = BTRFS_I(inode)->index_cnt;
			
 
				+	unsigned long index;
			
 
				 	unsigned long last_index;
			
 
				-	unsigned int total_read = 0;
			
 
				-	unsigned int total_dirty = 0;
			
 
				+	unsigned int dirty_page = 0;
			
 
				 	struct page *page;
			
 
				 	struct file_ra_state *ra;
			
 
				-	struct btrfs_ordered_extent *ordered;
			
 
				-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
			
 
				+	int nr = 0;
			
 
				 	int ret = 0;
			
 
				 
			
 
				+	if (!cluster->nr)
			
 
				+		return 0;
			
 
				+
			
 
				 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
			
 
				 	if (!ra)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				+	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
			
 
				+	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
			
 
				+
			
 
				 	mutex_lock(&inode->i_mutex);
			
 
				-	first_index = start >> PAGE_CACHE_SHIFT;
			
 
				-	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
			
 
				 
			
 
				-	/* make sure the dirty trick played by the caller work */
			
 
				-	while (1) {
			
 
				-		ret = invalidate_inode_pages2_range(inode->i_mapping,
			
 
				-						    first_index, last_index);
			
 
				-		if (ret != -EBUSY)
			
 
				-			break;
			
 
				-		schedule_timeout(HZ/10);
			
 
				-	}
			
 
				+	i_size_write(inode, cluster->end + 1 - offset);
			
 
				+	ret = setup_extent_mapping(inode, cluster->start - offset,
			
 
				+				   cluster->end - offset, cluster->start);
			
 
				 	if (ret)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				 	file_ra_state_init(ra, inode->i_mapping);
			
 
				 
			
 
				-	for (i = first_index ; i <= last_index; i++) {
			
 
				-		if (total_read % ra->ra_pages == 0) {
			
 
				-			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
			
 
				-				min(last_index, ra->ra_pages + i - 1));
			
 
				-		}
			
 
				-		total_read++;
			
 
				-again:
			
 
				-		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
			
 
				-			BUG_ON(1);
			
 
				-		page = grab_cache_page(inode->i_mapping, i);
			
 
				+	WARN_ON(cluster->start != cluster->boundary[0]);
			
 
				+	while (index <= last_index) {
			
 
				+		page = find_lock_page(inode->i_mapping, index);
			
 
				 		if (!page) {
			
 
				-			ret = -ENOMEM;
			
 
				-			goto out_unlock;
			
 
				+			page_cache_sync_readahead(inode->i_mapping,
			
 
				+						  ra, NULL, index,
			
 
				+						  last_index + 1 - index);
			
 
				+			page = grab_cache_page(inode->i_mapping, index);
			
 
				+			if (!page) {
			
 
				+				ret = -ENOMEM;
			
 
				+				goto out_unlock;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (PageReadahead(page)) {
			
 
				+			page_cache_async_readahead(inode->i_mapping,
			
 
				+						   ra, NULL, page, index,
			
 
				+						   last_index + 1 - index);
			
 
				 		}
			
 
				+
			
 
				 		if (!PageUptodate(page)) {
			
 
				 			btrfs_readpage(NULL, page);
			
 
				 			lock_page(page);
			
@@ -2589,75 +2636,79 @@ again:
 
				 				goto out_unlock;
			
 
				 			}
			
 
				 		}
			
 
				-		wait_on_page_writeback(page);
			
 
				 
			
 
				 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
			
 
				 		page_end = page_start + PAGE_CACHE_SIZE - 1;
			
 
				-		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
			
 
				-
			
 
				-		ordered = btrfs_lookup_ordered_extent(inode, page_start);
			
 
				-		if (ordered) {
			
 
				-			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
			
 
				-			unlock_page(page);
			
 
				-			page_cache_release(page);
			
 
				-			btrfs_start_ordered_extent(inode, ordered, 1);
			
 
				-			btrfs_put_ordered_extent(ordered);
			
 
				-			goto again;
			
 
				-		}
			
 
				+
			
 
				+		lock_extent(&BTRFS_I(inode)->io_tree,
			
 
				+			    page_start, page_end, GFP_NOFS);
			
 
				+
			
 
				 		set_page_extent_mapped(page);
			
 
				 
			
 
				-		if (i == first_index)
			
 
				-			set_extent_bits(io_tree, page_start, page_end,
			
 
				+		if (nr < cluster->nr &&
			
 
				+		    page_start + offset == cluster->boundary[nr]) {
			
 
				+			set_extent_bits(&BTRFS_I(inode)->io_tree,
			
 
				+					page_start, page_end,
			
 
				 					EXTENT_BOUNDARY, GFP_NOFS);
			
 
				+			nr++;
			
 
				+		}
			
 
				 		btrfs_set_extent_delalloc(inode, page_start, page_end);
			
 
				 
			
 
				 		set_page_dirty(page);
			
 
				-		total_dirty++;
			
 
				+		dirty_page++;
			
 
				 
			
 
				-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
			
 
				+		unlock_extent(&BTRFS_I(inode)->io_tree,
			
 
				+			      page_start, page_end, GFP_NOFS);
			
 
				 		unlock_page(page);
			
 
				 		page_cache_release(page);
			
 
				+
			
 
				+		index++;
			
 
				+		if (nr < cluster->nr &&
			
 
				+		    page_end + 1 + offset == cluster->boundary[nr]) {
			
 
				+			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
			
 
				+							   dirty_page);
			
 
				+			dirty_page = 0;
			
 
				+		}
			
 
				+	}
			
 
				+	if (dirty_page) {
			
 
				+		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
			
 
				+						   dirty_page);
			
 
				 	}
			
 
				+	WARN_ON(nr != cluster->nr);
			
 
				 out_unlock:
			
 
				 	mutex_unlock(&inode->i_mutex);
			
 
				 	kfree(ra);
			
 
				-	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 static noinline_for_stack
			
 
				-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
			
 
				+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
			
 
				+			 struct file_extent_cluster *cluster)
			
 
				 {
			
 
				-	struct btrfs_root *root = BTRFS_I(inode)->root;
			
 
				-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
			
 
				-	struct extent_map *em;
			
 
				-	u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
			
 
				-	u64 end = start + extent_key->offset - 1;
			
 
				-
			
 
				-	em = alloc_extent_map(GFP_NOFS);
			
 
				-	em->start = start;
			
 
				-	em->len = extent_key->offset;
			
 
				-	em->block_len = extent_key->offset;
			
 
				-	em->block_start = extent_key->objectid;
			
 
				-	em->bdev = root->fs_info->fs_devices->latest_bdev;
			
 
				-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
			
 
				+	int ret;
			
 
				 
			
 
				-	/* setup extent map to cheat btrfs_readpage */
			
 
				-	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
			
 
				-	while (1) {
			
 
				-		int ret;
			
 
				-		spin_lock(&em_tree->lock);
			
 
				-		ret = add_extent_mapping(em_tree, em);
			
 
				-		spin_unlock(&em_tree->lock);
			
 
				-		if (ret != -EEXIST) {
			
 
				-			free_extent_map(em);
			
 
				-			break;
			
 
				-		}
			
 
				-		btrfs_drop_extent_cache(inode, start, end, 0);
			
 
				+	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
			
 
				+		ret = relocate_file_extent_cluster(inode, cluster);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+		cluster->nr = 0;
			
 
				 	}
			
 
				-	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
			
 
				 
			
 
				-	return relocate_inode_pages(inode, start, extent_key->offset);
			
 
				+	if (!cluster->nr)
			
 
				+		cluster->start = extent_key->objectid;
			
 
				+	else
			
 
				+		BUG_ON(cluster->nr >= MAX_EXTENTS);
			
 
				+	cluster->end = extent_key->objectid + extent_key->offset - 1;
			
 
				+	cluster->boundary[cluster->nr] = extent_key->objectid;
			
 
				+	cluster->nr++;
			
 
				+
			
 
				+	if (cluster->nr >= MAX_EXTENTS) {
			
 
				+		ret = relocate_file_extent_cluster(inode, cluster);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+		cluster->nr = 0;
			
 
				+	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
			
@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+
			
 
				 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
			
 
				 {
			
 
				 	struct rb_root blocks = RB_ROOT;
			
 
				 	struct btrfs_key key;
			
 
				+	struct file_extent_cluster *cluster;
			
 
				 	struct btrfs_trans_handle *trans = NULL;
			
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_extent_item *ei;
			
@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
				 	int ret;
			
 
				 	int err = 0;
			
 
				 
			
 
				+	cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
			
 
				+	if (!cluster)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				+	rc->extents_found = 0;
			
 
				+	rc->extents_skipped = 0;
			
 
				+
			
 
				 	rc->search_start = rc->block_group->key.objectid;
			
 
				 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
			
 
				 			  GFP_NOFS);
			
@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
				 		}
			
 
				 
			
 
				 		nr = trans->blocks_used;
			
 
				-		btrfs_end_transaction_throttle(trans, rc->extent_root);
			
 
				+		btrfs_end_transaction(trans, rc->extent_root);
			
 
				 		trans = NULL;
			
 
				 		btrfs_btree_balance_dirty(rc->extent_root, nr);
			
 
				 
			
 
				 		if (rc->stage == MOVE_DATA_EXTENTS &&
			
 
				 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
			
 
				 			rc->found_file_extent = 1;
			
 
				-			ret = relocate_data_extent(rc->data_inode, &key);
			
 
				+			ret = relocate_data_extent(rc->data_inode,
			
 
				+						   &key, cluster);
			
 
				 			if (ret < 0) {
			
 
				 				err = ret;
			
 
				 				break;
			
@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
				 		btrfs_btree_balance_dirty(rc->extent_root, nr);
			
 
				 	}
			
 
				 
			
 
				+	if (!err) {
			
 
				+		ret = relocate_file_extent_cluster(rc->data_inode, cluster);
			
 
				+		if (ret < 0)
			
 
				+			err = ret;
			
 
				+	}
			
 
				+
			
 
				+	kfree(cluster);
			
 
				+
			
 
				 	rc->create_reloc_root = 0;
			
 
				 	smp_mb();
			
 
				 
			
@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
				 }
			
 
				 
			
 
				 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
			
 
				-				 struct btrfs_root *root,
			
 
				-				 u64 objectid, u64 size)
			
 
				+				 struct btrfs_root *root, u64 objectid)
			
 
				 {
			
 
				 	struct btrfs_path *path;
			
 
				 	struct btrfs_inode_item *item;
			
@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 
				 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
			
 
				 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
			
 
				 	btrfs_set_inode_generation(leaf, item, 1);
			
 
				-	btrfs_set_inode_size(leaf, item, size);
			
 
				+	btrfs_set_inode_size(leaf, item, 0);
			
 
				 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
			
 
				 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
			
 
				 	btrfs_mark_buffer_dirty(leaf);
			
@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 
				 	if (err)
			
 
				 		goto out;
			
 
				 
			
 
				-	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
			
 
				-	BUG_ON(err);
			
 
				-
			
 
				-	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
			
 
				-				       group->key.offset, 0, group->key.offset,
			
 
				-				       0, 0, 0);
			
 
				+	err = __insert_orphan_inode(trans, root, objectid);
			
 
				 	BUG_ON(err);
			
 
				 
			
 
				 	key.objectid = objectid;
			
@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 
				 	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
			
 
				 
			
 
				 	while (1) {
			
 
				-		mutex_lock(&fs_info->cleaner_mutex);
			
 
				-		btrfs_clean_old_snapshots(fs_info->tree_root);
			
 
				-		mutex_unlock(&fs_info->cleaner_mutex);
			
 
				-
			
 
				 		rc->extents_found = 0;
			
 
				 		rc->extents_skipped = 0;
			
 
				 
			
 
				+		mutex_lock(&fs_info->cleaner_mutex);
			
 
				+
			
 
				+		btrfs_clean_old_snapshots(fs_info->tree_root);
			
 
				 		ret = relocate_block_group(rc);
			
 
				+
			
 
				+		mutex_unlock(&fs_info->cleaner_mutex);
			
 
				 		if (ret < 0) {
			
 
				 			err = ret;
			
 
				 			break;
			
@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
			
 
				-				 rc->block_group->key.objectid,
			
 
				-				 rc->block_group->key.objectid +
			
 
				-				 rc->block_group->key.offset - 1);
			
 
				+	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
			
 
				+				     rc->block_group->key.objectid,
			
 
				+				     rc->block_group->key.objectid +
			
 
				+				     rc->block_group->key.offset - 1);
			
 
				 
			
 
				 	WARN_ON(rc->block_group->pinned > 0);
			
 
				 	WARN_ON(rc->block_group->reserved > 0);
			
@@ -3530,6 +3594,26 @@ out:
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
			
 
				+{
			
 
				+	struct btrfs_trans_handle *trans;
			
 
				+	int ret;
			
 
				+
			
 
				+	trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
			
 
				+
			
 
				+	memset(&root->root_item.drop_progress, 0,
			
 
				+		sizeof(root->root_item.drop_progress));
			
 
				+	root->root_item.drop_level = 0;
			
 
				+	btrfs_set_root_refs(&root->root_item, 0);
			
 
				+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
			
 
				+				&root->root_key, &root->root_item);
			
 
				+	BUG_ON(ret);
			
 
				+
			
 
				+	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
			
 
				+	BUG_ON(ret);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * recover relocation interrupted by system crash.
			
 
				  *
			
@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 
				 			fs_root = read_fs_root(root->fs_info,
			
 
				 					       reloc_root->root_key.offset);
			
 
				 			if (IS_ERR(fs_root)) {
			
 
				-				err = PTR_ERR(fs_root);
			
 
				-				goto out;
			
 
				+				ret = PTR_ERR(fs_root);
			
 
				+				if (ret != -ENOENT) {
			
 
				+					err = ret;
			
 
				+					goto out;
			
 
				+				}
			
 
				+				mark_garbage_root(reloc_root);
			
 
				 			}
			
 
				 		}
			
 
				 
			
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 
				 		goto out;
			
 
				 
			
 
				 	BUG_ON(ret == 0);
			
 
				+	if (path->slots[0] == 0) {
			
 
				+		ret = 1;
			
 
				+		goto out;
			
 
				+	}
			
 
				 	l = path->nodes[0];
			
 
				-	BUG_ON(path->slots[0] == 0);
			
 
				 	slot = path->slots[0] - 1;
			
 
				 	btrfs_item_key_to_cpu(l, &found_key, slot);
			
 
				-	if (found_key.objectid != objectid) {
			
 
				+	if (found_key.objectid != objectid ||
			
 
				+	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
			
 
				 		ret = 1;
			
 
				 		goto out;
			
 
				 	}
			
 
				-	read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
			
 
				-			   sizeof(*item));
			
 
				-	memcpy(key, &found_key, sizeof(found_key));
			
 
				+	if (item)
			
 
				+		read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
			
 
				+				   sizeof(*item));
			
 
				+	if (key)
			
 
				+		memcpy(key, &found_key, sizeof(found_key));
			
 
				 	ret = 0;
			
 
				 out:
			
 
				 	btrfs_free_path(path);
			
@@ -249,6 +255,59 @@ err:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
			
 
				+{
			
 
				+	struct extent_buffer *leaf;
			
 
				+	struct btrfs_path *path;
			
 
				+	struct btrfs_key key;
			
 
				+	int err = 0;
			
 
				+	int ret;
			
 
				+
			
 
				+	path = btrfs_alloc_path();
			
 
				+	if (!path)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	key.objectid = BTRFS_ORPHAN_OBJECTID;
			
 
				+	key.type = BTRFS_ORPHAN_ITEM_KEY;
			
 
				+	key.offset = 0;
			
 
				+
			
 
				+	while (1) {
			
 
				+		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
			
 
				+		if (ret < 0) {
			
 
				+			err = ret;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		leaf = path->nodes[0];
			
 
				+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			
 
				+			ret = btrfs_next_leaf(tree_root, path);
			
 
				+			if (ret < 0)
			
 
				+				err = ret;
			
 
				+			if (ret != 0)
			
 
				+				break;
			
 
				+			leaf = path->nodes[0];
			
 
				+		}
			
 
				+
			
 
				+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
			
 
				+		btrfs_release_path(tree_root, path);
			
 
				+
			
 
				+		if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
			
 
				+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
			
 
				+			break;
			
 
				+
			
 
				+		ret = btrfs_find_dead_roots(tree_root, key.offset);
			
 
				+		if (ret) {
			
 
				+			err = ret;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		key.offset++;
			
 
				+	}
			
 
				+
			
 
				+	btrfs_free_path(path);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				 /* drop the root item for 'key' from 'root' */
			
 
				 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
			
 
				 		   struct btrfs_key *key)
			
@@ -278,31 +337,57 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-#if 0 /* this will get used when snapshot deletion is implemented */
			
 
				 int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
			
 
				 		       struct btrfs_root *tree_root,
			
 
				-		       u64 root_id, u8 type, u64 ref_id)
			
 
				+		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
			
 
				+		       const char *name, int name_len)
			
 
				+
			
 
				 {
			
 
				+	struct btrfs_path *path;
			
 
				+	struct btrfs_root_ref *ref;
			
 
				+	struct extent_buffer *leaf;
			
 
				 	struct btrfs_key key;
			
 
				+	unsigned long ptr;
			
 
				+	int err = 0;
			
 
				 	int ret;
			
 
				-	struct btrfs_path *path;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				+	if (!path)
			
 
				+		return -ENOMEM;
			
 
				 
			
 
				 	key.objectid = root_id;
			
 
				-	key.type = type;
			
 
				+	key.type = BTRFS_ROOT_BACKREF_KEY;
			
 
				 	key.offset = ref_id;
			
 
				-
			
 
				+again:
			
 
				 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
			
 
				-	BUG_ON(ret);
			
 
				-
			
 
				-	ret = btrfs_del_item(trans, tree_root, path);
			
 
				-	BUG_ON(ret);
			
 
				+	BUG_ON(ret < 0);
			
 
				+	if (ret == 0) {
			
 
				+		leaf = path->nodes[0];
			
 
				+		ref = btrfs_item_ptr(leaf, path->slots[0],
			
 
				+				     struct btrfs_root_ref);
			
 
				+
			
 
				+		WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
			
 
				+		WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
			
 
				+		ptr = (unsigned long)(ref + 1);
			
 
				+		WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
			
 
				+		*sequence = btrfs_root_ref_sequence(leaf, ref);
			
 
				+
			
 
				+		ret = btrfs_del_item(trans, tree_root, path);
			
 
				+		BUG_ON(ret);
			
 
				+	} else
			
 
				+		err = -ENOENT;
			
 
				+
			
 
				+	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
			
 
				+		btrfs_release_path(tree_root, path);
			
 
				+		key.objectid = ref_id;
			
 
				+		key.type = BTRFS_ROOT_REF_KEY;
			
 
				+		key.offset = root_id;
			
 
				+		goto again;
			
 
				+	}
			
 
				 
			
 
				 	btrfs_free_path(path);
			
 
				-	return ret;
			
 
				+	return err;
			
 
				 }
			
 
				-#endif
			
 
				 
			
 
				 int btrfs_find_root_ref(struct btrfs_root *tree_root,
			
 
				 		   struct btrfs_path *path,
			
@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /*
			
 
				  * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
			
 
				  * or BTRFS_ROOT_BACKREF_KEY.
			
@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
 
				  */
			
 
				 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
			
 
				 		       struct btrfs_root *tree_root,
			
 
				-		       u64 root_id, u8 type, u64 ref_id,
			
 
				-		       u64 dirid, u64 sequence,
			
 
				+		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
			
 
				 		       const char *name, int name_len)
			
 
				 {
			
 
				 	struct btrfs_key key;
			
@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 
				 	struct extent_buffer *leaf;
			
 
				 	unsigned long ptr;
			
 
				 
			
 
				-
			
 
				 	path = btrfs_alloc_path();
			
 
				+	if (!path)
			
 
				+		return -ENOMEM;
			
 
				 
			
 
				 	key.objectid = root_id;
			
 
				-	key.type = type;
			
 
				+	key.type = BTRFS_ROOT_BACKREF_KEY;
			
 
				 	key.offset = ref_id;
			
 
				-
			
 
				+again:
			
 
				 	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
			
 
				 				      sizeof(*ref) + name_len);
			
 
				 	BUG_ON(ret);
			
@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 
				 	write_extent_buffer(leaf, name, ptr, name_len);
			
 
				 	btrfs_mark_buffer_dirty(leaf);
			
 
				 
			
 
				+	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
			
 
				+		btrfs_release_path(tree_root, path);
			
 
				+		key.objectid = ref_id;
			
 
				+		key.type = BTRFS_ROOT_REF_KEY;
			
 
				+		key.offset = root_id;
			
 
				+		goto again;
			
 
				+	}
			
 
				+
			
 
				 	btrfs_free_path(path);
			
 
				-	return ret;
			
 
				+	return 0;
			
 
				 }
			
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -676,6 +676,7 @@ static int btrfs_unfreeze(struct super_block *sb)
 
				 }
			
 
				 
			
 
				 static const struct super_operations btrfs_super_ops = {
			
 
				+	.drop_inode	= btrfs_drop_inode,
			
 
				 	.delete_inode	= btrfs_delete_inode,
			
 
				 	.put_super	= btrfs_put_super,
			
 
				 	.sync_fs	= btrfs_sync_fs,
			
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
 
				 {
			
 
				 	if (root->ref_cows && root->last_trans < trans->transid) {
			
 
				 		WARN_ON(root == root->fs_info->extent_root);
			
 
				-		WARN_ON(root->root_item.refs == 0);
			
 
				 		WARN_ON(root->commit_root != root->node);
			
 
				 
			
 
				 		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			
@@ -720,7 +719,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
				 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
			
 
				 
			
 
				 	key.objectid = objectid;
			
 
				-	key.offset = 0;
			
 
				+	/* record when the snapshot was created in key.offset */
			
 
				+	key.offset = trans->transid;
			
 
				 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
			
 
				 
			
 
				 	old = btrfs_lock_root_node(root);
			
@@ -778,24 +778,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 
				 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	/* add the backref first */
			
 
				 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
			
 
				 				 pending->root_key.objectid,
			
 
				-				 BTRFS_ROOT_BACKREF_KEY,
			
 
				 				 parent_root->root_key.objectid,
			
 
				 				 parent_inode->i_ino, index, pending->name,
			
 
				 				 namelen);
			
 
				 
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	/* now add the forward ref */
			
 
				-	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
			
 
				-				 parent_root->root_key.objectid,
			
 
				-				 BTRFS_ROOT_REF_KEY,
			
 
				-				 pending->root_key.objectid,
			
 
				-				 parent_inode->i_ino, index, pending->name,
			
 
				-				 namelen);
			
 
				-
			
 
				 	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
			
 
				 	d_instantiate(pending->dentry, inode);
			
 
				 fail:
			
@@ -874,7 +864,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	unsigned long timeout = 1;
			
 
				 	struct btrfs_transaction *cur_trans;
			
 
				 	struct btrfs_transaction *prev_trans = NULL;
			
 
				-	struct extent_io_tree *pinned_copy;
			
 
				 	DEFINE_WAIT(wait);
			
 
				 	int ret;
			
 
				 	int should_grow = 0;
			
@@ -915,13 +904,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
			
 
				-	if (!pinned_copy)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	extent_io_tree_init(pinned_copy,
			
 
				-			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
			
 
				-
			
 
				 	trans->transaction->in_commit = 1;
			
 
				 	trans->transaction->blocked = 1;
			
 
				 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
			
@@ -1019,6 +1001,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	ret = commit_cowonly_roots(trans, root);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				+	btrfs_prepare_extent_commit(trans, root);
			
 
				+
			
 
				 	cur_trans = root->fs_info->running_transaction;
			
 
				 	spin_lock(&root->fs_info->new_trans_lock);
			
 
				 	root->fs_info->running_transaction = NULL;
			
@@ -1042,8 +1026,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
			
 
				 	       sizeof(root->fs_info->super_copy));
			
 
				 
			
 
				-	btrfs_copy_pinned(root, pinned_copy);
			
 
				-
			
 
				 	trans->transaction->blocked = 0;
			
 
				 
			
 
				 	wake_up(&root->fs_info->transaction_wait);
			
@@ -1059,8 +1041,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
				 	 */
			
 
				 	mutex_unlock(&root->fs_info->tree_log_mutex);
			
 
				 
			
 
				-	btrfs_finish_extent_commit(trans, root, pinned_copy);
			
 
				-	kfree(pinned_copy);
			
 
				+	btrfs_finish_extent_commit(trans, root);
			
 
				 
			
 
				 	/* do the directory inserts of any pending snapshot creations */
			
 
				 	finish_pending_snapshots(trans, root->fs_info);
			
@@ -1096,8 +1077,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
				 
			
 
				 	while (!list_empty(&list)) {
			
 
				 		root = list_entry(list.next, struct btrfs_root, root_list);
			
 
				-		list_del_init(&root->root_list);
			
 
				-		btrfs_drop_snapshot(root, 0);
			
 
				+		list_del(&root->root_list);
			
 
				+
			
 
				+		if (btrfs_header_backref_rev(root->node) <
			
 
				+		    BTRFS_MIXED_BACKREF_REV)
			
 
				+			btrfs_drop_snapshot(root, 0);
			
 
				+		else
			
 
				+			btrfs_drop_snapshot(root, 1);
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -263,8 +263,8 @@ static int process_one_buffer(struct btrfs_root *log,
 
				 			      struct walk_control *wc, u64 gen)
			
 
				 {
			
 
				 	if (wc->pin)
			
 
				-		btrfs_update_pinned_extents(log->fs_info->extent_root,
			
 
				-					    eb->start, eb->len, 1);
			
 
				+		btrfs_pin_extent(log->fs_info->extent_root,
			
 
				+				 eb->start, eb->len, 0);
			
 
				 
			
 
				 	if (btrfs_buffer_uptodate(eb, gen)) {
			
 
				 		if (wc->write)
			
@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
				 	saved_nbytes = inode_get_bytes(inode);
			
 
				 	/* drop any overlapping extents */
			
 
				 	ret = btrfs_drop_extents(trans, root, inode,
			
 
				-			 start, extent_end, extent_end, start, &alloc_hint);
			
 
				+			 start, extent_end, extent_end, start, &alloc_hint, 1);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				 	if (found_type == BTRFS_FILE_EXTENT_REG ||
			
@@ -2841,7 +2841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 
				 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
			
 
				 			break;
			
 
				 
			
 
				-		if (parent == sb->s_root)
			
 
				+		if (IS_ROOT(parent))
			
 
				 			break;
			
 
				 
			
 
				 		parent = parent->d_parent;
			
@@ -2880,6 +2880,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
				 		goto end_no_trans;
			
 
				 	}
			
 
				 
			
 
				+	if (root != BTRFS_I(inode)->root ||
			
 
				+	    btrfs_root_refs(&root->root_item) == 0) {
			
 
				+		ret = 1;
			
 
				+		goto end_no_trans;
			
 
				+	}
			
 
				+
			
 
				 	ret = check_parent_dirs_for_sync(trans, inode, parent,
			
 
				 					 sb, last_committed);
			
 
				 	if (ret)
			
@@ -2907,12 +2913,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
				 			break;
			
 
				 
			
 
				 		inode = parent->d_inode;
			
 
				+		if (root != BTRFS_I(inode)->root)
			
 
				+			break;
			
 
				+
			
 
				 		if (BTRFS_I(inode)->generation >
			
 
				 		    root->fs_info->last_trans_committed) {
			
 
				 			ret = btrfs_log_inode(trans, root, inode, inode_only);
			
 
				 			BUG_ON(ret);
			
 
				 		}
			
 
				-		if (parent == sb->s_root)
			
 
				+		if (IS_ROOT(parent))
			
 
				 			break;
			
 
				 
			
 
				 		parent = parent->d_parent;
			
@@ -2951,7 +2960,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
				 	struct btrfs_key tmp_key;
			
 
				 	struct btrfs_root *log;
			
 
				 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
			
 
				-	u64 highest_inode;
			
 
				 	struct walk_control wc = {
			
 
				 		.process_func = process_one_buffer,
			
 
				 		.stage = 0,
			
@@ -3010,11 +3018,6 @@ again:
 
				 						      path);
			
 
				 			BUG_ON(ret);
			
 
				 		}
			
 
				-		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
			
 
				-		if (ret == 0) {
			
 
				-			wc.replay_dest->highest_inode = highest_inode;
			
 
				-			wc.replay_dest->last_inode_alloc = highest_inode;
			
 
				-		}
			
 
				 
			
 
				 		key.offset = found_key.offset - 1;
			
 
				 		wc.replay_dest->log_root = NULL;
			
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -276,7 +276,7 @@ loop_lock:
 
				 		 * is now congested.  Back off and let other work structs
			
 
				 		 * run instead
			
 
				 		 */
			
 
				-		if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
			
 
				+		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
			
 
				 		    fs_info->fs_devices->open_devices > 1) {
			
 
				 			struct io_context *ioc;
			
 
				 
			
@@ -719,10 +719,9 @@ error:
 
				  * called very infrequently and that a given device has a small number
			
 
				  * of extents
			
 
				  */
			
 
				-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
			
 
				-					 struct btrfs_device *device,
			
 
				-					 u64 num_bytes, u64 *start,
			
 
				-					 u64 *max_avail)
			
 
				+int find_free_dev_extent(struct btrfs_trans_handle *trans,
			
 
				+			 struct btrfs_device *device, u64 num_bytes,
			
 
				+			 u64 *start, u64 *max_avail)
			
 
				 {
			
 
				 	struct btrfs_key key;
			
 
				 	struct btrfs_root *root = device->dev_root;
			
@@ -1736,6 +1735,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 
				 	extent_root = root->fs_info->extent_root;
			
 
				 	em_tree = &root->fs_info->mapping_tree.map_tree;
			
 
				 
			
 
				+	ret = btrfs_can_relocate(extent_root, chunk_offset);
			
 
				+	if (ret)
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				 	/* step one, relocate all the extents inside this chunk */
			
 
				 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
			
 
				 	BUG_ON(ret);
			
@@ -1749,9 +1752,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 
				 	 * step two, delete the device extents and the
			
 
				 	 * chunk tree entries
			
 
				 	 */
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	read_unlock(&em_tree->lock);
			
 
				 
			
 
				 	BUG_ON(em->start > chunk_offset ||
			
 
				 	       em->start + em->len < chunk_offset);
			
@@ -1780,9 +1783,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 
				 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
			
 
				 	BUG_ON(ret);
			
 
				 
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	write_lock(&em_tree->lock);
			
 
				 	remove_extent_mapping(em_tree, em);
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	write_unlock(&em_tree->lock);
			
 
				 
			
 
				 	kfree(map);
			
 
				 	em->bdev = NULL;
			
@@ -1807,12 +1810,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 
				 	struct btrfs_key found_key;
			
 
				 	u64 chunk_tree = chunk_root->root_key.objectid;
			
 
				 	u64 chunk_type;
			
 
				+	bool retried = false;
			
 
				+	int failed = 0;
			
 
				 	int ret;
			
 
				 
			
 
				 	path = btrfs_alloc_path();
			
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				+again:
			
 
				 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
			
 
				 	key.offset = (u64)-1;
			
 
				 	key.type = BTRFS_CHUNK_ITEM_KEY;
			
@@ -1842,7 +1848,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 
				 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
			
 
				 						   found_key.objectid,
			
 
				 						   found_key.offset);
			
 
				-			BUG_ON(ret);
			
 
				+			if (ret == -ENOSPC)
			
 
				+				failed++;
			
 
				+			else if (ret)
			
 
				+				BUG();
			
 
				 		}
			
 
				 
			
 
				 		if (found_key.offset == 0)
			
@@ -1850,6 +1859,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 
				 		key.offset = found_key.offset - 1;
			
 
				 	}
			
 
				 	ret = 0;
			
 
				+	if (failed && !retried) {
			
 
				+		failed = 0;
			
 
				+		retried = true;
			
 
				+		goto again;
			
 
				+	} else if (failed && retried) {
			
 
				+		WARN_ON(1);
			
 
				+		ret = -ENOSPC;
			
 
				+	}
			
 
				 error:
			
 
				 	btrfs_free_path(path);
			
 
				 	return ret;
			
@@ -1894,6 +1911,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 
				 			continue;
			
 
				 
			
 
				 		ret = btrfs_shrink_device(device, old_size - size_to_free);
			
 
				+		if (ret == -ENOSPC)
			
 
				+			break;
			
 
				 		BUG_ON(ret);
			
 
				 
			
 
				 		trans = btrfs_start_transaction(dev_root, 1);
			
@@ -1938,9 +1957,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 
				 		chunk = btrfs_item_ptr(path->nodes[0],
			
 
				 				       path->slots[0],
			
 
				 				       struct btrfs_chunk);
			
 
				-		key.offset = found_key.offset;
			
 
				 		/* chunk zero is special */
			
 
				-		if (key.offset == 0)
			
 
				+		if (found_key.offset == 0)
			
 
				 			break;
			
 
				 
			
 
				 		btrfs_release_path(chunk_root, path);
			
@@ -1948,7 +1966,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 
				 					   chunk_root->root_key.objectid,
			
 
				 					   found_key.objectid,
			
 
				 					   found_key.offset);
			
 
				-		BUG_ON(ret);
			
 
				+		BUG_ON(ret && ret != -ENOSPC);
			
 
				+		key.offset = found_key.offset - 1;
			
 
				 	}
			
 
				 	ret = 0;
			
 
				 error:
			
@@ -1974,10 +1993,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 	u64 chunk_offset;
			
 
				 	int ret;
			
 
				 	int slot;
			
 
				+	int failed = 0;
			
 
				+	bool retried = false;
			
 
				 	struct extent_buffer *l;
			
 
				 	struct btrfs_key key;
			
 
				 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
			
 
				 	u64 old_total = btrfs_super_total_bytes(super_copy);
			
 
				+	u64 old_size = device->total_bytes;
			
 
				 	u64 diff = device->total_bytes - new_size;
			
 
				 
			
 
				 	if (new_size >= device->total_bytes)
			
@@ -1987,12 +2009,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 	if (!path)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	trans = btrfs_start_transaction(root, 1);
			
 
				-	if (!trans) {
			
 
				-		ret = -ENOMEM;
			
 
				-		goto done;
			
 
				-	}
			
 
				-
			
 
				 	path->reada = 2;
			
 
				 
			
 
				 	lock_chunks(root);
			
@@ -2001,8 +2017,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 	if (device->writeable)
			
 
				 		device->fs_devices->total_rw_bytes -= diff;
			
 
				 	unlock_chunks(root);
			
 
				-	btrfs_end_transaction(trans, root);
			
 
				 
			
 
				+again:
			
 
				 	key.objectid = device->devid;
			
 
				 	key.offset = (u64)-1;
			
 
				 	key.type = BTRFS_DEV_EXTENT_KEY;
			
@@ -2017,6 +2033,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 			goto done;
			
 
				 		if (ret) {
			
 
				 			ret = 0;
			
 
				+			btrfs_release_path(root, path);
			
 
				 			break;
			
 
				 		}
			
 
				 
			
@@ -2024,14 +2041,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 		slot = path->slots[0];
			
 
				 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
			
 
				 
			
 
				-		if (key.objectid != device->devid)
			
 
				+		if (key.objectid != device->devid) {
			
 
				+			btrfs_release_path(root, path);
			
 
				 			break;
			
 
				+		}
			
 
				 
			
 
				 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
			
 
				 		length = btrfs_dev_extent_length(l, dev_extent);
			
 
				 
			
 
				-		if (key.offset + length <= new_size)
			
 
				+		if (key.offset + length <= new_size) {
			
 
				+			btrfs_release_path(root, path);
			
 
				 			break;
			
 
				+		}
			
 
				 
			
 
				 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
			
 
				 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
			
@@ -2040,8 +2061,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
				 
			
 
				 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
			
 
				 					   chunk_offset);
			
 
				-		if (ret)
			
 
				+		if (ret && ret != -ENOSPC)
			
 
				 			goto done;
			
 
				+		if (ret == -ENOSPC)
			
 
				+			failed++;
			
 
				+		key.offset -= 1;
			
 
				+	}
			
 
				+
			
 
				+	if (failed && !retried) {
			
 
				+		failed = 0;
			
 
				+		retried = true;
			
 
				+		goto again;
			
 
				+	} else if (failed && retried) {
			
 
				+		ret = -ENOSPC;
			
 
				+		lock_chunks(root);
			
 
				+
			
 
				+		device->total_bytes = old_size;
			
 
				+		if (device->writeable)
			
 
				+			device->fs_devices->total_rw_bytes += diff;
			
 
				+		unlock_chunks(root);
			
 
				+		goto done;
			
 
				 	}
			
 
				 
			
 
				 	/* Shrinking succeeded, else we would be at "done". */
			
@@ -2294,9 +2333,9 @@ again:
 
				 	em->block_len = em->len;
			
 
				 
			
 
				 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	write_lock(&em_tree->lock);
			
 
				 	ret = add_extent_mapping(em_tree, em);
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	write_unlock(&em_tree->lock);
			
 
				 	BUG_ON(ret);
			
 
				 	free_extent_map(em);
			
 
				 
			
@@ -2491,9 +2530,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 
				 	int readonly = 0;
			
 
				 	int i;
			
 
				 
			
 
				-	spin_lock(&map_tree->map_tree.lock);
			
 
				+	read_lock(&map_tree->map_tree.lock);
			
 
				 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
			
 
				-	spin_unlock(&map_tree->map_tree.lock);
			
 
				+	read_unlock(&map_tree->map_tree.lock);
			
 
				 	if (!em)
			
 
				 		return 1;
			
 
				 
			
@@ -2518,11 +2557,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 
				 	struct extent_map *em;
			
 
				 
			
 
				 	while (1) {
			
 
				-		spin_lock(&tree->map_tree.lock);
			
 
				+		write_lock(&tree->map_tree.lock);
			
 
				 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
			
 
				 		if (em)
			
 
				 			remove_extent_mapping(&tree->map_tree, em);
			
 
				-		spin_unlock(&tree->map_tree.lock);
			
 
				+		write_unlock(&tree->map_tree.lock);
			
 
				 		if (!em)
			
 
				 			break;
			
 
				 		kfree(em->bdev);
			
@@ -2540,9 +2579,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 
				 	struct extent_map_tree *em_tree = &map_tree->map_tree;
			
 
				 	int ret;
			
 
				 
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree, logical, len);
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	read_unlock(&em_tree->lock);
			
 
				 	BUG_ON(!em);
			
 
				 
			
 
				 	BUG_ON(em->start > logical || em->start + em->len < logical);
			
@@ -2604,9 +2643,9 @@ again:
 
				 		atomic_set(&multi->error, 0);
			
 
				 	}
			
 
				 
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree, logical, *length);
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	read_unlock(&em_tree->lock);
			
 
				 
			
 
				 	if (!em && unplug_page)
			
 
				 		return 0;
			
@@ -2763,9 +2802,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
				 	u64 stripe_nr;
			
 
				 	int i, j, nr = 0;
			
 
				 
			
 
				-	spin_lock(&em_tree->lock);
			
 
				+	read_lock(&em_tree->lock);
			
 
				 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
			
 
				-	spin_unlock(&em_tree->lock);
			
 
				+	read_unlock(&em_tree->lock);
			
 
				 
			
 
				 	BUG_ON(!em || em->start != chunk_start);
			
 
				 	map = (struct map_lookup *)em->bdev;
			
@@ -3053,9 +3092,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
				 	logical = key->offset;
			
 
				 	length = btrfs_chunk_length(leaf, chunk);
			
 
				 
			
 
				-	spin_lock(&map_tree->map_tree.lock);
			
 
				+	read_lock(&map_tree->map_tree.lock);
			
 
				 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
			
 
				-	spin_unlock(&map_tree->map_tree.lock);
			
 
				+	read_unlock(&map_tree->map_tree.lock);
			
 
				 
			
 
				 	/* already mapped? */
			
 
				 	if (em && em->start <= logical && em->start + em->len > logical) {
			
@@ -3114,9 +3153,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
				 		map->stripes[i].dev->in_fs_metadata = 1;
			
 
				 	}
			
 
				 
			
 
				-	spin_lock(&map_tree->map_tree.lock);
			
 
				+	write_lock(&map_tree->map_tree.lock);
			
 
				 	ret = add_extent_mapping(&map_tree->map_tree, em);
			
 
				-	spin_unlock(&map_tree->map_tree.lock);
			
 
				+	write_unlock(&map_tree->map_tree.lock);
			
 
				 	BUG_ON(ret);
			
 
				 	free_extent_map(em);
			
 
				 
			
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
 
				 void btrfs_unlock_volumes(void);
			
 
				 void btrfs_lock_volumes(void);
			
 
				 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
			
 
				+int find_free_dev_extent(struct btrfs_trans_handle *trans,
			
 
				+			 struct btrfs_device *device, u64 num_bytes,
			
 
				+			 u64 *start, u64 *max_avail);
			
 
				 #endif