Browse Source

Merge http://oss.oracle.com/git/ocfs2

Linus Torvalds 19 years ago
parent
commit
29552b1462
100 changed files with 42277 additions and 19 deletions
  1. 6 0
      Documentation/filesystems/00-INDEX
  2. 434 0
      Documentation/filesystems/configfs/configfs.txt
  3. 474 0
      Documentation/filesystems/configfs/configfs_example.c
  4. 130 0
      Documentation/filesystems/dlmfs.txt
  5. 55 0
      Documentation/filesystems/ocfs2.txt
  6. 14 0
      MAINTAINERS
  7. 18 5
      drivers/block/loop.c
  8. 2 2
      drivers/block/rd.c
  9. 55 11
      fs/Kconfig
  10. 2 0
      fs/Makefile
  11. 7 0
      fs/configfs/Makefile
  12. 142 0
      fs/configfs/configfs_internal.h
  13. 1102 0
      fs/configfs/dir.c
  14. 360 0
      fs/configfs/file.c
  15. 162 0
      fs/configfs/inode.c
  16. 227 0
      fs/configfs/item.c
  17. 159 0
      fs/configfs/mount.c
  18. 281 0
      fs/configfs/symlink.c
  19. 1 1
      fs/mpage.c
  20. 33 0
      fs/ocfs2/Makefile
  21. 2040 0
      fs/ocfs2/alloc.c
  22. 82 0
      fs/ocfs2/alloc.h
  23. 643 0
      fs/ocfs2/aops.c
  24. 41 0
      fs/ocfs2/aops.h
  25. 232 0
      fs/ocfs2/buffer_head_io.c
  26. 73 0
      fs/ocfs2/buffer_head_io.h
  27. 4 0
      fs/ocfs2/cluster/Makefile
  28. 30 0
      fs/ocfs2/cluster/endian.h
  29. 1797 0
      fs/ocfs2/cluster/heartbeat.c
  30. 82 0
      fs/ocfs2/cluster/heartbeat.h
  31. 166 0
      fs/ocfs2/cluster/masklog.c
  32. 275 0
      fs/ocfs2/cluster/masklog.h
  33. 791 0
      fs/ocfs2/cluster/nodemanager.c
  34. 64 0
      fs/ocfs2/cluster/nodemanager.h
  35. 37 0
      fs/ocfs2/cluster/ocfs2_heartbeat.h
  36. 39 0
      fs/ocfs2/cluster/ocfs2_nodemanager.h
  37. 315 0
      fs/ocfs2/cluster/quorum.c
  38. 36 0
      fs/ocfs2/cluster/quorum.h
  39. 124 0
      fs/ocfs2/cluster/sys.c
  40. 33 0
      fs/ocfs2/cluster/sys.h
  41. 1829 0
      fs/ocfs2/cluster/tcp.c
  42. 113 0
      fs/ocfs2/cluster/tcp.h
  43. 174 0
      fs/ocfs2/cluster/tcp_internal.h
  44. 42 0
      fs/ocfs2/cluster/ver.c
  45. 31 0
      fs/ocfs2/cluster/ver.h
  46. 91 0
      fs/ocfs2/dcache.c
  47. 31 0
      fs/ocfs2/dcache.h
  48. 618 0
      fs/ocfs2/dir.c
  49. 54 0
      fs/ocfs2/dir.h
  50. 8 0
      fs/ocfs2/dlm/Makefile
  51. 214 0
      fs/ocfs2/dlm/dlmapi.h
  52. 466 0
      fs/ocfs2/dlm/dlmast.c
  53. 884 0
      fs/ocfs2/dlm/dlmcommon.h
  54. 530 0
      fs/ocfs2/dlm/dlmconvert.c
  55. 35 0
      fs/ocfs2/dlm/dlmconvert.h
  56. 246 0
      fs/ocfs2/dlm/dlmdebug.c
  57. 30 0
      fs/ocfs2/dlm/dlmdebug.h
  58. 1469 0
      fs/ocfs2/dlm/dlmdomain.c
  59. 36 0
      fs/ocfs2/dlm/dlmdomain.h
  60. 640 0
      fs/ocfs2/dlm/dlmfs.c
  61. 42 0
      fs/ocfs2/dlm/dlmfsver.c
  62. 31 0
      fs/ocfs2/dlm/dlmfsver.h
  63. 676 0
      fs/ocfs2/dlm/dlmlock.c
  64. 2664 0
      fs/ocfs2/dlm/dlmmaster.c
  65. 2132 0
      fs/ocfs2/dlm/dlmrecovery.c
  66. 692 0
      fs/ocfs2/dlm/dlmthread.c
  67. 672 0
      fs/ocfs2/dlm/dlmunlock.c
  68. 42 0
      fs/ocfs2/dlm/dlmver.c
  69. 31 0
      fs/ocfs2/dlm/dlmver.h
  70. 658 0
      fs/ocfs2/dlm/userdlm.c
  71. 111 0
      fs/ocfs2/dlm/userdlm.h
  72. 2904 0
      fs/ocfs2/dlmglue.c
  73. 111 0
      fs/ocfs2/dlmglue.h
  74. 45 0
      fs/ocfs2/endian.h
  75. 248 0
      fs/ocfs2/export.c
  76. 31 0
      fs/ocfs2/export.h
  77. 994 0
      fs/ocfs2/extent_map.c
  78. 46 0
      fs/ocfs2/extent_map.h
  79. 1237 0
      fs/ocfs2/file.c
  80. 57 0
      fs/ocfs2/file.h
  81. 378 0
      fs/ocfs2/heartbeat.c
  82. 67 0
      fs/ocfs2/heartbeat.h
  83. 1140 0
      fs/ocfs2/inode.c
  84. 145 0
      fs/ocfs2/inode.h
  85. 1652 0
      fs/ocfs2/journal.c
  86. 457 0
      fs/ocfs2/journal.h
  87. 983 0
      fs/ocfs2/localalloc.c
  88. 56 0
      fs/ocfs2/localalloc.h
  89. 102 0
      fs/ocfs2/mmap.c
  90. 6 0
      fs/ocfs2/mmap.h
  91. 2264 0
      fs/ocfs2/namei.c
  92. 58 0
      fs/ocfs2/namei.h
  93. 109 0
      fs/ocfs2/ocfs1_fs_compat.h
  94. 464 0
      fs/ocfs2/ocfs2.h
  95. 638 0
      fs/ocfs2/ocfs2_fs.h
  96. 73 0
      fs/ocfs2/ocfs2_lockid.h
  97. 303 0
      fs/ocfs2/slot_map.c
  98. 66 0
      fs/ocfs2/slot_map.h
  99. 1651 0
      fs/ocfs2/suballoc.c
  100. 132 0
      fs/ocfs2/suballoc.h

+ 6 - 0
Documentation/filesystems/00-INDEX

@@ -12,10 +12,14 @@ cifs.txt
 	- description of the CIFS filesystem
 	- description of the CIFS filesystem
 coda.txt
 coda.txt
 	- description of the CODA filesystem.
 	- description of the CODA filesystem.
+configfs/
+	- directory containing configfs documentation and example code.
 cramfs.txt
 cramfs.txt
 	- info on the cram filesystem for small storage (ROMs etc)
 	- info on the cram filesystem for small storage (ROMs etc)
 devfs/
 devfs/
 	- directory containing devfs documentation.
 	- directory containing devfs documentation.
+dlmfs.txt
+	- info on the userspace interface to the OCFS2 DLM.
 ext2.txt
 ext2.txt
 	- info, mount options and specifications for the Ext2 filesystem.
 	- info, mount options and specifications for the Ext2 filesystem.
 hpfs.txt
 hpfs.txt
@@ -30,6 +34,8 @@ ntfs.txt
 	- info and mount options for the NTFS filesystem (Windows NT).
 	- info and mount options for the NTFS filesystem (Windows NT).
 proc.txt
 proc.txt
 	- info on Linux's /proc filesystem.
 	- info on Linux's /proc filesystem.
+ocfs2.txt
+	- info and mount options for the OCFS2 clustered filesystem.
 romfs.txt
 romfs.txt
 	- Description of the ROMFS filesystem.
 	- Description of the ROMFS filesystem.
 smbfs.txt
 smbfs.txt

+ 434 - 0
Documentation/filesystems/configfs/configfs.txt

@@ -0,0 +1,434 @@
+
+configfs - Userspace-driven kernel object configuation.
+
+Joel Becker <joel.becker@oracle.com>
+
+Updated: 31 March 2005
+
+Copyright (c) 2005 Oracle Corporation,
+	Joel Becker <joel.becker@oracle.com>
+
+
+[What is configfs?]
+
+configfs is a ram-based filesystem that provides the converse of
+sysfs's functionality.  Where sysfs is a filesystem-based view of
+kernel objects, configfs is a filesystem-based manager of kernel
+objects, or config_items.
+
+With sysfs, an object is created in kernel (for example, when a device
+is discovered) and it is registered with sysfs.  Its attributes then
+appear in sysfs, allowing userspace to read the attributes via
+readdir(3)/read(2).  It may allow some attributes to be modified via
+write(2).  The important point is that the object is created and
+destroyed in kernel, the kernel controls the lifecycle of the sysfs
+representation, and sysfs is merely a window on all this.
+
+A configfs config_item is created via an explicit userspace operation:
+mkdir(2).  It is destroyed via rmdir(2).  The attributes appear at
+mkdir(2) time, and can be read or modified via read(2) and write(2).
+As with sysfs, readdir(3) queries the list of items and/or attributes.
+symlink(2) can be used to group items together.  Unlike sysfs, the
+lifetime of the representation is completely driven by userspace.  The
+kernel modules backing the items must respond to this.
+
+Both sysfs and configfs can and should exist together on the same
+system.  One is not a replacement for the other.
+
+[Using configfs]
+
+configfs can be compiled as a module or into the kernel.  You can access
+it by doing
+
+	mount -t configfs none /config
+
+The configfs tree will be empty unless client modules are also loaded.
+These are modules that register their item types with configfs as
+subsystems.  Once a client subsystem is loaded, it will appear as a
+subdirectory (or more than one) under /config.  Like sysfs, the
+configfs tree is always there, whether mounted on /config or not.
+
+An item is created via mkdir(2).  The item's attributes will also
+appear at this time.  readdir(3) can determine what the attributes are,
+read(2) can query their default values, and write(2) can store new
+values.  Like sysfs, attributes should be ASCII text files, preferably
+with only one value per file.  The same efficiency caveats from sysfs
+apply.  Don't mix more than one attribute in one attribute file.
+
+Like sysfs, configfs expects write(2) to store the entire buffer at
+once.  When writing to configfs attributes, userspace processes should
+first read the entire file, modify the portions they wish to change, and
+then write the entire buffer back.  Attribute files have a maximum size
+of one page (PAGE_SIZE, 4096 on i386).
+
+When an item needs to be destroyed, remove it with rmdir(2).  An
+item cannot be destroyed if any other item has a link to it (via
+symlink(2)).  Links can be removed via unlink(2).
+
+[Configuring FakeNBD: an Example]
+
+Imagine there's a Network Block Device (NBD) driver that allows you to
+access remote block devices.  Call it FakeNBD.  FakeNBD uses configfs
+for its configuration.  Obviously, there will be a nice program that
+sysadmins use to configure FakeNBD, but somehow that program has to tell
+the driver about it.  Here's where configfs comes in.
+
+When the FakeNBD driver is loaded, it registers itself with configfs.
+readdir(3) sees this just fine:
+
+	# ls /config
+	fakenbd
+
+A fakenbd connection can be created with mkdir(2).  The name is
+arbitrary, but likely the tool will make some use of the name.  Perhaps
+it is a uuid or a disk name:
+
+	# mkdir /config/fakenbd/disk1
+	# ls /config/fakenbd/disk1
+	target device rw
+
+The target attribute contains the IP address of the server FakeNBD will
+connect to.  The device attribute is the device on the server.
+Predictably, the rw attribute determines whether the connection is
+read-only or read-write.
+
+	# echo 10.0.0.1 > /config/fakenbd/disk1/target
+	# echo /dev/sda1 > /config/fakenbd/disk1/device
+	# echo 1 > /config/fakenbd/disk1/rw
+
+That's it.  That's all there is.  Now the device is configured, via the
+shell no less.
+
+[Coding With configfs]
+
+Every object in configfs is a config_item.  A config_item reflects an
+object in the subsystem.  It has attributes that match values on that
+object.  configfs handles the filesystem representation of that object
+and its attributes, allowing the subsystem to ignore all but the
+basic show/store interaction.
+
+Items are created and destroyed inside a config_group.  A group is a
+collection of items that share the same attributes and operations.
+Items are created by mkdir(2) and removed by rmdir(2), but configfs
+handles that.  The group has a set of operations to perform these tasks
+
+A subsystem is the top level of a client module.  During initialization,
+the client module registers the subsystem with configfs, the subsystem
+appears as a directory at the top of the configfs filesystem.  A
+subsystem is also a config_group, and can do everything a config_group
+can.
+
+[struct config_item]
+
+	struct config_item {
+		char                    *ci_name;
+		char                    ci_namebuf[UOBJ_NAME_LEN];
+		struct kref             ci_kref;
+		struct list_head        ci_entry;
+		struct config_item      *ci_parent;
+		struct config_group     *ci_group;
+		struct config_item_type *ci_type;
+		struct dentry           *ci_dentry;
+	};
+
+	void config_item_init(struct config_item *);
+	void config_item_init_type_name(struct config_item *,
+					const char *name,
+					struct config_item_type *type);
+	struct config_item *config_item_get(struct config_item *);
+	void config_item_put(struct config_item *);
+
+Generally, struct config_item is embedded in a container structure, a
+structure that actually represents what the subsystem is doing.  The
+config_item portion of that structure is how the object interacts with
+configfs.
+
+Whether statically defined in a source file or created by a parent
+config_group, a config_item must have one of the _init() functions
+called on it.  This initializes the reference count and sets up the
+appropriate fields.
+
+All users of a config_item should have a reference on it via
+config_item_get(), and drop the reference when they are done via
+config_item_put().
+
+By itself, a config_item cannot do much more than appear in configfs.
+Usually a subsystem wants the item to display and/or store attributes,
+among other things.  For that, it needs a type.
+
+[struct config_item_type]
+
+	struct configfs_item_operations {
+		void (*release)(struct config_item *);
+		ssize_t (*show_attribute)(struct config_item *,
+					  struct configfs_attribute *,
+					  char *);
+		ssize_t (*store_attribute)(struct config_item *,
+					   struct configfs_attribute *,
+					   const char *, size_t);
+		int (*allow_link)(struct config_item *src,
+				  struct config_item *target);
+		int (*drop_link)(struct config_item *src,
+				 struct config_item *target);
+	};
+
+	struct config_item_type {
+		struct module                           *ct_owner;
+		struct configfs_item_operations         *ct_item_ops;
+		struct configfs_group_operations        *ct_group_ops;
+		struct configfs_attribute               **ct_attrs;
+	};
+
+The most basic function of a config_item_type is to define what
+operations can be performed on a config_item.  All items that have been
+allocated dynamically will need to provide the ct_item_ops->release()
+method.  This method is called when the config_item's reference count
+reaches zero.  Items that wish to display an attribute need to provide
+the ct_item_ops->show_attribute() method.  Similarly, storing a new
+attribute value uses the store_attribute() method.
+
+[struct configfs_attribute]
+
+	struct configfs_attribute {
+		char                    *ca_name;
+		struct module           *ca_owner;
+		mode_t                  ca_mode;
+	};
+
+When a config_item wants an attribute to appear as a file in the item's
+configfs directory, it must define a configfs_attribute describing it.
+It then adds the attribute to the NULL-terminated array
+config_item_type->ct_attrs.  When the item appears in configfs, the
+attribute file will appear with the configfs_attribute->ca_name
+filename.  configfs_attribute->ca_mode specifies the file permissions.
+
+If an attribute is readable and the config_item provides a
+ct_item_ops->show_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute.  The converse
+will happen for write(2).
+
+[struct config_group]
+
+A config_item cannot live in a vaccum.  The only way one can be created
+is via mkdir(2) on a config_group.  This will trigger creation of a
+child item.
+
+	struct config_group {
+		struct config_item		cg_item;
+		struct list_head		cg_children;
+		struct configfs_subsystem 	*cg_subsys;
+		struct config_group		**default_groups;
+	};
+
+	void config_group_init(struct config_group *group);
+	void config_group_init_type_name(struct config_group *group,
+					 const char *name,
+					 struct config_item_type *type);
+
+
+The config_group structure contains a config_item.  Properly configuring
+that item means that a group can behave as an item in its own right.
+However, it can do more: it can create child items or groups.  This is
+accomplished via the group operations specified on the group's
+config_item_type.
+
+	struct configfs_group_operations {
+		struct config_item *(*make_item)(struct config_group *group,
+						 const char *name);
+		struct config_group *(*make_group)(struct config_group *group,
+						   const char *name);
+		int (*commit_item)(struct config_item *item);
+		void (*drop_item)(struct config_group *group,
+				  struct config_item *item);
+	};
+
+A group creates child items by providing the
+ct_group_ops->make_item() method.  If provided, this method is called from mkdir(2) in the group's directory.  The subsystem allocates a new
+config_item (or more likely, its container structure), initializes it,
+and returns it to configfs.  Configfs will then populate the filesystem
+tree to reflect the new item.
+
+If the subsystem wants the child to be a group itself, the subsystem
+provides ct_group_ops->make_group().  Everything else behaves the same,
+using the group _init() functions on the group.
+
+Finally, when userspace calls rmdir(2) on the item or group,
+ct_group_ops->drop_item() is called.  As a config_group is also a
+config_item, it is not necessary for a seperate drop_group() method.
+The subsystem must config_item_put() the reference that was initialized
+upon item allocation.  If a subsystem has no work to do, it may omit
+the ct_group_ops->drop_item() method, and configfs will call
+config_item_put() on the item on behalf of the subsystem.
+
+IMPORTANT: drop_item() is void, and as such cannot fail.  When rmdir(2)
+is called, configfs WILL remove the item from the filesystem tree
+(assuming that it has no children to keep it busy).  The subsystem is
+responsible for responding to this.  If the subsystem has references to
+the item in other threads, the memory is safe.  It may take some time
+for the item to actually disappear from the subsystem's usage.  But it
+is gone from configfs.
+
+A config_group cannot be removed while it still has child items.  This
+is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
+called, as the item has not been dropped.  rmdir(2) will fail, as the
+directory is not empty.
+
+[struct configfs_subsystem]
+
+A subsystem must register itself, ususally at module_init time.  This
+tells configfs to make the subsystem appear in the file tree.
+
+	struct configfs_subsystem {
+		struct config_group	su_group;
+		struct semaphore	su_sem;
+	};
+
+	int configfs_register_subsystem(struct configfs_subsystem *subsys);
+	void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+	A subsystem consists of a toplevel config_group and a semaphore.
+The group is where child config_items are created.  For a subsystem,
+this group is usually defined statically.  Before calling
+configfs_register_subsystem(), the subsystem must have initialized the
+group via the usual group _init() functions, and it must also have
+initialized the semaphore.
+	When the register call returns, the subsystem is live, and it
+will be visible via configfs.  At that point, mkdir(2) can be called and
+the subsystem must be ready for it.
+
+[An Example]
+
+The best example of these basic concepts is the simple_children
+subsystem/group and the simple_child item in configfs_example.c  It
+shows a trivial object displaying and storing an attribute, and a simple
+group creating and destroying these children.
+
+[Hierarchy Navigation and the Subsystem Semaphore]
+
+There is an extra bonus that configfs provides.  The config_groups and
+config_items are arranged in a hierarchy due to the fact that they
+appear in a filesystem.  A subsystem is NEVER to touch the filesystem
+parts, but the subsystem might be interested in this hierarchy.  For
+this reason, the hierarchy is mirrored via the config_group->cg_children
+and config_item->ci_parent structure members.
+
+A subsystem can navigate the cg_children list and the ci_parent pointer
+to see the tree created by the subsystem.  This can race with configfs'
+management of the hierarchy, so configfs uses the subsystem semaphore to
+protect modifications.  Whenever a subsystem wants to navigate the
+hierarchy, it must do so under the protection of the subsystem
+semaphore.
+
+A subsystem will be prevented from acquiring the semaphore while a newly
+allocated item has not been linked into this hierarchy.   Similarly, it
+will not be able to acquire the semaphore while a dropping item has not
+yet been unlinked.  This means that an item's ci_parent pointer will
+never be NULL while the item is in configfs, and that an item will only
+be in its parent's cg_children list for the same duration.  This allows
+a subsystem to trust ci_parent and cg_children while they hold the
+semaphore.
+
+[Item Aggregation Via symlink(2)]
+
+configfs provides a simple group via the group->item parent/child
+relationship.  Often, however, a larger environment requires aggregation
+outside of the parent/child connection.  This is implemented via
+symlink(2).
+
+A config_item may provide the ct_item_ops->allow_link() and
+ct_item_ops->drop_link() methods.  If the ->allow_link() method exists,
+symlink(2) may be called with the config_item as the source of the link.
+These links are only allowed between configfs config_items.  Any
+symlink(2) attempt outside the configfs filesystem will be denied.
+
+When symlink(2) is called, the source config_item's ->allow_link()
+method is called with itself and a target item.  If the source item
+allows linking to target item, it returns 0.  A source item may wish to
+reject a link if it only wants links to a certain type of object (say,
+in its own subsystem).
+
+When unlink(2) is called on the symbolic link, the source item is
+notified via the ->drop_link() method.  Like the ->drop_item() method,
+this is a void function and cannot return failure.  The subsystem is
+responsible for responding to the change.
+
+A config_item cannot be removed while it links to any other item, nor
+can it be removed while an item links to it.  Dangling symlinks are not
+allowed in configfs.
+
+[Automatically Created Subgroups]
+
+A new config_group may want to have two types of child config_items.
+While this could be codified by magic names in ->make_item(), it is much
+more explicit to have a method whereby userspace sees this divergence.
+
+Rather than have a group where some items behave differently than
+others, configfs provides a method whereby one or many subgroups are
+automatically created inside the parent at its creation.  Thus,
+mkdir("parent) results in "parent", "parent/subgroup1", up through
+"parent/subgroupN".  Items of type 1 can now be created in
+"parent/subgroup1", and items of type N can be created in
+"parent/subgroupN".
+
+These automatic subgroups, or default groups, do not preclude other
+children of the parent group.  If ct_group_ops->make_group() exists,
+other child groups can be created on the parent group directly.
+
+A configfs subsystem specifies default groups by filling in the
+NULL-terminated array default_groups on the config_group structure.
+Each group in that array is populated in the configfs tree at the same
+time as the parent group.  Similarly, they are removed at the same time
+as the parent.  No extra notification is provided.  When a ->drop_item()
+method call notifies the subsystem the parent group is going away, it
+also means every default group child associated with that parent group.
+
+As a consequence of this, default_groups cannot be removed directly via
+rmdir(2).  They also are not considered when rmdir(2) on the parent
+group is checking for children.
+
+[Committable Items]
+
+NOTE: Committable items are currently unimplemented.
+
+Some config_items cannot have a valid initial state.  That is, no
+default values can be specified for the item's attributes such that the
+item can do its work.  Userspace must configure one or more attributes,
+after which the subsystem can start whatever entity this item
+represents.
+
+Consider the FakeNBD device from above.  Without a target address *and*
+a target device, the subsystem has no idea what block device to import.
+The simple example assumes that the subsystem merely waits until all the
+appropriate attributes are configured, and then connects.  This will,
+indeed, work, but now every attribute store must check if the attributes
+are initialized.  Every attribute store must fire off the connection if
+that condition is met.
+
+Far better would be an explicit action notifying the subsystem that the
+config_item is ready to go.  More importantly, an explicit action allows
+the subsystem to provide feedback as to whether the attibutes are
+initialized in a way that makes sense.  configfs provides this as
+committable items.
+
+configfs still uses only normal filesystem operations.  An item is
+committed via rename(2).  The item is moved from a directory where it
+can be modified to a directory where it cannot.
+
+Any group that provides the ct_group_ops->commit_item() method has
+committable items.  When this group appears in configfs, mkdir(2) will
+not work directly in the group.  Instead, the group will have two
+subdirectories: "live" and "pending".  The "live" directory does not
+support mkdir(2) or rmdir(2) either.  It only allows rename(2).  The
+"pending" directory does allow mkdir(2) and rmdir(2).  An item is
+created in the "pending" directory.  Its attributes can be modified at
+will.  Userspace commits the item by renaming it into the "live"
+directory.  At this point, the subsystem recieves the ->commit_item()
+callback.  If all required attributes are filled to satisfaction, the
+method returns zero and the item is moved to the "live" directory.
+
+As rmdir(2) does not work in the "live" directory, an item must be
+shutdown, or "uncommitted".  Again, this is done via rename(2), this
+time from the "live" directory back to the "pending" one.  The subsystem
+is notified by the ct_group_ops->uncommit_object() method.
+
+

+ 474 - 0
Documentation/filesystems/configfs/configfs_example.c

@@ -0,0 +1,474 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example.c - This file is a demonstration module containing
+ *      a number of configfs subsystems.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem.  It cannot create
+ * any config_items.  It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem.  See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+	struct configfs_subsystem subsys;
+	int showme;
+	int storeme;
+};
+
+struct childless_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct childless *, char *);
+	ssize_t (*store)(struct childless *, const char *, size_t);
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+	return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_read(struct childless *childless,
+				     char *page)
+{
+	ssize_t pos;
+
+	pos = sprintf(page, "%d\n", childless->showme);
+	childless->showme++;
+
+	return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+				      char *page)
+{
+	return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+				       const char *page,
+				       size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	childless->storeme = tmp;
+
+	return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+					  char *page)
+{
+	return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs.  It does not support the creation of child config_items.\n"
+"It only has a few attributes.  In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+static struct childless_attribute childless_attr_showme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
+	.show	= childless_showme_read,
+};
+static struct childless_attribute childless_attr_storeme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= childless_storeme_read,
+	.store	= childless_storeme_write,
+};
+static struct childless_attribute childless_attr_description = {
+	.attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
+	.show = childless_description_read,
+};
+
+static struct configfs_attribute *childless_attrs[] = {
+	&childless_attr_showme.attr,
+	&childless_attr_storeme.attr,
+	&childless_attr_description.attr,
+	NULL,
+};
+
+static ssize_t childless_attr_show(struct config_item *item,
+				   struct configfs_attribute *attr,
+				   char *page)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = 0;
+
+	if (childless_attr->show)
+		ret = childless_attr->show(childless, page);
+	return ret;
+}
+
+static ssize_t childless_attr_store(struct config_item *item,
+				    struct configfs_attribute *attr,
+				    const char *page, size_t count)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (childless_attr->store)
+		ret = childless_attr->store(childless, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations childless_item_ops = {
+	.show_attribute		= childless_attr_show,
+	.store_attribute	= childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+	.ct_item_ops	= &childless_item_ops,
+	.ct_attrs	= childless_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+	.subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "01-childless",
+				.ci_type = &childless_type,
+			},
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child.  Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go.  Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+	struct config_item item;
+	int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+	return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "storeme",
+	.ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+	&simple_child_attr_storeme,
+	NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+				      struct configfs_attribute *attr,
+				      char *page)
+{
+	ssize_t count;
+	struct simple_child *simple_child = to_simple_child(item);
+
+	count = sprintf(page, "%d\n", simple_child->storeme);
+
+	return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+				       struct configfs_attribute *attr,
+				       const char *page, size_t count)
+{
+	struct simple_child *simple_child = to_simple_child(item);
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	simple_child->storeme = tmp;
+
+	return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+	kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+	.release		= simple_child_release,
+	.show_attribute		= simple_child_attr_show,
+	.store_attribute	= simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+	.ct_item_ops	= &simple_child_item_ops,
+	.ct_attrs	= simple_child_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+	struct simple_child *simple_child;
+
+	simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
+	if (!simple_child)
+		return NULL;
+
+	memset(simple_child, 0, sizeof(struct simple_child));
+
+	config_item_init_type_name(&simple_child->item, name,
+				   &simple_child_type);
+
+	simple_child->storeme = 0;
+
+	return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+	&simple_children_attr_description,
+	NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+			   		 struct configfs_attribute *attr,
+			   		 char *page)
+{
+	return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items.  These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+	.show_attribute	= simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+	.make_item	= simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+	.ct_item_ops	= &simple_children_item_ops,
+	.ct_group_ops	= &simple_children_group_ops,
+	.ct_attrs	= simple_children_attrs,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "02-simple-children",
+			.ci_type = &simple_children_type,
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above.  However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem.  Creation of a group in the subsystem creates
+ * a new simple_children group.  That group can then have simple_child
+ * children of its own.
+ */
+
+struct simple_children {
+	struct config_group group;
+};
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+	struct simple_children *simple_children;
+
+	simple_children = kmalloc(sizeof(struct simple_children),
+				  GFP_KERNEL);
+	if (!simple_children)
+		return NULL;
+
+	memset(simple_children, 0, sizeof(struct simple_children));
+
+	config_group_init_type_name(&simple_children->group, name,
+				    &simple_children_type);
+
+	return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+	&group_children_attr_description,
+	NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+			   		struct configfs_attribute *attr,
+			   		char *page)
+{
+	return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups.  These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+	.show_attribute	= group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+	.make_group	= group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+	.ct_item_ops	= &group_children_item_ops,
+	.ct_group_ops	= &group_children_group_ops,
+	.ct_attrs	= group_children_attrs,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "03-group-children",
+			.ci_type = &group_children_type,
+		},
+	},
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all.  It
+ * allows the init function to easily register them.  Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+	&childless_subsys.subsys,
+	&simple_children_subsys,
+	&group_children_subsys,
+	NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+	int ret;
+	int i;
+	struct configfs_subsystem *subsys;
+
+	for (i = 0; example_subsys[i]; i++) {
+		subsys = example_subsys[i];
+
+		config_group_init(&subsys->su_group);
+		init_MUTEX(&subsys->su_sem);
+		ret = configfs_register_subsystem(subsys);
+		if (ret) {
+			printk(KERN_ERR "Error %d while registering subsystem %s\n",
+			       ret,
+			       subsys->su_group.cg_item.ci_namebuf);
+			goto out_unregister;
+		}
+	}
+
+	return 0;
+
+out_unregister:
+	for (; i >= 0; i--) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+
+	return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+	int i;
+
+	for (i = 0; example_subsys[i]; i++) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");

+ 130 - 0
Documentation/filesystems/dlmfs.txt

@@ -0,0 +1,130 @@
+dlmfs
+==================
+A minimal DLM userspace interface implemented via a virtual file
+system.
+
+dlmfs is built with OCFS2 as it requires most of its infrastructure.
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS
+=======
+
+Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
+and Transmeta Corp.
+
+Mark Fasheh <mark.fasheh@oracle.com>
+
+Caveats
+=======
+- Right now it only works with the OCFS2 DLM, though support for other
+  DLM implementations should not be a major issue.
+
+Mount options
+=============
+None
+
+Usage
+=====
+
+If you're just interested in OCFS2, then please see ocfs2.txt. The
+rest of this document will be geared towards those who want to use
+dlmfs for easy to setup and easy to use clustered locking in
+userspace.
+
+Setup
+=====
+
+dlmfs requires that the OCFS2 cluster infrastructure be in
+place. Please download ocfs2-tools from the above url and configure a
+cluster.
+
+You'll want to start heartbeating on a volume which all the nodes in
+your lockspace can access. The easiest way to do this is via
+ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
+that an OCFS2 file system be in place so that it can automatically
+find it's heartbeat area, though it will eventually support heartbeat
+against raw disks.
+
+Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
+with ocfs2-tools.
+
+Once you're heartbeating, DLM lock 'domains' can be easily created /
+destroyed and locks within them accessed.
+
+Locking
+=======
+
+Users may access dlmfs via standard file system calls, or they can use
+'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
+system calls and presents a more traditional locking api.
+
+dlmfs handles lock caching automatically for the user, so a lock
+request for an already acquired lock will not generate another DLM
+call. Userspace programs are assumed to handle their own local
+locking.
+
+Two levels of locks are supported - Shared Read, and Exlcusive.
+Also supported is a Trylock operation.
+
+For information on the libo2dlm interface, please see o2dlm.h,
+distributed with ocfs2-tools.
+
+Lock value blocks can be read and written to a resource via read(2)
+and write(2) against the fd obtained via your open(2) call. The
+maximum currently supported LVB length is 64 bytes (though that is an
+OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
+small amounts of data amongst their nodes.
+
+mkdir(2) signals dlmfs to join a domain (which will have the same name
+as the resulting directory)
+
+rmdir(2) signals dlmfs to leave the domain
+
+Locks for a given domain are represented by regular inodes inside the
+domain directory.  Locking against them is done via the open(2) system
+call.
+
+The open(2) call will not return until your lock has been granted or
+an error has occurred, unless it has been instructed to do a trylock
+operation. If the lock succeeds, you'll get an fd.
+
+open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
+not automatically create inodes for existing lock resources.
+
+Open Flag     Lock Request Type
+---------     -----------------
+O_RDONLY      Shared Read
+O_RDWR        Exclusive
+
+Open Flag     Resulting Locking Behavior
+---------     --------------------------
+O_NONBLOCK    Trylock operation
+
+You must provide exactly one of O_RDONLY or O_RDWR.
+
+If O_NONBLOCK is also provided and the trylock operation was valid but
+could not lock the resource then open(2) will return ETXTBUSY.
+
+close(2) drops the lock associated with your fd.
+
+Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
+supported locally as well. This means you can use them to restrict
+access to the resources via dlmfs on your local node only.
+
+The resource LVB may be read from the fd in either Shared Read or
+Exclusive modes via the read(2) system call. It can be written via
+write(2) only when open in Exclusive mode.
+
+Once written, an LVB will be visible to other nodes who obtain Read
+Only or higher level locks on the resource.
+
+See Also
+========
+http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+
+For more information on the VMS distributed locking API.

+ 55 - 0
Documentation/filesystems/ocfs2.txt

@@ -0,0 +1,55 @@
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS:
+Lots of code taken from ext3 and other projects.
+
+Authors in alphabetical order:
+Joel Becker   <joel.becker@oracle.com>
+Zach Brown    <zach.brown@oracle.com>
+Mark Fasheh   <mark.fasheh@oracle.com>
+Kurt Hackel   <kurt.hackel@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh  <manish.singh@oracle.com>
+
+Caveats
+=======
+Features which OCFS2 does not support yet:
+	- sparse files
+	- extended attributes
+	- shared writeable mmap
+	- loopback is supported, but data written will not
+	  be cluster coherent.
+	- quotas
+	- cluster aware flock
+	- Directory change notification (F_NOTIFY)
+	- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+	- POSIX ACLs
+	- readpages / writepages (not user visible)
+
+Mount options
+=============
+
+OCFS2 supports the following mount options:
+(*) == default
+
+barrier=1		This enables/disables barriers. barrier=0 disables it,
+			barrier=1 enables it.
+errors=remount-ro(*)	Remount the filesystem read-only on an error.
+errors=panic		Panic and halt the machine if an error occurs.
+intr		(*)	Allow signals to interrupt cluster operations.
+nointr			Do not allow signals to interrupt cluster
+			operations.

+ 14 - 0
MAINTAINERS

@@ -554,6 +554,11 @@ W:	http://us1.samba.org/samba/Linux_CIFS_client.html
 T:	git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
 T:	git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
 S:	Supported	
 S:	Supported	
 
 
+CONFIGFS
+P:	Joel Becker
+M:	Joel Becker <joel.becker@oracle.com>
+S:	Supported
+
 CIRRUS LOGIC GENERIC FBDEV DRIVER
 CIRRUS LOGIC GENERIC FBDEV DRIVER
 P:	Jeff Garzik
 P:	Jeff Garzik
 M:	jgarzik@pobox.com
 M:	jgarzik@pobox.com
@@ -1898,6 +1903,15 @@ M:	ajoshi@shell.unixbox.com
 L:	linux-nvidia@lists.surfsouth.com
 L:	linux-nvidia@lists.surfsouth.com
 S:	Maintained
 S:	Maintained
 
 
+ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
+P:	Mark Fasheh
+M:	mark.fasheh@oracle.com
+P:	Kurt Hackel
+M:	kurt.hackel@oracle.com
+L:	ocfs2-devel@oss.oracle.com
+W:	http://oss.oracle.com/projects/ocfs2/
+S:	Supported	
+
 OLYMPIC NETWORK DRIVER
 OLYMPIC NETWORK DRIVER
 P:	Peter De Shrijver
 P:	Peter De Shrijver
 M:	p2@ace.ulyssis.student.kuleuven.ac.be
 M:	p2@ace.ulyssis.student.kuleuven.ac.be

+ 18 - 5
drivers/block/loop.c

@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	struct address_space_operations *aops = mapping->a_ops;
 	struct address_space_operations *aops = mapping->a_ops;
 	pgoff_t index;
 	pgoff_t index;
 	unsigned offset, bv_offs;
 	unsigned offset, bv_offs;
-	int len, ret = 0;
+	int len, ret;
 
 
 	down(&mapping->host->i_sem);
 	down(&mapping->host->i_sem);
 	index = pos >> PAGE_CACHE_SHIFT;
 	index = pos >> PAGE_CACHE_SHIFT;
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		page = grab_cache_page(mapping, index);
 		page = grab_cache_page(mapping, index);
 		if (unlikely(!page))
 		if (unlikely(!page))
 			goto fail;
 			goto fail;
-		if (unlikely(aops->prepare_write(file, page, offset,
-				offset + size)))
+		ret = aops->prepare_write(file, page, offset,
+					  offset + size);
+		if (unlikely(ret)) {
+			if (ret == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				continue;
+			}
 			goto unlock;
 			goto unlock;
+		}
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 				bvec->bv_page, bv_offs, size, IV);
 				bvec->bv_page, bv_offs, size, IV);
 		if (unlikely(transfer_result)) {
 		if (unlikely(transfer_result)) {
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 			kunmap_atomic(kaddr, KM_USER0);
 			kunmap_atomic(kaddr, KM_USER0);
 		}
 		}
 		flush_dcache_page(page);
 		flush_dcache_page(page);
-		if (unlikely(aops->commit_write(file, page, offset,
-				offset + size)))
+		ret = aops->commit_write(file, page, offset,
+					 offset + size);
+		if (unlikely(ret)) {
+			if (ret == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				continue;
+			}
 			goto unlock;
 			goto unlock;
+		}
 		if (unlikely(transfer_result))
 		if (unlikely(transfer_result))
 			goto unlock;
 			goto unlock;
 		bv_offs += size;
 		bv_offs += size;
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		unlock_page(page);
 		unlock_page(page);
 		page_cache_release(page);
 		page_cache_release(page);
 	}
 	}
+	ret = 0;
 out:
 out:
 	up(&mapping->host->i_sem);
 	up(&mapping->host->i_sem);
 	return ret;
 	return ret;

+ 2 - 2
drivers/block/rd.c

@@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
 
 
 /*
 /*
  * ->writepage to the the blockdev's mapping has to redirty the page so that the
  * ->writepage to the the blockdev's mapping has to redirty the page so that the
- * VM doesn't go and steal it.  We return WRITEPAGE_ACTIVATE so that the VM
+ * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
  * won't try to (pointlessly) write the page again for a while.
  * won't try to (pointlessly) write the page again for a while.
  *
  *
  * Really, these pages should not be on the LRU at all.
  * Really, these pages should not be on the LRU at all.
@@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
 		make_page_uptodate(page);
 		make_page_uptodate(page);
 	SetPageDirty(page);
 	SetPageDirty(page);
 	if (wbc->for_reclaim)
 	if (wbc->for_reclaim)
-		return WRITEPAGE_ACTIVATE;
+		return AOP_WRITEPAGE_ACTIVATE;
 	unlock_page(page);
 	unlock_page(page);
 	return 0;
 	return 0;
 }
 }

+ 55 - 11
fs/Kconfig

@@ -70,6 +70,7 @@ config FS_XIP
 
 
 config EXT3_FS
 config EXT3_FS
 	tristate "Ext3 journalling file system support"
 	tristate "Ext3 journalling file system support"
+	select JBD
 	help
 	help
 	  This is the journaling version of the Second extended file system
 	  This is the journaling version of the Second extended file system
 	  (often called ext3), the de facto standard Linux file system
 	  (often called ext3), the de facto standard Linux file system
@@ -138,23 +139,20 @@ config EXT3_FS_SECURITY
 	  extended attributes for file security labels, say N.
 	  extended attributes for file security labels, say N.
 
 
 config JBD
 config JBD
-# CONFIG_JBD could be its own option (even modular), but until there are
-# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
-# dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
 	tristate
 	tristate
-	default EXT3_FS
 	help
 	help
 	  This is a generic journaling layer for block devices.  It is
 	  This is a generic journaling layer for block devices.  It is
-	  currently used by the ext3 file system, but it could also be used to
-	  add journal support to other file systems or block devices such as
-	  RAID or LVM.
+	  currently used by the ext3 and OCFS2 file systems, but it could
+	  also be used to add journal support to other file systems or block
+	  devices such as RAID or LVM.
 
 
-	  If you are using the ext3 file system, you need to say Y here. If
-	  you are not using ext3 then you will probably want to say N.
+	  If you are using the ext3 or OCFS2 file systems, you need to
+	  say Y here. If you are not using ext3 OCFS2 then you will probably
+	  want to say N.
 
 
 	  To compile this device as a module, choose M here: the module will be
 	  To compile this device as a module, choose M here: the module will be
-	  called jbd.  If you are compiling ext3 into the kernel, you cannot
-	  compile this code as a module.
+	  called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
+	  you cannot compile this code as a module.
 
 
 config JBD_DEBUG
 config JBD_DEBUG
 	bool "JBD (ext3) debugging support"
 	bool "JBD (ext3) debugging support"
@@ -326,6 +324,38 @@ config FS_POSIX_ACL
 
 
 source "fs/xfs/Kconfig"
 source "fs/xfs/Kconfig"
 
 
+config OCFS2_FS
+	tristate "OCFS2 file system support (EXPERIMENTAL)"
+	depends on NET && EXPERIMENTAL
+	select CONFIGFS_FS
+	select JBD
+	select CRC32
+	select INET
+	help
+	  OCFS2 is a general purpose extent based shared disk cluster file
+	  system with many similarities to ext3. It supports 64 bit inode
+	  numbers, and has automatically extending metadata groups which may
+	  also make it attractive for non-clustered use.
+
+	  You'll want to install the ocfs2-tools package in order to at least
+	  get "mount.ocfs2".
+
+	  Project web page:    http://oss.oracle.com/projects/ocfs2
+	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+	  Note: Features which OCFS2 does not support yet:
+	          - extended attributes
+		  - shared writeable mmap
+	          - loopback is supported, but data written will not
+	            be cluster coherent.
+	          - quotas
+	          - cluster aware flock
+	          - Directory change notification (F_NOTIFY)
+	          - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+	          - POSIX ACLs
+	          - readpages / writepages (not user visible)
+
 config MINIX_FS
 config MINIX_FS
 	tristate "Minix fs support"
 	tristate "Minix fs support"
 	help
 	help
@@ -841,6 +871,20 @@ config RELAYFS_FS
 
 
 	  If unsure, say N.
 	  If unsure, say N.
 
 
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  configfs is a ram-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
+
+	  If unsure, say N.
+
 endmenu
 endmenu
 
 
 menu "Miscellaneous filesystems"
 menu "Miscellaneous filesystems"

+ 2 - 0
fs/Makefile

@@ -101,3 +101,5 @@ obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
+obj-$(CONFIG_OCFS2_FS)		+= ocfs2/

+ 7 - 0
fs/configfs/Makefile

@@ -0,0 +1,7 @@
+#
+# Makefile for the configfs virtual filesystem
+#
+
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs.o
+
+configfs-objs	:= inode.o file.o dir.o symlink.o mount.o item.o

+ 142 - 0
fs/configfs/configfs_internal.h

@@ -0,0 +1,142 @@
+/* -*- mode: c; c-basic-offset:8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs_internal.h - Internal stuff for configfs
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+
+struct configfs_dirent {
+	atomic_t		s_count;
+	struct list_head	s_sibling;
+	struct list_head	s_children;
+	struct list_head	s_links;
+	void 			* s_element;
+	int			s_type;
+	umode_t			s_mode;
+	struct dentry		* s_dentry;
+};
+
+#define CONFIGFS_ROOT		0x0001
+#define CONFIGFS_DIR		0x0002
+#define CONFIGFS_ITEM_ATTR 	0x0004
+#define CONFIGFS_ITEM_LINK 	0x0020
+#define CONFIGFS_USET_DIR	0x0040
+#define CONFIGFS_USET_DEFAULT	0x0080
+#define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
+
+extern struct vfsmount * configfs_mount;
+
+extern int configfs_is_root(struct config_item *item);
+
+extern struct inode * configfs_new_inode(mode_t mode);
+extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+
+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_make_dirent(struct configfs_dirent *,
+				struct dentry *, void *, umode_t, int);
+
+extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
+
+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+
+extern int configfs_pin_fs(void);
+extern void configfs_release_fs(void);
+
+extern struct rw_semaphore configfs_rename_sem;
+extern struct super_block * configfs_sb;
+extern struct file_operations configfs_dir_operations;
+extern struct file_operations configfs_file_operations;
+extern struct file_operations bin_fops;
+extern struct inode_operations configfs_dir_inode_operations;
+extern struct inode_operations configfs_symlink_inode_operations;
+
+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname);
+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
+
+struct configfs_symlink {
+	struct list_head sl_list;
+	struct config_item *sl_target;
+};
+
+extern int configfs_create_link(struct configfs_symlink *sl,
+				struct dentry *parent,
+				struct dentry *dentry);
+
+static inline struct config_item * to_item(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct config_item *) sd->s_element);
+}
+
+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct configfs_attribute *) sd->s_element);
+}
+
+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
+{
+	struct config_item * item = NULL;
+
+	spin_lock(&dcache_lock);
+	if (!d_unhashed(dentry)) {
+		struct configfs_dirent * sd = dentry->d_fsdata;
+		if (sd->s_type & CONFIGFS_ITEM_LINK) {
+			struct configfs_symlink * sl = sd->s_element;
+			item = config_item_get(sl->sl_target);
+		} else
+			item = config_item_get(sd->s_element);
+	}
+	spin_unlock(&dcache_lock);
+
+	return item;
+}
+
+static inline void release_configfs_dirent(struct configfs_dirent * sd)
+{
+	if (!(sd->s_type & CONFIGFS_ROOT))
+		kfree(sd);
+}
+
+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+	return sd;
+}
+
+static inline void configfs_put(struct configfs_dirent * sd)
+{
+	WARN_ON(!atomic_read(&sd->s_count));
+	if (atomic_dec_and_test(&sd->s_count))
+		release_configfs_dirent(sd);
+}
+

+ 1102 - 0
fs/configfs/dir.c

@@ -0,0 +1,1102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c - Operations for configfs directories.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+DECLARE_RWSEM(configfs_rename_sem);
+
+static void configfs_d_iput(struct dentry * dentry,
+			    struct inode * inode)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+
+	if (sd) {
+		BUG_ON(sd->s_dentry != dentry);
+		sd->s_dentry = NULL;
+		configfs_put(sd);
+	}
+	iput(inode);
+}
+
+/*
+ * We _must_ delete our dentries on last dput, as the chain-to-parent
+ * behavior is required to clear the parents of default_groups.
+ */
+static int configfs_d_delete(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations configfs_dentry_ops = {
+	.d_iput		= configfs_d_iput,
+	/* simple_delete_dentry() isn't exported */
+	.d_delete	= configfs_d_delete,
+};
+
+/*
+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
+ */
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
+						void * element)
+{
+	struct configfs_dirent * sd;
+
+	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+	if (!sd)
+		return NULL;
+
+	memset(sd, 0, sizeof(*sd));
+	atomic_set(&sd->s_count, 1);
+	INIT_LIST_HEAD(&sd->s_links);
+	INIT_LIST_HEAD(&sd->s_children);
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	sd->s_element = element;
+
+	return sd;
+}
+
+int configfs_make_dirent(struct configfs_dirent * parent_sd,
+			 struct dentry * dentry, void * element,
+			 umode_t mode, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = configfs_new_dirent(parent_sd, element);
+	if (!sd)
+		return -ENOMEM;
+
+	sd->s_mode = mode;
+	sd->s_type = type;
+	sd->s_dentry = dentry;
+	if (dentry) {
+		dentry->d_fsdata = configfs_get(sd);
+		dentry->d_op = &configfs_dentry_ops;
+	}
+
+	return 0;
+}
+
+static int init_dir(struct inode * inode)
+{
+	inode->i_op = &configfs_dir_inode_operations;
+	inode->i_fop = &configfs_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inode->i_nlink++;
+	return 0;
+}
+
+static int init_file(struct inode * inode)
+{
+	inode->i_size = PAGE_SIZE;
+	inode->i_fop = &configfs_file_operations;
+	return 0;
+}
+
+static int init_symlink(struct inode * inode)
+{
+	inode->i_op = &configfs_symlink_inode_operations;
+	return 0;
+}
+
+static int create_dir(struct config_item * k, struct dentry * p,
+		      struct dentry * d)
+{
+	int error;
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+
+	error = configfs_create(d, mode, init_dir);
+	if (!error) {
+		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+					   CONFIGFS_DIR);
+		if (!error) {
+			p->d_inode->i_nlink++;
+			(d)->d_op = &configfs_dentry_ops;
+		}
+	}
+	return error;
+}
+
+
+/**
+ *	configfs_create_dir - create a directory for an config_item.
+ *	@item:		config_itemwe're creating directory for.
+ *	@dentry:	config_item's dentry.
+ */
+
+static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+{
+	struct dentry * parent;
+	int error = 0;
+
+	BUG_ON(!item);
+
+	if (item->ci_parent)
+		parent = item->ci_parent->ci_dentry;
+	else if (configfs_mount && configfs_mount->mnt_sb)
+		parent = configfs_mount->mnt_sb->s_root;
+	else
+		return -EFAULT;
+
+	error = create_dir(item,parent,dentry);
+	if (!error)
+		item->ci_dentry = dentry;
+	return error;
+}
+
+int configfs_create_link(struct configfs_symlink *sl,
+			 struct dentry *parent,
+			 struct dentry *dentry)
+{
+	int err = 0;
+	umode_t mode = S_IFLNK | S_IRWXUGO;
+
+	err = configfs_create(dentry, mode, init_symlink);
+	if (!err) {
+		err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
+					 mode, CONFIGFS_ITEM_LINK);
+		if (!err)
+			dentry->d_op = &configfs_dentry_ops;
+	}
+	return err;
+}
+
+static void remove_dir(struct dentry * d)
+{
+	struct dentry * parent = dget(d->d_parent);
+	struct configfs_dirent * sd;
+
+	sd = d->d_fsdata;
+ 	list_del_init(&sd->s_sibling);
+	configfs_put(sd);
+	if (d->d_inode)
+		simple_rmdir(parent->d_inode,d);
+
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+		 atomic_read(&d->d_count));
+
+	dput(parent);
+}
+
+/**
+ * configfs_remove_dir - remove an config_item's directory.
+ * @item:	config_item we're removing.
+ *
+ * The only thing special about this is that we remove any files in
+ * the directory before we remove the directory, and we've inlined
+ * what used to be configfs_rmdir() below, instead of calling separately.
+ */
+
+static void configfs_remove_dir(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+
+	if (!dentry)
+		return;
+
+	remove_dir(dentry);
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+
+/* attaches attribute's configfs_dirent to the dentry corresponding to the
+ * attribute file
+ */
+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
+{
+	struct configfs_attribute * attr = sd->s_element;
+	int error;
+
+	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
+	if (error)
+		return error;
+
+	dentry->d_op = &configfs_dentry_ops;
+	dentry->d_fsdata = configfs_get(sd);
+	sd->s_dentry = dentry;
+	d_rehash(dentry);
+
+	return 0;
+}
+
+static struct dentry * configfs_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       struct nameidata *nd)
+{
+	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
+	struct configfs_dirent * sd;
+	int found = 0;
+	int err = 0;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED) {
+			const unsigned char * name = configfs_get_name(sd);
+
+			if (strcmp(name, dentry->d_name.name))
+				continue;
+
+			found = 1;
+			err = configfs_attach_attr(sd, dentry);
+			break;
+		}
+	}
+
+	if (!found) {
+		/*
+		 * If it doesn't exist and it isn't a NOT_PINNED item,
+		 * it must be negative.
+		 */
+		return simple_lookup(dir, dentry, nd);
+	}
+
+	return ERR_PTR(err);
+}
+
+/*
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes and are removed by rmdir().  We recurse, taking i_sem
+ * on all children that are candidates for default detach.  If the
+ * result is clean, then configfs_detach_group() will handle dropping
+ * i_sem.  If there is an error, the caller will clean up the i_sem
+ * holders via configfs_detach_rollback().
+ */
+static int configfs_detach_prep(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+	int ret;
+
+	ret = -EBUSY;
+	if (!list_empty(&parent_sd->s_links))
+		goto out;
+
+	ret = 0;
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED)
+			continue;
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			down(&sd->s_dentry->d_inode->i_sem);
+			/* Mark that we've taken i_sem */
+			sd->s_type |= CONFIGFS_USET_DROPPING;
+
+			ret = configfs_detach_prep(sd->s_dentry);
+			if (!ret)
+			       	continue;
+		} else
+			ret = -ENOTEMPTY;
+
+		break;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
+ * set.
+ */
+static void configfs_detach_rollback(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			configfs_detach_rollback(sd->s_dentry);
+
+			if (sd->s_type & CONFIGFS_USET_DROPPING) {
+				sd->s_type &= ~CONFIGFS_USET_DROPPING;
+				up(&sd->s_dentry->d_inode->i_sem);
+			}
+		}
+	}
+}
+
+static void detach_attrs(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+	struct configfs_dirent * parent_sd;
+	struct configfs_dirent * sd, * tmp;
+
+	if (!dentry)
+		return;
+
+	pr_debug("configfs %s: dropping attrs for  dir\n",
+		 dentry->d_name.name);
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		list_del_init(&sd->s_sibling);
+		configfs_drop_dentry(sd, dentry);
+		configfs_put(sd);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+static int populate_attrs(struct config_item *item)
+{
+	struct config_item_type *t = item->ci_type;
+	struct configfs_attribute *attr;
+	int error = 0;
+	int i;
+
+	if (!t)
+		return -EINVAL;
+	if (t->ct_attrs) {
+		for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+			if ((error = configfs_create_file(item, attr)))
+				break;
+		}
+	}
+
+	if (error)
+		detach_attrs(item);
+
+	return error;
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry);
+static void configfs_detach_group(struct config_item *item);
+
+static void detach_groups(struct config_group *group)
+{
+	struct dentry * dentry = dget(group->cg_item.ci_dentry);
+	struct dentry *child;
+	struct configfs_dirent *parent_sd;
+	struct configfs_dirent *sd, *tmp;
+
+	if (!dentry)
+		return;
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    !(sd->s_type & CONFIGFS_USET_DEFAULT))
+			continue;
+
+		child = sd->s_dentry;
+
+		configfs_detach_group(sd->s_element);
+		child->d_inode->i_flags |= S_DEAD;
+
+		/*
+		 * From rmdir/unregister, a configfs_detach_prep() pass
+		 * has taken our i_sem for us.  Drop it.
+		 * From mkdir/register cleanup, there is no sem held.
+		 */
+		if (sd->s_type & CONFIGFS_USET_DROPPING)
+			up(&child->d_inode->i_sem);
+
+		d_delete(child);
+		dput(child);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+/*
+ * This fakes mkdir(2) on a default_groups[] entry.  It
+ * creates a dentry, attachs it, and then does fixup
+ * on the sd->s_type.
+ *
+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
+ * try using vfs_mkdir.  Just a thought.
+ */
+static int create_default_group(struct config_group *parent_group,
+				struct config_group *group)
+{
+	int ret;
+	struct qstr name;
+	struct configfs_dirent *sd;
+	/* We trust the caller holds a reference to parent */
+	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	ret = -ENOMEM;
+	child = d_alloc(parent, &name);
+	if (child) {
+		d_add(child, NULL);
+
+		ret = configfs_attach_group(&parent_group->cg_item,
+					    &group->cg_item, child);
+		if (!ret) {
+			sd = child->d_fsdata;
+			sd->s_type |= CONFIGFS_USET_DEFAULT;
+		} else {
+			d_delete(child);
+			dput(child);
+		}
+	}
+
+	return ret;
+}
+
+static int populate_groups(struct config_group *group)
+{
+	struct config_group *new_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+	int ret = 0;
+	int i;
+
+	if (group && group->default_groups) {
+		/* FYI, we're faking mkdir here
+		 * I'm not sure we need this semaphore, as we're called
+		 * from our parent's mkdir.  That holds our parent's
+		 * i_sem, so afaik lookup cannot continue through our
+		 * parent to find us, let alone mess with our tree.
+		 * That said, taking our i_sem is closer to mkdir
+		 * emulation, and shouldn't hurt. */
+		down(&dentry->d_inode->i_sem);
+
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+
+			ret = create_default_group(group, new_group);
+			if (ret)
+				break;
+		}
+
+		up(&dentry->d_inode->i_sem);
+	}
+
+	if (ret)
+		detach_groups(group);
+
+	return ret;
+}
+
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_sem is held.
+ */
+
+static void unlink_obj(struct config_item *item)
+{
+	struct config_group *group;
+
+	group = item->ci_group;
+	if (group) {
+		list_del_init(&item->ci_entry);
+
+		item->ci_group = NULL;
+		item->ci_parent = NULL;
+		config_item_put(item);
+
+		config_group_put(group);
+	}
+}
+
+static void link_obj(struct config_item *parent_item, struct config_item *item)
+{
+	/* Parent seems redundant with group, but it makes certain
+	 * traversals much nicer. */
+	item->ci_parent = parent_item;
+	item->ci_group = config_group_get(to_config_group(parent_item));
+	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
+
+	config_item_get(item);
+}
+
+static void unlink_group(struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			unlink_group(new_group);
+		}
+	}
+
+	group->cg_subsys = NULL;
+	unlink_obj(&group->cg_item);
+}
+
+static void link_group(struct config_group *parent_group, struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+	struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
+
+	link_obj(&parent_group->cg_item, &group->cg_item);
+
+	if (parent_group->cg_subsys)
+		subsys = parent_group->cg_subsys;
+	else if (configfs_is_root(&parent_group->cg_item))
+		subsys = to_configfs_subsystem(group);
+	else
+		BUG();
+	group->cg_subsys = subsys;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			link_group(group, new_group);
+		}
+	}
+}
+
+/*
+ * The goal is that configfs_attach_item() (and
+ * configfs_attach_group()) can be called from either the VFS or this
+ * module.  That is, they assume that the items have been created,
+ * the dentry allocated, and the dcache is all ready to go.
+ *
+ * If they fail, they must clean up after themselves as if they
+ * had never been called.  The caller (VFS or local function) will
+ * handle cleaning up the dcache bits.
+ *
+ * configfs_detach_group() and configfs_detach_item() behave similarly on
+ * the way out.  They assume that the proper semaphores are held, they
+ * clean up the configfs items, and they expect their callers will
+ * handle the dcache bits.
+ */
+static int configfs_attach_item(struct config_item *parent_item,
+				struct config_item *item,
+				struct dentry *dentry)
+{
+	int ret;
+
+	ret = configfs_create_dir(item, dentry);
+	if (!ret) {
+		ret = populate_attrs(item);
+		if (ret) {
+			configfs_remove_dir(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_item(struct config_item *item)
+{
+	detach_attrs(item);
+	configfs_remove_dir(item);
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry)
+{
+	int ret;
+	struct configfs_dirent *sd;
+
+	ret = configfs_attach_item(parent_item, item, dentry);
+	if (!ret) {
+		sd = dentry->d_fsdata;
+		sd->s_type |= CONFIGFS_USET_DIR;
+
+		ret = populate_groups(to_config_group(item));
+		if (ret) {
+			configfs_detach_item(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_group(struct config_item *item)
+{
+	detach_groups(to_config_group(item));
+	configfs_detach_item(item);
+}
+
+/*
+ * Drop the initial reference from make_item()/make_group()
+ * This function assumes that reference is held on item
+ * and that item holds a valid reference to the parent.  Also, it
+ * assumes the caller has validated ci_type.
+ */
+static void client_drop_item(struct config_item *parent_item,
+			     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	if (type->ct_group_ops && type->ct_group_ops->drop_item)
+		type->ct_group_ops->drop_item(to_config_group(parent_item),
+						item);
+	else
+		config_item_put(item);
+}
+
+
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int ret;
+	struct config_group *group;
+	struct config_item *item;
+	struct config_item *parent_item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct config_item_type *type;
+	struct module *owner;
+	char *name;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_parent->d_fsdata;
+	if (!(sd->s_type & CONFIGFS_USET_DIR))
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!type || !type->ct_group_ops ||
+	    (!type->ct_group_ops->make_group &&
+	     !type->ct_group_ops->make_item)) {
+		config_item_put(parent_item);
+		return -EPERM;  /* What lack-of-mkdir returns */
+	}
+
+	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
+
+	down(&subsys->su_sem);
+	group = NULL;
+	item = NULL;
+	if (type->ct_group_ops->make_group) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (group) {
+			link_group(to_config_group(parent_item), group);
+			item = &group->cg_item;
+		}
+	} else {
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (item)
+			link_obj(parent_item, item);
+	}
+	up(&subsys->su_sem);
+
+	kfree(name);
+	if (!item) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+
+	ret = -EINVAL;
+	type = item->ci_type;
+	if (type) {
+		owner = type->ct_owner;
+		if (try_module_get(owner)) {
+			if (group) {
+				ret = configfs_attach_group(parent_item,
+							    item,
+							    dentry);
+			} else {
+				ret = configfs_attach_item(parent_item,
+							   item,
+							   dentry);
+			}
+
+			if (ret) {
+				down(&subsys->su_sem);
+				if (group)
+					unlink_group(group);
+				else
+					unlink_obj(item);
+				client_drop_item(parent_item, item);
+				up(&subsys->su_sem);
+
+				config_item_put(parent_item);
+				module_put(owner);
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct config_item *parent_item;
+	struct config_item *item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct module *owner = NULL;
+	int ret;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_fsdata;
+	if (sd->s_type & CONFIGFS_USET_DEFAULT)
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!parent_item->ci_type) {
+		config_item_put(parent_item);
+		return -EINVAL;
+	}
+
+	ret = configfs_detach_prep(dentry);
+	if (ret) {
+		configfs_detach_rollback(dentry);
+		config_item_put(parent_item);
+		return ret;
+	}
+
+	item = configfs_get_config_item(dentry);
+
+	/* Drop reference from above, item already holds one. */
+	config_item_put(parent_item);
+
+	if (item->ci_type)
+		owner = item->ci_type->ct_owner;
+
+	if (sd->s_type & CONFIGFS_USET_DIR) {
+		configfs_detach_group(item);
+
+		down(&subsys->su_sem);
+		unlink_group(to_config_group(item));
+	} else {
+		configfs_detach_item(item);
+
+		down(&subsys->su_sem);
+		unlink_obj(item);
+	}
+
+	client_drop_item(parent_item, item);
+	up(&subsys->su_sem);
+
+	/* Drop our reference from above */
+	config_item_put(item);
+
+	module_put(owner);
+
+	return 0;
+}
+
+struct inode_operations configfs_dir_inode_operations = {
+	.mkdir		= configfs_mkdir,
+	.rmdir		= configfs_rmdir,
+	.symlink	= configfs_symlink,
+	.unlink		= configfs_unlink,
+	.lookup		= configfs_lookup,
+};
+
+#if 0
+int configfs_rename_dir(struct config_item * item, const char *new_name)
+{
+	int error = 0;
+	struct dentry * new_dentry, * parent;
+
+	if (!strcmp(config_item_name(item), new_name))
+		return -EINVAL;
+
+	if (!item->parent)
+		return -EINVAL;
+
+	down_write(&configfs_rename_sem);
+	parent = item->parent->dentry;
+
+	down(&parent->d_inode->i_sem);
+
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+	if (!IS_ERR(new_dentry)) {
+  		if (!new_dentry->d_inode) {
+			error = config_item_set_name(item, "%s", new_name);
+			if (!error) {
+				d_add(new_dentry, NULL);
+				d_move(item->dentry, new_dentry);
+			}
+			else
+				d_delete(new_dentry);
+		} else
+			error = -EEXIST;
+		dput(new_dentry);
+	}
+	up(&parent->d_inode->i_sem);
+	up_write(&configfs_rename_sem);
+
+	return error;
+}
+#endif
+
+static int configfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+
+	down(&dentry->d_inode->i_sem);
+	file->private_data = configfs_new_dirent(parent_sd, NULL);
+	up(&dentry->d_inode->i_sem);
+
+	return file->private_data ? 0 : -ENOMEM;
+
+}
+
+static int configfs_dir_close(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * cursor = file->private_data;
+
+	down(&dentry->d_inode->i_sem);
+	list_del_init(&cursor->s_sibling);
+	up(&dentry->d_inode->i_sem);
+
+	release_configfs_dirent(cursor);
+
+	return 0;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct configfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->s_sibling;
+	ino_t ino;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			if (filp->f_pos == 2) {
+				list_del(q);
+				list_add(q, &parent_sd->s_children);
+			}
+			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
+				struct configfs_dirent *next;
+				const char * name;
+				int len;
+
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (!next->s_element)
+					continue;
+
+				name = configfs_get_name(next);
+				len = strlen(name);
+				if (next->s_dentry)
+					ino = next->s_dentry->d_inode->i_ino;
+				else
+					ino = iunique(configfs_sb, 2);
+
+				if (filldir(dirent, name, len, filp->f_pos, ino,
+						 dt_type(next)) < 0)
+					return 0;
+
+				list_del(q);
+				list_add(q, p);
+				p = q;
+				filp->f_pos++;
+			}
+	}
+	return 0;
+}
+
+static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+{
+	struct dentry * dentry = file->f_dentry;
+
+	down(&dentry->d_inode->i_sem);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			up(&file->f_dentry->d_inode->i_sem);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			struct configfs_dirent *cursor = file->private_data;
+			struct list_head *p;
+			loff_t n = file->f_pos - 2;
+
+			list_del(&cursor->s_sibling);
+			p = sd->s_children.next;
+			while (n && p != &sd->s_children) {
+				struct configfs_dirent *next;
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (next->s_element)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->s_sibling, p);
+		}
+	}
+	up(&dentry->d_inode->i_sem);
+	return offset;
+}
+
+struct file_operations configfs_dir_operations = {
+	.open		= configfs_dir_open,
+	.release	= configfs_dir_close,
+	.llseek		= configfs_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= configfs_readdir,
+};
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys)
+{
+	int err;
+	struct config_group *group = &subsys->su_group;
+	struct qstr name;
+	struct dentry *dentry;
+	struct configfs_dirent *sd;
+
+	err = configfs_pin_fs();
+	if (err)
+		return err;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+	sd = configfs_sb->s_root->d_fsdata;
+	link_group(to_config_group(sd->s_element), group);
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	err = -ENOMEM;
+	dentry = d_alloc(configfs_sb->s_root, &name);
+	if (!dentry)
+		goto out_release;
+
+	d_add(dentry, NULL);
+
+	err = configfs_attach_group(sd->s_element, &group->cg_item,
+				    dentry);
+	if (!err)
+		dentry = NULL;
+	else
+		d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	if (dentry) {
+	    dput(dentry);
+out_release:
+	    unlink_group(group);
+	    configfs_release_fs();
+	}
+
+	return err;
+}
+
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
+{
+	struct config_group *group = &subsys->su_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+
+	if (dentry->d_parent != configfs_sb->s_root) {
+		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+		return;
+	}
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+	down(&dentry->d_inode->i_sem);
+	if (configfs_detach_prep(dentry)) {
+		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+	}
+	configfs_detach_group(&group->cg_item);
+	dentry->d_inode->i_flags |= S_DEAD;
+	up(&dentry->d_inode->i_sem);
+
+	d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	dput(dentry);
+
+	unlink_group(group);
+	configfs_release_fs();
+}
+
+EXPORT_SYMBOL(configfs_register_subsystem);
+EXPORT_SYMBOL(configfs_unregister_subsystem);

+ 360 - 0
fs/configfs/file.c

@@ -0,0 +1,360 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c - operations for regular (text) files.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/dnotify.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+
+struct configfs_buffer {
+	size_t			count;
+	loff_t			pos;
+	char			* page;
+	struct configfs_item_operations	* ops;
+	struct semaphore	sem;
+	int			needs_read_fill;
+};
+
+
+/**
+ *	fill_read_buffer - allocate and fill buffer from item.
+ *	@dentry:	dentry pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Allocate @buffer->page, if it hasn't been already, then call the
+ *	config_item's show() method to fill the buffer with this attribute's
+ *	data.
+ *	This is called only once, on the file's first read.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+	int ret = 0;
+	ssize_t count;
+
+	if (!buffer->page)
+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	count = ops->show_attribute(item,attr,buffer->page);
+	buffer->needs_read_fill = 0;
+	BUG_ON(count > (ssize_t)PAGE_SIZE);
+	if (count >= 0)
+		buffer->count = count;
+	else
+		ret = count;
+	return ret;
+}
+
+
+/**
+ *	flush_read_buffer - push buffer to userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	user-passed buffer.
+ *	@count:		number of bytes requested.
+ *	@ppos:		file position.
+ *
+ *	Copy the buffer we filled in fill_read_buffer() to userspace.
+ *	This is done at the reader's leisure, copying and advancing
+ *	the amount they specify each time.
+ *	This may be called continuously until the buffer is empty.
+ */
+static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
+			     size_t count, loff_t * ppos)
+{
+	int error;
+
+	if (*ppos > buffer->count)
+		return 0;
+
+	if (count > (buffer->count - *ppos))
+		count = buffer->count - *ppos;
+
+	error = copy_to_user(buf,buffer->page + *ppos,count);
+	if (!error)
+		*ppos += count;
+	return error ? -EFAULT : count;
+}
+
+/**
+ *	configfs_read_file - read an attribute.
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target item is in the directory's
+ *	->d_fsdata.
+ *
+ *	We call fill_read_buffer() to allocate and fill the buffer from the
+ *	item's show() method exactly once (if the read is happening from
+ *	the beginning of the file). That should fill the entire buffer with
+ *	all the data the item has to offer for that attribute.
+ *	We then call flush_read_buffer() to copy the buffer to userspace
+ *	in the increments specified.
+ */
+
+static ssize_t
+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t retval = 0;
+
+	down(&buffer->sem);
+	if (buffer->needs_read_fill) {
+		if ((retval = fill_read_buffer(file->f_dentry,buffer)))
+			goto out;
+	}
+	pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
+		 __FUNCTION__,count,*ppos,buffer->page);
+	retval = flush_read_buffer(buffer,buf,count,ppos);
+out:
+	up(&buffer->sem);
+	return retval;
+}
+
+
+/**
+ *	fill_write_buffer - copy buffer from userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	data from user.
+ *	@count:		number of bytes in @userbuf.
+ *
+ *	Allocate @buffer->page if it hasn't been already, then
+ *	copy the user-supplied buffer into it.
+ */
+
+static int
+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+{
+	int error;
+
+	if (!buffer->page)
+		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+	error = copy_from_user(buffer->page,buf,count);
+	buffer->needs_read_fill = 1;
+	return error ? -EFAULT : count;
+}
+
+
+/**
+ *	flush_write_buffer - push buffer to config_item.
+ *	@file:		file pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Get the correct pointers for the config_item and the attribute we're
+ *	dealing with, then call the store() method for the attribute,
+ *	passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+
+	return ops->store_attribute(item,attr,buffer->page,count);
+}
+
+
+/**
+ *	configfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Similar to configfs_read_file(), though working in the opposite direction.
+ *	We allocate and fill the data from the user in fill_write_buffer(),
+ *	then push it to the config_item in flush_write_buffer().
+ *	There is no easy way for us to know if userspace is only doing a partial
+ *	write, so we don't support them. We expect the entire buffer to come
+ *	on the first write.
+ *	Hint: if you're writing a value, first read the file, modify only the
+ *	the value you're changing, then write entire buffer back.
+ */
+
+static ssize_t
+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+
+	down(&buffer->sem);
+	count = fill_write_buffer(buffer,buf,count);
+	if (count > 0)
+		count = flush_write_buffer(file->f_dentry,buffer,count);
+	if (count > 0)
+		*ppos += count;
+	up(&buffer->sem);
+	return count;
+}
+
+static int check_perm(struct inode * inode, struct file * file)
+{
+	struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(file->f_dentry);
+	struct configfs_buffer * buffer;
+	struct configfs_item_operations * ops = NULL;
+	int error = 0;
+
+	if (!item || !attr)
+		goto Einval;
+
+	/* Grab the module reference for this attribute if we have one */
+	if (!try_module_get(attr->ca_owner)) {
+		error = -ENODEV;
+		goto Done;
+	}
+
+	if (item->ci_type)
+		ops = item->ci_type->ct_item_ops;
+	else
+		goto Eaccess;
+
+	/* File needs write support.
+	 * The inode's perms must say it's ok,
+	 * and we must have a store method.
+	 */
+	if (file->f_mode & FMODE_WRITE) {
+
+		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+			goto Eaccess;
+
+	}
+
+	/* File needs read support.
+	 * The inode's perms must say it's ok, and we there
+	 * must be a show method for it.
+	 */
+	if (file->f_mode & FMODE_READ) {
+		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+			goto Eaccess;
+	}
+
+	/* No error? Great, allocate a buffer for the file, and store it
+	 * it in file->private_data for easy access.
+	 */
+	buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+	if (buffer) {
+		memset(buffer,0,sizeof(struct configfs_buffer));
+		init_MUTEX(&buffer->sem);
+		buffer->needs_read_fill = 1;
+		buffer->ops = ops;
+		file->private_data = buffer;
+	} else
+		error = -ENOMEM;
+	goto Done;
+
+ Einval:
+	error = -EINVAL;
+	goto Done;
+ Eaccess:
+	error = -EACCES;
+	module_put(attr->ca_owner);
+ Done:
+	if (error && item)
+		config_item_put(item);
+	return error;
+}
+
+static int configfs_open_file(struct inode * inode, struct file * filp)
+{
+	return check_perm(inode,filp);
+}
+
+static int configfs_release(struct inode * inode, struct file * filp)
+{
+	struct config_item * item = to_item(filp->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(filp->f_dentry);
+	struct module * owner = attr->ca_owner;
+	struct configfs_buffer * buffer = filp->private_data;
+
+	if (item)
+		config_item_put(item);
+	/* After this point, attr should not be accessed. */
+	module_put(owner);
+
+	if (buffer) {
+		if (buffer->page)
+			free_page((unsigned long)buffer->page);
+		kfree(buffer);
+	}
+	return 0;
+}
+
+struct file_operations configfs_file_operations = {
+	.read		= configfs_read_file,
+	.write		= configfs_write_file,
+	.llseek		= generic_file_llseek,
+	.open		= configfs_open_file,
+	.release	= configfs_release,
+};
+
+
+int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
+{
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
+
+	down(&dir->d_inode->i_sem);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
+	up(&dir->d_inode->i_sem);
+
+	return error;
+}
+
+
+/**
+ *	configfs_create_file - create an attribute file for an item.
+ *	@item:	item we're creating for.
+ *	@attr:	atrribute descriptor.
+ */
+
+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
+{
+	BUG_ON(!item || !item->ci_dentry || !attr);
+
+	return configfs_add_file(item->ci_dentry, attr,
+				 CONFIGFS_ITEM_ATTR);
+}
+

+ 162 - 0
fs/configfs/inode.c

@@ -0,0 +1,162 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c - basic inode and dentry operations.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see Documentation/filesystems/configfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+extern struct super_block * configfs_sb;
+
+static struct address_space_operations configfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+static struct backing_dev_info configfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+struct inode * configfs_new_inode(mode_t mode)
+{
+	struct inode * inode = new_inode(configfs_sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = 0;
+		inode->i_gid = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mapping->a_ops = &configfs_aops;
+		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+	}
+	return inode;
+}
+
+int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+{
+	int error = 0;
+	struct inode * inode = NULL;
+	if (dentry) {
+		if (!dentry->d_inode) {
+			if ((inode = configfs_new_inode(mode))) {
+				if (dentry->d_parent && dentry->d_parent->d_inode) {
+					struct inode *p_inode = dentry->d_parent->d_inode;
+					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+				}
+				goto Proceed;
+			}
+			else
+				error = -ENOMEM;
+		} else
+			error = -EEXIST;
+	} else
+		error = -ENOENT;
+	goto Done;
+
+ Proceed:
+	if (init)
+		error = init(inode);
+	if (!error) {
+		d_instantiate(dentry, inode);
+		if (S_ISDIR(mode) || S_ISLNK(mode))
+			dget(dentry);  /* pin link and directory dentries in core */
+	} else
+		iput(inode);
+ Done:
+	return error;
+}
+
+/*
+ * Get the name for corresponding element represented by the given configfs_dirent
+ */
+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
+{
+	struct attribute * attr;
+
+	if (!sd || !sd->s_element)
+		BUG();
+
+	/* These always have a dentry, so use that */
+	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
+		return sd->s_dentry->d_name.name;
+
+	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+		attr = sd->s_element;
+		return attr->name;
+	}
+	return NULL;
+}
+
+
+/*
+ * Unhashes the dentry corresponding to given configfs_dirent
+ * Called with parent inode's i_sem held.
+ */
+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
+{
+	struct dentry * dentry = sd->s_dentry;
+
+	if (dentry) {
+		spin_lock(&dcache_lock);
+		if (!(d_unhashed(dentry) && dentry->d_inode)) {
+			dget_locked(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dcache_lock);
+			simple_unlink(parent->d_inode, dentry);
+		} else
+			spin_unlock(&dcache_lock);
+	}
+}
+
+void configfs_hash_and_remove(struct dentry * dir, const char * name)
+{
+	struct configfs_dirent * sd;
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+
+	down(&dir->d_inode->i_sem);
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element)
+			continue;
+		if (!strcmp(configfs_get_name(sd), name)) {
+			list_del_init(&sd->s_sibling);
+			configfs_drop_dentry(sd, dir);
+			configfs_put(sd);
+			break;
+		}
+	}
+	up(&dir->d_inode->i_sem);
+}
+
+

+ 227 - 0
fs/configfs/item.c

@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * item.c - library routines for handling generic config items
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on kobject:
+ * 	kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see the file Documentation/filesystems/configfs.txt for
+ * critical information about using the config_item interface.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+static inline struct config_item * to_item(struct list_head * entry)
+{
+	return container_of(entry,struct config_item,ci_entry);
+}
+
+/* Evil kernel */
+static void config_item_release(struct kref *kref);
+
+/**
+ *	config_item_init - initialize item.
+ *	@item:	item in question.
+ */
+void config_item_init(struct config_item * item)
+{
+	kref_init(&item->ci_kref);
+	INIT_LIST_HEAD(&item->ci_entry);
+}
+
+/**
+ *	config_item_set_name - Set the name of an item
+ *	@item:	item.
+ *	@name:	name.
+ *
+ *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
+ *	dynamically allocated string that @item->ci_name points to.
+ *	Otherwise, use the static @item->ci_namebuf array.
+ */
+
+int config_item_set_name(struct config_item * item, const char * fmt, ...)
+{
+	int error = 0;
+	int limit = CONFIGFS_ITEM_NAME_LEN;
+	int need;
+	va_list args;
+	char * name;
+
+	/*
+	 * First, try the static array
+	 */
+	va_start(args,fmt);
+	need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+	va_end(args);
+	if (need < limit)
+		name = item->ci_namebuf;
+	else {
+		/*
+		 * Need more space? Allocate it and try again
+		 */
+		limit = need + 1;
+		name = kmalloc(limit,GFP_KERNEL);
+		if (!name) {
+			error = -ENOMEM;
+			goto Done;
+		}
+		va_start(args,fmt);
+		need = vsnprintf(name,limit,fmt,args);
+		va_end(args);
+
+		/* Still? Give up. */
+		if (need >= limit) {
+			kfree(name);
+			error = -EFAULT;
+			goto Done;
+		}
+	}
+
+	/* Free the old name, if necessary. */
+	if (item->ci_name && item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+
+	/* Now, set the new name */
+	item->ci_name = name;
+ Done:
+	return error;
+}
+
+EXPORT_SYMBOL(config_item_set_name);
+
+void config_item_init_type_name(struct config_item *item,
+				const char *name,
+				struct config_item_type *type)
+{
+	config_item_set_name(item, name);
+	item->ci_type = type;
+	config_item_init(item);
+}
+EXPORT_SYMBOL(config_item_init_type_name);
+
+void config_group_init_type_name(struct config_group *group, const char *name,
+			 struct config_item_type *type)
+{
+	config_item_set_name(&group->cg_item, name);
+	group->cg_item.ci_type = type;
+	config_group_init(group);
+}
+EXPORT_SYMBOL(config_group_init_type_name);
+
+struct config_item * config_item_get(struct config_item * item)
+{
+	if (item)
+		kref_get(&item->ci_kref);
+	return item;
+}
+
+/**
+ *	config_item_cleanup - free config_item resources.
+ *	@item:	item.
+ */
+
+void config_item_cleanup(struct config_item * item)
+{
+	struct config_item_type * t = item->ci_type;
+	struct config_group * s = item->ci_group;
+	struct config_item * parent = item->ci_parent;
+
+	pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+	if (item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+	item->ci_name = NULL;
+	if (t && t->ct_item_ops && t->ct_item_ops->release)
+		t->ct_item_ops->release(item);
+	if (s)
+		config_group_put(s);
+	if (parent)
+		config_item_put(parent);
+}
+
+static void config_item_release(struct kref *kref)
+{
+	config_item_cleanup(container_of(kref, struct config_item, ci_kref));
+}
+
+/**
+ *	config_item_put - decrement refcount for item.
+ *	@item:	item.
+ *
+ *	Decrement the refcount, and if 0, call config_item_cleanup().
+ */
+void config_item_put(struct config_item * item)
+{
+	if (item)
+		kref_put(&item->ci_kref, config_item_release);
+}
+
+
+/**
+ *	config_group_init - initialize a group for use
+ *	@k:	group
+ */
+
+void config_group_init(struct config_group *group)
+{
+	config_item_init(&group->cg_item);
+	INIT_LIST_HEAD(&group->cg_children);
+}
+
+
+/**
+ *	config_group_find_obj - search for item in group.
+ *	@group:	group we're looking in.
+ *	@name:	item's name.
+ *
+ *	Lock group via @group->cg_subsys, and iterate over @group->cg_list,
+ *	looking for a matching config_item. If matching item is found
+ *	take a reference and return the item.
+ */
+
+struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+{
+	struct list_head * entry;
+	struct config_item * ret = NULL;
+
+        /* XXX LOCKING! */
+	list_for_each(entry,&group->cg_children) {
+		struct config_item * item = to_item(entry);
+		if (config_item_name(item) &&
+                    !strcmp(config_item_name(item), name)) {
+			ret = config_item_get(item);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+EXPORT_SYMBOL(config_item_init);
+EXPORT_SYMBOL(config_group_init);
+EXPORT_SYMBOL(config_item_get);
+EXPORT_SYMBOL(config_item_put);
+

+ 159 - 0
fs/configfs/mount.c

@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mount.c - operations for initializing and mounting configfs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Random magic number */
+#define CONFIGFS_MAGIC 0x62656570
+
+struct vfsmount * configfs_mount = NULL;
+struct super_block * configfs_sb = NULL;
+static int configfs_mnt_count = 0;
+
+static struct super_operations configfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct config_group configfs_root_group = {
+	.cg_item = {
+		.ci_namebuf	= "root",
+		.ci_name	= configfs_root_group.cg_item.ci_namebuf,
+	},
+};
+
+int configfs_is_root(struct config_item *item)
+{
+	return item == &configfs_root_group.cg_item;
+}
+
+static struct configfs_dirent configfs_root = {
+	.s_sibling	= LIST_HEAD_INIT(configfs_root.s_sibling),
+	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
+	.s_element	= &configfs_root_group.cg_item,
+	.s_type		= CONFIGFS_ROOT,
+};
+
+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CONFIGFS_MAGIC;
+	sb->s_op = &configfs_ops;
+	configfs_sb = sb;
+
+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+	if (inode) {
+		inode->i_op = &configfs_dir_inode_operations;
+		inode->i_fop = &configfs_dir_operations;
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else {
+		pr_debug("configfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
+		iput(inode);
+		return -ENOMEM;
+	}
+	config_group_init(&configfs_root_group);
+	configfs_root_group.cg_item.ci_dentry = root;
+	root->d_fsdata = &configfs_root;
+	sb->s_root = root;
+	return 0;
+}
+
+static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+}
+
+static struct file_system_type configfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "configfs",
+	.get_sb		= configfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+int configfs_pin_fs(void)
+{
+	return simple_pin_fs("configfs", &configfs_mount,
+			     &configfs_mnt_count);
+}
+
+void configfs_release_fs(void)
+{
+	simple_release_fs(&configfs_mount, &configfs_mnt_count);
+}
+
+
+static decl_subsys(config, NULL, NULL);
+
+static int __init configfs_init(void)
+{
+	int err;
+
+	kset_set_kset_s(&config_subsys, kernel_subsys);
+	err = subsystem_register(&config_subsys);
+	if (err)
+		return err;
+
+	err = register_filesystem(&configfs_fs_type);
+	if (err) {
+		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+		subsystem_unregister(&config_subsys);
+	}
+
+	return err;
+}
+
+static void __exit configfs_exit(void)
+{
+	unregister_filesystem(&configfs_fs_type);
+	subsystem_unregister(&config_subsys);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.0.1");
+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
+
+module_init(configfs_init);
+module_exit(configfs_exit);

+ 281 - 0
fs/configfs/symlink.c

@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.c - operations for configfs symlinks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+static int item_depth(struct config_item * item)
+{
+	struct config_item * p = item;
+	int depth = 0;
+	do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
+	return depth;
+}
+
+static int item_path_length(struct config_item * item)
+{
+	struct config_item * p = item;
+	int length = 1;
+	do {
+		length += strlen(config_item_name(p)) + 1;
+		p = p->ci_parent;
+	} while (p && !configfs_is_root(p));
+	return length;
+}
+
+static void fill_item_path(struct config_item * item, char * buffer, int length)
+{
+	struct config_item * p;
+
+	--length;
+	for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
+		int cur = strlen(config_item_name(p));
+
+		/* back up enough to print this bus id with '/' */
+		length -= cur;
+		strncpy(buffer + length,config_item_name(p),cur);
+		*(buffer + --length) = '/';
+	}
+}
+
+static int create_link(struct config_item *parent_item,
+ 		       struct config_item *item,
+		       struct dentry *dentry)
+{
+	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	int ret;
+
+	ret = -ENOMEM;
+	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
+	if (sl) {
+		sl->sl_target = config_item_get(item);
+		/* FIXME: needs a lock, I'd bet */
+		list_add(&sl->sl_list, &target_sd->s_links);
+		ret = configfs_create_link(sl, parent_item->ci_dentry,
+					   dentry);
+		if (ret) {
+			list_del_init(&sl->sl_list);
+			config_item_put(item);
+			kfree(sl);
+		}
+	}
+
+	return ret;
+}
+
+
+static int get_target(const char *symname, struct nameidata *nd,
+		      struct config_item **target)
+{
+	int ret;
+
+	ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
+	if (!ret) {
+		if (nd->dentry->d_sb == configfs_sb) {
+			*target = configfs_get_config_item(nd->dentry);
+			if (!*target) {
+				ret = -ENOENT;
+				path_release(nd);
+			}
+		} else
+			ret = -EPERM;
+	}
+
+	return ret;
+}
+
+
+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int ret;
+	struct nameidata nd;
+	struct config_item *parent_item;
+	struct config_item *target_item;
+	struct config_item_type *type;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (dentry->d_parent == configfs_sb->s_root)
+		goto out;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	if (!type || !type->ct_item_ops ||
+	    !type->ct_item_ops->allow_link)
+		goto out_put;
+
+	ret = get_target(symname, &nd, &target_item);
+	if (ret)
+		goto out_put;
+
+	ret = type->ct_item_ops->allow_link(parent_item, target_item);
+	if (!ret)
+		ret = create_link(parent_item, target_item, dentry);
+
+	config_item_put(target_item);
+	path_release(&nd);
+
+out_put:
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+int configfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	struct config_item *parent_item;
+	struct config_item_type *type;
+	int ret;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
+		goto out;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		BUG();
+
+	sl = sd->s_element;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	list_del_init(&sd->s_sibling);
+	configfs_drop_dentry(sd, dentry->d_parent);
+	dput(dentry);
+	configfs_put(sd);
+
+	/*
+	 * drop_link() must be called before
+	 * list_del_init(&sl->sl_list), so that the order of
+	 * drop_link(this, target) and drop_item(target) is preserved.
+	 */
+	if (type && type->ct_item_ops &&
+	    type->ct_item_ops->drop_link)
+		type->ct_item_ops->drop_link(parent_item,
+					       sl->sl_target);
+
+	/* FIXME: Needs lock */
+	list_del_init(&sl->sl_list);
+
+	/* Put reference from create_link() */
+	config_item_put(sl->sl_target);
+	kfree(sl);
+
+	config_item_put(parent_item);
+
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
+				   char *path)
+{
+	char * s;
+	int depth, size;
+
+	depth = item_depth(item);
+	size = item_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
+
+	fill_item_path(target, path, size);
+	pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+
+	return 0;
+}
+
+static int configfs_getlink(struct dentry *dentry, char * path)
+{
+	struct config_item *item, *target_item;
+	int error = 0;
+
+	item = configfs_get_config_item(dentry->d_parent);
+	if (!item)
+		return -EINVAL;
+
+	target_item = configfs_get_config_item(dentry);
+	if (!target_item) {
+		config_item_put(item);
+		return -EINVAL;
+	}
+
+	down_read(&configfs_rename_sem);
+	error = configfs_get_target_path(item, target_item, path);
+	up_read(&configfs_rename_sem);
+
+	config_item_put(item);
+	config_item_put(target_item);
+	return error;
+
+}
+
+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	if (page) {
+		error = configfs_getlink(dentry, (char *)page);
+		if (!error) {
+			nd_set_link(nd, (char *)page);
+			return (void *)page;
+		}
+	}
+
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			      void *cookie)
+{
+	if (cookie) {
+		unsigned long page = (unsigned long)cookie;
+		free_page(page);
+	}
+}
+
+struct inode_operations configfs_symlink_inode_operations = {
+	.follow_link = configfs_follow_link,
+	.readlink = generic_readlink,
+	.put_link = configfs_put_link,
+};
+

+ 1 - 1
fs/mpage.c

@@ -721,7 +721,7 @@ retry:
 						&last_block_in_bio, &ret, wbc,
 						&last_block_in_bio, &ret, wbc,
 						page->mapping->a_ops->writepage);
 						page->mapping->a_ops->writepage);
 			}
 			}
-			if (unlikely(ret == WRITEPAGE_ACTIVATE))
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
 				unlock_page(page);
 				unlock_page(page);
 			if (ret || (--(wbc->nr_to_write) <= 0))
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
 				done = 1;

+ 33 - 0
fs/ocfs2/Makefile

@@ -0,0 +1,33 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+
+ocfs2-objs := \
+	alloc.o 		\
+	aops.o 			\
+	buffer_head_io.o	\
+	dcache.o 		\
+	dir.o 			\
+	dlmglue.o 		\
+	export.o 		\
+	extent_map.o 		\
+	file.o 			\
+	heartbeat.o 		\
+	inode.o 		\
+	journal.o 		\
+	localalloc.o 		\
+	mmap.o 			\
+	namei.o 		\
+	slot_map.o 		\
+	suballoc.o 		\
+	super.o 		\
+	symlink.o 		\
+	sysfile.o 		\
+	uptodate.o		\
+	ver.o 			\
+	vote.o
+
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS) += dlm/

+ 2040 - 0
fs/ocfs2/alloc.c

@@ -0,0 +1,2040 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_extent_contig(struct inode *inode,
+			       struct ocfs2_extent_rec *ext,
+			       u64 blkno);
+
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[]);
+
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac);
+
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh);
+
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 blkno,
+				  u32 new_clusters);
+
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh);
+
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+				       struct inode *inode,
+				       struct ocfs2_dinode *fe,
+				       unsigned int new_i_clusters,
+				       struct buffer_head *old_last_eb,
+				       struct buffer_head **new_last_eb);
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+
+static int ocfs2_extent_contig(struct inode *inode,
+			       struct ocfs2_extent_rec *ext,
+			       u64 blkno)
+{
+	return blkno == (le64_to_cpu(ext->e_blkno) +
+			 ocfs2_clusters_to_blocks(inode->i_sb,
+						  le32_to_cpu(ext->e_clusters)));
+}
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe)
+{
+	int retval;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		retval = -EIO;
+		goto bail;
+	}
+
+	if (fe->i_last_eb_blk) {
+		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &eb_bh, OCFS2_BH_CACHED, inode);
+		if (retval < 0) {
+			mlog_errno(retval);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	BUG_ON(el->l_tree_depth != 0);
+
+	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	mlog_exit(retval);
+	return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[])
+{
+	int count, status, i;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	struct ocfs2_extent_block *eb;
+
+	mlog_entry_void();
+
+	count = 0;
+	while (count < wanted) {
+		status = ocfs2_claim_metadata(osb,
+					      handle,
+					      meta_ac,
+					      wanted - count,
+					      &suballoc_bit_start,
+					      &num_got,
+					      &first_blkno);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = count;  i < (num_got + count); i++) {
+			bhs[i] = sb_getblk(osb->sb, first_blkno);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+
+			status = ocfs2_journal_access(handle, inode, bhs[i],
+						      OCFS2_JOURNAL_ACCESS_CREATE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+			/* Ok, setup the minimal stuff here. */
+			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+			eb->h_blkno = cpu_to_le64(first_blkno);
+			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+			/* we always use slot zero's suballocator */
+			eb->h_suballoc_slot = 0;
+#else
+			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+#endif
+			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+			eb->h_list.l_count =
+				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+			suballoc_bit_start++;
+			first_blkno++;
+
+			/* We'll also be dirtied by the caller, so
+			 * this isn't absolutely necessary. */
+			status = ocfs2_journal_dirty(handle, bhs[i]);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		count += num_got;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		for(i = 0; i < wanted; i++) {
+			if (bhs[i])
+				brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+	}
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
+ */
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac)
+{
+	int status, new_blocks, i;
+	u64 next_blkno, new_last_eb_blk;
+	struct buffer_head *bh;
+	struct buffer_head **new_eb_bhs = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *eb_el;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	BUG_ON(!last_eb_bh);
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (eb_bh) {
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	/* we never add a branch to a leaf. */
+	BUG_ON(!el->l_tree_depth);
+
+	new_blocks = le16_to_cpu(el->l_tree_depth);
+
+	/* allocate the number of new eb blocks we need */
+	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+			     GFP_KERNEL);
+	if (!new_eb_bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+					   meta_ac, new_eb_bhs);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+	 * linked with the rest of the tree.
+	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
+	 *
+	 * when we leave the loop, new_last_eb_blk will point to the
+	 * newest leaf, and next_blkno will point to the topmost extent
+	 * block. */
+	next_blkno = new_last_eb_blk = 0;
+	for(i = 0; i < new_blocks; i++) {
+		bh = new_eb_bhs[i];
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		eb_el = &eb->h_list;
+
+		status = ocfs2_journal_access(handle, inode, bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb->h_next_leaf_blk = 0;
+		eb_el->l_tree_depth = cpu_to_le16(i);
+		eb_el->l_next_free_rec = cpu_to_le16(1);
+		eb_el->l_recs[0].e_cpos = fe->i_clusters;
+		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+		if (!eb_el->l_tree_depth)
+			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+		status = ocfs2_journal_dirty(handle, bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		next_blkno = le64_to_cpu(eb->h_blkno);
+	}
+
+	/* This is a bit hairy. We want to update up to three blocks
+	 * here without leaving any of them in an inconsistent state
+	 * in case of error. We don't have to worry about
+	 * journal_dirty erroring as it won't unless we've aborted the
+	 * handle (in which case we would never be here) so reserving
+	 * the write with journal_access is all we need to do. */
+	status = ocfs2_journal_access(handle, inode, last_eb_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (eb_bh) {
+		status = ocfs2_journal_access(handle, inode, eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Link the new branch into the rest of the tree (el will
+	 * either be on the fe, or the extent block passed in. */
+	i = le16_to_cpu(el->l_next_free_rec);
+	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+	el->l_recs[i].e_cpos = fe->i_clusters;
+	el->l_recs[i].e_clusters = 0;
+	le16_add_cpu(&el->l_next_free_rec, 1);
+
+	/* fe needs a new last extent block pointer, as does the
+	 * next_leaf on the previously last-extent-block. */
+	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+
+	eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+	status = ocfs2_journal_dirty(handle, last_eb_bh);
+	if (status < 0)
+		mlog_errno(status);
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+	if (eb_bh) {
+		status = ocfs2_journal_dirty(handle, eb_bh);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0;
+bail:
+	if (new_eb_bhs) {
+		for (i = 0; i < new_blocks; i++)
+			if (new_eb_bhs[i])
+				brelse(new_eb_bhs[i]);
+		kfree(new_eb_bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh)
+{
+	int status, i;
+	struct buffer_head *new_eb_bh = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *fe_el;
+	struct ocfs2_extent_list  *eb_el;
+
+	mlog_entry_void();
+
+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+					   &new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		status = -EIO;
+		goto bail;
+	}
+
+	eb_el = &eb->h_list;
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	fe_el = &fe->id2.i_list;
+
+	status = ocfs2_journal_access(handle, inode, new_eb_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* copy the fe data into the new extent block */
+	eb_el->l_tree_depth = fe_el->l_tree_depth;
+	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
+		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
+	}
+
+	status = ocfs2_journal_dirty(handle, new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* update fe now */
+	le16_add_cpu(&fe_el->l_tree_depth, 1);
+	fe_el->l_recs[0].e_cpos = 0;
+	fe_el->l_recs[0].e_blkno = eb->h_blkno;
+	fe_el->l_recs[0].e_clusters = fe->i_clusters;
+	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+		fe_el->l_recs[i].e_cpos = 0;
+		fe_el->l_recs[i].e_clusters = 0;
+		fe_el->l_recs[i].e_blkno = 0;
+	}
+	fe_el->l_next_free_rec = cpu_to_le16(1);
+
+	/* If this is our 1st tree depth shift, then last_eb_blk
+	 * becomes the allocated extent block */
+	if (fe_el->l_tree_depth == cpu_to_le16(1))
+		fe->i_last_eb_blk = eb->h_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*ret_new_eb_bh = new_eb_bh;
+	new_eb_bh = NULL;
+	status = 0;
+bail:
+	if (new_eb_bh)
+		brelse(new_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Expects the tree to already have room in the rightmost leaf for the
+ * extent.  Updates all the extent blocks (and the dinode) on the way
+ * down.
+ */
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 start_blk,
+				  u32 new_clusters)
+{
+	int status, i, num_bhs = 0;
+	u64 next_blkno;
+	u16 next_free;
+	struct buffer_head **eb_bhs = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+	if (el->l_tree_depth) {
+		/* This is another operation where we want to be
+		 * careful about our tree updates. An error here means
+		 * none of the previous changes we made should roll
+		 * forward. As a result, we have to record the buffers
+		 * for this part of the tree in an array and reserve a
+		 * journal write to them before making any changes. */
+		num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+		eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
+				 GFP_KERNEL);
+		if (!eb_bhs) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		i = 0;
+		while(el->l_tree_depth) {
+			next_free = le16_to_cpu(el->l_next_free_rec);
+			if (next_free == 0) {
+				ocfs2_error(inode->i_sb,
+					    "Dinode %"MLFu64" has a bad "
+					    "extent list",
+					    OCFS2_I(inode)->ip_blkno);
+				status = -EIO;
+				goto bail;
+			}
+			next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
+
+			BUG_ON(i >= num_bhs);
+			status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
+						  OCFS2_BH_CACHED, inode);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+			if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+								 eb);
+				status = -EIO;
+				goto bail;
+			}
+
+			status = ocfs2_journal_access(handle, inode, eb_bhs[i],
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			el = &eb->h_list;
+			i++;
+			/* When we leave this loop, eb_bhs[num_bhs - 1] will
+			 * hold the bottom-most leaf extent block. */
+		}
+		BUG_ON(el->l_tree_depth);
+
+		el = &fe->id2.i_list;
+		/* If we have tree depth, then the fe update is
+		 * trivial, and we want to switch el out for the
+		 * bottom-most leaf in order to update it with the
+		 * actual extent data below. */
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (next_free == 0) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %"MLFu64" has a bad "
+				    "extent list",
+				    OCFS2_I(inode)->ip_blkno);
+			status = -EIO;
+			goto bail;
+		}
+		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+			     new_clusters);
+		/* (num_bhs - 1) to avoid the leaf */
+		for(i = 0; i < (num_bhs - 1); i++) {
+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+			el = &eb->h_list;
+
+			/* finally, make our actual change to the
+			 * intermediate extent blocks. */
+			next_free = le16_to_cpu(el->l_next_free_rec);
+			le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+				     new_clusters);
+
+			status = ocfs2_journal_dirty(handle, eb_bhs[i]);
+			if (status < 0)
+				mlog_errno(status);
+		}
+		BUG_ON(i != (num_bhs - 1));
+		/* note that the leaf block wasn't touched in
+		 * the loop above */
+		eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
+		el = &eb->h_list;
+		BUG_ON(el->l_tree_depth);
+	}
+
+	/* yay, we can finally add the actual extent now! */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le16_to_cpu(el->l_next_free_rec) &&
+	    ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
+		le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
+	} else if (le16_to_cpu(el->l_next_free_rec) &&
+		   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
+		/* having an empty extent at eof is legal. */
+		if (el->l_recs[i].e_cpos != fe->i_clusters) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %"MLFu64" trailing extent is bad: "
+				    "cpos (%u) != number of clusters (%u)",
+				    le32_to_cpu(el->l_recs[i].e_cpos),
+				    le32_to_cpu(fe->i_clusters));
+			status = -EIO;
+			goto bail;
+		}
+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+	} else {
+		/* No contiguous record, or no empty record at eof, so
+		 * we add a new one. */
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
+		       le16_to_cpu(el->l_count));
+		i = le16_to_cpu(el->l_next_free_rec);
+
+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+		el->l_recs[i].e_cpos = fe->i_clusters;
+		le16_add_cpu(&el->l_next_free_rec, 1);
+	}
+
+	/*
+	 * extent_map errors are not fatal, so they are ignored outside
+	 * of flushing the thing.
+	 */
+	status = ocfs2_extent_map_append(inode, &el->l_recs[i],
+					 new_clusters);
+	if (status) {
+		mlog_errno(status);
+		ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
+	}
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+	if (fe->id2.i_list.l_tree_depth) {
+		status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0;
+bail:
+	if (eb_bhs) {
+		for (i = 0; i < num_bhs; i++)
+			if (eb_bhs[i])
+				brelse(eb_bhs[i]);
+		kfree(eb_bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the dinode has room. We
+ *    pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the dinode is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
+ */
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh)
+{
+	int status = 0, i;
+	u64 blkno;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lowest_bh = NULL;
+
+	mlog_entry_void();
+
+	*target_bh = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+
+	while(le16_to_cpu(el->l_tree_depth) > 1) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
+				    "extent list (next_free_rec == 0)",
+				    OCFS2_I(inode)->ip_blkno);
+			status = -EIO;
+			goto bail;
+		}
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (!blkno) {
+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
+				    "list where extent # %d has no physical "
+				    "block start",
+				    OCFS2_I(inode)->ip_blkno, i);
+			status = -EIO;
+			goto bail;
+		}
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
+					  inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &eb->h_list;
+
+		if (le16_to_cpu(el->l_next_free_rec) <
+		    le16_to_cpu(el->l_count)) {
+			if (lowest_bh)
+				brelse(lowest_bh);
+			lowest_bh = bh;
+			get_bh(lowest_bh);
+		}
+	}
+
+	/* If we didn't find one and the fe doesn't have any room,
+	 * then return '1' */
+	if (!lowest_bh
+	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+		status = 1;
+
+	*target_bh = lowest_bh;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* the caller needs to update fe->i_clusters */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 start_blk,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac)
+{
+	int status, i, shift;
+	struct buffer_head *last_eb_bh = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	mlog(0, "add %u clusters starting at block %"MLFu64" to "
+		"inode %"MLFu64"\n",
+	     new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+
+	if (el->l_tree_depth) {
+		/* jump to end of tree */
+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_exit(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &eb->h_list;
+	}
+
+	/* Can we allocate without adding/shifting tree bits? */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le16_to_cpu(el->l_next_free_rec) == 0
+	    || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
+	    || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+	    || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+		goto out_add;
+
+	mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+	     "tree now.\n");
+
+	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
+	if (shift < 0) {
+		status = shift;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* We traveled all the way to the bottom of the allocation tree
+	 * and didn't find room for any more extents - we need to add
+	 * another tree level */
+	if (shift) {
+		/* if we hit a leaf, we'd better be empty :) */
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
+		       le16_to_cpu(el->l_count));
+		BUG_ON(bh);
+		mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+		     "(current = %u)\n",
+		     le16_to_cpu(fe->id2.i_list.l_tree_depth));
+
+		/* ocfs2_shift_tree_depth will return us a buffer with
+		 * the new extent block (so we can pass that to
+		 * ocfs2_add_branch). */
+		status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
+						meta_ac, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		/* Special case: we have room now if we shifted from
+		 * tree_depth 0 */
+		if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+			goto out_add;
+	}
+
+	/* call ocfs2_add_branch to add the final part of the tree with
+	 * the new data. */
+	mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
+				  meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+out_add:
+	/* Finally, we can add clusters. */
+	status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+					start_blk, new_clusters);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+
+	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+			"slot %d, invalid truncate log parameters: used = "
+			"%u, count = %u\n", osb->slot_num,
+			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+					   unsigned int new_start)
+{
+	unsigned int tail_index;
+	unsigned int current_tail;
+
+	/* No records, nothing to coalesce */
+	if (!le16_to_cpu(tl->tl_used))
+		return 0;
+
+	tail_index = le16_to_cpu(tl->tl_used) - 1;
+	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+	return current_tail == new_start;
+}
+
+static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     u64 start_blk,
+				     unsigned int num_clusters)
+{
+	int status, index;
+	unsigned int start_cluster, tl_count;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
+		   num_clusters);
+
+	BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	tl_count = le16_to_cpu(tl->tl_count);
+	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+			tl_count == 0,
+			"Truncate record count on #%"MLFu64" invalid ("
+			"wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
+			ocfs2_truncate_recs_per_inode(osb->sb),
+			le16_to_cpu(tl->tl_count));
+
+	/* Caller should have known to flush before calling us. */
+	index = le16_to_cpu(tl->tl_used);
+	if (index >= tl_count) {
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
+	     "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
+	     OCFS2_I(tl_inode)->ip_blkno, index);
+
+	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+		/*
+		 * Move index back to the record we are coalescing with.
+		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+		 */
+		index--;
+
+		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
+		     index, le32_to_cpu(tl->tl_recs[index].t_start),
+		     num_clusters);
+	} else {
+		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+		tl->tl_used = cpu_to_le16(index + 1);
+	}
+	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+	status = ocfs2_journal_dirty(handle, tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+					 struct ocfs2_journal_handle *handle,
+					 struct inode *data_alloc_inode,
+					 struct buffer_head *data_alloc_bh)
+{
+	int status = 0;
+	int i;
+	unsigned int num_clusters;
+	u64 start_blk;
+	struct ocfs2_truncate_rec rec;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+	mlog_entry_void();
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	i = le16_to_cpu(tl->tl_used) - 1;
+	while (i >= 0) {
+		/* Caller has given us at least enough credits to
+		 * update the truncate log dinode */
+		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		tl->tl_used = cpu_to_le16(i);
+
+		status = ocfs2_journal_dirty(handle, tl_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* TODO: Perhaps we can calculate the bulk of the
+		 * credits up front rather than extending like
+		 * this. */
+		status = ocfs2_extend_trans(handle,
+					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		rec = tl->tl_recs[i];
+		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+						    le32_to_cpu(rec.t_start));
+		num_clusters = le32_to_cpu(rec.t_clusters);
+
+		/* if start_blk is not set, we ignore the record as
+		 * invalid. */
+		if (start_blk) {
+			mlog(0, "free record %d, start = %u, clusters = %u\n",
+			     i, le32_to_cpu(rec.t_start), num_clusters);
+
+			status = ocfs2_free_clusters(handle, data_alloc_inode,
+						     data_alloc_bh, start_blk,
+						     num_clusters);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i--;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* Expects you to already be holding tl_inode->i_sem */
+static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	unsigned int num_to_flush;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *data_alloc_inode = NULL;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct buffer_head *data_alloc_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry_void();
+
+	BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	num_to_flush = le16_to_cpu(tl->tl_used);
+	mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
+	     num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
+	if (!num_to_flush) {
+		status = 0;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	data_alloc_inode = ocfs2_get_system_file_inode(osb,
+						       GLOBAL_BITMAP_SYSTEM_INODE,
+						       OCFS2_INVALID_SLOT);
+	if (!data_alloc_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, data_alloc_inode);
+	status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+					       data_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (data_alloc_inode)
+		iput(data_alloc_inode);
+
+	if (data_alloc_bh)
+		brelse(data_alloc_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	down(&tl_inode->i_sem);
+	status = __ocfs2_flush_truncate_log(osb);
+	up(&tl_inode->i_sem);
+
+	return status;
+}
+
+static void ocfs2_truncate_log_worker(void *data)
+{
+	int status;
+	struct ocfs2_super *osb = data;
+
+	mlog_entry_void();
+
+	status = ocfs2_flush_truncate_log(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+}
+
+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel)
+{
+	if (osb->osb_tl_inode) {
+		/* We want to push off log flushes while truncates are
+		 * still running. */
+		if (cancel)
+			cancel_delayed_work(&osb->osb_truncate_log_wq);
+
+		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
+	}
+}
+
+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
+				       int slot_num,
+				       struct inode **tl_inode,
+				       struct buffer_head **tl_bh)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					   TRUNCATE_LOG_SYSTEM_INODE,
+					   slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		iput(inode);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*tl_inode = inode;
+	*tl_bh    = bh;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* called during the 1st stage of node recovery. we stamp a clean
+ * truncate log and pass back a copy for processing later. if the
+ * truncate log does not require processing, a *tl_copy is set to
+ * NULL. */
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	*tl_copy = NULL;
+
+	mlog(0, "recover truncate log from slot %d\n", slot_num);
+
+	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	if (le16_to_cpu(tl->tl_used)) {
+		mlog(0, "We'll have %u logs to recover\n",
+		     le16_to_cpu(tl->tl_used));
+
+		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+		if (!(*tl_copy)) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Assuming the write-out below goes well, this copy
+		 * will be passed back to recovery for processing. */
+		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
+
+		/* All we need to do to clear the truncate log is set
+		 * tl_used. */
+		tl->tl_used = 0;
+
+		status = ocfs2_write_block(osb, tl_bh, tl_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+bail:
+	if (tl_inode)
+		iput(tl_inode);
+	if (tl_bh)
+		brelse(tl_bh);
+
+	if (status < 0 && (*tl_copy)) {
+		kfree(*tl_copy);
+		*tl_copy = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy)
+{
+	int status = 0;
+	int i;
+	unsigned int clusters, num_recs, start_cluster;
+	u64 start_blk;
+	struct ocfs2_journal_handle *handle;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry_void();
+
+	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
+		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
+		return -EINVAL;
+	}
+
+	tl = &tl_copy->id2.i_dealloc;
+	num_recs = le16_to_cpu(tl->tl_used);
+	mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
+	     tl_copy->i_blkno);
+
+	down(&tl_inode->i_sem);
+	for(i = 0; i < num_recs; i++) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			status = __ocfs2_flush_truncate_log(osb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail_up;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, NULL,
+					   OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_up;
+		}
+
+		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
+		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
+		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
+
+		status = ocfs2_truncate_log_append(osb, handle,
+						   start_blk, clusters);
+		ocfs2_commit_trans(handle);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_up;
+		}
+	}
+
+bail_up:
+	up(&tl_inode->i_sem);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mlog_entry_void();
+
+	if (tl_inode) {
+		cancel_delayed_work(&osb->osb_truncate_log_wq);
+		flush_workqueue(ocfs2_wq);
+
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		brelse(osb->osb_tl_bh);
+		iput(osb->osb_tl_inode);
+	}
+
+	mlog_exit_void();
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_get_truncate_log_info(osb,
+					     osb->slot_num,
+					     &tl_inode,
+					     &tl_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* ocfs2_truncate_log_shutdown keys on the existence of
+	 * osb->osb_tl_inode so we don't set any of the osb variables
+	 * until we're sure all is well. */
+	INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
+	osb->osb_tl_bh    = tl_bh;
+	osb->osb_tl_inode = tl_inode;
+
+	mlog_exit(status);
+	return status;
+}
+
+/* This function will figure out whether the currently last extent
+ * block will be deleted, and if it will, what the new last extent
+ * block will be so we can update his h_next_leaf_blk field, as well
+ * as the dinodes i_last_eb_blk */
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+				       struct inode *inode,
+				       struct ocfs2_dinode *fe,
+				       u32 new_i_clusters,
+				       struct buffer_head *old_last_eb,
+				       struct buffer_head **new_last_eb)
+{
+	int i, status = 0;
+	u64 block = 0;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *bh = NULL;
+
+	*new_last_eb = NULL;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+
+	/* we have no tree, so of course, no last_eb. */
+	if (!fe->id2.i_list.l_tree_depth)
+		goto bail;
+
+	/* trunc to zero special case - this makes tree_depth = 0
+	 * regardless of what it is.  */
+	if (!new_i_clusters)
+		goto bail;
+
+	eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
+	el = &(eb->h_list);
+	BUG_ON(!el->l_next_free_rec);
+
+	/* Make sure that this guy will actually be empty after we
+	 * clear away the data. */
+	if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
+		goto bail;
+
+	/* Ok, at this point, we know that last_eb will definitely
+	 * change, so lets traverse the tree and find the second to
+	 * last extent block. */
+	el = &(fe->id2.i_list);
+	/* go down the tree, */
+	do {
+		for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
+			if (le32_to_cpu(el->l_recs[i].e_cpos) <
+			    new_i_clusters) {
+				block = le64_to_cpu(el->l_recs[i].e_blkno);
+				break;
+			}
+		}
+		BUG_ON(i < 0);
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
+					 inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+	} while (el->l_tree_depth);
+
+	*new_last_eb = bh;
+	get_bh(*new_last_eb);
+	mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
+bail:
+	if (bh)
+		brelse(bh);
+
+	return status;
+}
+
+static int ocfs2_do_truncate(struct ocfs2_super *osb,
+			     unsigned int clusters_to_del,
+			     struct inode *inode,
+			     struct buffer_head *fe_bh,
+			     struct buffer_head *old_last_eb_bh,
+			     struct ocfs2_journal_handle *handle,
+			     struct ocfs2_truncate_context *tc)
+{
+	int status, i, depth;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *last_eb = NULL;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *eb_bh = NULL;
+	struct buffer_head *last_eb_bh = NULL;
+	u64 next_eb = 0;
+	u64 delete_blk = 0;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	status = ocfs2_find_new_last_ext_blk(osb,
+					     inode,
+					     fe,
+					     le32_to_cpu(fe->i_clusters) -
+					     		clusters_to_del,
+					     old_last_eb_bh,
+					     &last_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (last_eb_bh)
+		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	el = &(fe->id2.i_list);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
+				      clusters_to_del;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
+	fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+	BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+	le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+	/* tree depth zero, we can just delete the clusters, otherwise
+	 * we need to record the offset of the next level extent block
+	 * as we may overwrite it. */
+	if (!el->l_tree_depth)
+		delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+			+ ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(el->l_recs[i].e_clusters));
+	else
+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+
+	if (!el->l_recs[i].e_clusters) {
+		/* if we deleted the whole extent record, then clear
+		 * out the other fields and update the extent
+		 * list. For depth > 0 trees, we've already recorded
+		 * the extent block in 'next_eb' */
+		el->l_recs[i].e_cpos = 0;
+		el->l_recs[i].e_blkno = 0;
+		BUG_ON(!el->l_next_free_rec);
+		le16_add_cpu(&el->l_next_free_rec, -1);
+	}
+
+	depth = le16_to_cpu(el->l_tree_depth);
+	if (!fe->i_clusters) {
+		/* trunc to zero is a special case. */
+		el->l_tree_depth = 0;
+		fe->i_last_eb_blk = 0;
+	} else if (last_eb)
+		fe->i_last_eb_blk = last_eb->h_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (last_eb) {
+		/* If there will be a new last extent block, then by
+		 * definition, there cannot be any leaves to the right of
+		 * him. */
+		status = ocfs2_journal_access(handle, inode, last_eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		last_eb->h_next_leaf_blk = 0;
+		status = ocfs2_journal_dirty(handle, last_eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* if our tree depth > 0, update all the tree blocks below us. */
+	while (depth) {
+		mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
+		     depth,  next_eb);
+		status = ocfs2_read_block(osb, next_eb, &eb_bh,
+					  OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+
+		status = ocfs2_journal_access(handle, inode, eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+		BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+		mlog(0, "extent block %"MLFu64", before: record %d: "
+		     "(%u, %u, %"MLFu64"), next = %u\n",
+		     le64_to_cpu(eb->h_blkno), i,
+		     le32_to_cpu(el->l_recs[i].e_cpos),
+		     le32_to_cpu(el->l_recs[i].e_clusters),
+		     le64_to_cpu(el->l_recs[i].e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+		le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+
+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+		/* bottom-most block requires us to delete data.*/
+		if (!el->l_tree_depth)
+			delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+				+ ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(el->l_recs[i].e_clusters));
+		if (!el->l_recs[i].e_clusters) {
+			el->l_recs[i].e_cpos = 0;
+			el->l_recs[i].e_blkno = 0;
+			BUG_ON(!el->l_next_free_rec);
+			le16_add_cpu(&el->l_next_free_rec, -1);
+		}
+		mlog(0, "extent block %"MLFu64", after: record %d: "
+		     "(%u, %u, %"MLFu64"), next = %u\n",
+		     le64_to_cpu(eb->h_blkno), i,
+		     le32_to_cpu(el->l_recs[i].e_cpos),
+		     le32_to_cpu(el->l_recs[i].e_clusters),
+		     le64_to_cpu(el->l_recs[i].e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		status = ocfs2_journal_dirty(handle, eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!el->l_next_free_rec) {
+			mlog(0, "deleting this extent block.\n");
+
+			ocfs2_remove_from_cache(inode, eb_bh);
+
+			BUG_ON(eb->h_suballoc_slot);
+			BUG_ON(el->l_recs[0].e_clusters);
+			BUG_ON(el->l_recs[0].e_cpos);
+			BUG_ON(el->l_recs[0].e_blkno);
+			status = ocfs2_free_extent_block(handle,
+							 tc->tc_ext_alloc_inode,
+							 tc->tc_ext_alloc_bh,
+							 eb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		brelse(eb_bh);
+		eb_bh = NULL;
+		depth--;
+	}
+
+	BUG_ON(!delete_blk);
+	status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+					   clusters_to_del);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	if (!status)
+		ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
+	else
+		ocfs2_extent_map_drop(inode, 0);
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * It is expected, that by the time you call this function,
+ * inode->i_size and fe->i_size have been adjusted.
+ *
+ * WARNING: This will kfree the truncate context
+ */
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc)
+{
+	int status, i, credits, tl_sem = 0;
+	u32 clusters_to_del, target_i_clusters;
+	u64 last_eb = 0;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mlog_entry_void();
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						     i_size_read(inode));
+
+	last_eb_bh = tc->tc_last_eb_bh;
+	tc->tc_last_eb_bh = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (fe->id2.i_list.l_tree_depth) {
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+	last_eb = le64_to_cpu(fe->i_last_eb_blk);
+start:
+	mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
+	     "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
+	     "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+	     le32_to_cpu(fe->i_clusters), last_eb,
+	     le64_to_cpu(fe->i_last_eb_blk),
+	     le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
+
+	if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
+		mlog(0, "last_eb changed!\n");
+		BUG_ON(!fe->id2.i_list.l_tree_depth);
+		last_eb = le64_to_cpu(fe->i_last_eb_blk);
+		/* i_last_eb_blk may have changed, read it if
+		 * necessary. We don't have to worry about the
+		 * truncate to zero case here (where there becomes no
+		 * last_eb) because we never loop back after our work
+		 * is done. */
+		if (last_eb_bh) {
+			brelse(last_eb_bh);
+			last_eb_bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, last_eb,
+					  &last_eb_bh, OCFS2_BH_CACHED,
+					  inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+	}
+
+	/* by now, el will point to the extent list on the bottom most
+	 * portion of this tree. */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
+		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+	else
+		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+				   le32_to_cpu(el->l_recs[i].e_cpos)) -
+				  target_i_clusters;
+
+	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+
+	down(&tl_inode->i_sem);
+	tl_sem = 1;
+	/* ocfs2_truncate_log_needs_flush guarantees us at least one
+	 * record is free for use. If there isn't any, we flush to get
+	 * an empty truncate log.  */
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		status = __ocfs2_flush_truncate_log(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+						fe, el);
+	handle = ocfs2_start_trans(osb, NULL, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
+				   last_eb_bh, handle, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	up(&tl_inode->i_sem);
+	tl_sem = 0;
+
+	ocfs2_commit_trans(handle);
+	handle = NULL;
+
+	BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
+	if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
+		goto start;
+bail:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	if (tl_sem)
+		up(&tl_inode->i_sem);
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	/* This will drop the ext_alloc cluster lock for us */
+	ocfs2_free_truncate_context(tc);
+
+	mlog_exit(status);
+	return status;
+}
+
+
+/*
+ * Expects the inode to already be locked. This will figure out which
+ * inodes need to be locked and will put them on the returned truncate
+ * context.
+ */
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc)
+{
+	int status, metadata_delete;
+	unsigned int new_i_clusters;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh = NULL;
+	struct inode *ext_alloc_inode = NULL;
+	struct buffer_head *ext_alloc_bh = NULL;
+
+	mlog_entry_void();
+
+	*tc = NULL;
+
+	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						  i_size_read(inode));
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
+	     "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
+
+	if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
+		ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
+			    "%u and size %"MLFu64" whereas struct inode has "
+			    "cluster count %u and size %llu which caused an "
+			    "invalid truncate to %u clusters.",
+			    le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->i_clusters),
+			    le64_to_cpu(fe->i_size),
+			    OCFS2_I(inode)->ip_clusters, i_size_read(inode),
+			    new_i_clusters);
+		mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
+		status = -EIO;
+		goto bail;
+	}
+
+	*tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
+	if (!(*tc)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	metadata_delete = 0;
+	if (fe->id2.i_list.l_tree_depth) {
+		/* If we have a tree, then the truncate may result in
+		 * metadata deletes. Figure this out from the
+		 * rightmost leaf block.*/
+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+
+			brelse(last_eb_bh);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+		if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+			metadata_delete = 1;
+	}
+
+	(*tc)->tc_last_eb_bh = last_eb_bh;
+
+	if (metadata_delete) {
+		mlog(0, "Will have to delete metadata for this trunc. "
+		     "locking allocator.\n");
+		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
+		if (!ext_alloc_inode) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		down(&ext_alloc_inode->i_sem);
+		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
+
+		status = ocfs2_meta_lock(ext_alloc_inode,
+					 NULL,
+					 &ext_alloc_bh,
+					 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
+		(*tc)->tc_ext_alloc_locked = 1;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		if (*tc)
+			ocfs2_free_truncate_context(*tc);
+		*tc = NULL;
+	}
+	mlog_exit_void();
+	return status;
+}
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+{
+	if (tc->tc_ext_alloc_inode) {
+		if (tc->tc_ext_alloc_locked)
+			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
+
+		up(&tc->tc_ext_alloc_inode->i_sem);
+		iput(tc->tc_ext_alloc_inode);
+	}
+
+	if (tc->tc_ext_alloc_bh)
+		brelse(tc->tc_ext_alloc_bh);
+
+	if (tc->tc_last_eb_bh)
+		brelse(tc->tc_last_eb_bh);
+
+	kfree(tc);
+}

+ 82 - 0
fs/ocfs2/alloc.h

@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ALLOC_H
+#define OCFS2_ALLOC_H
+
+struct ocfs2_alloc_context;
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 blkno,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac);
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe);
+/* how many new metadata chunks would an allocation need at maximum? */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+{
+	/*
+	 * Rather than do all the work of determining how much we need
+	 * (involves a ton of reads and locks), just ask for the
+	 * maximal limit.  That's a tree depth shift.  So, one block for
+	 * level of the tree (current l_tree_depth), one block for the
+	 * new tree_depth==0 extent_block, and one block at the new
+	 * top-of-the tree.
+	 */
+	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel);
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy);
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy);
+
+struct ocfs2_truncate_context {
+	struct inode *tc_ext_alloc_inode;
+	struct buffer_head *tc_ext_alloc_bh;
+	int tc_ext_alloc_locked; /* is it cluster locked? */
+	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
+	struct buffer_head *tc_last_eb_bh;
+};
+
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc);
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc);
+
+#endif /* OCFS2_ALLOC_H */

+ 643 - 0
fs/ocfs2/aops.c

@@ -0,0 +1,643 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "symlink.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	void *kaddr;
+
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+
+	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
+
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  &bh, OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		goto bail;
+	}
+
+	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+						    le32_to_cpu(fe->i_clusters))) {
+		mlog(ML_ERROR, "block offset is outside the allocated size: "
+		     "%llu\n", (unsigned long long)iblock);
+		goto bail;
+	}
+
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
+		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
+			    iblock;
+		buffer_cache_bh = sb_getblk(osb->sb, blkno);
+		if (!buffer_cache_bh) {
+			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
+			goto bail;
+		}
+
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh)
+		    && ocfs2_inode_is_new(inode)) {
+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+			if (!kaddr) {
+				mlog(ML_ERROR, "couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock),
+			       buffer_cache_bh->b_data,
+			       bh_result->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+
+	map_bh(bh_result, inode->i_sb,
+	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
+
+	err = 0;
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(err);
+	return err;
+}
+
+static int ocfs2_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh_result, int create)
+{
+	int err = 0;
+	u64 p_blkno, past_eof;
+
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
+		     inode, inode->i_ino);
+
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
+		goto bail;
+	}
+
+	/* this can happen if another node truncs after our extend! */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+					       OCFS2_I(inode)->ip_clusters))
+		err = -EIO;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	if (err)
+		goto bail;
+
+	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  NULL);
+	if (err) {
+		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
+		     "%"MLFu64", NULL)\n", err, inode,
+		     (unsigned long long)iblock, p_blkno);
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+
+	if (bh_result->b_blocknr == 0) {
+		err = -EIO;
+		mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
+		     "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
+		     p_blkno, OCFS2_I(inode)->ip_blkno);
+	}
+
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
+
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+
+bail:
+	if (err < 0)
+		err = -EIO;
+
+	mlog_exit(err);
+	return err;
+}
+
+static int ocfs2_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int ret, unlock = 1;
+
+	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * might now be discovering a truncate that hit on another node.
+	 * block_read_full_page->get_block freaks out if it is asked to read
+	 * beyond the end of a file, so we check here.  Callers
+	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * and notice that the page they just read isn't needed.
+	 *
+	 * XXX sys_readahead() seems to get that wrong?
+	 */
+	if (start >= i_size_read(inode)) {
+		char *addr = kmap(page);
+		memset(addr, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		kunmap(page);
+		SetPageUptodate(page);
+		ret = 0;
+		goto out_alloc;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out_alloc;
+	}
+
+	ret = block_read_full_page(page, ocfs2_get_block);
+	unlock = 0;
+
+	ocfs2_data_unlock(inode, 0);
+out_alloc:
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 0);
+out:
+	if (unlock)
+		unlock_page(page);
+	mlog_exit(ret);
+	return ret;
+}
+
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	mlog_entry("(0x%p)\n", page);
+
+	ret = block_write_full_page(page, ocfs2_get_block, wbc);
+
+	mlog_exit(ret);
+
+	return ret;
+}
+
+/*
+ * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
+ * from loopback.  It must be able to perform its own locking around
+ * ocfs2_get_block().
+ */
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_meta_unlock(inode, 0);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle->k_handle,
+					page_buffers(page),
+					from, to, NULL,
+					ocfs2_journal_dirty_data);
+		if (ret < 0) 
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (handle)
+			ocfs2_commit_trans(handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
+static int ocfs2_commit_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	int ret, extending = 0, locklevel = 0;
+	loff_t new_i_size;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = page->mapping->host;
+	struct ocfs2_journal_handle *handle = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+	 * us to sample inode->i_size here without the metadata lock:
+	 *
+	 * 1) We're currently holding the inode alloc lock, so no
+	 *    nodes can change it underneath us.
+	 *
+	 * 2) We've had to take the metadata lock at least once
+	 *    already to check for extending writes, hence insuring
+	 *    that our current copy is also up to date.
+	 */
+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (new_i_size > i_size_read(inode)) {
+		extending = 1;
+		locklevel = 1;
+	}
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 1, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out_unlock_meta;
+	}
+
+	if (extending) {
+		handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock_data;
+		}
+
+		/* Mark our buffer early. We'd rather catch this error up here
+		 * as opposed to after a successful commit_write which would
+		 * require us to set back inode->i_size. */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/* might update i_size */
+	ret = generic_commit_write(file, page, from, to);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (extending) {
+		loff_t size = (u64) i_size_read(inode);
+		struct ocfs2_dinode *di =
+			(struct ocfs2_dinode *)di_bh->b_data;
+
+		/* ocfs2_mark_inode_dirty is too heavy to use here. */
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+		di->i_size = cpu_to_le64(size);
+		di->i_ctime = di->i_mtime = 
+				cpu_to_le64(inode->i_mtime.tv_sec);
+		di->i_ctime_nsec = di->i_mtime_nsec = 
+				cpu_to_le32(inode->i_mtime.tv_nsec);
+
+		ret = ocfs2_journal_dirty(handle, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	BUG_ON(extending && (i_size_read(inode) != new_i_size));
+
+out_commit:
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+	ocfs2_meta_unlock(inode, locklevel);
+out:
+	if (di_bh)
+		brelse(di_bh);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t status;
+	u64 p_blkno = 0;
+	int err = 0;
+	struct inode *inode = mapping->host;
+
+	mlog_entry("(block = %llu)\n", (unsigned long long)block);
+
+	/* We don't need to lock journal system files, since they aren't
+	 * accessed concurrently from multiple nodes.
+	 */
+	if (!INODE_JOURNAL(inode)) {
+		err = ocfs2_meta_lock(inode, NULL, NULL, 0);
+		if (err) {
+			if (err != -ENOENT)
+				mlog_errno(err);
+			goto bail;
+		}
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	}
+
+	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+					  NULL);
+
+	if (!INODE_JOURNAL(inode)) {
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		ocfs2_meta_unlock(inode, 0);
+	}
+
+	if (err) {
+		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
+		     (unsigned long long)block);
+		mlog_errno(err);
+		goto bail;
+	}
+
+
+bail:
+	status = err ? 0 : p_blkno;
+
+	mlog_exit((int)status);
+
+	return status;
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+				     unsigned long max_blocks,
+				     struct buffer_head *bh_result, int create)
+{
+	int ret;
+	u64 vbo_max; /* file offset, max_blocks from iblock */
+	u64 p_blkno;
+	int contig_blocks;
+	unsigned char blocksize_bits;
+
+	if (!inode || !bh_result) {
+		mlog(ML_ERROR, "inode or bh_result is null\n");
+		return -EIO;
+	}
+
+	blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+
+	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if ((iblock + max_blocks) >
+	    ocfs2_clusters_to_blocks(inode->i_sb,
+				     OCFS2_I(inode)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		ret = -EIO;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* This figures out the size of the next contiguous block, and
+	 * our logical offset */
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  &contig_blocks);
+	if (ret) {
+		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+		     (unsigned long long)iblock);
+		ret = -EIO;
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (max_blocks < contig_blocks)
+		contig_blocks = max_blocks;
+	bh_result->b_size = contig_blocks << blocksize_bits;
+bail:
+	return ret;
+}
+
+/* 
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  Like the core uses
+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
+ * truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private)
+{
+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+	ocfs2_iocb_clear_rw_locked(iocb);
+	up_read(&inode->i_alloc_sem);
+	ocfs2_rw_unlock(inode, 0);
+}
+
+static ssize_t ocfs2_direct_IO(int rw,
+			       struct kiocb *iocb,
+			       const struct iovec *iov,
+			       loff_t offset,
+			       unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+	int ret;
+
+	mlog_entry_void();
+	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+					    inode->i_sb->s_bdev, iov, offset,
+					    nr_segs, 
+					    ocfs2_direct_IO_get_blocks,
+					    ocfs2_dio_end_io);
+	mlog_exit(ret);
+	return ret;
+}
+
+struct address_space_operations ocfs2_aops = {
+	.readpage	= ocfs2_readpage,
+	.writepage	= ocfs2_writepage,
+	.prepare_write	= ocfs2_prepare_write,
+	.commit_write	= ocfs2_commit_write,
+	.bmap		= ocfs2_bmap,
+	.sync_page	= block_sync_page,
+	.direct_IO	= ocfs2_direct_IO
+};

+ 41 - 0
fs/ocfs2/aops.h

@@ -0,0 +1,41 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to);
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to);
+
+/* all ocfs2_dio_end_io()'s fault */
+#define ocfs2_iocb_is_rw_locked(iocb) \
+	test_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_rw_locked(iocb) \
+	set_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_rw_locked(iocb) \
+	clear_bit(0, (unsigned long *)&iocb->private)
+
+#endif /* OCFS2_FILE_H */

+ 232 - 0
fs/ocfs2/buffer_head_io.c

@@ -0,0 +1,232 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * io.c
+ *
+ * Buffer cache handling
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
+		      struct inode *inode)
+{
+	int ret = 0;
+
+	mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
+		   (unsigned long long)bh->b_blocknr, inode);
+
+	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
+	BUG_ON(buffer_jbd(bh));
+
+	/* No need to check for a soft readonly file system here. non
+	 * journalled writes are only ever done on system files which
+	 * can get modified during recovery even if read-only. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	down(&OCFS2_I(inode)->ip_io_sem);
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */                   
+	bh->b_end_io = end_buffer_write_sync;
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		ocfs2_set_buffer_uptodate(inode, bh);
+	} else {
+		/* We don't need to remove the clustered uptodate
+		 * information for this bh as it's not marked locally
+		 * uptodate. */
+		ret = -EIO;
+		brelse(bh);
+	}
+
+	up(&OCFS2_I(inode)->ip_io_sem);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      struct inode *inode)
+{
+	int status = 0;
+	struct super_block *sb;
+	int i, ignore_cache = 0;
+	struct buffer_head *bh;
+
+	mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
+		   block, nr, flags, inode);
+
+	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr < 0) {
+		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr == 0) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		status = 0;
+		goto bail;
+	}
+
+	sb = osb->sb;
+
+	if (flags & OCFS2_BH_CACHED && !inode)
+		flags &= ~OCFS2_BH_CACHED;
+
+	if (inode)
+		down(&OCFS2_I(inode)->ip_io_sem);
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(sb, block++);
+			if (bhs[i] == NULL) {
+				if (inode)
+					up(&OCFS2_I(inode)->ip_io_sem);
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+		ignore_cache = 0;
+
+		if (flags & OCFS2_BH_CACHED &&
+		    !ocfs2_buffer_uptodate(inode, bh)) {
+			mlog(ML_UPTODATE,
+			     "bh (%llu), inode %"MLFu64" not uptodate\n",
+			     (unsigned long long)bh->b_blocknr,
+			     OCFS2_I(inode)->ip_blkno);
+			ignore_cache = 1;
+		}
+
+		/* XXX: Can we ever get this and *not* have the cached
+		 * flag set? */
+		if (buffer_jbd(bh)) {
+			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+				mlog(ML_BH_IO, "trying to sync read a jbd "
+					       "managed bh (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+			if (buffer_dirty(bh)) {
+				/* This should probably be a BUG, or
+				 * at least return an error. */
+				mlog(ML_BH_IO, "asking me to sync read a dirty "
+					       "buffer! (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+				continue;
+			}
+
+			lock_buffer(bh);
+			if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
+				mlog(ML_ERROR, "block %llu had the JBD bit set "
+					       "while I was in lock_buffer!",
+				     (unsigned long long)bh->b_blocknr);
+				BUG();
+#else
+				unlock_buffer(bh);
+				continue;
+#endif
+			}
+			clear_buffer_uptodate(bh);
+			get_bh(bh); /* for end_buffer_read_sync() */
+			bh->b_end_io = end_buffer_read_sync;
+			if (flags & OCFS2_BH_READAHEAD)
+				submit_bh(READA, bh);
+			else
+				submit_bh(READ, bh);
+			continue;
+		}
+	}
+
+	status = 0;
+
+	for (i = (nr - 1); i >= 0; i--) {
+		bh = bhs[i];
+
+		/* We know this can't have changed as we hold the
+		 * inode sem. Avoid doing any work on the bh if the
+		 * journal has it. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
+
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. Don't need to
+			 * remove the clustered uptodate information
+			 * for this bh as it's not marked locally
+			 * uptodate. */
+			status = -EIO;
+			brelse(bh);
+			bhs[i] = NULL;
+			continue;
+		}
+
+		if (inode)
+			ocfs2_set_buffer_uptodate(inode, bh);
+	}
+	if (inode)
+		up(&OCFS2_I(inode)->ip_io_sem);
+
+	mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
+	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+
+bail:
+
+	mlog_exit(status);
+	return status;
+}

+ 73 - 0
fs/ocfs2/buffer_head_io.h

@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_buffer_head.h
+ *
+ * Buffer cache handling functions defined
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_BUFFER_HEAD_IO_H
+#define OCFS2_BUFFER_HEAD_IO_H
+
+#include <linux/buffer_head.h>
+
+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
+			     int uptodate);
+
+static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+				   u64                  off,
+				   struct buffer_head **bh,
+				   int                  flags,
+				   struct inode        *inode);
+
+int ocfs2_write_block(struct ocfs2_super          *osb,
+		      struct buffer_head  *bh,
+		      struct inode        *inode);
+int ocfs2_read_blocks(struct ocfs2_super          *osb,
+		      u64                  block,
+		      int                  nr,
+		      struct buffer_head  *bhs[],
+		      int                  flags,
+		      struct inode        *inode);
+
+
+#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_READAHEAD         8	/* use this to pass READA down to submit_bh */
+
+static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
+				   struct buffer_head **bh, int flags,
+				   struct inode *inode)
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_blocks(osb, off, 1, bh,
+				   flags, inode);
+
+bail:
+	return status;
+}
+
+#endif /* OCFS2_BUFFER_HEAD_IO_H */

+ 4 - 0
fs/ocfs2/cluster/Makefile

@@ -0,0 +1,4 @@
+obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
+
+ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
+	quorum.o tcp.o ver.o

+ 30 - 0
fs/ocfs2/cluster/endian.h

@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_CLUSTER_ENDIAN_H
+#define OCFS2_CLUSTER_ENDIAN_H
+
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+
+#endif /* OCFS2_CLUSTER_ENDIAN_H */

+ 1797 - 0
fs/ocfs2/cluster/heartbeat.c

@@ -0,0 +1,1797 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/configfs.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "quorum.h"
+
+#include "masklog.h"
+
+
+/*
+ * The first heartbeat pass had one global thread that would serialize all hb
+ * callback calls.  This global serializing sem should only be removed once
+ * we've made sure that all callees can deal with being called concurrently
+ * from multiple hb region threads.
+ */
+static DECLARE_RWSEM(o2hb_callback_sem);
+
+/*
+ * multiple hb threads are watching multiple regions.  A node is live
+ * whenever any of the threads sees activity from the node in its region.
+ */
+static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
+static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+static LIST_HEAD(o2hb_node_events);
+static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+
+static LIST_HEAD(o2hb_all_regions);
+
+static struct o2hb_callback {
+	struct list_head list;
+} o2hb_callbacks[O2HB_NUM_CB];
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
+
+#define O2HB_DEFAULT_BLOCK_BITS       9
+
+unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+
+/* Only sets a new threshold if there are no active regions. 
+ *
+ * No locking or otherwise interesting code is required for reading
+ * o2hb_dead_threshold as it can't change once regions are active and
+ * it's not interesting to anyone until then anyway. */
+static void o2hb_dead_threshold_set(unsigned int threshold)
+{
+	if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions))
+			o2hb_dead_threshold = threshold;
+		spin_unlock(&o2hb_live_lock);
+	}
+}
+
+struct o2hb_node_event {
+	struct list_head        hn_item;
+	enum o2hb_callback_type hn_event_type;
+	struct o2nm_node        *hn_node;
+	int                     hn_node_num;
+};
+
+struct o2hb_disk_slot {
+	struct o2hb_disk_heartbeat_block *ds_raw_block;
+	u8			ds_node_num;
+	u64			ds_last_time;
+	u64			ds_last_generation;
+	u16			ds_equal_samples;
+	u16			ds_changed_samples;
+	struct list_head	ds_live_item;
+};
+
+/* each thread owns a region.. when we're asked to tear down the region
+ * we ask the thread to stop, who cleans up the region */
+struct o2hb_region {
+	struct config_item	hr_item;
+
+	struct list_head	hr_all_item;
+	unsigned		hr_unclean_stop:1;
+
+	/* protected by the hr_callback_sem */
+	struct task_struct 	*hr_task;
+
+	unsigned int		hr_blocks;
+	unsigned long long	hr_start_block;
+
+	unsigned int		hr_block_bits;
+	unsigned int		hr_block_bytes;
+
+	unsigned int		hr_slots_per_page;
+	unsigned int		hr_num_pages;
+
+	struct page             **hr_slot_data;
+	struct block_device	*hr_bdev;
+	struct o2hb_disk_slot	*hr_slots;
+
+	/* let the person setting up hb wait for it to return until it
+	 * has reached a 'steady' state.  This will be fixed when we have
+	 * a more complete api that doesn't lead to this sort of fragility. */
+	atomic_t		hr_steady_iterations;
+
+	char			hr_dev_name[BDEVNAME_SIZE];
+
+	unsigned int		hr_timeout_ms;
+
+	/* randomized as the region goes up and down so that a node
+	 * recognizes a node going up and down in one iteration */
+	u64			hr_generation;
+
+	struct work_struct	hr_write_timeout_work;
+	unsigned long		hr_last_timeout_start;
+
+	/* Used during o2hb_check_slot to hold a copy of the block
+	 * being checked because we temporarily have to zero out the
+	 * crc field. */
+	struct o2hb_disk_heartbeat_block *hr_tmp_block;
+};
+
+struct o2hb_bio_wait_ctxt {
+	atomic_t          wc_num_reqs;
+	struct completion wc_io_complete;
+};
+
+static void o2hb_write_timeout(void *arg)
+{
+	struct o2hb_region *reg = arg;
+
+	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
+	     "milliseconds\n", reg->hr_dev_name,
+	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 
+	o2quo_disk_timeout();
+}
+
+static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+{
+	mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
+
+	cancel_delayed_work(&reg->hr_write_timeout_work);
+	reg->hr_last_timeout_start = jiffies;
+	schedule_delayed_work(&reg->hr_write_timeout_work,
+			      msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+}
+
+static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+{
+	cancel_delayed_work(&reg->hr_write_timeout_work);
+	flush_scheduled_work();
+}
+
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
+				      unsigned int num_ios)
+{
+	atomic_set(&wc->wc_num_reqs, num_ios);
+	init_completion(&wc->wc_io_complete);
+}
+
+/* Used in error paths too */
+static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
+				     unsigned int num)
+{
+	/* sadly atomic_sub_and_test() isn't available on all platforms.  The
+	 * good news is that the fast path only completes one at a time */
+	while(num--) {
+		if (atomic_dec_and_test(&wc->wc_num_reqs)) {
+			BUG_ON(num > 0);
+			complete(&wc->wc_io_complete);
+		}
+	}
+}
+
+static void o2hb_wait_on_io(struct o2hb_region *reg,
+			    struct o2hb_bio_wait_ctxt *wc)
+{
+	struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
+
+	blk_run_address_space(mapping);
+
+	wait_for_completion(&wc->wc_io_complete);
+}
+
+static int o2hb_bio_end_io(struct bio *bio,
+			   unsigned int bytes_done,
+			   int error)
+{
+	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
+
+	if (error)
+		mlog(ML_ERROR, "IO Error %d\n", error);
+
+	if (bio->bi_size)
+		return 1;
+
+	o2hb_bio_wait_dec(wc, 1);
+	return 0;
+}
+
+/* Setup a Bio to cover I/O against num_slots slots starting at
+ * start_slot. */
+static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
+				      struct o2hb_bio_wait_ctxt *wc,
+				      unsigned int start_slot,
+				      unsigned int num_slots)
+{
+	int i, nr_vecs, len, first_page, last_page;
+	unsigned int vec_len, vec_start;
+	unsigned int bits = reg->hr_block_bits;
+	unsigned int spp = reg->hr_slots_per_page;
+	struct bio *bio;
+	struct page *page;
+
+	nr_vecs = (num_slots + spp - 1) / spp;
+
+	/* Testing has shown this allocation to take long enough under
+	 * GFP_KERNEL that the local node can get fenced. It would be
+	 * nicest if we could pre-allocate these bios and avoid this
+	 * all together. */
+	bio = bio_alloc(GFP_ATOMIC, nr_vecs);
+	if (!bio) {
+		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
+		bio = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	/* Must put everything in 512 byte sectors for the bio... */
+	bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
+	bio->bi_bdev = reg->hr_bdev;
+	bio->bi_private = wc;
+	bio->bi_end_io = o2hb_bio_end_io;
+
+	first_page = start_slot / spp;
+	last_page = first_page + nr_vecs;
+	vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
+	for(i = first_page; i < last_page; i++) {
+		page = reg->hr_slot_data[i];
+
+		vec_len = PAGE_CACHE_SIZE;
+		/* last page might be short */
+		if (((i + 1) * spp) > (start_slot + num_slots))
+			vec_len = ((num_slots + start_slot) % spp) << bits;
+		vec_len -=  vec_start;
+
+		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
+		     i, vec_len, vec_start);
+
+		len = bio_add_page(bio, page, vec_len, vec_start);
+		if (len != vec_len) {
+			bio_put(bio);
+			bio = ERR_PTR(-EIO);
+
+			mlog(ML_ERROR, "Error adding page to bio i = %d, "
+			     "vec_len = %u, len = %d\n, start = %u\n",
+			     i, vec_len, len, vec_start);
+			goto bail;
+		}
+
+		vec_start = 0;
+	}
+
+bail:
+	return bio;
+}
+
+/*
+ * Compute the maximum number of sectors the bdev can handle in one bio,
+ * as a power of two.
+ *
+ * Stolen from oracleasm, thanks Joel!
+ */
+static int compute_max_sectors(struct block_device *bdev)
+{
+	int max_pages, max_sectors, pow_two_sectors;
+
+	struct request_queue *q;
+
+	q = bdev_get_queue(bdev);
+	max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
+	if (max_pages > q->max_phys_segments)
+		max_pages = q->max_phys_segments;
+	if (max_pages > q->max_hw_segments)
+		max_pages = q->max_hw_segments;
+	max_pages--; /* Handle I/Os that straddle a page */
+
+	max_sectors = max_pages << (PAGE_SHIFT - 9);
+
+	/* Why is fls() 1-based???? */
+	pow_two_sectors = 1 << (fls(max_sectors) - 1);
+
+	return pow_two_sectors;
+}
+
+static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
+					       unsigned int num_slots,
+					       unsigned int *num_bios,
+					       unsigned int *slots_per_bio)
+{
+	unsigned int max_sectors, io_sectors;
+
+	max_sectors = compute_max_sectors(reg->hr_bdev);
+
+	io_sectors = num_slots << (reg->hr_block_bits - 9);
+
+	*num_bios = (io_sectors + max_sectors - 1) / max_sectors;
+	*slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
+
+	mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
+	     "device can handle %u sectors of I/O\n", io_sectors, num_slots,
+	     max_sectors);
+	mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
+	     *num_bios, *slots_per_bio);
+}
+
+static int o2hb_read_slots(struct o2hb_region *reg,
+			   unsigned int max_slots)
+{
+	unsigned int num_bios, slots_per_bio, start_slot, num_slots;
+	int i, status;
+	struct o2hb_bio_wait_ctxt wc;
+	struct bio **bios;
+	struct bio *bio;
+
+	o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
+
+	bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
+	if (!bios) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		return status;
+	}
+
+	o2hb_bio_wait_init(&wc, num_bios);
+
+	num_slots = slots_per_bio;
+	for(i = 0; i < num_bios; i++) {
+		start_slot = i * slots_per_bio;
+
+		/* adjust num_slots at last bio */
+		if (max_slots < (start_slot + num_slots))
+			num_slots = max_slots - start_slot;
+
+		bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
+		if (IS_ERR(bio)) {
+			o2hb_bio_wait_dec(&wc, num_bios - i);
+
+			status = PTR_ERR(bio);
+			mlog_errno(status);
+			goto bail_and_wait;
+		}
+		bios[i] = bio;
+
+		submit_bio(READ, bio);
+	}
+
+	status = 0;
+
+bail_and_wait:
+	o2hb_wait_on_io(reg, &wc);
+
+	if (bios) {
+		for(i = 0; i < num_bios; i++)
+			if (bios[i])
+				bio_put(bios[i]);
+		kfree(bios);
+	}
+
+	return status;
+}
+
+static int o2hb_issue_node_write(struct o2hb_region *reg,
+				 struct bio **write_bio,
+				 struct o2hb_bio_wait_ctxt *write_wc)
+{
+	int status;
+	unsigned int slot;
+	struct bio *bio;
+
+	o2hb_bio_wait_init(write_wc, 1);
+
+	slot = o2nm_this_node();
+
+	bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
+	if (IS_ERR(bio)) {
+		status = PTR_ERR(bio);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	submit_bio(WRITE, bio);
+
+	*write_bio = bio;
+	status = 0;
+bail:
+	return status;
+}
+
+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
+				     struct o2hb_disk_heartbeat_block *hb_block)
+{
+	__le32 old_cksum;
+	u32 ret;
+
+	/* We want to compute the block crc with a 0 value in the
+	 * hb_cksum field. Save it off here and replace after the
+	 * crc. */
+	old_cksum = hb_block->hb_cksum;
+	hb_block->hb_cksum = 0;
+
+	ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
+
+	hb_block->hb_cksum = old_cksum;
+
+	return ret;
+}
+
+static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
+{
+	mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
+	     "cksum = 0x%x, generation 0x%"MLFx64"\n",
+	     le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
+	     le32_to_cpu(hb_block->hb_cksum),
+	     le64_to_cpu(hb_block->hb_generation));
+}
+
+static int o2hb_verify_crc(struct o2hb_region *reg,
+			   struct o2hb_disk_heartbeat_block *hb_block)
+{
+	u32 read, computed;
+
+	read = le32_to_cpu(hb_block->hb_cksum);
+	computed = o2hb_compute_block_crc_le(reg, hb_block);
+
+	return read == computed;
+}
+
+/* We want to make sure that nobody is heartbeating on top of us --
+ * this will help detect an invalid configuration. */
+static int o2hb_check_last_timestamp(struct o2hb_region *reg)
+{
+	int node_num, ret;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	node_num = o2nm_this_node();
+
+	ret = 1;
+	slot = &reg->hr_slots[node_num];
+	/* Don't check on our 1st timestamp */
+	if (slot->ds_last_time) {
+		hb_block = slot->ds_raw_block;
+
+		if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
+			ret = 0;
+	}
+
+	return ret;
+}
+
+static inline void o2hb_prepare_block(struct o2hb_region *reg,
+				      u64 generation)
+{
+	int node_num;
+	u64 cputime;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	node_num = o2nm_this_node();
+	slot = &reg->hr_slots[node_num];
+
+	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
+	memset(hb_block, 0, reg->hr_block_bytes);
+	/* TODO: time stuff */
+	cputime = CURRENT_TIME.tv_sec;
+	if (!cputime)
+		cputime = 1;
+
+	hb_block->hb_seq = cpu_to_le64(cputime);
+	hb_block->hb_node = node_num;
+	hb_block->hb_generation = cpu_to_le64(generation);
+
+	/* This step must always happen last! */
+	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
+								   hb_block));
+
+	mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
+	     cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum));
+}
+
+static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
+				struct o2nm_node *node,
+				int idx)
+{
+	struct list_head *iter;
+	struct o2hb_callback_func *f;
+
+	list_for_each(iter, &hbcall->list) {
+		f = list_entry(iter, struct o2hb_callback_func, hc_item);
+		mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
+		(f->hc_func)(node, idx, f->hc_data);
+	}
+}
+
+/* Will run the list in order until we process the passed event */
+static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
+{
+	int empty;
+	struct o2hb_callback *hbcall;
+	struct o2hb_node_event *event;
+
+	spin_lock(&o2hb_live_lock);
+	empty = list_empty(&queued_event->hn_item);
+	spin_unlock(&o2hb_live_lock);
+	if (empty)
+		return;
+
+	/* Holding callback sem assures we don't alter the callback
+	 * lists when doing this, and serializes ourselves with other
+	 * processes wanting callbacks. */
+	down_write(&o2hb_callback_sem);
+
+	spin_lock(&o2hb_live_lock);
+	while (!list_empty(&o2hb_node_events)
+	       && !list_empty(&queued_event->hn_item)) {
+		event = list_entry(o2hb_node_events.next,
+				   struct o2hb_node_event,
+				   hn_item);
+		list_del_init(&event->hn_item);
+		spin_unlock(&o2hb_live_lock);
+
+		mlog(ML_HEARTBEAT, "Node %s event for %d\n",
+		     event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
+		     event->hn_node_num);
+
+		hbcall = hbcall_from_type(event->hn_event_type);
+
+		/* We should *never* have gotten on to the list with a
+		 * bad type... This isn't something that we should try
+		 * to recover from. */
+		BUG_ON(IS_ERR(hbcall));
+
+		o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
+
+		spin_lock(&o2hb_live_lock);
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	up_write(&o2hb_callback_sem);
+}
+
+static void o2hb_queue_node_event(struct o2hb_node_event *event,
+				  enum o2hb_callback_type type,
+				  struct o2nm_node *node,
+				  int node_num)
+{
+	assert_spin_locked(&o2hb_live_lock);
+
+	event->hn_event_type = type;
+	event->hn_node = node;
+	event->hn_node_num = node_num;
+
+	mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
+	     type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
+
+	list_add_tail(&event->hn_item, &o2hb_node_events);
+}
+
+static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
+{
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node)
+		return;
+
+	spin_lock(&o2hb_live_lock);
+	if (!list_empty(&slot->ds_live_item)) {
+		mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
+		     slot->ds_node_num);
+
+		list_del_init(&slot->ds_live_item);
+
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+					      slot->ds_node_num);
+		}
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	o2hb_run_event_list(&event);
+
+	o2nm_node_put(node);
+}
+
+static int o2hb_check_slot(struct o2hb_region *reg,
+			   struct o2hb_disk_slot *slot)
+{
+	int changed = 0, gen_changed = 0;
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
+	u64 cputime;
+
+	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
+
+	/* Is this correct? Do we assume that the node doesn't exist
+	 * if we're not configured for him? */
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node)
+		return 0;
+
+	if (!o2hb_verify_crc(reg, hb_block)) {
+		/* all paths from here will drop o2hb_live_lock for
+		 * us. */
+		spin_lock(&o2hb_live_lock);
+
+		/* Don't print an error on the console in this case -
+		 * a freshly formatted heartbeat area will not have a
+		 * crc set on it. */
+		if (list_empty(&slot->ds_live_item))
+			goto out;
+
+		/* The node is live but pushed out a bad crc. We
+		 * consider it a transient miss but don't populate any
+		 * other values as they may be junk. */
+		mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
+		     slot->ds_node_num, reg->hr_dev_name);
+		o2hb_dump_slot(hb_block);
+
+		slot->ds_equal_samples++;
+		goto fire_callbacks;
+	}
+
+	/* we don't care if these wrap.. the state transitions below
+	 * clear at the right places */
+	cputime = le64_to_cpu(hb_block->hb_seq);
+	if (slot->ds_last_time != cputime)
+		slot->ds_changed_samples++;
+	else
+		slot->ds_equal_samples++;
+	slot->ds_last_time = cputime;
+
+	/* The node changed heartbeat generations. We assume this to
+	 * mean it dropped off but came back before we timed out. We
+	 * want to consider it down for the time being but don't want
+	 * to lose any changed_samples state we might build up to
+	 * considering it live again. */
+	if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
+		gen_changed = 1;
+		slot->ds_equal_samples = 0;
+		mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
+		     "to 0x%"MLFx64")\n", slot->ds_node_num,
+		     slot->ds_last_generation,
+		     le64_to_cpu(hb_block->hb_generation));
+	}
+
+	slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+
+	mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
+	     "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
+	     slot->ds_node_num, slot->ds_last_generation,
+	     le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq), 
+	     slot->ds_last_time, slot->ds_changed_samples,
+	     slot->ds_equal_samples);
+
+	spin_lock(&o2hb_live_lock);
+
+fire_callbacks:
+	/* dead nodes only come to life after some number of
+	 * changes at any time during their dead time */
+	if (list_empty(&slot->ds_live_item) &&
+	    slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
+		mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
+		     "region\n", slot->ds_node_num, slot->ds_last_generation);
+
+		/* first on the list generates a callback */
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
+					      slot->ds_node_num);
+
+			changed = 1;
+		}
+
+		list_add_tail(&slot->ds_live_item,
+			      &o2hb_live_slots[slot->ds_node_num]);
+
+		slot->ds_equal_samples = 0;
+		goto out;
+	}
+
+	/* if the list is dead, we're done.. */
+	if (list_empty(&slot->ds_live_item))
+		goto out;
+
+	/* live nodes only go dead after enough consequtive missed
+	 * samples..  reset the missed counter whenever we see
+	 * activity */
+	if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
+		mlog(ML_HEARTBEAT, "Node %d left my region\n",
+		     slot->ds_node_num);
+
+		/* last off the live_slot generates a callback */
+		list_del_init(&slot->ds_live_item);
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+					      slot->ds_node_num);
+
+			changed = 1;
+		}
+
+		/* We don't clear this because the node is still
+		 * actually writing new blocks. */
+		if (!gen_changed)
+			slot->ds_changed_samples = 0;
+		goto out;
+	}
+	if (slot->ds_changed_samples) {
+		slot->ds_changed_samples = 0;
+		slot->ds_equal_samples = 0;
+	}
+out:
+	spin_unlock(&o2hb_live_lock);
+
+	o2hb_run_event_list(&event);
+
+	o2nm_node_put(node);
+	return changed;
+}
+
+/* This could be faster if we just implmented a find_last_bit, but I
+ * don't think the circumstances warrant it. */
+static int o2hb_highest_node(unsigned long *nodes,
+			     int numbits)
+{
+	int highest, node;
+
+	highest = numbits;
+	node = -1;
+	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
+		if (node >= numbits)
+			break;
+
+		highest = node;
+	}
+
+	return highest;
+}
+
+static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+{
+	int i, ret, highest_node, change = 0;
+	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct bio *write_bio;
+	struct o2hb_bio_wait_ctxt write_wc;
+
+	if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
+		return;
+
+	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
+	if (highest_node >= O2NM_MAX_NODES) {
+		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
+		return;
+	}
+
+	/* No sense in reading the slots of nodes that don't exist
+	 * yet. Of course, if the node definitions have holes in them
+	 * then we're reading an empty slot anyway... Consider this
+	 * best-effort. */
+	ret = o2hb_read_slots(reg, highest_node + 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return;
+	}
+
+	/* With an up to date view of the slots, we can check that no
+	 * other node has been improperly configured to heartbeat in
+	 * our slot. */
+	if (!o2hb_check_last_timestamp(reg))
+		mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
+		     "in our slot!\n", reg->hr_dev_name);
+
+	/* fill in the proper info for our next heartbeat */
+	o2hb_prepare_block(reg, reg->hr_generation);
+
+	/* And fire off the write. Note that we don't wait on this I/O
+	 * until later. */
+	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return;
+	}
+
+	i = -1;
+	while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+
+		change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+	}
+
+	/*
+	 * We have to be sure we've advertised ourselves on disk
+	 * before we can go to steady state.  This ensures that
+	 * people we find in our steady state have seen us.
+	 */
+	o2hb_wait_on_io(reg, &write_wc);
+	bio_put(write_bio);
+	o2hb_arm_write_timeout(reg);
+
+	/* let the person who launched us know when things are steady */
+	if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
+		if (atomic_dec_and_test(&reg->hr_steady_iterations))
+			wake_up(&o2hb_steady_queue);
+	}
+}
+
+/* Subtract b from a, storing the result in a. a *must* have a larger
+ * value than b. */
+static void o2hb_tv_subtract(struct timeval *a,
+			     struct timeval *b)
+{
+	/* just return 0 when a is after b */
+	if (a->tv_sec < b->tv_sec ||
+	    (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
+		a->tv_sec = 0;
+		a->tv_usec = 0;
+		return;
+	}
+
+	a->tv_sec -= b->tv_sec;
+	a->tv_usec -= b->tv_usec;
+	while ( a->tv_usec < 0 ) {
+		a->tv_sec--;
+		a->tv_usec += 1000000;
+	}
+}
+
+static unsigned int o2hb_elapsed_msecs(struct timeval *start,
+				       struct timeval *end)
+{
+	struct timeval res = *end;
+
+	o2hb_tv_subtract(&res, start);
+
+	return res.tv_sec * 1000 + res.tv_usec / 1000;
+}
+
+/*
+ * we ride the region ref that the region dir holds.  before the region
+ * dir is removed and drops it ref it will wait to tear down this
+ * thread.
+ */
+static int o2hb_thread(void *data)
+{
+	int i, ret;
+	struct o2hb_region *reg = data;
+	struct bio *write_bio;
+	struct o2hb_bio_wait_ctxt write_wc;
+	struct timeval before_hb, after_hb;
+	unsigned int elapsed_msec;
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
+
+	set_user_nice(current, -20);
+
+	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+		/* We track the time spent inside
+		 * o2hb_do_disk_heartbeat so that we avoid more then
+		 * hr_timeout_ms between disk writes. On busy systems
+		 * this should result in a heartbeat which is less
+		 * likely to time itself out. */
+		do_gettimeofday(&before_hb);
+
+		o2hb_do_disk_heartbeat(reg);
+
+		do_gettimeofday(&after_hb);
+		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+
+		mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+		     before_hb.tv_sec, before_hb.tv_usec,
+		     after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
+
+		if (elapsed_msec < reg->hr_timeout_ms) {
+			/* the kthread api has blocked signals for us so no
+			 * need to record the return value. */
+			msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
+		}
+	}
+
+	o2hb_disarm_write_timeout(reg);
+
+	/* unclean stop is only used in very bad situation */
+	for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
+		o2hb_shutdown_slot(&reg->hr_slots[i]);
+
+	/* Explicit down notification - avoid forcing the other nodes
+	 * to timeout on this region when we could just as easily
+	 * write a clear generation - thus indicating to them that
+	 * this node has left this region.
+	 *
+	 * XXX: Should we skip this on unclean_stop? */
+	o2hb_prepare_block(reg, 0);
+	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+	if (ret == 0) {
+		o2hb_wait_on_io(reg, &write_wc);
+		bio_put(write_bio);
+	} else {
+		mlog_errno(ret);
+	}
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+
+	return 0;
+}
+
+void o2hb_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
+		INIT_LIST_HEAD(&o2hb_callbacks[i].list);
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
+		INIT_LIST_HEAD(&o2hb_live_slots[i]);
+
+	INIT_LIST_HEAD(&o2hb_node_events);
+
+	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+}
+
+/* if we're already in a callback then we're already serialized by the sem */
+static void o2hb_fill_node_map_from_callback(unsigned long *map,
+					     unsigned bytes)
+{
+	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+	memcpy(map, &o2hb_live_node_bitmap, bytes);
+}
+
+/*
+ * get a map of all nodes that are heartbeating in any regions
+ */
+void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+{
+	/* callers want to serialize this map and callbacks so that they
+	 * can trust that they don't miss nodes coming to the party */
+	down_read(&o2hb_callback_sem);
+	spin_lock(&o2hb_live_lock);
+	o2hb_fill_node_map_from_callback(map, bytes);
+	spin_unlock(&o2hb_live_lock);
+	up_read(&o2hb_callback_sem);
+}
+EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
+
+/*
+ * heartbeat configfs bits.  The heartbeat set is a default set under
+ * the cluster set in nodemanager.c.
+ */
+
+static struct o2hb_region *to_o2hb_region(struct config_item *item)
+{
+	return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
+}
+
+/* drop_item only drops its ref after killing the thread, nothing should
+ * be using the region anymore.  this has to clean up any state that
+ * attributes might have built up. */
+static void o2hb_region_release(struct config_item *item)
+{
+	int i;
+	struct page *page;
+	struct o2hb_region *reg = to_o2hb_region(item);
+
+	if (reg->hr_tmp_block)
+		kfree(reg->hr_tmp_block);
+
+	if (reg->hr_slot_data) {
+		for (i = 0; i < reg->hr_num_pages; i++) {
+			page = reg->hr_slot_data[i];
+			if (page)
+				__free_page(page);
+		}
+		kfree(reg->hr_slot_data);
+	}
+
+	if (reg->hr_bdev)
+		blkdev_put(reg->hr_bdev);
+
+	if (reg->hr_slots)
+		kfree(reg->hr_slots);
+
+	spin_lock(&o2hb_live_lock);
+	list_del(&reg->hr_all_item);
+	spin_unlock(&o2hb_live_lock);
+
+	kfree(reg);
+}
+
+static int o2hb_read_block_input(struct o2hb_region *reg,
+				 const char *page,
+				 size_t count,
+				 unsigned long *ret_bytes,
+				 unsigned int *ret_bits)
+{
+	unsigned long bytes;
+	char *p = (char *)page;
+
+	bytes = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	/* Heartbeat and fs min / max block sizes are the same. */
+	if (bytes > 4096 || bytes < 512)
+		return -ERANGE;
+	if (hweight16(bytes) != 1)
+		return -EINVAL;
+
+	if (ret_bytes)
+		*ret_bytes = bytes;
+	if (ret_bits)
+		*ret_bits = ffs(bytes) - 1;
+
+	return 0;
+}
+
+static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%u\n", reg->hr_block_bytes);
+}
+
+static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	int status;
+	unsigned long block_bytes;
+	unsigned int block_bits;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	status = o2hb_read_block_input(reg, page, count,
+				       &block_bytes, &block_bits);
+	if (status)
+		return status;
+
+	reg->hr_block_bytes = (unsigned int)block_bytes;
+	reg->hr_block_bits = block_bits;
+
+	return count;
+}
+
+static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%llu\n", reg->hr_start_block);
+}
+
+static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	unsigned long long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoull(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	reg->hr_start_block = tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
+				       char *page)
+{
+	return sprintf(page, "%d\n", reg->hr_blocks);
+}
+
+static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+					const char *page,
+					size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > O2NM_MAX_NODES || tmp == 0)
+		return -ERANGE;
+
+	reg->hr_blocks = (unsigned int)tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
+				    char *page)
+{
+	unsigned int ret = 0;
+
+	if (reg->hr_bdev)
+		ret = sprintf(page, "%s\n", reg->hr_dev_name);
+
+	return ret;
+}
+
+static void o2hb_init_region_params(struct o2hb_region *reg)
+{
+	reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+	reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
+
+	mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
+	     reg->hr_start_block, reg->hr_blocks);
+	mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
+	     reg->hr_block_bytes, reg->hr_block_bits);
+	mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
+	mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
+}
+
+static int o2hb_map_slot_data(struct o2hb_region *reg)
+{
+	int i, j;
+	unsigned int last_slot;
+	unsigned int spp = reg->hr_slots_per_page;
+	struct page *page;
+	char *raw;
+	struct o2hb_disk_slot *slot;
+
+	reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
+	if (reg->hr_tmp_block == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	reg->hr_slots = kcalloc(reg->hr_blocks,
+				sizeof(struct o2hb_disk_slot), GFP_KERNEL);
+	if (reg->hr_slots == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		slot->ds_node_num = i;
+		INIT_LIST_HEAD(&slot->ds_live_item);
+		slot->ds_raw_block = NULL;
+	}
+
+	reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
+	mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
+			   "at %u blocks per page\n",
+	     reg->hr_num_pages, reg->hr_blocks, spp);
+
+	reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
+				    GFP_KERNEL);
+	if (!reg->hr_slot_data) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_num_pages; i++) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+
+		reg->hr_slot_data[i] = page;
+
+		last_slot = i * spp;
+		raw = page_address(page);
+		for (j = 0;
+		     (j < spp) && ((j + last_slot) < reg->hr_blocks);
+		     j++) {
+			BUG_ON((j + last_slot) >= reg->hr_blocks);
+
+			slot = &reg->hr_slots[j + last_slot];
+			slot->ds_raw_block =
+				(struct o2hb_disk_heartbeat_block *) raw;
+
+			raw += reg->hr_block_bytes;
+		}
+	}
+
+	return 0;
+}
+
+/* Read in all the slots available and populate the tracking
+ * structures so that we can start with a baseline idea of what's
+ * there. */
+static int o2hb_populate_slot_data(struct o2hb_region *reg)
+{
+	int ret, i;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	mlog_entry_void();
+
+	ret = o2hb_read_slots(reg, reg->hr_blocks);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* We only want to get an idea of the values initially in each
+	 * slot, so we do no verification - o2hb_check_slot will
+	 * actually determine if each configured slot is valid and
+	 * whether any values have changed. */
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
+
+		/* Only fill the values that o2hb_check_slot uses to
+		 * determine changing slots */
+		slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
+		slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+	}
+
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+				     const char *page,
+				     size_t count)
+{
+	long fd;
+	int sectsize;
+	char *p = (char *)page;
+	struct file *filp = NULL;
+	struct inode *inode = NULL;
+	ssize_t ret = -EINVAL;
+
+	if (reg->hr_bdev)
+		goto out;
+
+	/* We can't heartbeat without having had our node number
+	 * configured yet. */
+	if (o2nm_this_node() == O2NM_MAX_NODES)
+		goto out;
+
+	fd = simple_strtol(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		goto out;
+
+	if (fd < 0 || fd >= INT_MAX)
+		goto out;
+
+	filp = fget(fd);
+	if (filp == NULL)
+		goto out;
+
+	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
+	    reg->hr_block_bytes == 0)
+		goto out;
+
+	inode = igrab(filp->f_mapping->host);
+	if (inode == NULL)
+		goto out;
+
+	if (!S_ISBLK(inode->i_mode))
+		goto out;
+
+	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
+	if (ret) {
+		reg->hr_bdev = NULL;
+		goto out;
+	}
+	inode = NULL;
+
+	bdevname(reg->hr_bdev, reg->hr_dev_name);
+
+	sectsize = bdev_hardsect_size(reg->hr_bdev);
+	if (sectsize != reg->hr_block_bytes) {
+		mlog(ML_ERROR,
+		     "blocksize %u incorrect for device, expected %d",
+		     reg->hr_block_bytes, sectsize);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	o2hb_init_region_params(reg);
+
+	/* Generation of zero is invalid */
+	do {
+		get_random_bytes(&reg->hr_generation,
+				 sizeof(reg->hr_generation));
+	} while (reg->hr_generation == 0);
+
+	ret = o2hb_map_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = o2hb_populate_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	INIT_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout, reg);
+
+	/*
+	 * A node is considered live after it has beat LIVE_THRESHOLD
+	 * times.  We're not steady until we've given them a chance
+	 * _after_ our first read.
+	 */
+	atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
+
+	reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
+				   reg->hr_item.ci_name);
+	if (IS_ERR(reg->hr_task)) {
+		ret = PTR_ERR(reg->hr_task);
+		mlog_errno(ret);
+		reg->hr_task = NULL;
+		goto out;
+	}
+
+	ret = wait_event_interruptible(o2hb_steady_queue,
+				atomic_read(&reg->hr_steady_iterations) == 0);
+	if (ret) {
+		kthread_stop(reg->hr_task);
+		reg->hr_task = NULL;
+		goto out;
+	}
+
+	ret = count;
+out:
+	if (filp)
+		fput(filp);
+	if (inode)
+		iput(inode);
+	if (ret < 0) {
+		if (reg->hr_bdev) {
+			blkdev_put(reg->hr_bdev);
+			reg->hr_bdev = NULL;
+		}
+	}
+	return ret;
+}
+
+struct o2hb_region_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_region *, char *);
+	ssize_t (*store)(struct o2hb_region *, const char *, size_t);
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "block_bytes",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_block_bytes_read,
+	.store	= o2hb_region_block_bytes_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_start_block = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "start_block",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_start_block_read,
+	.store	= o2hb_region_start_block_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_blocks = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "blocks",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_blocks_read,
+	.store	= o2hb_region_blocks_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_dev = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dev",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_dev_read,
+	.store	= o2hb_region_dev_write,
+};
+
+static struct configfs_attribute *o2hb_region_attrs[] = {
+	&o2hb_region_attr_block_bytes.attr,
+	&o2hb_region_attr_start_block.attr,
+	&o2hb_region_attr_blocks.attr,
+	&o2hb_region_attr_dev.attr,
+	NULL,
+};
+
+static ssize_t o2hb_region_show(struct config_item *item,
+				struct configfs_attribute *attr,
+				char *page)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_region_attr->show)
+		ret = o2hb_region_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_region_store(struct config_item *item,
+				 struct configfs_attribute *attr,
+				 const char *page, size_t count)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_region_attr->store)
+		ret = o2hb_region_attr->store(reg, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations o2hb_region_item_ops = {
+	.release		= o2hb_region_release,
+	.show_attribute		= o2hb_region_show,
+	.store_attribute	= o2hb_region_store,
+};
+
+static struct config_item_type o2hb_region_type = {
+	.ct_item_ops	= &o2hb_region_item_ops,
+	.ct_attrs	= o2hb_region_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* heartbeat set */
+
+struct o2hb_heartbeat_group {
+	struct config_group hs_group;
+	/* some stuff? */
+};
+
+static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2hb_heartbeat_group, hs_group)
+		: NULL;
+}
+
+static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
+							  const char *name)
+{
+	struct o2hb_region *reg = NULL;
+	struct config_item *ret = NULL;
+
+	reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL);
+	if (reg == NULL)
+		goto out; /* ENOMEM */
+
+	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+	ret = &reg->hr_item;
+
+	spin_lock(&o2hb_live_lock);
+	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
+	spin_unlock(&o2hb_live_lock);
+out:
+	if (ret == NULL)
+		kfree(reg);
+
+	return ret;
+}
+
+static void o2hb_heartbeat_group_drop_item(struct config_group *group,
+					   struct config_item *item)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+
+	/* stop the thread when the user removes the region dir */
+	if (reg->hr_task) {
+		kthread_stop(reg->hr_task);
+		reg->hr_task = NULL;
+	}
+
+	config_item_put(item);
+}
+
+struct o2hb_heartbeat_group_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
+	ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
+};
+
+static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
+					 struct configfs_attribute *attr,
+					 char *page)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_heartbeat_group_attr->show)
+		ret = o2hb_heartbeat_group_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
+					  struct configfs_attribute *attr,
+					  const char *page, size_t count)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_heartbeat_group_attr->store)
+		ret = o2hb_heartbeat_group_attr->store(reg, page, count);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
+						     char *page)
+{
+	return sprintf(page, "%u\n", o2hb_dead_threshold);
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
+						    const char *page,
+						    size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+                return -EINVAL;
+
+	/* this will validate ranges for us. */
+	o2hb_dead_threshold_set((unsigned int) tmp);
+
+	return count;
+}
+
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dead_threshold",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_heartbeat_group_threshold_show,
+	.store	= o2hb_heartbeat_group_threshold_store,
+};
+
+static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
+	&o2hb_heartbeat_group_attr_threshold.attr,
+	NULL,
+};
+
+static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+	.show_attribute		= o2hb_heartbeat_group_show,
+	.store_attribute	= o2hb_heartbeat_group_store,
+};
+
+static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
+	.make_item	= o2hb_heartbeat_group_make_item,
+	.drop_item	= o2hb_heartbeat_group_drop_item,
+};
+
+static struct config_item_type o2hb_heartbeat_group_type = {
+	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
+	.ct_item_ops	= &o2hb_hearbeat_group_item_ops,
+	.ct_attrs	= o2hb_heartbeat_group_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* this is just here to avoid touching group in heartbeat.h which the
+ * entire damn world #includes */
+struct config_group *o2hb_alloc_hb_set(void)
+{
+	struct o2hb_heartbeat_group *hs = NULL;
+	struct config_group *ret = NULL;
+
+	hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
+	if (hs == NULL)
+		goto out;
+
+	config_group_init_type_name(&hs->hs_group, "heartbeat",
+				    &o2hb_heartbeat_group_type);
+
+	ret = &hs->hs_group;
+out:
+	if (ret == NULL)
+		kfree(hs);
+	return ret;
+}
+
+void o2hb_free_hb_set(struct config_group *group)
+{
+	struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
+	kfree(hs);
+}
+
+/* hb callback registration and issueing */
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
+{
+	if (type == O2HB_NUM_CB)
+		return ERR_PTR(-EINVAL);
+
+	return &o2hb_callbacks[type];
+}
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority)
+{
+	INIT_LIST_HEAD(&hc->hc_item);
+	hc->hc_func = func;
+	hc->hc_data = data;
+	hc->hc_priority = priority;
+	hc->hc_type = type;
+	hc->hc_magic = O2HB_CB_MAGIC;
+}
+EXPORT_SYMBOL_GPL(o2hb_setup_callback);
+
+int o2hb_register_callback(struct o2hb_callback_func *hc)
+{
+	struct o2hb_callback_func *tmp;
+	struct list_head *iter;
+	struct o2hb_callback *hbcall;
+	int ret;
+
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+	BUG_ON(!list_empty(&hc->hc_item));
+
+	hbcall = hbcall_from_type(hc->hc_type);
+	if (IS_ERR(hbcall)) {
+		ret = PTR_ERR(hbcall);
+		goto out;
+	}
+
+	down_write(&o2hb_callback_sem);
+
+	list_for_each(iter, &hbcall->list) {
+		tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
+		if (hc->hc_priority < tmp->hc_priority) {
+			list_add_tail(&hc->hc_item, iter);
+			break;
+		}
+	}
+	if (list_empty(&hc->hc_item))
+		list_add_tail(&hc->hc_item, &hbcall->list);
+
+	up_write(&o2hb_callback_sem);
+	ret = 0;
+out:
+	mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+	     ret, __builtin_return_address(0), hc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2hb_register_callback);
+
+int o2hb_unregister_callback(struct o2hb_callback_func *hc)
+{
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+
+	mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+	     __builtin_return_address(0), hc);
+
+	if (list_empty(&hc->hc_item))
+		return 0;
+
+	down_write(&o2hb_callback_sem);
+
+	list_del_init(&hc->hc_item);
+
+	up_write(&o2hb_callback_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
+
+int o2hb_check_node_heartbeating(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+
+int o2hb_check_node_heartbeating_from_callback(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
+
+/* Makes sure our local node is configured with a node number, and is
+ * heartbeating. */
+int o2hb_check_local_node_heartbeating(void)
+{
+	u8 node_num;
+
+	/* if this node was set then we have networking */
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_MAX_NODES) {
+		mlog(ML_HEARTBEAT, "this node has not been configured.\n");
+		return 0;
+	}
+
+	return o2hb_check_node_heartbeating(node_num);
+}
+EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
+
+/*
+ * this is just a hack until we get the plumbing which flips file systems
+ * read only and drops the hb ref instead of killing the node dead.
+ */
+void o2hb_stop_all_regions(void)
+{
+	struct o2hb_region *reg;
+
+	mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
+
+	spin_lock(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
+		reg->hr_unclean_stop = 1;
+
+	spin_unlock(&o2hb_live_lock);
+}
+EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);

+ 82 - 0
fs/ocfs2/cluster/heartbeat.h

@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_HEARTBEAT_H
+#define O2CLUSTER_HEARTBEAT_H
+
+#include "ocfs2_heartbeat.h"
+
+#define O2HB_REGION_TIMEOUT_MS		2000
+
+/* number of changes to be seen as live */
+#define O2HB_LIVE_THRESHOLD	   2
+/* number of equal samples to be seen as dead */
+extern unsigned int o2hb_dead_threshold;
+#define O2HB_DEFAULT_DEAD_THRESHOLD	   7
+/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
+#define O2HB_MIN_DEAD_THRESHOLD	  2
+#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
+
+#define O2HB_CB_MAGIC		0x51d1e4ec
+
+/* callback stuff */
+enum o2hb_callback_type {
+	O2HB_NODE_DOWN_CB = 0,
+	O2HB_NODE_UP_CB,
+	O2HB_NUM_CB
+};
+
+struct o2nm_node;
+typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
+
+struct o2hb_callback_func {
+	u32			hc_magic;
+	struct list_head	hc_item;
+	o2hb_cb_func		*hc_func;
+	void			*hc_data;
+	int			hc_priority;
+	enum o2hb_callback_type hc_type;
+};
+
+struct config_group *o2hb_alloc_hb_set(void);
+void o2hb_free_hb_set(struct config_group *group);
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority);
+int o2hb_register_callback(struct o2hb_callback_func *hc);
+int o2hb_unregister_callback(struct o2hb_callback_func *hc);
+void o2hb_fill_node_map(unsigned long *map,
+			unsigned bytes);
+void o2hb_init(void);
+int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_from_callback(u8 node_num);
+int o2hb_check_local_node_heartbeating(void);
+void o2hb_stop_all_regions(void);
+
+#endif /* O2CLUSTER_HEARTBEAT_H */

+ 166 - 0
fs/ocfs2/cluster/masklog.c

@@ -0,0 +1,166 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+
+#include "masklog.h"
+
+struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
+EXPORT_SYMBOL_GPL(mlog_and_bits);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+EXPORT_SYMBOL_GPL(mlog_not_bits);
+
+static ssize_t mlog_mask_show(u64 mask, char *buf)
+{
+	char *state;
+
+	if (__mlog_test_u64(mask, mlog_and_bits))
+		state = "allow";
+	else if (__mlog_test_u64(mask, mlog_not_bits))
+		state = "deny";
+	else
+		state = "off";
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", state);
+}
+
+static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
+{
+	if (!strnicmp(buf, "allow", 5)) {
+		__mlog_set_u64(mask, mlog_and_bits);
+		__mlog_clear_u64(mask, mlog_not_bits);
+	} else if (!strnicmp(buf, "deny", 4)) {
+		__mlog_set_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else if (!strnicmp(buf, "off", 3)) {
+		__mlog_clear_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+struct mlog_attribute {
+	struct attribute attr;
+	u64 mask;
+};
+
+#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
+
+#define define_mask(_name) {			\
+	.attr = {				\
+		.name = #_name,			\
+		.mode = S_IRUGO | S_IWUSR,	\
+	},					\
+	.mask = ML_##_name,			\
+}
+
+static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
+	define_mask(ENTRY),
+	define_mask(EXIT),
+	define_mask(TCP),
+	define_mask(MSG),
+	define_mask(SOCKET),
+	define_mask(HEARTBEAT),
+	define_mask(HB_BIO),
+	define_mask(DLMFS),
+	define_mask(DLM),
+	define_mask(DLM_DOMAIN),
+	define_mask(DLM_THREAD),
+	define_mask(DLM_MASTER),
+	define_mask(DLM_RECOVERY),
+	define_mask(AIO),
+	define_mask(JOURNAL),
+	define_mask(DISK_ALLOC),
+	define_mask(SUPER),
+	define_mask(FILE_IO),
+	define_mask(EXTENT_MAP),
+	define_mask(DLM_GLUE),
+	define_mask(BH_IO),
+	define_mask(UPTODATE),
+	define_mask(NAMEI),
+	define_mask(INODE),
+	define_mask(VOTE),
+	define_mask(DCACHE),
+	define_mask(CONN),
+	define_mask(QUORUM),
+	define_mask(EXPORT),
+	define_mask(ERROR),
+	define_mask(NOTICE),
+	define_mask(KTHREAD),
+};
+
+static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
+
+static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
+			 char *buf)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_show(mlog_attr->mask, buf);
+}
+
+static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_store(mlog_attr->mask, buf, count);
+}
+
+static struct sysfs_ops mlog_attr_ops = {
+	.show  = mlog_show,
+	.store = mlog_store,
+};
+
+static struct kobj_type mlog_ktype = {
+	.default_attrs = mlog_attr_ptrs,
+	.sysfs_ops     = &mlog_attr_ops,
+};
+
+static struct kset mlog_kset = {
+	.kobj   = {.name = "logmask", .ktype = &mlog_ktype},
+};
+
+int mlog_sys_init(struct subsystem *o2cb_subsys)
+{
+	int i = 0;
+
+	while (mlog_attrs[i].attr.mode) {
+		mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
+		i++;
+	}
+	mlog_attr_ptrs[i] = NULL;
+
+	mlog_kset.subsys = o2cb_subsys;
+	return kset_register(&mlog_kset);
+}
+
+void mlog_sys_shutdown(void)
+{
+	kset_unregister(&mlog_kset);
+}

+ 275 - 0
fs/ocfs2/cluster/masklog.h

@@ -0,0 +1,275 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_MASKLOG_H
+#define O2CLUSTER_MASKLOG_H
+
+/*
+ * For now this is a trivial wrapper around printk() that gives the critical
+ * ability to enable sets of debugging output at run-time.  In the future this
+ * will almost certainly be redirected to relayfs so that it can pay a
+ * substantially lower heisenberg tax.
+ *
+ * Callers associate the message with a bitmask and a global bitmask is
+ * maintained with help from /proc.  If any of the bits match the message is
+ * output.
+ *
+ * We must have efficient bit tests on i386 and it seems gcc still emits crazy
+ * code for the 64bit compare.  It emits very good code for the dual unsigned
+ * long tests, though, completely avoiding tests that can never pass if the
+ * caller gives a constant bitmask that fills one of the longs with all 0s.  So
+ * the desire is to have almost all of the calls decided on by comparing just
+ * one of the longs.  This leads to having infrequently given bits that are
+ * frequently matched in the high bits.
+ *
+ * _ERROR and _NOTICE are used for messages that always go to the console and
+ * have appropriate KERN_ prefixes.  We wrap these in our function instead of
+ * just calling printk() so that this can eventually make its way through
+ * relayfs along with the debugging messages.  Everything else gets KERN_DEBUG.
+ * The inline tests and macro dance give GCC the opportunity to quite cleverly
+ * only emit the appropriage printk() when the caller passes in a constant
+ * mask, as is almost always the case.
+ *
+ * All this bitmask nonsense is hidden from the /proc interface so that Joel
+ * doesn't have an aneurism.  Reading the file gives a straight forward
+ * indication of which bits are on or off:
+ * 	ENTRY off
+ * 	EXIT off
+ * 	TCP off
+ * 	MSG off
+ * 	SOCKET off
+ * 	ERROR off
+ * 	NOTICE on
+ *
+ * Writing changes the state of a given bit and requires a strictly formatted
+ * single write() call:
+ *
+ * 	write(fd, "ENTRY on", 8);
+ *
+ * would turn the entry bit on.  "1" is also accepted in the place of "on", and
+ * "off" and "0" behave as expected.
+ *
+ * Some trivial shell can flip all the bits on or off:
+ *
+ * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
+ * cat $log_mask | (
+ * 	while read bit status; do
+ * 		# $1 is "on" or "off", say
+ * 		echo "$bit $1" > $log_mask
+ * 	done
+ * )
+ */
+
+/* for task_struct */
+#include <linux/sched.h>
+
+/* bits that are frequently given and infrequently matched in the low word */
+/* NOTE: If you add a flag, you need to also update mlog.c! */
+#define ML_ENTRY	0x0000000000000001ULL /* func call entry */
+#define ML_EXIT		0x0000000000000002ULL /* func call exit */
+#define ML_TCP		0x0000000000000004ULL /* net cluster/tcp.c */
+#define ML_MSG		0x0000000000000008ULL /* net network messages */
+#define ML_SOCKET	0x0000000000000010ULL /* net socket lifetime */
+#define ML_HEARTBEAT	0x0000000000000020ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO	0x0000000000000040ULL /* hb io tracing */
+#define ML_DLMFS	0x0000000000000080ULL /* dlm user dlmfs */
+#define ML_DLM		0x0000000000000100ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN	0x0000000000000200ULL /* dlm domain debugging */
+#define ML_DLM_THREAD	0x0000000000000400ULL /* dlm domain thread */
+#define ML_DLM_MASTER	0x0000000000000800ULL /* dlm master functions */
+#define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
+#define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
+#define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
+#define ML_DISK_ALLOC	0x0000000000008000ULL /* ocfs2 disk allocation */
+#define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
+#define ML_FILE_IO	0x0000000000020000ULL /* ocfs2 file I/O */
+#define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
+#define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
+#define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
+#define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
+#define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
+#define ML_INODE	0x0000000000800000ULL /* ocfs2 inode manipulation */
+#define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
+#define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
+#define ML_CONN		0x0000000004000000ULL /* net connection management */
+#define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
+#define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
+/* bits that are infrequently given and frequently matched in the high word */
+#define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD	0x0000000400000000ULL /* kernel thread activity */
+
+#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
+#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
+#ifndef MLOG_MASK_PREFIX
+#define MLOG_MASK_PREFIX 0
+#endif
+
+#define MLOG_MAX_BITS 64
+
+struct mlog_bits {
+	unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
+};
+
+extern struct mlog_bits mlog_and_bits, mlog_not_bits;
+
+#if BITS_PER_LONG == 32
+
+#define __mlog_test_u64(mask, bits)			\
+	( (u32)(mask & 0xffffffff) & bits.words[0] || 	\
+	  ((u64)(mask) >> 32) & bits.words[1] )
+#define __mlog_set_u64(mask, bits) do {			\
+	bits.words[0] |= (u32)(mask & 0xffffffff);	\
+       	bits.words[1] |= (u64)(mask) >> 32;		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {		\
+	bits.words[0] &= ~((u32)(mask & 0xffffffff));	\
+       	bits.words[1] &= ~((u64)(mask) >> 32);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) {				\
+	{						\
+		[0] = (u32)(mask & 0xffffffff),		\
+		[1] = (u64)(mask) >> 32,		\
+	}						\
+}
+
+#else /* 32bit long above, 64bit long below */
+
+#define __mlog_test_u64(mask, bits)	((mask) & bits.words[0])
+#define __mlog_set_u64(mask, bits) do {		\
+	bits.words[0] |= (mask);		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {	\
+	bits.words[0] &= ~(mask);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) { { (mask) } }
+
+#endif
+
+/*
+ * smp_processor_id() "helpfully" screams when called outside preemptible
+ * regions in current kernels.  sles doesn't have the variants that don't
+ * scream.  just do this instead of trying to guess which we're building
+ * against.. *sigh*.
+ */
+#define __mlog_cpu_guess ({		\
+	unsigned long _cpu = get_cpu();	\
+	put_cpu();			\
+	_cpu;				\
+})
+
+/* In the following two macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define __mlog_printk(level, fmt, args...)				\
+	printk(level "(%u,%lu):%s:%d " fmt, current->pid,		\
+	       __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ ,	\
+	       ##args)
+
+#define mlog(mask, fmt, args...) do {					\
+	u64 __m = MLOG_MASK_PREFIX | (mask);				\
+	if (__mlog_test_u64(__m, mlog_and_bits) &&			\
+	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
+		if (__m & ML_ERROR)					\
+			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
+		else if (__m & ML_NOTICE)				\
+			__mlog_printk(KERN_NOTICE, fmt , ##args);	\
+		else __mlog_printk(KERN_INFO, fmt , ##args);		\
+	}								\
+} while (0)
+
+#define mlog_errno(st) do {						\
+	int _st = (st);							\
+	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
+	    _st != AOP_TRUNCATED_PAGE)					\
+		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
+} while (0)
+
+#define mlog_entry(fmt, args...) do {					\
+	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
+} while (0)
+
+#define mlog_entry_void() do {						\
+	mlog(ML_ENTRY, "ENTRY:\n");					\
+} while (0)
+
+/* We disable this for old compilers since they don't have support for
+ * __builtin_types_compatible_p.
+ */
+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
+    !defined(__CHECKER__)
+#define mlog_exit(st) do {						     \
+	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
+		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), signed long))      \
+		mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), unsigned int)	     \
+		 || __builtin_types_compatible_p(typeof(st), unsigned short) \
+		 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
+		mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), signed int)	     \
+		 || __builtin_types_compatible_p(typeof(st), signed short)   \
+		 || __builtin_types_compatible_p(typeof(st), signed char))   \
+		mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st));		     \
+	else if (__builtin_types_compatible_p(typeof(st), long long))	     \
+		mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));	     \
+	else								     \
+		mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st));    \
+} while (0)
+#else
+#define mlog_exit(st) do {						     \
+	mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));		     \
+} while (0)
+#endif
+
+#define mlog_exit_ptr(ptr) do {						\
+	mlog(ML_EXIT, "EXIT: %p\n", ptr);				\
+} while (0)
+
+#define mlog_exit_void() do {						\
+	mlog(ML_EXIT, "EXIT\n");					\
+} while (0)
+
+#define mlog_bug_on_msg(cond, fmt, args...) do {			\
+	if (cond) {							\
+		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
+		mlog(ML_ERROR, fmt, ##args);				\
+		BUG();							\
+	}								\
+} while (0)
+
+#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
+#define MLFi64 "lld"
+#define MLFu64 "llu"
+#define MLFx64 "llx"
+#else
+#define MLFi64 "ld"
+#define MLFu64 "lu"
+#define MLFx64 "lx"
+#endif
+
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+int mlog_sys_init(struct subsystem *o2cb_subsys);
+void mlog_sys_shutdown(void);
+
+#endif /* O2CLUSTER_MASKLOG_H */

+ 791 - 0
fs/ocfs2/cluster/nodemanager.c

@@ -0,0 +1,791 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/configfs.h>
+
+#include "endian.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+#include "masklog.h"
+#include "sys.h"
+#include "ver.h"
+
+/* for now we operate under the assertion that there can be only one
+ * cluster active at a time.  Changing this will require trickling
+ * cluster references throughout where nodes are looked up */
+static struct o2nm_cluster *o2nm_single_cluster = NULL;
+
+#define OCFS2_MAX_HB_CTL_PATH 256
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
+
+static ctl_table ocfs2_nm_table[] = {
+	{
+		.ctl_name	= 1,
+		.procname	= "hb_ctl_path",
+		.data		= ocfs2_hb_ctl_path,
+		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ocfs2_mod_table[] = {
+	{
+		.ctl_name	= KERN_OCFS2_NM,
+		.procname	= "nm",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_nm_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_kern_table[] = {
+	{
+		.ctl_name	= KERN_OCFS2,
+		.procname	= "ocfs2",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_mod_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_root_table[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_kern_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header *ocfs2_table_header = NULL;
+
+const char *o2nm_get_hb_ctl_path(void)
+{
+	return ocfs2_hb_ctl_path;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
+
+struct o2nm_cluster {
+	struct config_group	cl_group;
+	unsigned		cl_has_local:1;
+	u8			cl_local_node;
+	rwlock_t		cl_nodes_lock;
+	struct o2nm_node  	*cl_nodes[O2NM_MAX_NODES];
+	struct rb_root		cl_node_ip_tree;
+	/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
+	unsigned long	cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
+{
+	struct o2nm_node *node = NULL;
+
+	if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
+		goto out;
+
+	read_lock(&o2nm_single_cluster->cl_nodes_lock);
+	node = o2nm_single_cluster->cl_nodes[node_num];
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&o2nm_single_cluster->cl_nodes_lock);
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
+{
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
+
+	if (cluster == NULL)
+		return -EINVAL;
+
+	read_lock(&cluster->cl_nodes_lock);
+	memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+	read_unlock(&cluster->cl_nodes_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
+
+static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
+						  __be32 ip_needle,
+						  struct rb_node ***ret_p,
+						  struct rb_node **ret_parent)
+{
+	struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct o2nm_node *node, *ret = NULL;
+
+	while (*p) {
+		parent = *p;
+		node = rb_entry(parent, struct o2nm_node, nd_ip_node);
+
+		if (memcmp(&ip_needle, &node->nd_ipv4_address,
+		           sizeof(ip_needle)) < 0)
+			p = &(*p)->rb_left;
+		else if (memcmp(&ip_needle, &node->nd_ipv4_address,
+			        sizeof(ip_needle)) > 0)
+			p = &(*p)->rb_right;
+		else {
+			ret = node;
+			break;
+		}
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+
+	return ret;
+}
+
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
+{
+	struct o2nm_node *node = NULL;
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	if (cluster == NULL)
+		goto out;
+
+	read_lock(&cluster->cl_nodes_lock);
+	node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&cluster->cl_nodes_lock);
+
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
+
+void o2nm_node_put(struct o2nm_node *node)
+{
+	config_item_put(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_put);
+
+void o2nm_node_get(struct o2nm_node *node)
+{
+	config_item_get(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_get);
+
+u8 o2nm_this_node(void)
+{
+	u8 node_num = O2NM_MAX_NODES;
+
+	if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
+		node_num = o2nm_single_cluster->cl_local_node;
+
+	return node_num;
+}
+EXPORT_SYMBOL_GPL(o2nm_this_node);
+
+/* node configfs bits */
+
+static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct o2nm_cluster,
+			     cl_group)
+		: NULL;
+}
+
+static struct o2nm_node *to_o2nm_node(struct config_item *item)
+{
+	return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
+}
+
+static void o2nm_node_release(struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	kfree(node);
+}
+
+static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_num);
+}
+
+static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
+{
+	/* through the first node_set .parent
+	 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
+	return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+}
+
+enum {
+	O2NM_NODE_ATTR_NUM = 0,
+	O2NM_NODE_ATTR_PORT,
+	O2NM_NODE_ATTR_ADDRESS,
+	O2NM_NODE_ATTR_LOCAL,
+};
+
+static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+				   size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp >= O2NM_MAX_NODES)
+		return -ERANGE;
+
+	/* once we're in the cl_nodes tree networking can look us up by
+	 * node number and try to use our address and port attributes
+	 * to connect to this node.. make sure that they've been set
+	 * before writing the node attribute? */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	write_lock(&cluster->cl_nodes_lock);
+	if (cluster->cl_nodes[tmp])
+		p = NULL;
+	else  {
+		cluster->cl_nodes[tmp] = node;
+		node->nd_num = tmp;
+		set_bit(tmp, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (p == NULL)
+		return -EEXIST;
+
+	return count;
+}
+static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+}
+
+static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+					 const char *page, size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp == 0)
+		return -EINVAL;
+	if (tmp >= (u16)-1)
+		return -ERANGE;
+
+	node->nd_ipv4_port = htons(tmp);
+
+	return count;
+}
+
+static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
+}
+
+static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+					    const char *page,
+					    size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	int ret, i;
+	struct rb_node **p, *parent;
+	unsigned int octets[4];
+	__be32 ipv4_addr = 0;
+
+	ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
+		     &octets[1], &octets[0]);
+	if (ret != 4)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(octets); i++) {
+		if (octets[i] > 255)
+			return -ERANGE;
+		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
+	}
+
+	ret = 0;
+	write_lock(&cluster->cl_nodes_lock);
+	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
+		ret = -EEXIST;
+	else {
+		rb_link_node(&node->nd_ip_node, parent, p);
+		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (ret)
+		return ret;
+
+	memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
+
+	return count;
+}
+
+static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_local);
+}
+
+static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+				     size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+	ssize_t ret;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	tmp = !!tmp; /* boolean of whether this node wants to be local */
+
+	/* setting local turns on networking rx for now so we require having
+	 * set everything else first */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	/* the only failure case is trying to set a new local node
+	 * when a different one is already set */
+	if (tmp && tmp == cluster->cl_has_local &&
+	    cluster->cl_local_node != node->nd_num)
+		return -EBUSY;
+
+	/* bring up the rx thread if we're setting the new local node. */
+	if (tmp && !cluster->cl_has_local) {
+		ret = o2net_start_listening(node);
+		if (ret)
+			return ret;
+	}
+
+	if (!tmp && cluster->cl_has_local &&
+	    cluster->cl_local_node == node->nd_num) {
+		o2net_stop_listening(node);
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	node->nd_local = tmp;
+	if (node->nd_local) {
+		cluster->cl_has_local = tmp;
+		cluster->cl_local_node = node->nd_num;
+	}
+
+	return count;
+}
+
+struct o2nm_node_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2nm_node *, char *);
+	ssize_t (*store)(struct o2nm_node *, const char *, size_t);
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_num = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "num",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_num_read,
+	.store	= o2nm_node_num_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_port",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_port_read,
+	.store	= o2nm_node_ipv4_port_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_address",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_address_read,
+	.store	= o2nm_node_ipv4_address_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_local = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "local",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_local_read,
+	.store	= o2nm_node_local_write,
+};
+
+static struct configfs_attribute *o2nm_node_attrs[] = {
+	[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
+	[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
+	[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
+	[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+	NULL,
+};
+
+static int o2nm_attr_index(struct configfs_attribute *attr)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
+		if (attr == o2nm_node_attrs[i])
+			return i;
+	}
+	BUG();
+	return 0;
+}
+
+static ssize_t o2nm_node_show(struct config_item *item,
+			      struct configfs_attribute *attr,
+			      char *page)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2nm_node_attr->show)
+		ret = o2nm_node_attr->show(node, page);
+	return ret;
+}
+
+static ssize_t o2nm_node_store(struct config_item *item,
+			       struct configfs_attribute *attr,
+			       const char *page, size_t count)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret;
+	int attr_index = o2nm_attr_index(attr);
+
+	if (o2nm_node_attr->store == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (test_bit(attr_index, &node->nd_set_attributes))
+		return -EBUSY;
+
+	ret = o2nm_node_attr->store(node, page, count);
+	if (ret < count)
+		goto out;
+
+	set_bit(attr_index, &node->nd_set_attributes);
+out:
+	return ret;
+}
+
+static struct configfs_item_operations o2nm_node_item_ops = {
+	.release		= o2nm_node_release,
+	.show_attribute		= o2nm_node_show,
+	.store_attribute	= o2nm_node_store,
+};
+
+static struct config_item_type o2nm_node_type = {
+	.ct_item_ops	= &o2nm_node_item_ops,
+	.ct_attrs	= o2nm_node_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* node set */
+
+struct o2nm_node_group {
+	struct config_group ns_group;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2nm_node_group, ns_group)
+		: NULL;
+}
+#endif
+
+static struct config_item *o2nm_node_group_make_item(struct config_group *group,
+						     const char *name)
+{
+	struct o2nm_node *node = NULL;
+	struct config_item *ret = NULL;
+
+	if (strlen(name) > O2NM_MAX_NAME_LEN)
+		goto out; /* ENAMETOOLONG */
+
+	node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
+	if (node == NULL)
+		goto out; /* ENOMEM */
+
+	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
+	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
+	spin_lock_init(&node->nd_lock);
+
+	ret = &node->nd_item;
+
+out:
+	if (ret == NULL)
+		kfree(node);
+
+	return ret;
+}
+
+static void o2nm_node_group_drop_item(struct config_group *group,
+				      struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
+
+	o2net_disconnect_node(node);
+
+	if (cluster->cl_has_local &&
+	    (cluster->cl_local_node == node->nd_num)) {
+		cluster->cl_has_local = 0;
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+		o2net_stop_listening(node);
+	}
+
+	/* XXX call into net to stop this node from trading messages */
+
+	write_lock(&cluster->cl_nodes_lock);
+
+	/* XXX sloppy */
+	if (node->nd_ipv4_address)
+		rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+
+	/* nd_num might be 0 if the node number hasn't been set.. */
+	if (cluster->cl_nodes[node->nd_num] == node) {
+		cluster->cl_nodes[node->nd_num] = NULL;
+		clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_node_group_group_ops = {
+	.make_item	= o2nm_node_group_make_item,
+	.drop_item	= o2nm_node_group_drop_item,
+};
+
+static struct config_item_type o2nm_node_group_type = {
+	.ct_group_ops	= &o2nm_node_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster */
+
+static void o2nm_cluster_release(struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+
+	kfree(cluster->cl_group.default_groups);
+	kfree(cluster);
+}
+
+static struct configfs_item_operations o2nm_cluster_item_ops = {
+	.release	= o2nm_cluster_release,
+};
+
+static struct config_item_type o2nm_cluster_type = {
+	.ct_item_ops	= &o2nm_cluster_item_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster set */
+
+struct o2nm_cluster_group {
+	struct configfs_subsystem cs_subsys;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
+{
+	return group ?
+		container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
+	       : NULL;
+}
+#endif
+
+static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
+							  const char *name)
+{
+	struct o2nm_cluster *cluster = NULL;
+	struct o2nm_node_group *ns = NULL;
+	struct config_group *o2hb_group = NULL, *ret = NULL;
+	void *defs = NULL;
+
+	/* this runs under the parent dir's i_sem; there can be only
+	 * one caller in here at a time */
+	if (o2nm_single_cluster)
+		goto out; /* ENOSPC */
+
+	cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
+	ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
+	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+	o2hb_group = o2hb_alloc_hb_set();
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+		goto out;
+
+	config_group_init_type_name(&cluster->cl_group, name,
+				    &o2nm_cluster_type);
+	config_group_init_type_name(&ns->ns_group, "node",
+				    &o2nm_node_group_type);
+
+	cluster->cl_group.default_groups = defs;
+	cluster->cl_group.default_groups[0] = &ns->ns_group;
+	cluster->cl_group.default_groups[1] = o2hb_group;
+	cluster->cl_group.default_groups[2] = NULL;
+	rwlock_init(&cluster->cl_nodes_lock);
+	cluster->cl_node_ip_tree = RB_ROOT;
+
+	ret = &cluster->cl_group;
+	o2nm_single_cluster = cluster;
+
+out:
+	if (ret == NULL) {
+		kfree(cluster);
+		kfree(ns);
+		o2hb_free_hb_set(o2hb_group);
+		kfree(defs);
+	}
+
+	return ret;
+}
+
+static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	int i;
+	struct config_item *killme;
+
+	BUG_ON(o2nm_single_cluster != cluster);
+	o2nm_single_cluster = NULL;
+
+	for (i = 0; cluster->cl_group.default_groups[i]; i++) {
+		killme = &cluster->cl_group.default_groups[i]->cg_item;
+		cluster->cl_group.default_groups[i] = NULL;
+		config_item_put(killme);
+	}
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_cluster_group_group_ops = {
+	.make_group	= o2nm_cluster_group_make_group,
+	.drop_item	= o2nm_cluster_group_drop_item,
+};
+
+static struct config_item_type o2nm_cluster_group_type = {
+	.ct_group_ops	= &o2nm_cluster_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct o2nm_cluster_group o2nm_cluster_group = {
+	.cs_subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "cluster",
+				.ci_type = &o2nm_cluster_group_type,
+			},
+		},
+	},
+};
+
+static void __exit exit_o2nm(void)
+{
+	if (ocfs2_table_header)
+		unregister_sysctl_table(ocfs2_table_header);
+
+	/* XXX sync with hb callbacks and shut down hb? */
+	o2net_unregister_hb_callbacks();
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+	o2cb_sys_shutdown();
+
+	o2net_exit();
+}
+
+static int __init init_o2nm(void)
+{
+	int ret = -1;
+
+	cluster_print_version();
+
+	o2hb_init();
+	o2net_init();
+
+	ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
+	if (!ocfs2_table_header) {
+		printk(KERN_ERR "nodemanager: unable to register sysctl\n");
+		ret = -ENOMEM; /* or something. */
+		goto out;
+	}
+
+	ret = o2net_register_hb_callbacks();
+	if (ret)
+		goto out_sysctl;
+
+	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
+	init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+	ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
+	if (ret) {
+		printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
+		goto out_callbacks;
+	}
+
+	ret = o2cb_sys_init();
+	if (!ret)
+		goto out;
+
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+out_callbacks:
+	o2net_unregister_hb_callbacks();
+out_sysctl:
+	unregister_sysctl_table(ocfs2_table_header);
+out:
+	return ret;
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_o2nm)
+module_exit(exit_o2nm)

+ 64 - 0
fs/ocfs2/cluster/nodemanager.h

@@ -0,0 +1,64 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_NODEMANAGER_H
+#define O2CLUSTER_NODEMANAGER_H
+
+#include "ocfs2_nodemanager.h"
+
+/* This totally doesn't belong here. */
+#include <linux/configfs.h>
+#include <linux/rbtree.h>
+
+#define KERN_OCFS2		988
+#define KERN_OCFS2_NM		1
+
+const char *o2nm_get_hb_ctl_path(void);
+
+struct o2nm_node {
+	spinlock_t		nd_lock;
+	struct config_item	nd_item;
+	char			nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
+	__u8			nd_num;
+	/* only one address per node, as attributes, for now. */
+	__be32			nd_ipv4_address;
+	__be16			nd_ipv4_port;
+	struct rb_node		nd_ip_node;
+	/* there can be only one local node for now */
+	int			nd_local;
+
+	unsigned long		nd_set_attributes;
+};
+
+u8 o2nm_this_node(void);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
+void o2nm_node_get(struct o2nm_node *node);
+void o2nm_node_put(struct o2nm_node *node);
+
+#endif /* O2CLUSTER_NODEMANAGER_H */

+ 37 - 0
fs/ocfs2/cluster/ocfs2_heartbeat.h

@@ -0,0 +1,37 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_heartbeat.h
+ *
+ * On-disk structures for ocfs2_heartbeat
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_HEARTBEAT_H
+#define _OCFS2_HEARTBEAT_H
+
+struct o2hb_disk_heartbeat_block {
+	__le64 hb_seq;
+	__u8  hb_node;
+	__u8  hb_pad1[3];
+	__le32 hb_cksum;
+	__le64 hb_generation;
+};
+
+#endif /* _OCFS2_HEARTBEAT_H */

+ 39 - 0
fs/ocfs2/cluster/ocfs2_nodemanager.h

@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_nodemanager.h
+ *
+ * Header describing the interface between userspace and the kernel
+ * for the ocfs2_nodemanager module.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef _OCFS2_NODEMANAGER_H
+#define _OCFS2_NODEMANAGER_H
+
+#define O2NM_API_VERSION	5
+
+#define O2NM_MAX_NODES		255
+#define O2NM_INVALID_NODE_NUM	255
+
+/* host name, group name, cluster name all 64 bytes */
+#define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
+
+#endif /* _OCFS2_NODEMANAGER_H */

+ 315 - 0
fs/ocfs2/cluster/quorum.c

@@ -0,0 +1,315 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* This quorum hack is only here until we transition to some more rational
+ * approach that is driven from userspace.  Honest.  No foolin'.
+ *
+ * Imagine two nodes lose network connectivity to each other but they're still
+ * up and operating in every other way.  Presumably a network timeout indicates
+ * that a node is broken and should be recovered.  They can't both recover each
+ * other and both carry on without serialising their access to the file system.
+ * They need to decide who is authoritative.  Now extend that problem to
+ * arbitrary groups of nodes losing connectivity between each other.
+ *
+ * So we declare that a node which has given up on connecting to a majority
+ * of nodes who are still heartbeating will fence itself.
+ *
+ * There are huge opportunities for races here.  After we give up on a node's
+ * connection we need to wait long enough to give heartbeat an opportunity
+ * to declare the node as truly dead.  We also need to be careful with the
+ * race between when we see a node start heartbeating and when we connect
+ * to it.
+ *
+ * So nodes that are in this transtion put a hold on the quorum decision
+ * with a counter.  As they fall out of this transition they drop the count
+ * and if they're the last, they fire off the decision.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_QUORUM
+#include "masklog.h"
+#include "quorum.h"
+
+static struct o2quo_state {
+	spinlock_t		qs_lock;
+	struct work_struct	qs_work;
+	int			qs_pending;
+	int			qs_heartbeating;
+	unsigned long		qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_connected;
+	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_holds;
+	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+} o2quo_state;
+
+/* this is horribly heavy-handed.  It should instead flip the file
+ * system RO and call some userspace script. */
+static void o2quo_fence_self(void)
+{
+	/* panic spins with interrupts enabled.  with preempt
+	 * threads can still schedule, etc, etc */
+	o2hb_stop_all_regions();
+	panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+}
+
+/* Indicate that a timeout occured on a hearbeat region write. The
+ * other nodes in the cluster may consider us dead at that time so we
+ * want to "fence" ourselves so that we don't scribble on the disk
+ * after they think they've recovered us. This can't solve all
+ * problems related to writeout after recovery but this hack can at
+ * least close some of those gaps. When we have real fencing, this can
+ * go away as our node would be fenced externally before other nodes
+ * begin recovery. */
+void o2quo_disk_timeout(void)
+{
+	o2quo_fence_self();
+}
+
+static void o2quo_make_decision(void *arg)
+{
+	int quorum;
+	int lowest_hb, lowest_reachable = 0, fence = 0;
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
+	if (lowest_hb != O2NM_MAX_NODES)
+		lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
+
+	mlog(0, "heartbeating: %d, connected: %d, "
+	     "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
+	     qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
+
+	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
+	    qs->qs_heartbeating == 1)
+		goto out;
+
+	if (qs->qs_heartbeating & 1) {
+		/* the odd numbered cluster case is straight forward --
+		 * if we can't talk to the majority we're hosed */
+		quorum = (qs->qs_heartbeating + 1)/2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+	} else {
+		/* the even numbered cluster adds the possibility of each half
+		 * of the cluster being able to talk amongst themselves.. in
+		 * that case we're hosed if we can't talk to the group that has
+		 * the lowest numbered node */
+		quorum = qs->qs_heartbeating / 2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+		else if ((qs->qs_connected == quorum) &&
+			 !lowest_reachable) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "connected to a half-quorum of %u out of %u "
+			     "nodes which doesn't include the lowest active "
+			     "node %u\n", quorum, qs->qs_heartbeating,
+			     lowest_hb);
+			fence = 1;
+		}
+	}
+
+out:
+	spin_unlock(&qs->qs_lock);
+	if (fence)
+		o2quo_fence_self();
+}
+
+static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
+		qs->qs_holds++;
+		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
+			        "node %u\n", node);
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
+	}
+}
+
+static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
+		if (--qs->qs_holds == 0) {
+			if (qs->qs_pending) {
+				qs->qs_pending = 0;
+				schedule_work(&qs->qs_work);
+			}
+		}
+		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
+				node, qs->qs_holds);
+	}
+}
+
+/* as a node comes up we delay the quorum decision until we know the fate of
+ * the connection.  the hold will be droped in conn_up or hb_down.  it might be
+ * perpetuated by con_err until hb_down.  if we already have a conn, we might
+ * be dropping a hold that conn_up got. */
+void o2quo_hb_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating++;
+	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	set_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	if (!test_bit(node, qs->qs_conn_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* hb going down releases any holds we might have had due to this node from
+ * conn_up, conn_err, or hb_up */
+void o2quo_hb_down(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating--;
+	mlog_bug_on_msg(qs->qs_heartbeating < 0,
+			"node %u, %d heartbeating\n",
+			node, qs->qs_heartbeating);
+	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	clear_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* this tells us that we've decided that the node is still heartbeating
+ * even though we've lost it's conn.  it must only be called after conn_err
+ * and indicates that we must now make a quorum decision in the future,
+ * though we might be doing so after waiting for holds to drain.  Here
+ * we'll be dropping the hold from conn_err. */
+void o2quo_hb_still_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	mlog(0, "node %u\n", node);
+
+	qs->qs_pending = 1;
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* This is analagous to hb_up.  as a node's connection comes up we delay the
+ * quorum decision until we see it heartbeating.  the hold will be droped in
+ * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
+ * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * */
+void o2quo_conn_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_connected++;
+	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
+	set_bit(node, qs->qs_conn_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (!test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* we've decided that we won't ever be connecting to the node again.  if it's
+ * still heartbeating we grab a hold that will delay decisions until either the
+ * node stops heartbeating from hb_down or the caller decides that the node is
+ * still up and calls still_up */
+void o2quo_conn_err(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	if (test_bit(node, qs->qs_conn_bm)) {
+		qs->qs_connected--;
+		mlog_bug_on_msg(qs->qs_connected < 0,
+				"node %u, connected %d\n",
+				node, qs->qs_connected);
+
+		clear_bit(node, qs->qs_conn_bm);
+	}
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+void o2quo_init(void)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock_init(&qs->qs_lock);
+	INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
+}
+
+void o2quo_exit(void)
+{
+	flush_scheduled_work();
+}

+ 36 - 0
fs/ocfs2/cluster/quorum.h

@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_QUORUM_H
+#define O2CLUSTER_QUORUM_H
+
+void o2quo_init(void);
+void o2quo_exit(void);
+
+void o2quo_hb_up(u8 node);
+void o2quo_hb_down(u8 node);
+void o2quo_hb_still_up(u8 node);
+void o2quo_conn_up(u8 node);
+void o2quo_conn_err(u8 node);
+void o2quo_disk_timeout(void);
+
+#endif /* O2CLUSTER_QUORUM_H */

+ 124 - 0
fs/ocfs2/cluster/sys.c

@@ -0,0 +1,124 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.c
+ *
+ * OCFS2 cluster sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include "ocfs2_nodemanager.h"
+#include "masklog.h"
+#include "sys.h"
+
+struct o2cb_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(char *buf);
+	ssize_t (*store)(const char *buf, size_t count);
+};
+
+#define O2CB_ATTR(_name, _mode, _show, _store)	\
+struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
+#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
+
+static ssize_t o2cb_interface_revision_show(char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
+}
+
+static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+
+static struct attribute *o2cb_attrs[] = {
+	&o2cb_attr_interface_revision.attr,
+	NULL,
+};
+
+static ssize_t
+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
+static ssize_t
+o2cb_store(struct kobject * kobj, struct attribute * attr,
+	   const char * buffer, size_t count);
+static struct sysfs_ops o2cb_sysfs_ops = {
+	.show	= o2cb_show,
+	.store	= o2cb_store,
+};
+
+static struct kobj_type o2cb_subsys_type = {
+	.default_attrs	= o2cb_attrs,
+	.sysfs_ops	= &o2cb_sysfs_ops,
+};
+
+/* gives us o2cb_subsys */
+static decl_subsys(o2cb, NULL, NULL);
+
+static ssize_t
+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
+{
+	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
+	struct subsystem *sbs = to_o2cb_subsys(kobj);
+
+	BUG_ON(sbs != &o2cb_subsys);
+
+	if (o2cb_attr->show)
+		return o2cb_attr->show(buffer);
+	return -EIO;
+}
+
+static ssize_t
+o2cb_store(struct kobject * kobj, struct attribute * attr,
+	     const char * buffer, size_t count)
+{
+	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
+	struct subsystem *sbs = to_o2cb_subsys(kobj);
+
+	BUG_ON(sbs != &o2cb_subsys);
+
+	if (o2cb_attr->store)
+		return o2cb_attr->store(buffer, count);
+	return -EIO;
+}
+
+void o2cb_sys_shutdown(void)
+{
+	mlog_sys_shutdown();
+	subsystem_unregister(&o2cb_subsys);
+}
+
+int o2cb_sys_init(void)
+{
+	int ret;
+
+	o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
+	ret = subsystem_register(&o2cb_subsys);
+	if (ret)
+		return ret;
+
+	ret = mlog_sys_init(&o2cb_subsys);
+	if (ret)
+		subsystem_unregister(&o2cb_subsys);
+	return ret;
+}

+ 33 - 0
fs/ocfs2/cluster/sys.h

@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.h
+ *
+ * Function prototypes for o2cb sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_SYS_H
+#define O2CLUSTER_SYS_H
+
+void o2cb_sys_shutdown(void);
+int o2cb_sys_init(void);
+
+#endif /* O2CLUSTER_SYS_H */

+ 1829 - 0
fs/ocfs2/cluster/tcp.c

@@ -0,0 +1,1829 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * ----
+ *
+ * Callers for this were originally written against a very simple synchronus
+ * API.  This implementation reflects those simple callers.  Some day I'm sure
+ * we'll need to move to a more robust posting/callback mechanism.
+ *
+ * Transmit calls pass in kernel virtual addresses and block copying this into
+ * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
+ * for a failed socket to timeout.  TX callers can also pass in a poniter to an
+ * 'int' which gets filled with an errno off the wire in response to the
+ * message they send.
+ *
+ * Handlers for unsolicited messages are registered.  Each socket has a page
+ * that incoming data is copied into.  First the header, then the data.
+ * Handlers are called from only one thread with a reference to this per-socket
+ * page.  This page is destroyed after the handler call, so it can't be
+ * referenced beyond the call.  Handlers may block but are discouraged from
+ * doing so.
+ *
+ * Any framing errors (bad magic, large payload lengths) close a connection.
+ *
+ * Our sock_container holds the state we associate with a socket.  It's current
+ * framing state is held there as well as the refcounting we do around when it
+ * is safe to tear down the socket.  The socket is only finally torn down from
+ * the container when the container loses all of its references -- so as long
+ * as you hold a ref on the container you can trust that the socket is valid
+ * for use with kernel socket APIs.
+ *
+ * Connections are initiated between a pair of nodes when the node with the
+ * higher node number gets a heartbeat callback which indicates that the lower
+ * numbered node has started heartbeating.  The lower numbered node is passive
+ * and only accepts the connection if the higher numbered node is heartbeating.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <net/tcp.h>
+
+#include <asm/uaccess.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+#include "quorum.h"
+
+#include "tcp_internal.h"
+
+/* 
+ * The linux network stack isn't sparse endian clean.. It has macros like
+ * ntohs() which perform the endian checks and structs like sockaddr_in
+ * which aren't annotated.  So __force is found here to get the build
+ * clean.  When they emerge from the dark ages and annotate the code
+ * we can remove these.
+ */
+
+#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
+			  NIPQUAD(sc->sc_node->nd_ipv4_address),	\
+			  ntohs(sc->sc_node->nd_ipv4_port)
+
+/*
+ * In the following two log macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define msglog(hdr, fmt, args...) do {					\
+	typeof(hdr) __hdr = (hdr);					\
+	mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d "	\
+	     "key %08x num %u] " fmt,					\
+	     be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), 	\
+	     be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status),	\
+	     be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key),	\
+	     be32_to_cpu(__hdr->msg_num) ,  ##args);			\
+} while (0)
+
+#define sclog(sc, fmt, args...) do {					\
+	typeof(sc) __sc = (sc);						\
+	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
+	     "pg_off %zu] " fmt, __sc,					\
+	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
+	    ##args);							\
+} while (0)
+
+static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static struct rb_root o2net_handler_tree = RB_ROOT;
+
+static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
+
+/* XXX someday we'll need better accounting */
+static struct socket *o2net_listen_sock = NULL;
+
+/*
+ * listen work is only queued by the listening socket callbacks on the
+ * o2net_wq.  teardown detaches the callbacks before destroying the workqueue.
+ * quorum work is queued as sock containers are shutdown.. stop_listening
+ * tears down all the node's sock containers, preventing future shutdowns
+ * and queued quroum work, before canceling delayed quorum work and
+ * destroying the work queue.
+ */
+static struct workqueue_struct *o2net_wq;
+static struct work_struct o2net_listen_work;
+
+static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
+#define O2NET_HB_PRI 0x1
+
+static struct o2net_handshake *o2net_hand;
+static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
+
+static int o2net_sys_err_translations[O2NET_ERR_MAX] =
+		{[O2NET_ERR_NONE]	= 0,
+		 [O2NET_ERR_NO_HNDLR]	= -ENOPROTOOPT,
+		 [O2NET_ERR_OVERFLOW]	= -EOVERFLOW,
+		 [O2NET_ERR_DIED]	= -EHOSTDOWN,};
+
+/* can't quite avoid *all* internal declarations :/ */
+static void o2net_sc_connect_completed(void *arg);
+static void o2net_rx_until_empty(void *arg);
+static void o2net_shutdown_sc(void *arg);
+static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_sc_send_keep_req(void *arg);
+static void o2net_idle_timer(unsigned long data);
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
+
+static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
+{
+	int trans;
+	BUG_ON(err >= O2NET_ERR_MAX);
+	trans = o2net_sys_err_translations[err];
+
+	/* Just in case we mess up the translation table above */
+	BUG_ON(err != O2NET_ERR_NONE && trans == 0);
+	return trans;
+}
+
+static struct o2net_node * o2net_nn_from_num(u8 node_num)
+{
+	BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
+	return &o2net_nodes[node_num];
+}
+
+static u8 o2net_num_from_nn(struct o2net_node *nn)
+{
+	BUG_ON(nn == NULL);
+	return nn - o2net_nodes;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
+{
+	int ret = 0;
+
+	do {
+		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
+			ret = -EAGAIN;
+			break;
+		}
+		spin_lock(&nn->nn_lock);
+		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
+		if (ret == 0)
+			list_add_tail(&nsw->ns_node_item,
+				      &nn->nn_status_list);
+		spin_unlock(&nn->nn_lock);
+	} while (ret == -EAGAIN);
+
+	if (ret == 0)  {
+		init_waitqueue_head(&nsw->ns_wq);
+		nsw->ns_sys_status = O2NET_ERR_NONE;
+		nsw->ns_status = 0;
+	}
+
+	return ret;
+}
+
+static void o2net_complete_nsw_locked(struct o2net_node *nn,
+				      struct o2net_status_wait *nsw,
+				      enum o2net_system_error sys_status,
+				      s32 status)
+{
+	assert_spin_locked(&nn->nn_lock);
+
+	if (!list_empty(&nsw->ns_node_item)) {
+		list_del_init(&nsw->ns_node_item);
+		nsw->ns_sys_status = sys_status;
+		nsw->ns_status = status;
+		idr_remove(&nn->nn_status_idr, nsw->ns_id);
+		wake_up(&nsw->ns_wq);
+	}
+}
+
+static void o2net_complete_nsw(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw,
+			       u64 id, enum o2net_system_error sys_status,
+			       s32 status)
+{
+	spin_lock(&nn->nn_lock);
+	if (nsw == NULL) {
+		if (id > INT_MAX)
+			goto out;
+
+		nsw = idr_find(&nn->nn_status_idr, id);
+		if (nsw == NULL)
+			goto out;
+	}
+
+	o2net_complete_nsw_locked(nn, nsw, sys_status, status);
+
+out:
+	spin_unlock(&nn->nn_lock);
+	return;
+}
+
+static void o2net_complete_nodes_nsw(struct o2net_node *nn)
+{
+	struct list_head *iter, *tmp;
+	unsigned int num_kills = 0;
+	struct o2net_status_wait *nsw;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	list_for_each_safe(iter, tmp, &nn->nn_status_list) {
+		nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
+		o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
+		num_kills++;
+	}
+
+	mlog(0, "completed %d messages for node %u\n", num_kills,
+	     o2net_num_from_nn(nn));
+}
+
+static int o2net_nsw_completed(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw)
+{
+	int completed;
+	spin_lock(&nn->nn_lock);
+	completed = list_empty(&nsw->ns_node_item);
+	spin_unlock(&nn->nn_lock);
+	return completed;
+}
+
+/* ------------------------------------------------------------ */
+
+static void sc_kref_release(struct kref *kref)
+{
+	struct o2net_sock_container *sc = container_of(kref,
+					struct o2net_sock_container, sc_kref);
+	sclog(sc, "releasing\n");
+
+	if (sc->sc_sock) {
+		sock_release(sc->sc_sock);
+		sc->sc_sock = NULL;
+	}
+
+	o2nm_node_put(sc->sc_node);
+	sc->sc_node = NULL;
+
+	kfree(sc);
+}
+
+static void sc_put(struct o2net_sock_container *sc)
+{
+	sclog(sc, "put\n");
+	kref_put(&sc->sc_kref, sc_kref_release);
+}
+static void sc_get(struct o2net_sock_container *sc)
+{
+	sclog(sc, "get\n");
+	kref_get(&sc->sc_kref);
+}
+static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
+{
+	struct o2net_sock_container *sc, *ret = NULL;
+	struct page *page = NULL;
+
+	page = alloc_page(GFP_NOFS);
+	sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
+	if (sc == NULL || page == NULL)
+		goto out;
+
+	kref_init(&sc->sc_kref);
+	o2nm_node_get(node);
+	sc->sc_node = node;
+
+	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
+	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
+	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
+	INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
+
+	init_timer(&sc->sc_idle_timeout);
+	sc->sc_idle_timeout.function = o2net_idle_timer;
+	sc->sc_idle_timeout.data = (unsigned long)sc;
+
+	sclog(sc, "alloced\n");
+
+	ret = sc;
+	sc->sc_page = page;
+	sc = NULL;
+	page = NULL;
+
+out:
+	if (page)
+		__free_page(page);
+	kfree(sc);
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static void o2net_sc_queue_work(struct o2net_sock_container *sc,
+				struct work_struct *work)
+{
+	sc_get(sc);
+	if (!queue_work(o2net_wq, work))
+		sc_put(sc);
+}
+static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
+					struct work_struct *work,
+					int delay)
+{
+	sc_get(sc);
+	if (!queue_delayed_work(o2net_wq, work, delay))
+		sc_put(sc);
+}
+static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
+					 struct work_struct *work)
+{
+	if (cancel_delayed_work(work))
+		sc_put(sc);
+}
+
+static void o2net_set_nn_state(struct o2net_node *nn,
+			       struct o2net_sock_container *sc,
+			       unsigned valid, int err)
+{
+	int was_valid = nn->nn_sc_valid;
+	int was_err = nn->nn_persistent_error;
+	struct o2net_sock_container *old_sc = nn->nn_sc;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	/* the node num comparison and single connect/accept path should stop
+	 * an non-null sc from being overwritten with another */
+	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
+	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
+	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
+
+	/* we won't reconnect after our valid conn goes away for
+	 * this hb iteration.. here so it shows up in the logs */
+	if (was_valid && !valid && err == 0)
+		err = -ENOTCONN;
+
+	mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
+	     o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
+	     nn->nn_persistent_error, err);
+
+	nn->nn_sc = sc;
+	nn->nn_sc_valid = valid ? 1 : 0;
+	nn->nn_persistent_error = err;
+
+	/* mirrors o2net_tx_can_proceed() */
+	if (nn->nn_persistent_error || nn->nn_sc_valid)
+		wake_up(&nn->nn_sc_wq);
+
+	if (!was_err && nn->nn_persistent_error) {
+		o2quo_conn_err(o2net_num_from_nn(nn));
+		queue_delayed_work(o2net_wq, &nn->nn_still_up,
+				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+	}
+
+	if (was_valid && !valid) {
+		mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
+		     SC_NODEF_ARGS(old_sc));
+		o2net_complete_nodes_nsw(nn);
+	}
+
+	if (!was_valid && valid) {
+		o2quo_conn_up(o2net_num_from_nn(nn));
+		/* this is a bit of a hack.  we only try reconnecting
+		 * when heartbeating starts until we get a connection.
+		 * if that connection then dies we don't try reconnecting.
+		 * the only way to start connecting again is to down
+		 * heartbeat and bring it back up. */
+		cancel_delayed_work(&nn->nn_connect_expired);
+		mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", 
+		     o2nm_this_node() > sc->sc_node->nd_num ?
+		     	"connected to" : "accepted connection from",
+		     SC_NODEF_ARGS(sc));
+	}
+
+	/* trigger the connecting worker func as long as we're not valid,
+	 * it will back off if it shouldn't connect.  This can be called
+	 * from node config teardown and so needs to be careful about
+	 * the work queue actually being up. */
+	if (!valid && o2net_wq) {
+		unsigned long delay;
+		/* delay if we're withing a RECONNECT_DELAY of the
+		 * last attempt */
+		delay = (nn->nn_last_connect_attempt +
+			 msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+			- jiffies;
+		if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+			delay = 0;
+		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
+		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+	}
+
+	/* keep track of the nn's sc ref for the caller */
+	if ((old_sc == NULL) && sc)
+		sc_get(sc);
+	if (old_sc && (old_sc != sc)) {
+		o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
+		sc_put(old_sc);
+	}
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	read_lock(&sk->sk_callback_lock);
+	if (sk->sk_user_data) {
+		struct o2net_sock_container *sc = sk->sk_user_data;
+		sclog(sc, "data_ready hit\n");
+		do_gettimeofday(&sc->sc_tv_data_ready);
+		o2net_sc_queue_work(sc, &sc->sc_rx_work);
+		ready = sc->sc_data_ready;
+	} else {
+		ready = sk->sk_data_ready;
+	}
+	read_unlock(&sk->sk_callback_lock);
+
+	ready(sk, bytes);
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_state_change(struct sock *sk)
+{
+	void (*state_change)(struct sock *sk);
+	struct o2net_sock_container *sc;
+
+	read_lock(&sk->sk_callback_lock);
+	sc = sk->sk_user_data;
+	if (sc == NULL) {
+		state_change = sk->sk_state_change;
+		goto out;
+	}
+
+	sclog(sc, "state_change to %d\n", sk->sk_state);
+
+	state_change = sc->sc_state_change;
+
+	switch(sk->sk_state) {
+		/* ignore connecting sockets as they make progress */
+		case TCP_SYN_SENT:
+		case TCP_SYN_RECV:
+			break;
+		case TCP_ESTABLISHED:
+			o2net_sc_queue_work(sc, &sc->sc_connect_work);
+			break;
+		default:
+			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+			break;
+	}
+out:
+	read_unlock(&sk->sk_callback_lock);
+	state_change(sk);
+}
+
+/*
+ * we register callbacks so we can queue work on events before calling
+ * the original callbacks.  our callbacks our careful to test user_data
+ * to discover when they've reaced with o2net_unregister_callbacks().
+ */
+static void o2net_register_callbacks(struct sock *sk,
+				     struct o2net_sock_container *sc)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+
+	/* accepted sockets inherit the old listen socket data ready */
+	if (sk->sk_data_ready == o2net_listen_data_ready) {
+		sk->sk_data_ready = sk->sk_user_data;
+		sk->sk_user_data = NULL;
+	}
+
+	BUG_ON(sk->sk_user_data != NULL);
+	sk->sk_user_data = sc;
+	sc_get(sc);
+
+	sc->sc_data_ready = sk->sk_data_ready;
+	sc->sc_state_change = sk->sk_state_change;
+	sk->sk_data_ready = o2net_data_ready;
+	sk->sk_state_change = o2net_state_change;
+
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int o2net_unregister_callbacks(struct sock *sk,
+			           struct o2net_sock_container *sc)
+{
+	int ret = 0;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_user_data == sc) {
+		ret = 1;
+		sk->sk_user_data = NULL;
+		sk->sk_data_ready = sc->sc_data_ready;
+		sk->sk_state_change = sc->sc_state_change;
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	return ret;
+}
+
+/*
+ * this is a little helper that is called by callers who have seen a problem
+ * with an sc and want to detach it from the nn if someone already hasn't beat
+ * them to it.  if an error is given then the shutdown will be persistent
+ * and pending transmits will be canceled.
+ */
+static void o2net_ensure_shutdown(struct o2net_node *nn,
+			           struct o2net_sock_container *sc,
+				   int err)
+{
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc == sc)
+		o2net_set_nn_state(nn, NULL, 0, err);
+	spin_unlock(&nn->nn_lock);
+}
+
+/*
+ * This work queue function performs the blocking parts of socket shutdown.  A
+ * few paths lead here.  set_nn_state will trigger this callback if it sees an
+ * sc detached from the nn.  state_change will also trigger this callback
+ * directly when it sees errors.  In that case we need to call set_nn_state
+ * ourselves as state_change couldn't get the nn_lock and call set_nn_state
+ * itself.
+ */
+static void o2net_shutdown_sc(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	sclog(sc, "shutting down\n");
+
+	/* drop the callbacks ref and call shutdown only once */
+	if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
+		/* we shouldn't flush as we're in the thread, the
+		 * races with pending sc work structs are harmless */
+		del_timer_sync(&sc->sc_idle_timeout);
+		o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+		sc_put(sc);
+		sc->sc_sock->ops->shutdown(sc->sc_sock,
+					   RCV_SHUTDOWN|SEND_SHUTDOWN);
+	}
+
+	/* not fatal so failed connects before the other guy has our
+	 * heartbeat can be retried */
+	o2net_ensure_shutdown(nn, sc, 0);
+	sc_put(sc);
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
+			     u32 key)
+{
+	int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
+
+	if (ret == 0)
+		ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
+
+	return ret;
+}
+
+static struct o2net_msg_handler *
+o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
+			  struct rb_node **ret_parent)
+{
+        struct rb_node **p = &o2net_handler_tree.rb_node;
+        struct rb_node *parent = NULL;
+	struct o2net_msg_handler *nmh, *ret = NULL;
+	int cmp;
+
+        while (*p) {
+                parent = *p;
+                nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+		cmp = o2net_handler_cmp(nmh, msg_type, key);
+
+                if (cmp < 0)
+                        p = &(*p)->rb_left;
+                else if (cmp > 0)
+                        p = &(*p)->rb_right;
+                else {
+			ret = nmh;
+                        break;
+		}
+        }
+
+        if (ret_p != NULL)
+                *ret_p = p;
+        if (ret_parent != NULL)
+                *ret_parent = parent;
+
+        return ret;
+}
+
+static void o2net_handler_kref_release(struct kref *kref)
+{
+	struct o2net_msg_handler *nmh;
+	nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
+
+	kfree(nmh);
+}
+
+static void o2net_handler_put(struct o2net_msg_handler *nmh)
+{
+	kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+}
+
+/* max_len is protection for the handler func.  incoming messages won't
+ * be given to the handler if their payload is longer than the max. */
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   struct list_head *unreg_list)
+{
+	struct o2net_msg_handler *nmh = NULL;
+	struct rb_node **p, *parent;
+	int ret = 0;
+
+	if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "max_len for message handler out of range: %u\n",
+			max_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!msg_type) {
+		mlog(0, "no message type provided: %u, %p\n", msg_type, func);
+		ret = -EINVAL;
+		goto out;
+
+	}
+	if (!func) {
+		mlog(0, "no message handler provided: %u, %p\n",
+		       msg_type, func);
+		ret = -EINVAL;
+		goto out;
+	}
+
+       	nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS);
+	if (nmh == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nmh->nh_func = func;
+	nmh->nh_func_data = data;
+	nmh->nh_msg_type = msg_type;
+	nmh->nh_max_len = max_len;
+	nmh->nh_key = key;
+	/* the tree and list get this ref.. they're both removed in
+	 * unregister when this ref is dropped */
+	kref_init(&nmh->nh_kref);
+	INIT_LIST_HEAD(&nmh->nh_unregister_item);
+
+	write_lock(&o2net_handler_lock);
+	if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
+		ret = -EEXIST;
+	else {
+	        rb_link_node(&nmh->nh_node, parent, p);
+		rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
+		list_add_tail(&nmh->nh_unregister_item, unreg_list);
+
+		mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
+		     func, msg_type, key);
+		/* we've had some trouble with handlers seemingly vanishing. */
+		mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
+							  &parent) == NULL,
+			        "couldn't find handler we *just* registerd "
+				"for type %u key %08x\n", msg_type, key);
+	}
+	write_unlock(&o2net_handler_lock);
+	if (ret)
+		goto out;
+
+out:
+	if (ret)
+		kfree(nmh);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_register_handler);
+
+void o2net_unregister_handler_list(struct list_head *list)
+{
+	struct list_head *pos, *n;
+	struct o2net_msg_handler *nmh;
+
+	write_lock(&o2net_handler_lock);
+	list_for_each_safe(pos, n, list) {
+		nmh = list_entry(pos, struct o2net_msg_handler,
+				 nh_unregister_item);
+		mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
+		     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
+		rb_erase(&nmh->nh_node, &o2net_handler_tree);
+		list_del_init(&nmh->nh_unregister_item);
+		kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+	}
+	write_unlock(&o2net_handler_lock);
+}
+EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
+
+static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
+{
+	struct o2net_msg_handler *nmh;
+
+	read_lock(&o2net_handler_lock);
+	nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
+	if (nmh)
+		kref_get(&nmh->nh_kref);
+	read_unlock(&o2net_handler_lock);
+
+	return nmh;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct kvec vec = {
+		.iov_len = len,
+		.iov_base = data,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&vec,
+       		.msg_flags = MSG_DONTWAIT,
+	};
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
+			      size_t veclen, size_t total)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct msghdr msg = {
+		.msg_iov = (struct iovec *)vec,
+		.msg_iovlen = veclen,
+	};
+
+	if (sock == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ret = sock_sendmsg(sock, &msg, total);
+	set_fs(oldfs);
+	if (ret != total) {
+		mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
+		     total);
+		if (ret >= 0)
+			ret = -EPIPE; /* should be smarter, I bet */
+		goto out;
+	}
+
+	ret = 0;
+out:
+	if (ret < 0)
+		mlog(0, "returning error: %d\n", ret);
+	return ret;
+}
+
+static void o2net_sendpage(struct o2net_sock_container *sc,
+			   void *kmalloced_virt,
+			   size_t size)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	ssize_t ret;
+
+
+	ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
+					 virt_to_page(kmalloced_virt),
+					 (long)kmalloced_virt & ~PAGE_MASK,
+					 size, MSG_DONTWAIT);
+	if (ret != size) {
+		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 
+		     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+}
+
+static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
+{
+	memset(msg, 0, sizeof(struct o2net_msg));
+	msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
+	msg->data_len = cpu_to_be16(data_len);
+	msg->msg_type = cpu_to_be16(msg_type);
+	msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
+	msg->status = 0;
+	msg->key = cpu_to_be32(key);
+}
+
+static int o2net_tx_can_proceed(struct o2net_node *nn,
+			        struct o2net_sock_container **sc_ret,
+				int *error)
+{
+	int ret = 0;
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_persistent_error) {
+		ret = 1;
+		*sc_ret = NULL;
+		*error = nn->nn_persistent_error;
+	} else if (nn->nn_sc_valid) {
+		kref_get(&nn->nn_sc->sc_kref);
+
+		ret = 1;
+		*sc_ret = nn->nn_sc;
+		*error = 0;
+	}
+	spin_unlock(&nn->nn_lock);
+
+	return ret;
+}
+
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
+			   size_t caller_veclen, u8 target_node, int *status)
+{
+	int ret, error = 0;
+	struct o2net_msg *msg = NULL;
+	size_t veclen, caller_bytes = 0;
+	struct kvec *vec = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn = o2net_nn_from_num(target_node);
+	struct o2net_status_wait nsw = {
+		.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
+	};
+
+	if (o2net_wq == NULL) {
+		mlog(0, "attempt to tx without o2netd running\n");
+		ret = -ESRCH;
+		goto out;
+	}
+
+	if (caller_veclen == 0) {
+		mlog(0, "bad kvec array length\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
+	if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "total payload len %zu too large\n", caller_bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (target_node == o2nm_this_node()) {
+		ret = -ELOOP;
+		goto out;
+	}
+
+	ret = wait_event_interruptible(nn->nn_sc_wq,
+				       o2net_tx_can_proceed(nn, &sc, &error));
+	if (!ret && error)
+		ret = error;
+	if (ret)
+		goto out;
+
+	veclen = caller_veclen + 1;
+	vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
+	if (vec == NULL) {
+		mlog(0, "failed to %zu element kvec!\n", veclen);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
+	if (!msg) {
+		mlog(0, "failed to allocate a o2net_msg!\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	o2net_init_msg(msg, caller_bytes, msg_type, key);
+
+	vec[0].iov_len = sizeof(struct o2net_msg);
+	vec[0].iov_base = msg;
+	memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
+
+	ret = o2net_prep_nsw(nn, &nsw);
+	if (ret)
+		goto out;
+
+	msg->msg_num = cpu_to_be32(nsw.ns_id);
+
+	/* finally, convert the message header to network byte-order
+	 * and send */
+	ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
+				 sizeof(struct o2net_msg) + caller_bytes);
+	msglog(msg, "sending returned %d\n", ret);
+	if (ret < 0) {
+		mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
+		goto out;
+	}
+
+	/* wait on other node's handler */
+	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+
+	/* Note that we avoid overwriting the callers status return
+	 * variable if a system error was reported on the other
+	 * side. Callers beware. */
+	ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
+	if (status && !ret)
+		*status = nsw.ns_status;
+
+	mlog(0, "woken, returning system status %d, user status %d\n",
+	     ret, nsw.ns_status);
+out:
+	if (sc)
+		sc_put(sc);
+	if (vec)
+		kfree(vec);
+	if (msg)
+		kfree(msg);
+	o2net_complete_nsw(nn, &nsw, 0, 0, 0);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_send_message_vec);
+
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status)
+{
+	struct kvec vec = {
+		.iov_base = data,
+		.iov_len = len,
+	};
+	return o2net_send_message_vec(msg_type, key, &vec, 1,
+				      target_node, status);
+}
+EXPORT_SYMBOL_GPL(o2net_send_message);
+
+static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
+				   enum o2net_system_error syserr, int err)
+{
+	struct kvec vec = {
+		.iov_base = hdr,
+		.iov_len = sizeof(struct o2net_msg),
+	};
+
+	BUG_ON(syserr >= O2NET_ERR_MAX);
+
+	/* leave other fields intact from the incoming message, msg_num
+	 * in particular */
+	hdr->sys_status = cpu_to_be32(syserr);
+	hdr->status = cpu_to_be32(err);
+	hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC);  // twiddle the magic
+	hdr->data_len = 0;
+
+	msglog(hdr, "about to send status magic %d\n", err);
+	/* hdr has been in host byteorder this whole time */
+	return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
+}
+
+/* this returns -errno if the header was unknown or too large, etc.
+ * after this is called the buffer us reused for the next message */
+static int o2net_process_message(struct o2net_sock_container *sc,
+				 struct o2net_msg *hdr)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	int ret = 0, handler_status;
+	enum  o2net_system_error syserr;
+	struct o2net_msg_handler *nmh = NULL;
+
+	msglog(hdr, "processing message\n");
+
+	o2net_sc_postpone_idle(sc);
+
+	switch(be16_to_cpu(hdr->magic)) {
+		case O2NET_MSG_STATUS_MAGIC:
+			/* special type for returning message status */
+			o2net_complete_nsw(nn, NULL,
+					   be32_to_cpu(hdr->msg_num),
+					   be32_to_cpu(hdr->sys_status),
+					   be32_to_cpu(hdr->status));
+			goto out;
+		case O2NET_MSG_KEEP_REQ_MAGIC:
+			o2net_sendpage(sc, o2net_keep_resp,
+				       sizeof(*o2net_keep_resp));
+			goto out;
+		case O2NET_MSG_KEEP_RESP_MAGIC:
+			goto out;
+		case O2NET_MSG_MAGIC:
+			break;
+		default:
+			msglog(hdr, "bad magic\n");
+			ret = -EINVAL;
+			goto out;
+			break;
+	}
+
+	/* find a handler for it */
+	handler_status = 0;
+	nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
+				be32_to_cpu(hdr->key));
+	if (!nmh) {
+		mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
+		     be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
+		syserr = O2NET_ERR_NO_HNDLR;
+		goto out_respond;
+	}
+
+	syserr = O2NET_ERR_NONE;
+
+	if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
+		syserr = O2NET_ERR_OVERFLOW;
+
+	if (syserr != O2NET_ERR_NONE)
+		goto out_respond;
+
+	do_gettimeofday(&sc->sc_tv_func_start);
+	sc->sc_msg_key = be32_to_cpu(hdr->key);
+	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
+	handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
+					     be16_to_cpu(hdr->data_len),
+					nmh->nh_func_data);
+	do_gettimeofday(&sc->sc_tv_func_stop);
+
+out_respond:
+	/* this destroys the hdr, so don't use it after this */
+	ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
+				      handler_status);
+	hdr = NULL;
+	mlog(0, "sending handler status %d, syserr %d returned %d\n",
+	     handler_status, syserr, ret);
+
+out:
+	if (nmh)
+		o2net_handler_put(nmh);
+	return ret;
+}
+
+static int o2net_check_handshake(struct o2net_sock_container *sc)
+{
+	struct o2net_handshake *hand = page_address(sc->sc_page);
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
+		     "version %llu but %llu is required, disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     (unsigned long long)be64_to_cpu(hand->protocol_version),
+		     O2NET_PROTOCOL_VERSION);
+
+		/* don't bother reconnecting if its the wrong version. */
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	sc->sc_handshake_ok = 1;
+
+	spin_lock(&nn->nn_lock);
+	/* set valid and queue the idle timers only if it hasn't been
+	 * shut down already */
+	if (nn->nn_sc == sc) {
+		o2net_sc_postpone_idle(sc);
+		o2net_set_nn_state(nn, sc, 1, 0);
+	}
+	spin_unlock(&nn->nn_lock);
+
+	/* shift everything up as though it wasn't there */
+	sc->sc_page_off -= sizeof(struct o2net_handshake);
+	if (sc->sc_page_off)
+		memmove(hand, hand + 1, sc->sc_page_off);
+
+	return 0;
+}
+
+/* this demuxes the queued rx bytes into header or payload bits and calls
+ * handlers as each full message is read off the socket.  it returns -error,
+ * == 0 eof, or > 0 for progress made.*/
+static int o2net_advance_rx(struct o2net_sock_container *sc)
+{
+	struct o2net_msg *hdr;
+	int ret = 0;
+	void *data;
+	size_t datalen;
+
+	sclog(sc, "receiving\n");
+	do_gettimeofday(&sc->sc_tv_advance_start);
+
+	/* do we need more header? */
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0) {
+			sc->sc_page_off += ret;
+
+			/* this working relies on the handshake being
+			 * smaller than the normal message header */
+			if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
+			    !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
+				ret = -EPROTO;
+				goto out;
+			}
+
+			/* only swab incoming here.. we can
+			 * only get here once as we cross from
+			 * being under to over */
+			if (sc->sc_page_off == sizeof(struct o2net_msg)) {
+				hdr = page_address(sc->sc_page);
+				if (be16_to_cpu(hdr->data_len) >
+				    O2NET_MAX_PAYLOAD_BYTES)
+					ret = -EOVERFLOW;
+			}
+		}
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		/* oof, still don't have a header */
+		goto out;
+	}
+
+	/* this was swabbed above when we first read it */
+	hdr = page_address(sc->sc_page);
+
+	msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
+
+	/* do we need more payload? */
+	if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
+		/* need more payload */
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
+			  sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0)
+			sc->sc_page_off += ret;
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
+		/* we can only get here once, the first time we read
+		 * the payload.. so set ret to progress if the handler
+		 * works out. after calling this the message is toast */
+		ret = o2net_process_message(sc, hdr);
+		if (ret == 0)
+			ret = 1;
+		sc->sc_page_off = 0;
+	}
+
+out:
+	sclog(sc, "ret = %d\n", ret);
+	do_gettimeofday(&sc->sc_tv_advance_stop);
+	return ret;
+}
+
+/* this work func is triggerd by data ready.  it reads until it can read no
+ * more.  it interprets 0, eof, as fatal.  if data_ready hits while we're doing
+ * our work the work struct will be marked and we'll be called again. */
+static void o2net_rx_until_empty(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+	int ret;
+
+	do {
+		ret = o2net_advance_rx(sc);
+	} while (ret > 0);
+
+	if (ret <= 0 && ret != -EAGAIN) {
+		struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+		sclog(sc, "saw error %d, closing\n", ret);
+		/* not permanent so read failed handshake can retry */
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+
+	sc_put(sc);
+}
+
+static int o2net_set_nodelay(struct socket *sock)
+{
+	int ret, val = 1;
+	mm_segment_t oldfs;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	/*
+	 * Dear unsuspecting programmer,
+	 *
+	 * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
+	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
+	 * silently turn into SO_DEBUG.
+	 *
+	 * Yours,
+	 * Keeper of hilariously fragile interfaces.
+	 */
+	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
+				    (char __user *)&val, sizeof(val));
+
+	set_fs(oldfs);
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+/* called when a connect completes and after a sock is accepted.  the
+ * rx path will see the response and mark the sc valid */
+static void o2net_sc_connect_completed(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+
+	mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
+              (unsigned long long)O2NET_PROTOCOL_VERSION,
+	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
+
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+	sc_put(sc);
+}
+
+/* this is called as a work_struct func. */
+static void o2net_sc_send_keep_req(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+
+	o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
+	sc_put(sc);
+}
+
+/* socket shutdown does a del_timer_sync against this as it tears down.
+ * we can't start this timer until we've got to the point in sc buildup
+ * where shutdown is going to be involved */
+static void o2net_idle_timer(unsigned long data)
+{
+	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct timeval now;
+
+	do_gettimeofday(&now);
+
+	mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
+	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
+	mlog(ML_NOTICE, "here are some times that might help debug the "
+	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+	     sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, 
+	     now.tv_sec, now.tv_usec,
+	     sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, 
+	     sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, 
+	     sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, 
+	     sc->sc_msg_key, sc->sc_msg_type,
+	     sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec,
+	     sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec);
+
+	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+}
+
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
+{
+	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
+				    O2NET_KEEPALIVE_DELAY_SECS * HZ);
+	do_gettimeofday(&sc->sc_tv_timer);
+	mod_timer(&sc->sc_idle_timeout,
+		  jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
+}
+
+/* this work func is kicked whenever a path sets the nn state which doesn't
+ * have valid set.  This includes seeing hb come up, losing a connection,
+ * having a connect attempt fail, etc. This centralizes the logic which decides
+ * if a connect attempt should be made or if we should give up and all future
+ * transmit attempts should fail */
+static void o2net_start_connect(void *arg)
+{
+	struct o2net_node *nn = arg;
+	struct o2net_sock_container *sc = NULL;
+	struct o2nm_node *node = NULL;
+	struct socket *sock = NULL;
+	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
+	int ret = 0;
+
+	/* if we're greater we initiate tx, otherwise we accept */
+	if (o2nm_this_node() <= o2net_num_from_nn(nn))
+		goto out;
+
+	/* watch for racing with tearing a node down */
+	node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
+	if (node == NULL) {
+		ret = 0;
+		goto out;
+	}
+
+	spin_lock(&nn->nn_lock);
+	/* see if we already have one pending or have given up */
+	if (nn->nn_sc || nn->nn_persistent_error)
+		arg = NULL;
+	spin_unlock(&nn->nn_lock);
+	if (arg == NULL) /* *shrug*, needed some indicator */
+		goto out;
+
+	nn->nn_last_connect_attempt = jiffies;
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		mlog(0, "couldn't allocate sc\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(0, "can't create socket: %d\n", ret);
+		goto out;
+	}
+	sc->sc_sock = sock; /* freed by sc_kref_release */
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	myaddr.sin_family = AF_INET;
+	myaddr.sin_port = (__force u16)htons(0); /* any port */
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
+			      sizeof(myaddr));
+	if (ret) {
+		mlog(0, "bind failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = o2net_set_nodelay(sc->sc_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+
+	spin_lock(&nn->nn_lock);
+	/* handshake completion will set nn->nn_sc_valid */
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	remoteaddr.sin_family = AF_INET;
+	remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
+	remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
+
+	ret = sc->sc_sock->ops->connect(sc->sc_sock,
+					(struct sockaddr *)&remoteaddr,
+					sizeof(remoteaddr),
+					O_NONBLOCK);
+	if (ret == -EINPROGRESS)
+		ret = 0;
+
+out:
+	if (ret) {
+		mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
+		     "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+		/* 0 err so that another will be queued and attempted
+		 * from set_nn_state */
+		if (sc)
+			o2net_ensure_shutdown(nn, sc, 0);
+	}
+	if (sc)
+		sc_put(sc);
+	if (node)
+		o2nm_node_put(node);
+
+	return;
+}
+
+static void o2net_connect_expired(void *arg)
+{
+	struct o2net_node *nn = arg;
+
+	spin_lock(&nn->nn_lock);
+	if (!nn->nn_sc_valid) {
+		mlog(ML_ERROR, "no connection established with node %u after "
+		     "%u seconds, giving up and returning errors.\n",
+		     o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
+
+		o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	}
+	spin_unlock(&nn->nn_lock);
+}
+
+static void o2net_still_up(void *arg)
+{
+	struct o2net_node *nn = arg;
+
+	o2quo_hb_still_up(o2net_num_from_nn(nn));
+}
+
+/* ------------------------------------------------------------ */
+
+void o2net_disconnect_node(struct o2nm_node *node)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
+
+	/* don't reconnect until it's heartbeating again */
+	spin_lock(&nn->nn_lock);
+	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	spin_unlock(&nn->nn_lock);
+
+	if (o2net_wq) {
+		cancel_delayed_work(&nn->nn_connect_expired);
+		cancel_delayed_work(&nn->nn_connect_work);
+		cancel_delayed_work(&nn->nn_still_up);
+		flush_workqueue(o2net_wq);
+	}
+}
+
+static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
+				  void *data)
+{
+	o2quo_hb_down(node_num);
+
+	if (node_num != o2nm_this_node())
+		o2net_disconnect_node(node);
+}
+
+static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
+				void *data)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node_num);
+
+	o2quo_hb_up(node_num);
+
+	/* ensure an immediate connect attempt */
+	nn->nn_last_connect_attempt = jiffies -
+		(msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
+
+	if (node_num != o2nm_this_node()) {
+		/* heartbeat doesn't work unless a local node number is
+		 * configured and doing so brings up the o2net_wq, so we can
+		 * use it.. */
+		queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
+				   O2NET_IDLE_TIMEOUT_SECS * HZ);
+
+		/* believe it or not, accept and node hearbeating testing
+		 * can succeed for this node before we got here.. so
+		 * only use set_nn_state to clear the persistent error
+		 * if that hasn't already happened */
+		spin_lock(&nn->nn_lock);
+		if (nn->nn_persistent_error)
+			o2net_set_nn_state(nn, NULL, 0, 0);
+		spin_unlock(&nn->nn_lock);
+	}
+}
+
+void o2net_unregister_hb_callbacks(void)
+{
+	int ret;
+
+	ret = o2hb_unregister_callback(&o2net_hb_up);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
+		     "callback!\n", ret);
+
+	ret = o2hb_unregister_callback(&o2net_hb_down);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
+		     "callback!\n", ret);
+}
+
+int o2net_register_hb_callbacks(void)
+{
+	int ret;
+
+	o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
+			    o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
+	o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
+			    o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
+
+	ret = o2hb_register_callback(&o2net_hb_up);
+	if (ret == 0)
+		ret = o2hb_register_callback(&o2net_hb_down);
+
+	if (ret)
+		o2net_unregister_hb_callbacks();
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_accept_one(struct socket *sock)
+{
+	int ret, slen;
+	struct sockaddr_in sin;
+	struct socket *new_sock = NULL;
+	struct o2nm_node *node = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn;
+
+	BUG_ON(sock == NULL);
+	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+			       sock->sk->sk_protocol, &new_sock);
+	if (ret)
+		goto out;
+
+	new_sock->type = sock->type;
+	new_sock->ops = sock->ops;
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+
+	new_sock->sk->sk_allocation = GFP_ATOMIC;
+
+	ret = o2net_set_nodelay(new_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	slen = sizeof(sin);
+	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
+				       &slen, 1);
+	if (ret < 0)
+		goto out;
+
+	node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
+	if (node == NULL) {
+		mlog(ML_NOTICE, "attempt to connect from unknown node at "
+		     "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (o2nm_this_node() > node->nd_num) {
+		mlog(ML_NOTICE, "unexpected connect attempted from a lower "
+		     "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port), node->nd_num);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* this happens all the time when the other node sees our heartbeat
+	 * and tries to connect before we see their heartbeat */
+	if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
+		mlog(ML_CONN, "attempt to connect from node '%s' at "
+		     "%u.%u.%u.%u:%d but it isn't heartbeating\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nn = o2net_nn_from_num(node->nd_num);
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc)
+		ret = -EBUSY;
+	else
+		ret = 0;
+	spin_unlock(&nn->nn_lock);
+	if (ret) {
+		mlog(ML_NOTICE, "attempt to connect from node '%s' at "
+		     "%u.%u.%u.%u:%d but it already has an open connection\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		goto out;
+	}
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sc->sc_sock = new_sock;
+	new_sock = NULL;
+
+	spin_lock(&nn->nn_lock);
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+	o2net_sc_queue_work(sc, &sc->sc_rx_work);
+
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+
+out:
+	if (new_sock)
+		sock_release(new_sock);
+	if (node)
+		o2nm_node_put(node);
+	if (sc)
+		sc_put(sc);
+	return ret;
+}
+
+static void o2net_accept_many(void *arg)
+{
+	struct socket *sock = arg;
+	while (o2net_accept_one(sock) == 0)
+		cond_resched();
+}
+
+static void o2net_listen_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	read_lock(&sk->sk_callback_lock);
+	ready = sk->sk_user_data;
+	if (ready == NULL) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	/* ->sk_data_ready is also called for a newly established child socket
+	 * before it has been accepted and the acceptor has set up their
+	 * data_ready.. we only want to queue listen work for our listening
+	 * socket */
+	if (sk->sk_state == TCP_LISTEN) {
+		mlog(ML_TCP, "bytes: %d\n", bytes);
+		queue_work(o2net_wq, &o2net_listen_work);
+	}
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+	ready(sk, bytes);
+}
+
+static int o2net_open_listening_sock(__be16 port)
+{
+	struct socket *sock = NULL;
+	int ret;
+	struct sockaddr_in sin = {
+		.sin_family = PF_INET,
+		.sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
+		.sin_port = (__force u16)port,
+	};
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+		goto out;
+	}
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = o2net_listen_data_ready;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	o2net_listen_sock = sock;
+	INIT_WORK(&o2net_listen_work, o2net_accept_many, sock);
+
+	sock->sk->sk_reuse = 1;
+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
+		     ntohs(port), ret);
+		goto out;
+	}
+
+	ret = sock->ops->listen(sock, 64);
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
+		     ntohs(port), ret);
+	}
+
+out:
+	if (ret) {
+		o2net_listen_sock = NULL;
+		if (sock)
+			sock_release(sock);
+	}
+	return ret;
+}
+
+/*
+ * called from node manager when we should bring up our network listening
+ * socket.  node manager handles all the serialization to only call this
+ * once and to match it with o2net_stop_listening().  note,
+ * o2nm_this_node() doesn't work yet as we're being called while it
+ * is being set up.
+ */
+int o2net_start_listening(struct o2nm_node *node)
+{
+	int ret = 0;
+
+	BUG_ON(o2net_wq != NULL);
+	BUG_ON(o2net_listen_sock != NULL);
+
+	mlog(ML_KTHREAD, "starting o2net thread...\n");
+	o2net_wq = create_singlethread_workqueue("o2net");
+	if (o2net_wq == NULL) {
+		mlog(ML_ERROR, "unable to launch o2net thread\n");
+		return -ENOMEM; /* ? */
+	}
+
+	ret = o2net_open_listening_sock(node->nd_ipv4_port);
+	if (ret) {
+		destroy_workqueue(o2net_wq);
+		o2net_wq = NULL;
+	} else
+		o2quo_conn_up(node->nd_num);
+
+	return ret;
+}
+
+/* again, o2nm_this_node() doesn't work here as we're involved in
+ * tearing it down */
+void o2net_stop_listening(struct o2nm_node *node)
+{
+	struct socket *sock = o2net_listen_sock;
+	size_t i;
+
+	BUG_ON(o2net_wq == NULL);
+	BUG_ON(o2net_listen_sock == NULL);
+
+	/* stop the listening socket from generating work */
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready = sock->sk->sk_user_data;
+	sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2nm_node *node = o2nm_get_node_by_num(i);
+		if (node) {
+			o2net_disconnect_node(node);
+			o2nm_node_put(node);
+		}
+	}
+
+	/* finish all work and tear down the work queue */
+	mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
+	destroy_workqueue(o2net_wq);
+	o2net_wq = NULL;
+
+	sock_release(o2net_listen_sock);
+	o2net_listen_sock = NULL;
+
+	o2quo_conn_err(node->nd_num);
+}
+
+/* ------------------------------------------------------------ */
+
+int o2net_init(void)
+{
+	unsigned long i;
+
+	o2quo_init();
+
+	o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
+	o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
+	o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
+	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
+		kfree(o2net_hand);
+		kfree(o2net_keep_req);
+		kfree(o2net_keep_resp);
+		return -ENOMEM;
+	}
+
+	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
+	o2net_hand->connector_id = cpu_to_be64(1);
+
+	o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
+	o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2net_node *nn = o2net_nn_from_num(i);
+
+		spin_lock_init(&nn->nn_lock);
+		INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
+		INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
+		INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
+		/* until we see hb from a node we'll return einval */
+		nn->nn_persistent_error = -ENOTCONN;
+		init_waitqueue_head(&nn->nn_sc_wq);
+		idr_init(&nn->nn_status_idr);
+		INIT_LIST_HEAD(&nn->nn_status_list);
+	}
+
+	return 0;
+}
+
+void o2net_exit(void)
+{
+	o2quo_exit();
+	kfree(o2net_hand);
+	kfree(o2net_keep_req);
+	kfree(o2net_keep_resp);
+}

+ 113 - 0
fs/ocfs2/cluster/tcp.h

@@ -0,0 +1,113 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_TCP_H
+#define O2CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#include <linux/tcp.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+struct o2net_msg
+{
+	__be16 magic;
+	__be16 data_len;
+	__be16 msg_type;
+	__be16 pad1;
+	__be32 sys_status;
+	__be32 status;
+	__be32 key;
+	__be32 msg_num;
+	__u8  buf[0];
+};
+
+typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
+
+#define O2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(struct o2net_msg))
+
+/* TODO: figure this out.... */
+static inline int o2net_link_down(int err, struct socket *sock)
+{
+	if (sock) {
+		if (sock->sk->sk_state != TCP_ESTABLISHED &&
+	    	    sock->sk->sk_state != TCP_CLOSE_WAIT)
+			return 1;
+	}
+
+	if (err >= 0)
+		return 0;
+	switch (err) {
+		/* ????????????????????????? */
+		case -ERESTARTSYS:
+		case -EBADF:
+		/* When the server has died, an ICMP port unreachable
+		 * message prompts ECONNREFUSED. */
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+			return 1;
+	}
+	return 0;
+}
+
+enum {
+	O2NET_DRIVER_UNINITED,
+	O2NET_DRIVER_READY,
+};
+
+int o2net_init_tcp_sock(struct inode *inode);
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status);
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
+			   size_t veclen, u8 target_node, int *status);
+int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
+			    struct inode *group);
+
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   struct list_head *unreg_list);
+void o2net_unregister_handler_list(struct list_head *list);
+
+struct o2nm_node;
+int o2net_register_hb_callbacks(void);
+void o2net_unregister_hb_callbacks(void);
+int o2net_start_listening(struct o2nm_node *node);
+void o2net_stop_listening(struct o2nm_node *node);
+void o2net_disconnect_node(struct o2nm_node *node);
+
+int o2net_init(void);
+void o2net_exit(void);
+int o2net_proc_init(struct proc_dir_entry *parent);
+void o2net_proc_exit(struct proc_dir_entry *parent);
+
+#endif /* O2CLUSTER_TCP_H */

+ 174 - 0
fs/ocfs2/cluster/tcp_internal.h

@@ -0,0 +1,174 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_TCP_INTERNAL_H
+#define O2CLUSTER_TCP_INTERNAL_H
+
+#define O2NET_MSG_MAGIC           ((u16)0xfa55)
+#define O2NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
+#define O2NET_MSG_KEEP_REQ_MAGIC  ((u16)0xfa57)
+#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
+
+/* same as hb delay, we're waiting for another node to recognize our hb */
+#define O2NET_RECONNECT_DELAY_MS	O2HB_REGION_TIMEOUT_MS
+
+/* we're delaying our quorum decision so that heartbeat will have timed
+ * out truly dead nodes by the time we come around to making decisions
+ * on their number */
+#define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
+#define O2NET_KEEPALIVE_DELAY_SECS	5
+#define O2NET_IDLE_TIMEOUT_SECS		10
+
+/* 
+ * This version number represents quite a lot, unfortunately.  It not
+ * only represents the raw network message protocol on the wire but also
+ * locking semantics of the file system using the protocol.  It should 
+ * be somewhere else, I'm sure, but right now it isn't.
+ *
+ * New in version 2:
+ * 	- full 64 bit i_size in the metadata lock lvbs
+ * 	- introduction of "rw" lock and pushing meta/data locking down
+ */
+#define O2NET_PROTOCOL_VERSION 2ULL
+struct o2net_handshake {
+	__be64	protocol_version;
+	__be64	connector_id;
+};
+
+struct o2net_node {
+	/* this is never called from int/bh */
+	spinlock_t			nn_lock;
+
+	/* set the moment an sc is allocated and a connect is started */
+	struct o2net_sock_container	*nn_sc;
+	/* _valid is only set after the handshake passes and tx can happen */
+	unsigned			nn_sc_valid:1;
+	/* if this is set tx just returns it */
+	int				nn_persistent_error;
+
+	/* threads waiting for an sc to arrive wait on the wq for generation
+	 * to increase.  it is increased when a connecting socket succeeds
+	 * or fails or when an accepted socket is attached. */
+	wait_queue_head_t		nn_sc_wq;
+
+	struct idr			nn_status_idr;
+	struct list_head		nn_status_list;
+
+	/* connects are attempted from when heartbeat comes up until either hb
+	 * goes down, the node is unconfigured, no connect attempts succeed
+	 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
+	 * is queued from set_nn_state both from hb up and from itself if a
+	 * connect attempt fails and so can be self-arming.  shutdown is
+	 * careful to first mark the nn such that no connects will be attempted
+	 * before canceling delayed connect work and flushing the queue. */
+	struct work_struct		nn_connect_work;
+	unsigned long			nn_last_connect_attempt;
+
+	/* this is queued as nodes come up and is canceled when a connection is
+	 * established.  this expiring gives up on the node and errors out
+	 * transmits */
+	struct work_struct		nn_connect_expired;
+
+	/* after we give up on a socket we wait a while before deciding
+	 * that it is still heartbeating and that we should do some
+	 * quorum work */
+	struct work_struct		nn_still_up;
+};
+
+struct o2net_sock_container {
+	struct kref		sc_kref;
+	/* the next two are vaild for the life time of the sc */
+	struct socket		*sc_sock;
+	struct o2nm_node	*sc_node;
+
+	/* all of these sc work structs hold refs on the sc while they are
+	 * queued.  they should not be able to ref a freed sc.  the teardown
+	 * race is with o2net_wq destruction in o2net_stop_listening() */
+
+	/* rx and connect work are generated from socket callbacks.  sc
+	 * shutdown removes the callbacks and then flushes the work queue */
+	struct work_struct	sc_rx_work;
+	struct work_struct	sc_connect_work;
+	/* shutdown work is triggered in two ways.  the simple way is
+	 * for a code path calls ensure_shutdown which gets a lock, removes
+	 * the sc from the nn, and queues the work.  in this case the
+	 * work is single-shot.  the work is also queued from a sock
+	 * callback, though, and in this case the work will find the sc
+	 * still on the nn and will call ensure_shutdown itself.. this
+	 * ends up triggering the shutdown work again, though nothing
+	 * will be done in that second iteration.  so work queue teardown
+	 * has to be careful to remove the sc from the nn before waiting
+	 * on the work queue so that the shutdown work doesn't remove the
+	 * sc and rearm itself.
+	 */
+	struct work_struct	sc_shutdown_work;
+
+	struct timer_list	sc_idle_timeout;
+	struct work_struct	sc_keepalive_work;
+
+	unsigned		sc_handshake_ok:1;
+
+	struct page 		*sc_page;
+	size_t			sc_page_off;
+
+	/* original handlers for the sockets */
+	void			(*sc_state_change)(struct sock *sk);
+	void			(*sc_data_ready)(struct sock *sk, int bytes);
+
+	struct timeval 		sc_tv_timer;
+	struct timeval 		sc_tv_data_ready;
+	struct timeval 		sc_tv_advance_start;
+	struct timeval 		sc_tv_advance_stop;
+	struct timeval 		sc_tv_func_start;
+	struct timeval 		sc_tv_func_stop;
+	u32			sc_msg_key;
+	u16			sc_msg_type;
+};
+
+struct o2net_msg_handler {
+	struct rb_node		nh_node;
+	u32			nh_max_len;
+	u32			nh_msg_type;
+	u32			nh_key;
+	o2net_msg_handler_func	*nh_func;
+	o2net_msg_handler_func	*nh_func_data;
+	struct kref		nh_kref;
+	struct list_head	nh_unregister_item;
+};
+
+enum o2net_system_error {
+	O2NET_ERR_NONE = 0,
+	O2NET_ERR_NO_HNDLR,
+	O2NET_ERR_OVERFLOW,
+	O2NET_ERR_DIED,
+	O2NET_ERR_MAX
+};
+
+struct o2net_status_wait {
+	enum o2net_system_error	ns_sys_status;
+	s32			ns_status;
+	int			ns_id;
+	wait_queue_head_t	ns_wq;
+	struct list_head	ns_node_item;
+};
+
+#endif /* O2CLUSTER_TCP_INTERNAL_H */

+ 42 - 0
fs/ocfs2/cluster/ver.c

@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "ver.h"
+
+#define CLUSTER_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
+
+void cluster_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(CLUSTER_BUILD_VERSION);

+ 31 - 0
fs/ocfs2/cluster/ver.h

@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_VER_H
+#define O2CLUSTER_VER_H
+
+void cluster_print_version(void);
+
+#endif /* O2CLUSTER_VER_H */

+ 91 - 0
fs/ocfs2/dcache.c

@@ -0,0 +1,91 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.c
+ *
+ * dentry cache handling code
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+
+#define MLOG_MASK_PREFIX ML_DCACHE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "file.h"
+#include "inode.h"
+
+static int ocfs2_dentry_revalidate(struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	int ret = 0;    /* if all else fails, just return false */
+	struct ocfs2_super *osb;
+
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	/* Never trust a negative dentry - force a new lookup. */
+	if (inode == NULL) {
+		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
+		     dentry->d_name.name);
+		goto bail;
+	}
+
+	osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!osb);
+
+	if (inode != osb->root_inode) {
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		/* did we or someone else delete this inode? */
+		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			mlog(0, "inode (%"MLFu64") deleted, returning false\n",
+			     OCFS2_I(inode)->ip_blkno);
+			goto bail;
+		}
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		if (!inode->i_nlink) {
+			mlog(0, "Inode %"MLFu64" orphaned, returning false "
+			     "dir = %d\n", OCFS2_I(inode)->ip_blkno,
+			     S_ISDIR(inode->i_mode));
+			goto bail;
+		}
+	}
+
+	ret = 1;
+
+bail:
+	mlog_exit(ret);
+
+	return ret;
+}
+
+struct dentry_operations ocfs2_dentry_ops = {
+	.d_revalidate		= ocfs2_dentry_revalidate,
+};

+ 31 - 0
fs/ocfs2/dcache.h

@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DCACHE_H
+#define OCFS2_DCACHE_H
+
+extern struct dentry_operations ocfs2_dentry_ops;
+
+#endif /* OCFS2_DCACHE_H */

+ 618 - 0
fs/ocfs2/dir.c

@@ -0,0 +1,618 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c
+ *
+ * Creates, reads, walks and deletes directory-nodes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static unsigned char ocfs2_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh);
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	unsigned long offset, blk;
+	int i, num, stored;
+	struct buffer_head * bh, * tmp;
+	struct ocfs2_dir_entry * de;
+	int err;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block * sb = inode->i_sb;
+	int have_disk_lock = 0;
+
+	mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	stored = 0;
+	bh = NULL;
+
+	error = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (error < 0) {
+		if (error != -ENOENT)
+			mlog_errno(error);
+		/* we haven't got any yet, so propagate the error. */
+		stored = error;
+		goto bail;
+	}
+	have_disk_lock = 1;
+
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+
+	while (!error && !stored && filp->f_pos < i_size_read(inode)) {
+		blk = (filp->f_pos) >> sb->s_blocksize_bits;
+		bh = ocfs2_bread(inode, blk, &err, 0);
+		if (!bh) {
+			mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
+				       "at offset %lld\n",
+			     OCFS2_I(inode)->ip_blkno,
+			     filp->f_pos);
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+		/*
+		 * Do the readahead (8k)
+		 */
+		if (!offset) {
+			for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
+			     i > 0; i--) {
+				tmp = ocfs2_bread(inode, ++blk, &err, 1);
+				if (tmp)
+					brelse(tmp);
+			}
+		}
+
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			offset = i;
+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+				| offset;
+			filp->f_version = inode->i_version;
+		}
+
+		while (!error && filp->f_pos < i_size_read(inode)
+		       && offset < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
+			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+				/* On error, skip the f_pos to the
+				   next block. */
+				filp->f_pos = (filp->f_pos |
+					       (sb->s_blocksize - 1)) + 1;
+				brelse(bh);
+				goto bail;
+			}
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				unsigned long version = filp->f_version;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (de->file_type < OCFS2_FT_MAX)
+					d_type = ocfs2_filetype_table[de->file_type];
+				error = filldir(dirent, de->name,
+						de->name_len,
+						filp->f_pos,
+						ino_from_blkno(sb, le64_to_cpu(de->inode)),
+						d_type);
+				if (error)
+					break;
+				if (version != filp->f_version)
+					goto revalidate;
+				stored ++;
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		offset = 0;
+		brelse(bh);
+	}
+
+	stored = 0;
+bail:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(inode, 0);
+
+	mlog_exit(stored);
+
+	return stored;
+}
+
+/*
+ * NOTE: this should always be called with parent dir i_sem taken.
+ */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent)
+{
+	int status = -ENOENT;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
+		   "inode=%p)\n",
+		   osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
+
+	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
+	if (!*dirent_bh || !*dirent) {
+		status = -ENOENT;
+		goto leave;
+	}
+
+	*blkno = le64_to_cpu((*dirent)->inode);
+
+	status = 0;
+leave:
+	if (status < 0) {
+		*dirent = NULL;
+		if (*dirent_bh) {
+			brelse(*dirent_bh);
+			*dirent_bh = NULL;
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Check for a name within a directory.
+ *
+ * Return 0 if the name does not exist
+ * Return -EEXIST if the directory contains the name
+ *
+ * Callers should have i_sem + a cluster lock on dir
+ */
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen)
+{
+	int ret;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+
+	mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
+		   namelen, name);
+
+	ret = -EEXIST;
+	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
+	if (dirent_bh)
+		goto bail;
+
+	ret = 0;
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ocfs2_empty_dir(struct inode *inode)
+{
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ocfs2_dir_entry * de, * de1;
+	struct super_block * sb;
+	int err;
+
+	sb = inode->i_sb;
+	if ((i_size_read(inode) <
+	     (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
+	    !(bh = ocfs2_bread(inode, 0, &err, 0))) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no data block\n",
+		     OCFS2_I(inode)->ip_blkno);
+		return 1;
+	}
+
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	de1 = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
+			!le64_to_cpu(de1->inode) ||
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no `.' or `..'\n",
+		     OCFS2_I(inode)->ip_blkno);
+		brelse(bh);
+		return 1;
+	}
+	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+	de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
+	while (offset < i_size_read(inode) ) {
+		if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
+			brelse(bh);
+			bh = ocfs2_bread(inode,
+					 offset >> sb->s_blocksize_bits, &err, 0);
+			if (!bh) {
+				mlog(ML_ERROR, "directory #%"MLFu64" contains "
+					       "a hole at offset %lu\n",
+				     OCFS2_I(inode)->ip_blkno, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+			brelse(bh);
+			return 1;
+		}
+		if (le64_to_cpu(de->inode)) {
+			brelse(bh);
+			return 0;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	}
+	brelse(bh);
+	return 1;
+}
+
+/* returns a bh of the 1st new block in the allocation. */
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh)
+{
+	int status;
+	int extend;
+	u64 p_blkno;
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+	if (extend) {
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+						    parent_fe_bh, handle,
+						    data_ac, meta_ac, NULL);
+		BUG_ON(status == -EAGAIN);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
+						   (sb->s_blocksize_bits - 9)),
+					     1, &p_blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_bh = sb_getblk(sb, p_blkno);
+	if (!*new_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* assumes you already have a cluster lock on the directory. */
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh)
+{
+	int status = 0;
+	int credits, num_free_extents;
+	loff_t dir_i_size;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry * de;
+	struct super_block *sb = osb->sb;
+
+	mlog_entry_void();
+
+	dir_i_size = i_size_read(dir);
+	mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
+	     OCFS2_I(dir)->ip_blkno, dir_i_size);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* dir->i_size is always block aligned. */
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		if (num_free_extents < 0) {
+			status = num_free_extents;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!num_free_extents) {
+			status = ocfs2_reserve_new_metadata(osb, handle,
+							    fe, &meta_ac);
+			if (status < 0) {
+				if (status != -ENOSPC)
+					mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+	} else {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
+				     data_ac, meta_ac, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(dir, new_bh);
+
+	status = ocfs2_journal_access(handle, dir, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, sb->s_blocksize);
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = 0;
+	de->rec_len = cpu_to_le16(sb->s_blocksize);
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	dir_i_size += dir->i_sb->s_blocksize;
+	i_size_write(dir, dir_i_size);
+	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_de_bh = new_bh;
+	get_bh(*new_de_bh);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Search the dir for a good spot, extending it if necessary. The
+ * block containing an appropriate record is returned in ret_de_bh.
+ */
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head * bh = NULL;
+	unsigned short rec_len;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb;
+	int status;
+
+	mlog_entry_void();
+
+	mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
+	     namelen, OCFS2_I(dir)->ip_blkno);
+
+	BUG_ON(!S_ISDIR(dir->i_mode));
+	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
+
+	sb = dir->i_sb;
+
+	if (!namelen) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bh = ocfs2_bread(dir, 0, &status, 0);
+	if (!bh) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (1) {
+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+			brelse(bh);
+			bh = NULL;
+
+			if (i_size_read(dir) <= offset) {
+				status = ocfs2_extend_dir(osb,
+							  dir,
+							  parent_fe_bh,
+							  &bh);
+				if (status < 0) {
+					mlog_errno(status);
+					goto bail;
+				}
+				BUG_ON(!bh);
+				*ret_de_bh = bh;
+				get_bh(*ret_de_bh);
+				goto bail;
+			}
+			bh = ocfs2_bread(dir,
+					 offset >> sb->s_blocksize_bits,
+					 &status,
+					 0);
+			if (!bh) {
+				mlog_errno(status);
+				goto bail;
+			}
+			/* move to next block */
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+			status = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			status = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = bh;
+			get_bh(*ret_de_bh);
+			status = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	status = 0;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}

+ 54 - 0
fs/ocfs2/dir.h

@@ -0,0 +1,54 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DIR_H
+#define OCFS2_DIR_H
+
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen);
+int ocfs2_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent);
+int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh);
+struct ocfs2_alloc_context;
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh);
+#endif /* OCFS2_DIR_H */

+ 8 - 0
fs/ocfs2/dlm/Makefile

@@ -0,0 +1,8 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
+
+ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
+	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o

+ 214 - 0
fs/ocfs2/dlm/dlmapi.h

@@ -0,0 +1,214 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmapi.h
+ *
+ * externally exported dlm interfaces
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMAPI_H
+#define DLMAPI_H
+
+struct dlm_lock;
+struct dlm_ctxt;
+
+/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
+enum dlm_status {
+	DLM_NORMAL = 0,           /*  0: request in progress */
+	DLM_GRANTED,              /*  1: request granted */
+	DLM_DENIED,               /*  2: request denied */
+	DLM_DENIED_NOLOCKS,       /*  3: request denied, out of system resources */
+	DLM_WORKING,              /*  4: async request in progress */
+	DLM_BLOCKED,              /*  5: lock request blocked */
+	DLM_BLOCKED_ORPHAN,       /*  6: lock request blocked by a orphan lock*/
+	DLM_DENIED_GRACE_PERIOD,  /*  7: topological change in progress */
+	DLM_SYSERR,               /*  8: system error */
+	DLM_NOSUPPORT,            /*  9: unsupported */
+	DLM_CANCELGRANT,          /* 10: can't cancel convert: already granted */
+	DLM_IVLOCKID,             /* 11: bad lockid */
+	DLM_SYNC,                 /* 12: synchronous request granted */
+	DLM_BADTYPE,              /* 13: bad resource type */
+	DLM_BADRESOURCE,          /* 14: bad resource handle */
+	DLM_MAXHANDLES,           /* 15: no more resource handles */
+	DLM_NOCLINFO,             /* 16: can't contact cluster manager */
+	DLM_NOLOCKMGR,            /* 17: can't contact lock manager */
+	DLM_NOPURGED,             /* 18: can't contact purge daemon */
+	DLM_BADARGS,              /* 19: bad api args */
+	DLM_VOID,                 /* 20: no status */
+	DLM_NOTQUEUED,            /* 21: NOQUEUE was specified and request failed */
+	DLM_IVBUFLEN,             /* 22: invalid resource name length */
+	DLM_CVTUNGRANT,           /* 23: attempted to convert ungranted lock */
+	DLM_BADPARAM,             /* 24: invalid lock mode specified */
+	DLM_VALNOTVALID,          /* 25: value block has been invalidated */
+	DLM_REJECTED,             /* 26: request rejected, unrecognized client */
+	DLM_ABORT,                /* 27: blocked lock request cancelled */
+	DLM_CANCEL,               /* 28: conversion request cancelled */
+	DLM_IVRESHANDLE,          /* 29: invalid resource handle */
+	DLM_DEADLOCK,             /* 30: deadlock recovery refused this request */
+	DLM_DENIED_NOASTS,        /* 31: failed to allocate AST */
+	DLM_FORWARD,              /* 32: request must wait for primary's response */
+	DLM_TIMEOUT,              /* 33: timeout value for lock has expired */
+	DLM_IVGROUPID,            /* 34: invalid group specification */
+	DLM_VERS_CONFLICT,        /* 35: version conflicts prevent request handling */
+	DLM_BAD_DEVICE_PATH,      /* 36: Locks device does not exist or path wrong */
+	DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
+	DLM_NO_CONTROL_DEVICE,    /* 38: Cannot set options on opened device */
+
+	DLM_RECOVERING,           /* 39: extension, allows caller to fail a lock
+				     request if it is being recovered */
+	DLM_MIGRATING,            /* 40: extension, allows caller to fail a lock
+				     request if it is being migrated */
+	DLM_MAXSTATS,             /* 41: upper limit for return code validation */
+};
+
+/* for pretty-printing dlm_status error messages */
+const char *dlm_errmsg(enum dlm_status err);
+/* for pretty-printing dlm_status error names */
+const char *dlm_errname(enum dlm_status err);
+
+/* Eventually the DLM will use standard errno values, but in the
+ * meantime this lets us track dlm errors as they bubble up. When we
+ * bring its error reporting into line with the rest of the stack,
+ * these can just be replaced with calls to mlog_errno. */
+#define dlm_error(st) do {						\
+	if ((st) != DLM_RECOVERING &&					\
+	    (st) != DLM_MIGRATING &&					\
+	    (st) != DLM_FORWARD)					\
+		mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st)));	\
+} while (0)
+
+#define DLM_LKSB_UNUSED1           0x01  
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+#define DLM_LVB_LEN  64
+
+/* Callers are only allowed access to the lvb and status members of
+ * this struct. */
+struct dlm_lockstatus {
+	enum dlm_status status;
+	u32 flags;
+	struct dlm_lock *lockid;
+	char lvb[DLM_LVB_LEN];
+};
+
+/* Valid lock modes. */
+#define LKM_IVMODE      (-1)            /* invalid mode */
+#define LKM_NLMODE      0               /* null lock */
+#define LKM_CRMODE      1               /* concurrent read    unsupported */
+#define LKM_CWMODE      2               /* concurrent write   unsupported */
+#define LKM_PRMODE      3               /* protected read */
+#define LKM_PWMODE      4               /* protected write    unsupported */
+#define LKM_EXMODE      5               /* exclusive */
+#define LKM_MAXMODE     5
+#define LKM_MODEMASK    0xff
+
+/* Flags passed to dlmlock and dlmunlock:
+ * reserved: flags used by the "real" dlm
+ * only a few are supported by this dlm
+ * (U) = unsupported by ocfs2 dlm */
+#define LKM_ORPHAN       0x00000010  /* this lock is orphanable (U) */
+#define LKM_PARENTABLE   0x00000020  /* this lock was orphaned (U) */
+#define LKM_BLOCK        0x00000040  /* blocking lock request (U) */
+#define LKM_LOCAL        0x00000080  /* local lock request */
+#define LKM_VALBLK       0x00000100  /* lock value block request */
+#define LKM_NOQUEUE      0x00000200  /* non blocking request */
+#define LKM_CONVERT      0x00000400  /* conversion request */
+#define LKM_NODLCKWT     0x00000800  /* this lock wont deadlock (U) */
+#define LKM_UNLOCK       0x00001000  /* deallocate this lock */
+#define LKM_CANCEL       0x00002000  /* cancel conversion request */
+#define LKM_DEQALL       0x00004000  /* remove all locks held by proc (U) */
+#define LKM_INVVALBLK    0x00008000  /* invalidate lock value block */
+#define LKM_SYNCSTS      0x00010000  /* return synchronous status if poss (U) */
+#define LKM_TIMEOUT      0x00020000  /* lock request contains timeout (U) */
+#define LKM_SNGLDLCK     0x00040000  /* request can self-deadlock (U) */
+#define LKM_FINDLOCAL    0x00080000  /* find local lock request (U) */
+#define LKM_PROC_OWNED   0x00100000  /* owned by process, not group (U) */
+#define LKM_XID          0x00200000  /* use transaction id for deadlock (U) */
+#define LKM_XID_CONFLICT 0x00400000  /* do not allow lock inheritance (U) */
+#define LKM_FORCE        0x00800000  /* force unlock flag */
+#define LKM_REVVALBLK    0x01000000  /* temporary solution: re-validate
+					lock value block (U) */
+/* unused */
+#define LKM_UNUSED1      0x00000001  /* unused */
+#define LKM_UNUSED2      0x00000002  /* unused */
+#define LKM_UNUSED3      0x00000004  /* unused */
+#define LKM_UNUSED4      0x00000008  /* unused */
+#define LKM_UNUSED5      0x02000000  /* unused */
+#define LKM_UNUSED6      0x04000000  /* unused */
+#define LKM_UNUSED7      0x08000000  /* unused */
+
+/* ocfs2 extensions: internal only
+ * should never be used by caller */
+#define LKM_MIGRATION    0x10000000  /* extension: lockres is to be migrated
+					to another node */
+#define LKM_PUT_LVB      0x20000000  /* extension: lvb is being passed
+					should be applied to lockres */
+#define LKM_GET_LVB      0x40000000  /* extension: lvb should be copied
+					from lockres when lock is granted */
+#define LKM_RECOVERY     0x80000000  /* extension: flag for recovery lock
+					used to avoid recovery rwsem */
+
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm,
+			int mode,
+			struct dlm_lockstatus *lksb,
+			int flags,
+			const char *name,
+			dlm_astlockfunc_t *ast,
+			void *data,
+			dlm_bastlockfunc_t *bast);
+
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
+			  struct dlm_lockstatus *lksb,
+			  int flags,
+			  dlm_astunlockfunc_t *unlockast,
+			  void *data);
+
+struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm);
+
+void dlm_print_one_lock(struct dlm_lock *lockid);
+
+typedef void (dlm_eviction_func)(int, void *);
+struct dlm_eviction_cb {
+	struct list_head        ec_item;
+	dlm_eviction_func       *ec_func;
+	void                    *ec_data;
+};
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data);
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb);
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
+
+#endif /* DLMAPI_H */

+ 466 - 0
fs/ocfs2/dlm/dlmast.c

@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmast.c
+ *
+ * AST and BAST functionality for local and remote nodes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+#include "cluster/endian.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock);
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+
+/* Should be called as an ast gets queued to see if the new
+ * lock level will obsolete a pending bast.
+ * For example, if dlm_thread queued a bast for an EX lock that
+ * was blocking another EX, but before sending the bast the
+ * lock owner downconverted to NL, the bast is now obsolete.
+ * Only the ast should be sent.
+ * This is needed because the lock and convert paths can queue
+ * asts out-of-band (not waiting for dlm_thread) in order to
+ * allow for LKM_NOQUEUE to get immediate responses. */
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	assert_spin_locked(&dlm->ast_lock);
+	assert_spin_locked(&lock->spinlock);
+
+	if (lock->ml.highest_blocked == LKM_IVMODE)
+		return 0;
+	BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
+
+	if (lock->bast_pending &&
+	    list_empty(&lock->bast_list))
+		/* old bast already sent, ok */
+		return 0;
+
+	if (lock->ml.type == LKM_EXMODE)
+		/* EX blocks anything left, any bast still valid */
+		return 0;
+	else if (lock->ml.type == LKM_NLMODE)
+		/* NL blocks nothing, no reason to send any bast, cancel it */
+		return 1;
+	else if (lock->ml.highest_blocked != LKM_EXMODE)
+		/* PR only blocks EX */
+		return 1;
+
+	return 0;
+}
+
+static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	assert_spin_locked(&dlm->ast_lock);
+	if (!list_empty(&lock->ast_list)) {
+		mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+		     lock->ast_pending, lock->ml.type);
+		BUG();
+	}
+	BUG_ON(!list_empty(&lock->ast_list));
+	if (lock->ast_pending)
+		mlog(0, "lock has an ast getting flushed right now\n");
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+
+	/* check to see if this ast obsoletes the bast */
+	if (dlm_should_cancel_bast(dlm, lock)) {
+		struct dlm_lock_resource *res = lock->lockres;
+		mlog(0, "%s: cancelling bast for %.*s\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		lock->bast_pending = 0;
+		list_del_init(&lock->bast_list);
+		lock->ml.highest_blocked = LKM_IVMODE;
+		/* removing lock from list, remove a ref.  guaranteed
+		 * this won't be the last ref because of the get above,
+		 * so res->spinlock will not be taken here */
+		dlm_lock_put(lock);
+		/* free up the reserved bast that we are cancelling.
+		 * guaranteed that this will not be the last reserved
+		 * ast because *both* an ast and a bast were reserved 
+		 * to get to this point.  the res->spinlock will not be
+		 * taken here */
+		dlm_lockres_release_ast(dlm, res);
+	}
+	list_add_tail(&lock->ast_list, &dlm->pending_asts);
+	lock->ast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_ast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+	assert_spin_locked(&dlm->ast_lock);
+
+	BUG_ON(!list_empty(&lock->bast_list));
+	if (lock->bast_pending)
+		mlog(0, "lock has a bast getting flushed right now\n");
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+	list_add_tail(&lock->bast_list, &dlm->pending_basts);
+	lock->bast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_bast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock)
+{
+	struct dlm_lockstatus *lksb = lock->lksb;
+	BUG_ON(!lksb);
+
+	/* only updates if this node masters the lockres */
+	if (res->owner == dlm->node_num) {
+
+		spin_lock(&res->spinlock);
+		/* check the lksb flags for the direction */
+		if (lksb->flags & DLM_LKSB_GET_LVB) {
+			mlog(0, "getting lvb from lockres for %s node\n",
+				  lock->ml.node == dlm->node_num ? "master" :
+				  "remote");
+			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
+		} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
+			mlog(0, "setting lvb from lockres for %s node\n",
+				  lock->ml.node == dlm->node_num ? "master" :
+				  "remote");
+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
+		}
+		spin_unlock(&res->spinlock);
+	}
+
+	/* reset any lvb flags on the lksb */
+	lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+}
+
+void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	dlm_astlockfunc_t *fn;
+	struct dlm_lockstatus *lksb;
+
+	mlog_entry_void();
+
+	lksb = lock->lksb;
+	fn = lock->ast;
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	dlm_update_lvb(dlm, res, lock);
+	(*fn)(lock->astdata);
+}
+
+
+int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	int ret;
+	struct dlm_lockstatus *lksb;
+	int lksbflags;
+
+	mlog_entry_void();
+
+	lksb = lock->lksb;
+	BUG_ON(lock->ml.node == dlm->node_num);
+
+	lksbflags = lksb->flags;
+	dlm_update_lvb(dlm, res, lock);
+
+	/* lock request came from another node
+	 * go do the ast over there */
+	ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
+	return ret;
+}
+
+void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		       struct dlm_lock *lock, int blocked_type)
+{
+	dlm_bastlockfunc_t *fn = lock->bast;
+
+	mlog_entry_void();
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	(*fn)(lock->astdata, blocked_type);
+}
+
+
+
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	int ret;
+	unsigned int locklen;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
+	char *name;
+	struct list_head *iter, *head=NULL;
+	u64 cookie;
+	u32 flags;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	name = past->name;
+	locklen = past->namelen;
+	cookie = be64_to_cpu(past->cookie);
+	flags = be32_to_cpu(past->flags);
+
+	if (locklen > DLM_LOCKID_NAME_MAX) {
+		ret = DLM_IVBUFLEN;
+		mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+		goto leave;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		ret = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+		  (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+	if (past->type != DLM_AST &&
+	    past->type != DLM_BAST) {
+		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
+		     "name=%.*s\n", past->type, cookie, locklen, name);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	res = dlm_lookup_lockres(dlm, name, locklen);
+	if (!res) {
+		mlog(ML_ERROR, "got %sast for unknown lockres! "
+			       "cookie=%"MLFu64", name=%.*s, namelen=%u\n",
+		     past->type == DLM_AST ? "" : "b",
+		     cookie, locklen, name, locklen);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	/* cannot get a proxy ast message if this node owns it */
+	BUG_ON(res->owner == dlm->node_num);
+
+	mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "responding with DLM_RECOVERING!\n");
+		ret = DLM_RECOVERING;
+		goto unlock_out;
+	}
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "responding with DLM_MIGRATING!\n");
+		ret = DLM_MIGRATING;
+		goto unlock_out;
+	}
+	/* try convert queue for both ast/bast */
+	head = &res->converting;
+	lock = NULL;
+	list_for_each(iter, head) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (be64_to_cpu(lock->ml.cookie) == cookie)
+			goto do_ast;
+	}
+
+	/* if not on convert, try blocked for ast, granted for bast */
+	if (past->type == DLM_AST)
+		head = &res->blocked;
+	else
+		head = &res->granted;
+
+	list_for_each(iter, head) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (be64_to_cpu(lock->ml.cookie) == cookie)
+			goto do_ast;
+	}
+
+	mlog(ML_ERROR, "got %sast for unknown lock!  cookie=%"MLFu64", "
+		       "name=%.*s, namelen=%u\n",
+             past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
+
+	ret = DLM_NORMAL;
+unlock_out:
+	spin_unlock(&res->spinlock);
+	goto leave;
+
+do_ast:
+	ret = DLM_NORMAL;
+	if (past->type == DLM_AST) {
+		/* do not alter lock refcount.  switching lists. */
+		list_del_init(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
+		mlog(0, "ast: adding to granted list... type=%d, "
+			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+		if (lock->ml.convert_type != LKM_IVMODE) {
+			lock->ml.type = lock->ml.convert_type;
+			lock->ml.convert_type = LKM_IVMODE;
+		} else {
+			// should already be there....
+		}
+
+		lock->lksb->status = DLM_NORMAL;
+
+		/* if we requested the lvb, fetch it into our lksb now */
+		if (flags & LKM_GET_LVB) {
+			BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
+			memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	if (past->type == DLM_AST)
+		dlm_do_local_ast(dlm, res, lock);
+	else
+		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
+
+leave:
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+	return ret;
+}
+
+
+
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock, int msg_type,
+			   int blocked_type, int flags)
+{
+	int ret = 0;
+	struct dlm_proxy_ast past;
+	struct kvec vec[2];
+	size_t veclen = 1;
+	int status;
+
+	mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+		   res->lockname.len, res->lockname.name, lock->ml.node,
+		   msg_type, blocked_type);
+
+	memset(&past, 0, sizeof(struct dlm_proxy_ast));
+	past.node_idx = dlm->node_num;
+	past.type = msg_type;
+	past.blocked_type = blocked_type;
+	past.namelen = res->lockname.len;
+	memcpy(past.name, res->lockname.name, past.namelen);
+	past.cookie = lock->ml.cookie;
+
+	vec[0].iov_len = sizeof(struct dlm_proxy_ast);
+	vec[0].iov_base = &past;
+	if (flags & DLM_LKSB_GET_LVB) {
+		mlog(0, "returning requested LVB data\n");
+		be32_add_cpu(&past.flags, LKM_GET_LVB);
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
+				     lock->ml.node, &status);
+	if (ret < 0)
+		mlog_errno(ret);
+	else {
+		if (status == DLM_RECOVERING) {
+			mlog(ML_ERROR, "sent AST to node %u, it thinks this "
+			     "node is dead!\n", lock->ml.node);
+			BUG();
+		} else if (status == DLM_MIGRATING) {
+			mlog(ML_ERROR, "sent AST to node %u, it returned "
+			     "DLM_MIGRATING!\n", lock->ml.node);
+			BUG();
+		} else if (status != DLM_NORMAL) {
+			mlog(ML_ERROR, "AST to node %u returned %d!\n",
+			     lock->ml.node, status);
+			/* ignore it */
+		}
+		ret = 0;
+	}
+	return ret;
+}

+ 884 - 0
fs/ocfs2/dlm/dlmcommon.h

@@ -0,0 +1,884 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCOMMON_H
+#define DLMCOMMON_H
+
+#include <linux/kref.h>
+
+#define DLM_HB_NODE_DOWN_PRI     (0xf000000)
+#define DLM_HB_NODE_UP_PRI       (0x8000000)
+
+#define DLM_LOCKID_NAME_MAX    32
+
+#define DLM_DOMAIN_NAME_MAX_LEN    255
+#define DLM_LOCK_RES_OWNER_UNKNOWN     O2NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
+#define DLM_THREAD_MS                  200   // flush at least every 200 ms
+
+#define DLM_HASH_BITS     7
+#define DLM_HASH_SIZE     (1 << DLM_HASH_BITS)
+#define DLM_HASH_MASK     (DLM_HASH_SIZE - 1)
+
+enum dlm_ast_type {
+	DLM_AST = 0,
+	DLM_BAST,
+	DLM_ASTUNLOCK
+};
+
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+			 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+			 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME       "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN   9
+
+static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
+{
+	if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+	    memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
+		return 1;
+	return 0;
+}
+
+#define DLM_RECO_STATE_ACTIVE  0x0001
+
+struct dlm_recovery_ctxt
+{
+	struct list_head resources;
+	struct list_head received;
+	struct list_head node_data;
+	u8  new_master;
+	u8  dead_node;
+	u16 state;
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	wait_queue_head_t event;
+};
+
+enum dlm_ctxt_state {
+	DLM_CTXT_NEW = 0,
+	DLM_CTXT_JOINED,
+	DLM_CTXT_IN_SHUTDOWN,
+	DLM_CTXT_LEAVING,
+};
+
+struct dlm_ctxt
+{
+	struct list_head list;
+	struct list_head *resources;
+	struct list_head dirty_list;
+	struct list_head purge_list;
+	struct list_head pending_asts;
+	struct list_head pending_basts;
+	unsigned int purge_count;
+	spinlock_t spinlock;
+	spinlock_t ast_lock;
+	char *name;
+	u8 node_num;
+	u32 key;
+	u8  joining_node;
+	wait_queue_head_t dlm_join_events;
+	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct dlm_recovery_ctxt reco;
+	spinlock_t master_lock;
+	struct list_head master_list;
+	struct list_head mle_hb_events;
+
+	/* these give a really vague idea of the system load */
+	atomic_t local_resources;
+	atomic_t remote_resources;
+	atomic_t unknown_resources;
+
+	/* NOTE: Next three are protected by dlm_domain_lock */
+	struct kref dlm_refs;
+	enum dlm_ctxt_state dlm_state;
+	unsigned int num_joins;
+
+	struct o2hb_callback_func dlm_hb_up;
+	struct o2hb_callback_func dlm_hb_down;
+	struct task_struct *dlm_thread_task;
+	struct task_struct *dlm_reco_thread_task;
+	wait_queue_head_t dlm_thread_wq;
+	wait_queue_head_t dlm_reco_thread_wq;
+	wait_queue_head_t ast_wq;
+	wait_queue_head_t migration_wq;
+
+	struct work_struct dispatched_work;
+	struct list_head work_list;
+	spinlock_t work_lock;
+	struct list_head dlm_domain_handlers;
+	struct list_head	dlm_eviction_callbacks;
+};
+
+/* these keventd work queue items are for less-frequently
+ * called functions that cannot be directly called from the
+ * net message handlers for some reason, usually because
+ * they need to send net messages of their own. */
+void dlm_dispatch_work(void *data);
+
+struct dlm_lock_resource;
+struct dlm_work_item;
+
+typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
+
+struct dlm_request_all_locks_priv
+{
+	u8 reco_master;
+	u8 dead_node;
+};
+
+struct dlm_mig_lockres_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 real_master;
+};
+
+struct dlm_assert_master_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 request_from;
+	u32 flags;
+	unsigned ignore_higher:1;
+};
+
+
+struct dlm_work_item
+{
+	struct list_head list;
+	dlm_workfunc_t *func;
+	struct dlm_ctxt *dlm;
+	void *data;
+	union {
+		struct dlm_request_all_locks_priv ral;
+		struct dlm_mig_lockres_priv ml;
+		struct dlm_assert_master_priv am;
+	} u;
+};
+
+static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
+				      struct dlm_work_item *i,
+				      dlm_workfunc_t *f, void *data)
+{
+	memset(i, 0, sizeof(*i));
+	i->func = f;
+	INIT_LIST_HEAD(&i->list);
+	i->data = data;
+	i->dlm = dlm;  /* must have already done a dlm_grab on this! */
+}
+
+
+
+static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
+					  u8 node)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	dlm->joining_node = node;
+	wake_up(&dlm->dlm_join_events);
+}
+
+#define DLM_LOCK_RES_UNINITED             0x00000001
+#define DLM_LOCK_RES_RECOVERING           0x00000002
+#define DLM_LOCK_RES_READY                0x00000004
+#define DLM_LOCK_RES_DIRTY                0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS          0x00000010
+#define DLM_LOCK_RES_MIGRATING            0x00000020
+
+#define DLM_PURGE_INTERVAL_MS   (8 * 1000)
+
+struct dlm_lock_resource
+{
+	/* WARNING: Please see the comment in dlm_init_lockres before
+	 * adding fields here. */
+	struct list_head list;
+	struct kref      refs;
+
+	/* please keep these next 3 in this order
+	 * some funcs want to iterate over all lists */
+	struct list_head granted;
+	struct list_head converting;
+	struct list_head blocked;
+
+	struct list_head dirty;
+	struct list_head recovering; // dlm_recovery_ctxt.resources list
+
+	/* unused lock resources have their last_used stamped and are
+	 * put on a list for the dlm thread to run. */
+	struct list_head purge;
+	unsigned long    last_used;
+
+	unsigned migration_pending:1;
+	atomic_t asts_reserved;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	u8  owner;              //node which owns the lock resource, or unknown
+	u16 state;
+	struct qstr lockname;
+	char lvb[DLM_LVB_LEN];
+};
+
+struct dlm_migratable_lock
+{
+	__be64 cookie;
+
+	/* these 3 are just padding for the in-memory structure, but
+	 * list and flags are actually used when sent over the wire */
+	__be16 pad1;
+	u8 list;  // 0=granted, 1=converting, 2=blocked
+	u8 flags;
+
+	s8 type;
+	s8 convert_type;
+	s8 highest_blocked;
+	u8 node;
+};  // 16 bytes
+
+struct dlm_lock
+{
+	struct dlm_migratable_lock ml;
+
+	struct list_head list;
+	struct list_head ast_list;
+	struct list_head bast_list;
+	struct dlm_lock_resource *lockres;
+	spinlock_t spinlock;
+	struct kref lock_refs;
+
+	// ast and bast must be callable while holding a spinlock!
+	dlm_astlockfunc_t *ast;
+	dlm_bastlockfunc_t *bast;
+	void *astdata;
+	struct dlm_lockstatus *lksb;
+	unsigned ast_pending:1,
+		 bast_pending:1,
+		 convert_pending:1,
+		 lock_pending:1,
+		 cancel_pending:1,
+		 unlock_pending:1,
+		 lksb_kernel_allocated:1;
+};
+
+
+#define DLM_LKSB_UNUSED1           0x01
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+
+enum dlm_lockres_list {
+	DLM_GRANTED_LIST = 0,
+	DLM_CONVERTING_LIST,
+	DLM_BLOCKED_LIST
+};
+
+static inline struct list_head *
+dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
+{
+	struct list_head *ret = NULL;
+	if (idx == DLM_GRANTED_LIST)
+		ret = &res->granted;
+	else if (idx == DLM_CONVERTING_LIST)
+		ret = &res->converting;
+	else if (idx == DLM_BLOCKED_LIST)
+		ret = &res->blocked;
+	else
+		BUG();
+	return ret;
+}
+
+
+
+
+struct dlm_node_iter
+{
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int curnode;
+};
+
+
+enum {
+	DLM_MASTER_REQUEST_MSG    = 500,
+	DLM_UNUSED_MSG1,         /* 501 */
+	DLM_ASSERT_MASTER_MSG,	 /* 502 */
+	DLM_CREATE_LOCK_MSG,	 /* 503 */
+	DLM_CONVERT_LOCK_MSG,	 /* 504 */
+	DLM_PROXY_AST_MSG,	 /* 505 */
+	DLM_UNLOCK_LOCK_MSG,	 /* 506 */
+	DLM_UNUSED_MSG2,	 /* 507 */
+	DLM_MIGRATE_REQUEST_MSG, /* 508 */
+	DLM_MIG_LOCKRES_MSG, 	 /* 509 */
+	DLM_QUERY_JOIN_MSG,	 /* 510 */
+	DLM_ASSERT_JOINED_MSG,	 /* 511 */
+	DLM_CANCEL_JOIN_MSG,	 /* 512 */
+	DLM_EXIT_DOMAIN_MSG,	 /* 513 */
+	DLM_MASTER_REQUERY_MSG,	 /* 514 */
+	DLM_LOCK_REQUEST_MSG,	 /* 515 */
+	DLM_RECO_DATA_DONE_MSG,	 /* 516 */
+	DLM_BEGIN_RECO_MSG,	 /* 517 */
+	DLM_FINALIZE_RECO_MSG	 /* 518 */
+};
+
+struct dlm_reco_node_data
+{
+	int state;
+	u8 node_num;
+	struct list_head list;
+};
+
+enum {
+	DLM_RECO_NODE_DATA_DEAD = -1,
+	DLM_RECO_NODE_DATA_INIT = 0,
+	DLM_RECO_NODE_DATA_REQUESTING,
+	DLM_RECO_NODE_DATA_REQUESTED,
+	DLM_RECO_NODE_DATA_RECEIVING,
+	DLM_RECO_NODE_DATA_DONE,
+	DLM_RECO_NODE_DATA_FINALIZE_SENT,
+};
+
+
+enum {
+	DLM_MASTER_RESP_NO = 0,
+	DLM_MASTER_RESP_YES,
+	DLM_MASTER_RESP_MAYBE,
+	DLM_MASTER_RESP_ERROR
+};
+
+
+struct dlm_master_request
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_ASSERT_MASTER_MLE_CLEANUP      0x00000001
+#define DLM_ASSERT_MASTER_REQUERY          0x00000002
+#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
+struct dlm_assert_master
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_migrate_request
+{
+	u8 master;
+	u8 new_master;
+	u8 namelen;
+	u8 pad1;
+	__be32 pad2;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_master_requery
+{
+	u8 pad1;
+	u8 pad2;
+	u8 node_idx;
+	u8 namelen;
+	__be32 pad3;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_MRES_RECOVERY   0x01
+#define DLM_MRES_MIGRATION  0x02
+#define DLM_MRES_ALL_DONE   0x04
+
+/*
+ * We would like to get one whole lockres into a single network
+ * message whenever possible.  Generally speaking, there will be
+ * at most one dlm_lock on a lockres for each node in the cluster,
+ * plus (infrequently) any additional locks coming in from userdlm.
+ *
+ * struct _dlm_lockres_page
+ * {
+ * 	dlm_migratable_lockres mres;
+ * 	dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
+ * 	u8 pad[DLM_MIG_LOCKRES_RESERVED];
+ * };
+ *
+ * from ../cluster/tcp.h
+ *    NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(net_msg))
+ *    (roughly 4080 bytes)
+ * and sizeof(dlm_migratable_lockres) = 112 bytes
+ * and sizeof(dlm_migratable_lock) = 16 bytes
+ *
+ * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
+ * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
+ *
+ *  (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
+ *     sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
+ *        NET_MAX_PAYLOAD_BYTES
+ *  (240 * 16) + 112 + 128 = 4080
+ *
+ * So a lockres would need more than 240 locks before it would
+ * use more than one network packet to recover.  Not too bad.
+ */
+#define DLM_MAX_MIGRATABLE_LOCKS   240
+
+struct dlm_migratable_lockres
+{
+	u8 master;
+	u8 lockname_len;
+	u8 num_locks;    // locks sent in this structure
+	u8 flags;
+	__be32 total_locks; // locks to be sent for this migration cookie
+	__be64 mig_cookie;  // cookie for this lockres migration
+			 // or zero if not needed
+	// 16 bytes
+	u8 lockname[DLM_LOCKID_NAME_MAX];
+	// 48 bytes
+	u8 lvb[DLM_LVB_LEN];
+	// 112 bytes
+	struct dlm_migratable_lock ml[0];  // 16 bytes each, begins at byte 112
+};
+#define DLM_MIG_LOCKRES_MAX_LEN  \
+	(sizeof(struct dlm_migratable_lockres) + \
+	 (sizeof(struct dlm_migratable_lock) * \
+	  DLM_MAX_MIGRATABLE_LOCKS) )
+
+/* from above, 128 bytes
+ * for some undetermined future use */
+#define DLM_MIG_LOCKRES_RESERVED   (NET_MAX_PAYLOAD_BYTES - \
+				    DLM_MIG_LOCKRES_MAX_LEN)
+
+struct dlm_create_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_convert_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_CONVERT_LOCK_MAX_LEN  (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
+
+struct dlm_unlock_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	__be16 pad1;
+	u8 node_idx;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_UNLOCK_LOCK_MAX_LEN  (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
+
+struct dlm_proxy_ast
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 node_idx;
+	u8 type;
+	u8 blocked_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_PROXY_AST_MAX_LEN  (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
+
+#define DLM_MOD_KEY (0x666c6172)
+enum dlm_query_join_response {
+	JOIN_DISALLOW = 0,
+	JOIN_OK,
+	JOIN_OK_NO_MAP,
+};
+
+struct dlm_lock_request
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+struct dlm_reco_data_done
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+
+	/* unused for now */
+	/* eventually we can use this to attempt
+	 * lvb recovery based on each node's info */
+	u8 reco_lvb[DLM_LVB_LEN];
+};
+
+struct dlm_begin_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+
+struct dlm_query_join_request
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_assert_joined
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_cancel_join
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_exit_domain
+{
+	u8 node_idx;
+	u8 pad1[3];
+};
+
+struct dlm_finalize_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+static inline enum dlm_status
+__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
+{
+	enum dlm_status status = DLM_NORMAL;
+
+	assert_spin_locked(&res->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		status = DLM_RECOVERING;
+	else if (res->state & DLM_LOCK_RES_MIGRATING)
+		status = DLM_MIGRATING;
+	else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
+		status = DLM_FORWARD;
+
+	return status;
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb);
+void dlm_lock_get(struct dlm_lock *lock);
+void dlm_lock_put(struct dlm_lock *lock);
+
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res);
+
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock);
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock);
+
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+
+int dlm_launch_thread(struct dlm_ctxt *dlm);
+void dlm_complete_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
+
+void dlm_put(struct dlm_ctxt *dlm);
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
+
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res);
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res);
+void dlm_purge_lockres(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *lockres);
+void dlm_lockres_get(struct dlm_lock_resource *res);
+void dlm_lockres_put(struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+			  struct dlm_lock_resource *res);
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+						const char *name,
+						unsigned int len);
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+					      const char *name,
+					      unsigned int len);
+
+int dlm_is_host_down(int errno);
+void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res,
+			      u8 owner);
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+						 const char *lockid,
+						 int flags);
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+					  const char *name,
+					  unsigned int namelen);
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_do_local_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+int dlm_do_remote_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+void dlm_do_local_bast(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res,
+		       struct dlm_lock *lock,
+		       int blocked_type);
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
+			   struct dlm_lock_resource *res,
+			   struct dlm_lock *lock,
+			   int msg_type,
+			   int blocked_type, int flags);
+static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock,
+				      int blocked_type)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
+				      blocked_type, 0);
+}
+
+static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_lock *lock,
+				     int flags)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
+				      0, flags);
+}
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+
+u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+
+
+int dlm_nm_init(struct dlm_ctxt *dlm);
+int dlm_heartbeat_init(struct dlm_ctxt *dlm);
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
+
+int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+int dlm_migrate_lockres(struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			u8 target);
+int dlm_finish_migration(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 u8 old_master);
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res);
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
+
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher,
+			       u8 request_from,
+			       u32 flags);
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to,
+			 u8 flags);
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res);
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
+void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
+
+/* will exit holding res->spinlock, but may drop in function */
+static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
+{
+	__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
+				    	  DLM_LOCK_RES_RECOVERING|
+					  DLM_LOCK_RES_MIGRATING));
+}
+
+
+int dlm_init_mle_cache(void);
+void dlm_destroy_mle_cache(void);
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
+void dlm_clean_master_list(struct dlm_ctxt *dlm,
+			   u8 dead_node);
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+
+
+static inline const char * dlm_lock_mode_name(int mode)
+{
+	switch (mode) {
+		case LKM_EXMODE:
+			return "EX";
+		case LKM_PRMODE:
+			return "PR";
+		case LKM_NLMODE:
+			return "NL";
+	}
+	return "UNKNOWN";
+}
+
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+	/* NO_LOCK compatible with all */
+	if (request == LKM_NLMODE ||
+	    existing == LKM_NLMODE)
+		return 1;
+
+	/* EX incompatible with all non-NO_LOCK */
+	if (request == LKM_EXMODE)
+		return 0;
+
+	/* request must be PR, which is compatible with PR */
+	if (existing == LKM_PRMODE)
+		return 1;
+
+	return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head,
+				   struct dlm_lock *lock)
+{
+	struct list_head *iter;
+	struct dlm_lock *tmplock;
+
+	list_for_each(iter, head) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (tmplock == lock)
+			return 1;
+	}
+	return 0;
+}
+
+
+static inline enum dlm_status dlm_err_to_dlm_status(int err)
+{
+	enum dlm_status ret;
+	if (err == -ENOMEM)
+		ret = DLM_SYSERR;
+	else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
+		ret = DLM_NOLOCKMGR;
+	else if (err == -EINVAL)
+		ret = DLM_BADPARAM;
+	else if (err == -ENAMETOOLONG)
+		ret = DLM_IVBUFLEN;
+	else
+		ret = DLM_BADARGS;
+	return ret;
+}
+
+
+static inline void dlm_node_iter_init(unsigned long *map,
+				      struct dlm_node_iter *iter)
+{
+	memcpy(iter->node_map, map, sizeof(iter->node_map));
+	iter->curnode = -1;
+}
+
+static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
+{
+	int bit;
+	bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+	iter->curnode = bit;
+	return bit;
+}
+
+
+
+#endif /* DLMCOMMON_H */

+ 530 - 0
fs/ocfs2/dlm/dlmconvert.c

@@ -0,0 +1,530 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.c
+ *
+ * underlying calls for lock conversion
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+/* NOTE: __dlmconvert_master is the only function in here that
+ * needs a spinlock held on entry (res->spinlock) and it is the
+ * only one that holds a lock on exit (res->spinlock).
+ * All other functions in here need no locks and drop all of
+ * the locks that they acquire. */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread);
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type);
+
+/*
+ * this is only called directly by dlmlock(), and only when the
+ * local node is the owner of the lockres
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: see __dlmconvert_master
+ */
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status;
+
+	spin_lock(&res->spinlock);
+	/* we are not in a network handler, this is fine */
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	status = __dlmconvert_master(dlm, res, lock, flags, type,
+				     &call_ast, &kick_thread);
+
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
+		dlm_error(status);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+/* performs lock conversion at the lockres master site
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         takes and drops lock->spinlock
+ *   held on exit:  res->spinlock
+ * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
+ *   call_ast: whether ast should be called for this lock
+ *   kick_thread: whether dlm_kick_thread should be called
+ */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread)
+{
+	enum dlm_status status = DLM_NORMAL;
+	struct list_head *iter;
+	struct dlm_lock *tmplock=NULL;
+
+	assert_spin_locked(&res->spinlock);
+
+	mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
+		   lock->ml.type, lock->ml.convert_type, type);
+
+	spin_lock(&lock->spinlock);
+
+	/* already converting? */
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		mlog(ML_ERROR, "attempted to convert a lock with a lock "
+		     "conversion pending\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	/* must be on grant queue to convert */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		mlog(ML_ERROR, "attempted to convert a lock not on grant "
+		     "queue\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	if (flags & LKM_VALBLK) {
+		switch (lock->ml.type) {
+			case LKM_EXMODE:
+				/* EX + LKM_VALBLK + convert == set lvb */
+				mlog(0, "will set lvb: converting %s->%s\n",
+				     dlm_lock_mode_name(lock->ml.type),
+				     dlm_lock_mode_name(type));
+				lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+				break;
+			case LKM_PRMODE:
+			case LKM_NLMODE:
+				/* refetch if new level is not NL */
+				if (type > LKM_NLMODE) {
+					mlog(0, "will fetch new value into "
+					     "lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					lock->lksb->flags |= DLM_LKSB_GET_LVB;
+				} else {
+					mlog(0, "will NOT fetch new value "
+					     "into lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					flags &= ~(LKM_VALBLK);
+				}
+				break;
+		}
+	}
+
+
+	/* in-place downconvert? */
+	if (type <= lock->ml.type)
+		goto grant;
+
+	/* upconvert from here on */
+	status = DLM_NORMAL;
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (tmplock == lock)
+			continue;
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+		/* existing conversion requests take precedence */
+		if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
+			goto switch_queues;
+	}
+
+	/* fall thru to grant */
+
+grant:
+	mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
+	     res->lockname.name, dlm_lock_mode_name(type));
+	/* immediately grant the new lock type */
+	lock->lksb->status = DLM_NORMAL;
+	if (lock->ml.node == dlm->node_num)
+		mlog(0, "doing in-place convert for nonlocal lock\n");
+	lock->ml.type = type;
+	status = DLM_NORMAL;
+	*call_ast = 1;
+	goto unlock_exit;
+
+switch_queues:
+	if (flags & LKM_NOQUEUE) {
+		mlog(0, "failed to convert NOQUEUE lock %.*s from "
+		     "%d to %d...\n", res->lockname.len, res->lockname.name,
+		     lock->ml.type, type);
+		status = DLM_NOTQUEUED;
+		goto unlock_exit;
+	}
+	mlog(0, "res %.*s, queueing...\n", res->lockname.len,
+	     res->lockname.name);
+
+	lock->ml.convert_type = type;
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->converting);
+
+unlock_exit:
+	spin_unlock(&lock->spinlock);
+	if (status == DLM_DENIED) {
+		__dlm_print_one_lock_resource(res);
+	}
+	if (status == DLM_NORMAL)
+		*kick_thread = 1;
+	return status;
+}
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock)
+{
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+	lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+}
+
+/* messages the master site to do lock conversion
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
+ */
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	enum dlm_status status;
+
+	mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
+	     lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "bailing out early since res is RECOVERING "
+		     "on secondary queue\n");
+		/* __dlm_print_one_lock_resource(res); */
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		__dlm_print_one_lock_resource(res);
+		mlog(ML_ERROR, "converting a remote lock that is already "
+		     "converting! (cookie=%"MLFu64", conv=%d)\n",
+		     lock->ml.cookie, lock->ml.convert_type);
+		status = DLM_DENIED;
+		goto bail;
+	}
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	/* move lock to local convert queue */
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->converting);
+	lock->convert_pending = 1;
+	lock->ml.convert_type = type;
+
+	if (flags & LKM_VALBLK) {
+		if (lock->ml.type == LKM_EXMODE) {
+			flags |= LKM_PUT_LVB;
+			lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+		} else {
+			if (lock->ml.convert_type == LKM_NLMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	/* no locks held here.
+	 * need to wait for a reply as to whether it got queued or not. */
+	status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->convert_pending = 0;
+	/* if it failed, move it back to granted queue */
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		dlm_revert_pending_convert(res, lock);
+	}
+bail:
+	spin_unlock(&res->spinlock);
+
+	/* TODO: should this be a wake_one? */
+	/* wake up any IN_PROGRESS waiters */
+	wake_up(&res->wq);
+
+	return status;
+}
+
+/* sends DLM_CONVERT_LOCK_MSG to master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, status from remote node
+ */
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type)
+{
+	struct dlm_convert_lock convert;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	memset(&convert, 0, sizeof(struct dlm_convert_lock));
+	convert.node_idx = dlm->node_num;
+	convert.requested_type = type;
+	convert.cookie = lock->ml.cookie;
+	convert.namelen = res->lockname.len;
+	convert.flags = cpu_to_be32(flags);
+	memcpy(convert.name, res->lockname.name, convert.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_convert_lock);
+	vec[0].iov_base = &convert;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
+					vec, veclen, res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+		if (ret == DLM_RECOVERING) {
+			mlog(0, "node %u returned DLM_RECOVERING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_MIGRATING) {
+			mlog(0, "node %u returned DLM_MIGRATING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_FORWARD) {
+			mlog(0, "node %u returned DLM_FORWARD from convert "
+			     "message!\n", res->owner);
+		} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
+			dlm_error(ret);
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from convert message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+/* handler for DLM_CONVERT_LOCK_MSG on master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drop res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
+ *          status from __dlmconvert_master
+ */
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+	struct dlm_lockstatus *lksb;
+	enum dlm_status status = DLM_NORMAL;
+	u32 flags;
+	int call_ast = 0, kick_thread = 0;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
+		status = DLM_IVBUFLEN;
+		dlm_error(status);
+		goto leave;
+	}
+
+	flags = be32_to_cpu(cnv->flags);
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		status = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+	     (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	list_for_each(iter, &res->granted) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock->ml.cookie == cnv->cookie &&
+		    lock->ml.node == cnv->node_idx) {
+			dlm_lock_get(lock);
+			break;
+		}
+		lock = NULL;
+	}
+	spin_unlock(&res->spinlock);
+	if (!lock) {
+		status = DLM_IVLOCKID;
+		dlm_error(status);
+		goto leave;
+	}
+
+	/* found the lock */
+	lksb = lock->lksb;
+
+	/* see if caller needed to get/put lvb */
+	if (flags & LKM_PUT_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
+	} else if (flags & LKM_GET_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_GET_LVB;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status == DLM_NORMAL) {
+		__dlm_lockres_reserve_ast(res);
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+		status = __dlmconvert_master(dlm, res, lock, flags,
+					     cnv->requested_type,
+					     &call_ast, &kick_thread);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	}
+	spin_unlock(&res->spinlock);
+
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+	}
+
+leave:
+	if (!lock)
+		mlog(ML_ERROR, "did not find lock to convert on grant queue! "
+			       "cookie=%"MLFu64"\n",
+		     cnv->cookie);
+	else
+		dlm_lock_put(lock);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}

+ 35 - 0
fs/ocfs2/dlm/dlmconvert.h

@@ -0,0 +1,35 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCONVERT_H
+#define DLMCONVERT_H
+
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+
+#endif

+ 246 - 0
fs/ocfs2/dlm/dlmdebug.c

@@ -0,0 +1,246 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.c
+ *
+ * debug functionality for the dlm
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdebug.h"
+
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
+	       res->lockname.len, res->lockname.name,
+	       res->owner, res->state);
+	spin_lock(&res->spinlock);
+	__dlm_print_one_lock_resource(res);
+	spin_unlock(&res->spinlock);
+}
+
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	struct list_head *iter2;
+	struct dlm_lock *lock;
+
+	assert_spin_locked(&res->spinlock);
+
+	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
+	       res->lockname.len, res->lockname.name,
+	       res->owner, res->state);
+	mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
+	     res->last_used, list_empty(&res->purge) ? "no" : "yes");
+	mlog(ML_NOTICE, "  granted queue: \n");
+	list_for_each(iter2, &res->granted) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+	mlog(ML_NOTICE, "  converting queue: \n");
+	list_for_each(iter2, &res->converting) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+	mlog(ML_NOTICE, "  blocked queue: \n");
+	list_for_each(iter2, &res->blocked) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+}
+
+void dlm_print_one_lock(struct dlm_lock *lockid)
+{
+	dlm_print_one_lock_resource(lockid->lockres);
+}
+EXPORT_SYMBOL_GPL(dlm_print_one_lock);
+
+void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
+{
+	struct dlm_lock_resource *res;
+	struct list_head *iter;
+	struct list_head *bucket;
+	int i;
+
+	mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
+		  dlm->name, dlm->node_num, dlm->key);
+	if (!dlm || !dlm->name) {
+		mlog(ML_ERROR, "dlm=%p\n", dlm);
+		return;
+	}
+
+	spin_lock(&dlm->spinlock);
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry(iter, struct dlm_lock_resource, list);
+			dlm_print_one_lock_resource(res);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static const char *dlm_errnames[] = {
+	[DLM_NORMAL] =			"DLM_NORMAL",
+	[DLM_GRANTED] =			"DLM_GRANTED",
+	[DLM_DENIED] =			"DLM_DENIED",
+	[DLM_DENIED_NOLOCKS] =		"DLM_DENIED_NOLOCKS",
+	[DLM_WORKING] =			"DLM_WORKING",
+	[DLM_BLOCKED] =			"DLM_BLOCKED",
+	[DLM_BLOCKED_ORPHAN] =		"DLM_BLOCKED_ORPHAN",
+	[DLM_DENIED_GRACE_PERIOD] =	"DLM_DENIED_GRACE_PERIOD",
+	[DLM_SYSERR] =			"DLM_SYSERR",
+	[DLM_NOSUPPORT] =		"DLM_NOSUPPORT",
+	[DLM_CANCELGRANT] =		"DLM_CANCELGRANT",
+	[DLM_IVLOCKID] =		"DLM_IVLOCKID",
+	[DLM_SYNC] =			"DLM_SYNC",
+	[DLM_BADTYPE] =			"DLM_BADTYPE",
+	[DLM_BADRESOURCE] =		"DLM_BADRESOURCE",
+	[DLM_MAXHANDLES] =		"DLM_MAXHANDLES",
+	[DLM_NOCLINFO] =		"DLM_NOCLINFO",
+	[DLM_NOLOCKMGR] =		"DLM_NOLOCKMGR",
+	[DLM_NOPURGED] =		"DLM_NOPURGED",
+	[DLM_BADARGS] =			"DLM_BADARGS",
+	[DLM_VOID] =			"DLM_VOID",
+	[DLM_NOTQUEUED] =		"DLM_NOTQUEUED",
+	[DLM_IVBUFLEN] =		"DLM_IVBUFLEN",
+	[DLM_CVTUNGRANT] =		"DLM_CVTUNGRANT",
+	[DLM_BADPARAM] =		"DLM_BADPARAM",
+	[DLM_VALNOTVALID] =		"DLM_VALNOTVALID",
+	[DLM_REJECTED] =		"DLM_REJECTED",
+	[DLM_ABORT] =			"DLM_ABORT",
+	[DLM_CANCEL] =			"DLM_CANCEL",
+	[DLM_IVRESHANDLE] =		"DLM_IVRESHANDLE",
+	[DLM_DEADLOCK] =		"DLM_DEADLOCK",
+	[DLM_DENIED_NOASTS] =		"DLM_DENIED_NOASTS",
+	[DLM_FORWARD] =			"DLM_FORWARD",
+	[DLM_TIMEOUT] =			"DLM_TIMEOUT",
+	[DLM_IVGROUPID] =		"DLM_IVGROUPID",
+	[DLM_VERS_CONFLICT] =		"DLM_VERS_CONFLICT",
+	[DLM_BAD_DEVICE_PATH] =		"DLM_BAD_DEVICE_PATH",
+	[DLM_NO_DEVICE_PERMISSION] =	"DLM_NO_DEVICE_PERMISSION",
+	[DLM_NO_CONTROL_DEVICE ] =	"DLM_NO_CONTROL_DEVICE ",
+	[DLM_RECOVERING] =		"DLM_RECOVERING",
+	[DLM_MIGRATING] =		"DLM_MIGRATING",
+	[DLM_MAXSTATS] =		"DLM_MAXSTATS",
+};
+
+static const char *dlm_errmsgs[] = {
+	[DLM_NORMAL] = 			"request in progress",
+	[DLM_GRANTED] = 		"request granted",
+	[DLM_DENIED] = 			"request denied",
+	[DLM_DENIED_NOLOCKS] = 		"request denied, out of system resources",
+	[DLM_WORKING] = 		"async request in progress",
+	[DLM_BLOCKED] = 		"lock request blocked",
+	[DLM_BLOCKED_ORPHAN] = 		"lock request blocked by a orphan lock",
+	[DLM_DENIED_GRACE_PERIOD] = 	"topological change in progress",
+	[DLM_SYSERR] = 			"system error",
+	[DLM_NOSUPPORT] = 		"unsupported",
+	[DLM_CANCELGRANT] = 		"can't cancel convert: already granted",
+	[DLM_IVLOCKID] = 		"bad lockid",
+	[DLM_SYNC] = 			"synchronous request granted",
+	[DLM_BADTYPE] = 		"bad resource type",
+	[DLM_BADRESOURCE] = 		"bad resource handle",
+	[DLM_MAXHANDLES] = 		"no more resource handles",
+	[DLM_NOCLINFO] = 		"can't contact cluster manager",
+	[DLM_NOLOCKMGR] = 		"can't contact lock manager",
+	[DLM_NOPURGED] = 		"can't contact purge daemon",
+	[DLM_BADARGS] = 		"bad api args",
+	[DLM_VOID] = 			"no status",
+	[DLM_NOTQUEUED] = 		"NOQUEUE was specified and request failed",
+	[DLM_IVBUFLEN] = 		"invalid resource name length",
+	[DLM_CVTUNGRANT] = 		"attempted to convert ungranted lock",
+	[DLM_BADPARAM] = 		"invalid lock mode specified",
+	[DLM_VALNOTVALID] = 		"value block has been invalidated",
+	[DLM_REJECTED] = 		"request rejected, unrecognized client",
+	[DLM_ABORT] = 			"blocked lock request cancelled",
+	[DLM_CANCEL] = 			"conversion request cancelled",
+	[DLM_IVRESHANDLE] = 		"invalid resource handle",
+	[DLM_DEADLOCK] = 		"deadlock recovery refused this request",
+	[DLM_DENIED_NOASTS] = 		"failed to allocate AST",
+	[DLM_FORWARD] = 		"request must wait for primary's response",
+	[DLM_TIMEOUT] = 		"timeout value for lock has expired",
+	[DLM_IVGROUPID] = 		"invalid group specification",
+	[DLM_VERS_CONFLICT] = 		"version conflicts prevent request handling",
+	[DLM_BAD_DEVICE_PATH] = 	"Locks device does not exist or path wrong",
+	[DLM_NO_DEVICE_PERMISSION] = 	"Client has insufficient perms for device",
+	[DLM_NO_CONTROL_DEVICE] = 	"Cannot set options on opened device ",
+	[DLM_RECOVERING] = 		"lock resource being recovered",
+	[DLM_MIGRATING] = 		"lock resource being migrated",
+	[DLM_MAXSTATS] = 		"invalid error number",
+};
+
+const char *dlm_errmsg(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errmsgs[DLM_MAXSTATS];
+	return dlm_errmsgs[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errmsg);
+
+const char *dlm_errname(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errnames[DLM_MAXSTATS];
+	return dlm_errnames[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errname);

+ 30 - 0
fs/ocfs2/dlm/dlmdebug.h

@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
+
+#endif

+ 1469 - 0
fs/ocfs2/dlm/dlmdomain.c

@@ -0,0 +1,1469 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.c
+ *
+ * defines domain join / leave apis
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmdebug.h"
+#include "dlmdomain.h"
+
+#include "dlmver.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
+#include "cluster/masklog.h"
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, obey this ordering:
+ *    dlm_domain_lock
+ *    struct dlm_ctxt->spinlock
+ *    struct dlm_lock_resource->spinlock
+ *    struct dlm_ctxt->master_lock
+ *    struct dlm_ctxt->ast_lock
+ *    dlm_master_list_entry->spinlock
+ *    dlm_lock->spinlock
+ *
+ */
+
+spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(dlm_domains);
+static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
+
+#define DLM_DOMAIN_BACKOFF_MS 200
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
+
+void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+{
+	list_del_init(&lockres->list);
+	dlm_lockres_put(lockres);
+}
+
+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res)
+{
+	struct list_head *bucket;
+	struct qstr *q;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	q = &res->lockname;
+	q->hash = full_name_hash(q->name, q->len);
+	bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
+
+	/* get a reference for our hashtable */
+	dlm_lockres_get(res);
+
+	list_add_tail(&res->list, bucket);
+}
+
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+					 const char *name,
+					 unsigned int len)
+{
+	unsigned int hash;
+	struct list_head *iter;
+	struct dlm_lock_resource *tmpres=NULL;
+	struct list_head *bucket;
+
+	mlog_entry("%.*s\n", len, name);
+
+	assert_spin_locked(&dlm->spinlock);
+
+	hash = full_name_hash(name, len);
+
+	bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
+
+	/* check for pre-existing lock */
+	list_for_each(iter, bucket) {
+		tmpres = list_entry(iter, struct dlm_lock_resource, list);
+		if (tmpres->lockname.len == len &&
+		    memcmp(tmpres->lockname.name, name, len) == 0) {
+			dlm_lockres_get(tmpres);
+			break;
+		}
+
+		tmpres = NULL;
+	}
+	return tmpres;
+}
+
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+				    const char *name,
+				    unsigned int len)
+{
+	struct dlm_lock_resource *res;
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, len);
+	spin_unlock(&dlm->spinlock);
+	return res;
+}
+
+static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
+{
+	struct dlm_ctxt *tmp = NULL;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm_domain_lock);
+
+	/* tmp->name here is always NULL terminated,
+	 * but domain may not be! */
+	list_for_each(iter, &dlm_domains) {
+		tmp = list_entry (iter, struct dlm_ctxt, list);
+		if (strlen(tmp->name) == len &&
+		    memcmp(tmp->name, domain, len)==0)
+			break;
+		tmp = NULL;
+	}
+
+	return tmp;
+}
+
+/* For null terminated domain strings ONLY */
+static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
+{
+	assert_spin_locked(&dlm_domain_lock);
+
+	return __dlm_lookup_domain_full(domain, strlen(domain));
+}
+
+
+/* returns true on one of two conditions:
+ * 1) the domain does not exist
+ * 2) the domain exists and it's state is "joined" */
+static int dlm_wait_on_domain_helper(const char *domain)
+{
+	int ret = 0;
+	struct dlm_ctxt *tmp = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	tmp = __dlm_lookup_domain(domain);
+	if (!tmp)
+		ret = 1;
+	else if (tmp->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+	return ret;
+}
+
+static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
+{
+	if (dlm->resources)
+		free_page((unsigned long) dlm->resources);
+
+	if (dlm->name)
+		kfree(dlm->name);
+
+	kfree(dlm);
+}
+
+/* A little strange - this function will be called while holding
+ * dlm_domain_lock and is expected to be holding it on the way out. We
+ * will however drop and reacquire it multiple times */
+static void dlm_ctxt_release(struct kref *kref)
+{
+	struct dlm_ctxt *dlm;
+
+	dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
+
+	BUG_ON(dlm->num_joins);
+	BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
+
+	/* we may still be in the list if we hit an error during join. */
+	list_del_init(&dlm->list);
+
+	spin_unlock(&dlm_domain_lock);
+
+	mlog(0, "freeing memory from domain %s\n", dlm->name);
+
+	wake_up(&dlm_domain_events);
+
+	dlm_free_ctxt_mem(dlm);
+
+	spin_lock(&dlm_domain_lock);
+}
+
+void dlm_put(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm_domain_lock);
+	kref_put(&dlm->dlm_refs, dlm_ctxt_release);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_get(struct dlm_ctxt *dlm)
+{
+	kref_get(&dlm->dlm_refs);
+}
+
+/* given a questionable reference to a dlm object, gets a reference if
+ * it can find it in the list, otherwise returns NULL in which case
+ * you shouldn't trust your pointer. */
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
+{
+	struct list_head *iter;
+	struct dlm_ctxt *target = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	list_for_each(iter, &dlm_domains) {
+		target = list_entry (iter, struct dlm_ctxt, list);
+
+		if (target == dlm) {
+			__dlm_get(target);
+			break;
+		}
+
+		target = NULL;
+	}
+
+	spin_unlock(&dlm_domain_lock);
+
+	return target;
+}
+
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm_domain_lock);
+	ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
+		(dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain_handlers(dlm);
+	dlm_complete_thread(dlm);
+	dlm_complete_recovery_thread(dlm);
+
+	/* We've left the domain. Now we can take ourselves out of the
+	 * list and allow the kref stuff to help us free the
+	 * memory. */
+	spin_lock(&dlm_domain_lock);
+	list_del_init(&dlm->list);
+	spin_unlock(&dlm_domain_lock);
+
+	/* Wake up anyone waiting for us to remove this domain */
+	wake_up(&dlm_domain_events);
+}
+
+static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
+{
+	int i;
+	struct dlm_lock_resource *res;
+
+	mlog(0, "Migrating locks from domain %s\n", dlm->name);
+restart:
+	spin_lock(&dlm->spinlock);
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		while (!list_empty(&dlm->resources[i])) {
+			res = list_entry(dlm->resources[i].next,
+				     struct dlm_lock_resource, list);
+			/* need reference when manually grabbing lockres */
+			dlm_lockres_get(res);
+			/* this should unhash the lockres
+			 * and exit with dlm->spinlock */
+			mlog(0, "purging res=%p\n", res);
+			if (dlm_lockres_is_dirty(dlm, res)) {
+				/* HACK!  this should absolutely go.
+				 * need to figure out why some empty
+				 * lockreses are still marked dirty */
+				mlog(ML_ERROR, "lockres %.*s dirty!\n",
+				     res->lockname.len, res->lockname.name);
+
+				spin_unlock(&dlm->spinlock);
+				dlm_kick_thread(dlm, res);
+				wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
+				dlm_lockres_put(res);
+				goto restart;
+			}
+			dlm_purge_lockres(dlm, res);
+			dlm_lockres_put(res);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
+}
+
+static int dlm_no_joining_node(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm->spinlock);
+	ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
+	spin_unlock(&dlm->spinlock);
+
+	return ret;
+}
+
+static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
+{
+	/* Yikes, a double spinlock! I need domain_lock for the dlm
+	 * state and the dlm spinlock for join state... Sorry! */
+again:
+	spin_lock(&dlm_domain_lock);
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "Node %d is joining, we wait on it.\n",
+			  dlm->joining_node);
+		spin_unlock(&dlm->spinlock);
+		spin_unlock(&dlm_domain_lock);
+
+		wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
+		goto again;
+	}
+
+	dlm->dlm_state = DLM_CTXT_LEAVING;
+	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_print_nodes(struct dlm_ctxt *dlm)
+{
+	int node = -1;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
+
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		mlog(ML_NOTICE, " node %d\n", node);
+	}
+}
+
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	unsigned int node;
+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+	mlog_entry("%p %u %p", msg, len, data);
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	node = exit_msg->node_idx;
+
+	mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
+
+	spin_lock(&dlm->spinlock);
+	clear_bit(node, dlm->domain_map);
+	__dlm_print_nodes(dlm);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, node, 0);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+
+	return 0;
+}
+
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_exit_domain leave_msg;
+
+	mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
+		  node, dlm->name, dlm->node_num);
+
+	memset(&leave_msg, 0, sizeof(leave_msg));
+	leave_msg.node_idx = dlm->node_num;
+
+	status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
+				    &leave_msg, sizeof(leave_msg), node,
+				    NULL);
+
+	mlog(0, "status return %d from o2net_send_message\n", status);
+
+	return status;
+}
+
+
+static void dlm_leave_domain(struct dlm_ctxt *dlm)
+{
+	int node, clear_node, status;
+
+	/* At this point we've migrated away all our locks and won't
+	 * accept mastership of new ones. The dlm is responsible for
+	 * almost nothing now. We make sure not to confuse any joining
+	 * nodes and then commence shutdown procedure. */
+
+	spin_lock(&dlm->spinlock);
+	/* Clear ourselves from the domain map */
+	clear_bit(dlm->node_num, dlm->domain_map);
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     0)) < O2NM_MAX_NODES) {
+		/* Drop the dlm spinlock. This is safe wrt the domain_map.
+		 * -nodes cannot be added now as the
+		 *   query_join_handlers knows to respond with OK_NO_MAP
+		 * -we catch the right network errors if a node is
+		 *   removed from the map while we're sending him the
+		 *   exit message. */
+		spin_unlock(&dlm->spinlock);
+
+		clear_node = 1;
+
+		status = dlm_send_one_domain_exit(dlm, node);
+		if (status < 0 &&
+		    status != -ENOPROTOOPT &&
+		    status != -ENOTCONN) {
+			mlog(ML_NOTICE, "Error %d sending domain exit message "
+			     "to node %d\n", status, node);
+
+			/* Not sure what to do here but lets sleep for
+			 * a bit in case this was a transient
+			 * error... */
+			msleep(DLM_DOMAIN_BACKOFF_MS);
+			clear_node = 0;
+		}
+
+		spin_lock(&dlm->spinlock);
+		/* If we're not clearing the node bit then we intend
+		 * to loop back around to try again. */
+		if (clear_node)
+			clear_bit(node, dlm->domain_map);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+int dlm_joined(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+int dlm_shutting_down(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm)
+{
+	int leave = 0;
+
+	spin_lock(&dlm_domain_lock);
+	BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
+	BUG_ON(!dlm->num_joins);
+
+	dlm->num_joins--;
+	if (!dlm->num_joins) {
+		/* We mark it "in shutdown" now so new register
+		 * requests wait until we've completely left the
+		 * domain. Don't use DLM_CTXT_LEAVING yet as we still
+		 * want new domain joins to communicate with us at
+		 * least until we've completed migration of our
+		 * resources. */
+		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
+		leave = 1;
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	if (leave) {
+		mlog(0, "shutting down domain %s\n", dlm->name);
+
+		/* We changed dlm state, notify the thread */
+		dlm_kick_thread(dlm, NULL);
+
+		dlm_migrate_all_locks(dlm);
+		dlm_mark_domain_leaving(dlm);
+		dlm_leave_domain(dlm);
+		dlm_complete_dlm_shutdown(dlm);
+	}
+	dlm_put(dlm);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_domain);
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_query_join_request *query;
+	enum dlm_query_join_response response;
+	struct dlm_ctxt *dlm = NULL;
+
+	query = (struct dlm_query_join_request *) msg->buf;
+
+	mlog(0, "node %u wants to join domain %s\n", query->node_idx,
+		  query->domain);
+
+	/*
+	 * If heartbeat doesn't consider the node live, tell it
+	 * to back off and try again.  This gives heartbeat a chance
+	 * to catch up.
+	 */
+	if (!o2hb_check_node_heartbeating(query->node_idx)) {
+		mlog(0, "node %u is not in our live map yet\n",
+		     query->node_idx);
+
+		response = JOIN_DISALLOW;
+		goto respond;
+	}
+
+	response = JOIN_OK_NO_MAP;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+	/* Once the dlm ctxt is marked as leaving then we don't want
+	 * to be put in someone's domain map. */
+	if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
+		spin_lock(&dlm->spinlock);
+
+		if (dlm->dlm_state == DLM_CTXT_NEW &&
+		    dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/*If this is a brand new context and we
+			 * haven't started our join process yet, then
+			 * the other node won the race. */
+			response = JOIN_OK_NO_MAP;
+		} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/* Disallow parallel joins. */
+			response = JOIN_DISALLOW;
+		} else {
+			/* Alright we're fully a part of this domain
+			 * so we keep some state as to who's joining
+			 * and indicate to him that needs to be fixed
+			 * up. */
+			response = JOIN_OK;
+			__dlm_set_joining_node(dlm, query->node_idx);
+		}
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+respond:
+	mlog(0, "We respond with %u\n", response);
+
+	return response;
+}
+
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_assert_joined *assert;
+	struct dlm_ctxt *dlm = NULL;
+
+	assert = (struct dlm_assert_joined *) msg->buf;
+
+	mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
+		  assert->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
+	/* XXX should we consider no dlm ctxt an error? */
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Alright, this node has officially joined our
+		 * domain. Set him in the map and clean up our
+		 * leftover join state. */
+		BUG_ON(dlm->joining_node != assert->node_idx);
+		set_bit(assert->node_idx, dlm->domain_map);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		__dlm_print_nodes(dlm);
+
+		/* notify anything attached to the heartbeat events */
+		dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_cancel_join *cancel;
+	struct dlm_ctxt *dlm = NULL;
+
+	cancel = (struct dlm_cancel_join *) msg->buf;
+
+	mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
+		  cancel->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
+
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Yikes, this guy wants to cancel his join. No
+		 * problem, we simply cleanup our join state. */
+		BUG_ON(dlm->joining_node != cancel->node_idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_cancel_join cancel_msg;
+
+	memset(&cancel_msg, 0, sizeof(cancel_msg));
+	cancel_msg.node_idx = dlm->node_num;
+	cancel_msg.name_len = strlen(dlm->name);
+	memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
+
+	status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+				    &cancel_msg, sizeof(cancel_msg), node,
+				    NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+/* map_size should be in bytes. */
+static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
+				 unsigned long *node_map,
+				 unsigned int map_size)
+{
+	int status, tmpstat;
+	unsigned int node;
+
+	if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
+			 sizeof(unsigned long))) {
+		mlog(ML_ERROR,
+		     "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
+		     map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
+		return -EINVAL;
+	}
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		tmpstat = dlm_send_one_join_cancel(dlm, node);
+		if (tmpstat) {
+			mlog(ML_ERROR, "Error return %d cancelling join on "
+			     "node %d\n", tmpstat, node);
+			if (!status)
+				status = tmpstat;
+		}
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int dlm_request_join(struct dlm_ctxt *dlm,
+			    int node,
+			    enum dlm_query_join_response *response)
+{
+	int status, retval;
+	struct dlm_query_join_request join_msg;
+
+	mlog(0, "querying node %d\n", node);
+
+	memset(&join_msg, 0, sizeof(join_msg));
+	join_msg.node_idx = dlm->node_num;
+	join_msg.name_len = strlen(dlm->name);
+	memcpy(join_msg.domain, dlm->name, join_msg.name_len);
+
+	status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
+				    sizeof(join_msg), node, &retval);
+	if (status < 0 && status != -ENOPROTOOPT) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* -ENOPROTOOPT from the net code means the other side isn't
+	    listening for our message type -- that's fine, it means
+	    his dlm isn't up, so we can consider him a 'yes' but not
+	    joined into the domain.  */
+	if (status == -ENOPROTOOPT) {
+		status = 0;
+		*response = JOIN_OK_NO_MAP;
+	} else if (retval == JOIN_DISALLOW ||
+		   retval == JOIN_OK ||
+		   retval == JOIN_OK_NO_MAP) {
+		*response = retval;
+	} else {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
+		     node);
+	}
+
+	mlog(0, "status %d, node %d response is %d\n", status, node,
+		  *response);
+
+bail:
+	return status;
+}
+
+static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_assert_joined assert_msg;
+
+	mlog(0, "Sending join assert to node %u\n", node);
+
+	memset(&assert_msg, 0, sizeof(assert_msg));
+	assert_msg.node_idx = dlm->node_num;
+	assert_msg.name_len = strlen(dlm->name);
+	memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
+
+	status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+				    &assert_msg, sizeof(assert_msg), node,
+				    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
+				  unsigned long *node_map)
+{
+	int status, node, live;
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		do {
+			/* It is very important that this message be
+			 * received so we spin until either the node
+			 * has died or it gets the message. */
+			status = dlm_send_one_join_assert(dlm, node);
+
+			spin_lock(&dlm->spinlock);
+			live = test_bit(node, dlm->live_nodes_map);
+			spin_unlock(&dlm->spinlock);
+
+			if (status) {
+				mlog(ML_ERROR, "Error return %d asserting "
+				     "join on node %d\n", status, node);
+
+				/* give us some time between errors... */
+				if (live)
+					msleep(DLM_DOMAIN_BACKOFF_MS);
+			}
+		} while (status && live);
+	}
+}
+
+struct domain_join_ctxt {
+	unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+static int dlm_should_restart_join(struct dlm_ctxt *dlm,
+				   struct domain_join_ctxt *ctxt,
+				   enum dlm_query_join_response response)
+{
+	int ret;
+
+	if (response == JOIN_DISALLOW) {
+		mlog(0, "Latest response of disallow -- should restart\n");
+		return 1;
+	}
+
+	spin_lock(&dlm->spinlock);
+	/* For now, we restart the process if the node maps have
+	 * changed at all */
+	ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
+		     sizeof(dlm->live_nodes_map));
+	spin_unlock(&dlm->spinlock);
+
+	if (ret)
+		mlog(0, "Node maps changed -- should restart\n");
+
+	return ret;
+}
+
+static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
+{
+	int status = 0, tmpstat, node;
+	struct domain_join_ctxt *ctxt;
+	enum dlm_query_join_response response;
+
+	mlog_entry("%p", dlm);
+
+	ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* group sem locking should work for us here -- we're already
+	 * registered for heartbeat events so filling this should be
+	 * atomic wrt getting those handlers called. */
+	o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+
+	spin_lock(&dlm->spinlock);
+	memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
+
+	__dlm_set_joining_node(dlm, dlm->node_num);
+
+	spin_unlock(&dlm->spinlock);
+
+	node = -1;
+	while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		status = dlm_request_join(dlm, node, &response);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Ok, either we got a response or the node doesn't have a
+		 * dlm up. */
+		if (response == JOIN_OK)
+			set_bit(node, ctxt->yes_resp_map);
+
+		if (dlm_should_restart_join(dlm, ctxt, response)) {
+			status = -EAGAIN;
+			goto bail;
+		}
+	}
+
+	mlog(0, "Yay, done querying nodes!\n");
+
+	/* Yay, everyone agree's we can join the domain. My domain is
+	 * comprised of all nodes who were put in the
+	 * yes_resp_map. Copy that into our domain map and send a join
+	 * assert message to clean up everyone elses state. */
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->domain_map, ctxt->yes_resp_map,
+	       sizeof(ctxt->yes_resp_map));
+	set_bit(dlm->node_num, dlm->domain_map);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
+
+	/* Joined state *must* be set before the joining node
+	 * information, otherwise the query_join handler may read no
+	 * current joiner but a state of NEW and tell joining nodes
+	 * we're not in the domain. */
+	spin_lock(&dlm_domain_lock);
+	dlm->dlm_state = DLM_CTXT_JOINED;
+	dlm->num_joins++;
+	spin_unlock(&dlm_domain_lock);
+
+bail:
+	spin_lock(&dlm->spinlock);
+	__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	if (!status)
+		__dlm_print_nodes(dlm);
+	spin_unlock(&dlm->spinlock);
+
+	if (ctxt) {
+		/* Do we need to send a cancel message to any nodes? */
+		if (status < 0) {
+			tmpstat = dlm_send_join_cancels(dlm,
+							ctxt->yes_resp_map,
+							sizeof(ctxt->yes_resp_map));
+			if (tmpstat < 0)
+				mlog_errno(tmpstat);
+		}
+		kfree(ctxt);
+	}
+
+	mlog(0, "returning %d\n", status);
+	return status;
+}
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
+{
+	o2hb_unregister_callback(&dlm->dlm_hb_up);
+	o2hb_unregister_callback(&dlm->dlm_hb_down);
+	o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
+}
+
+static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
+{
+	int status;
+
+	mlog(0, "registering handlers.\n");
+
+	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
+			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+	status = o2hb_register_callback(&dlm->dlm_hb_down);
+	if (status)
+		goto bail;
+
+	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+	status = o2hb_register_callback(&dlm->dlm_hb_up);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_master_request),
+					dlm_master_request_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
+					sizeof(struct dlm_assert_master),
+					dlm_assert_master_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
+					sizeof(struct dlm_create_lock),
+					dlm_create_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
+					DLM_CONVERT_LOCK_MAX_LEN,
+					dlm_convert_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					DLM_UNLOCK_LOCK_MAX_LEN,
+					dlm_unlock_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
+					DLM_PROXY_AST_MAX_LEN,
+					dlm_proxy_ast_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
+					sizeof(struct dlm_exit_domain),
+					dlm_exit_domain_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_migrate_request),
+					dlm_migrate_request_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
+					DLM_MIG_LOCKRES_MAX_LEN,
+					dlm_mig_lockres_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
+					sizeof(struct dlm_master_requery),
+					dlm_master_requery_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_lock_request),
+					dlm_request_all_locks_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
+					sizeof(struct dlm_reco_data_done),
+					dlm_reco_data_done_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
+					sizeof(struct dlm_begin_reco),
+					dlm_begin_reco_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
+					sizeof(struct dlm_finalize_reco),
+					dlm_finalize_reco_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+bail:
+	if (status)
+		dlm_unregister_domain_handlers(dlm);
+
+	return status;
+}
+
+static int dlm_join_domain(struct dlm_ctxt *dlm)
+{
+	int status;
+
+	BUG_ON(!dlm);
+
+	mlog(0, "Join domain %s\n", dlm->name);
+
+	status = dlm_register_domain_handlers(dlm);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_recovery_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	do {
+		unsigned int backoff;
+		status = dlm_try_to_join_domain(dlm);
+
+		/* If we're racing another node to the join, then we
+		 * need to back off temporarily and let them
+		 * complete. */
+		if (status == -EAGAIN) {
+			if (signal_pending(current)) {
+				status = -ERESTARTSYS;
+				goto bail;
+			}
+
+			/*
+			 * <chip> After you!
+			 * <dale> No, after you!
+			 * <chip> I insist!
+			 * <dale> But you first!
+			 * ...
+			 */
+			backoff = (unsigned int)(jiffies & 0x3);
+			backoff *= DLM_DOMAIN_BACKOFF_MS;
+			mlog(0, "backoff %d\n", backoff);
+			msleep(backoff);
+		}
+	} while (status == -EAGAIN);
+
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	wake_up(&dlm_domain_events);
+
+	if (status) {
+		dlm_unregister_domain_handlers(dlm);
+		dlm_complete_thread(dlm);
+		dlm_complete_recovery_thread(dlm);
+	}
+
+	return status;
+}
+
+static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
+				u32 key)
+{
+	int i;
+	struct dlm_ctxt *dlm = NULL;
+
+	dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
+	if (!dlm) {
+		mlog_errno(-ENOMEM);
+		goto leave;
+	}
+
+	dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+	if (dlm->name == NULL) {
+		mlog_errno(-ENOMEM);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
+	if (!dlm->resources) {
+		mlog_errno(-ENOMEM);
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+	memset(dlm->resources, 0, PAGE_SIZE);
+
+	for (i=0; i<DLM_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&dlm->resources[i]);
+
+	strcpy(dlm->name, domain);
+	dlm->key = key;
+	dlm->node_num = o2nm_this_node();
+
+	spin_lock_init(&dlm->spinlock);
+	spin_lock_init(&dlm->master_lock);
+	spin_lock_init(&dlm->ast_lock);
+	INIT_LIST_HEAD(&dlm->list);
+	INIT_LIST_HEAD(&dlm->dirty_list);
+	INIT_LIST_HEAD(&dlm->reco.resources);
+	INIT_LIST_HEAD(&dlm->reco.received);
+	INIT_LIST_HEAD(&dlm->reco.node_data);
+	INIT_LIST_HEAD(&dlm->purge_list);
+	INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+	dlm->reco.state = 0;
+
+	INIT_LIST_HEAD(&dlm->pending_asts);
+	INIT_LIST_HEAD(&dlm->pending_basts);
+
+	mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
+		  dlm->recovery_map, &(dlm->recovery_map[0]));
+
+	memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
+	memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
+	memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+
+	dlm->dlm_thread_task = NULL;
+	dlm->dlm_reco_thread_task = NULL;
+	init_waitqueue_head(&dlm->dlm_thread_wq);
+	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
+	init_waitqueue_head(&dlm->reco.event);
+	init_waitqueue_head(&dlm->ast_wq);
+	init_waitqueue_head(&dlm->migration_wq);
+	INIT_LIST_HEAD(&dlm->master_list);
+	INIT_LIST_HEAD(&dlm->mle_hb_events);
+
+	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
+	init_waitqueue_head(&dlm->dlm_join_events);
+
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	atomic_set(&dlm->local_resources, 0);
+	atomic_set(&dlm->remote_resources, 0);
+	atomic_set(&dlm->unknown_resources, 0);
+
+	spin_lock_init(&dlm->work_lock);
+	INIT_LIST_HEAD(&dlm->work_list);
+	INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
+
+	kref_init(&dlm->dlm_refs);
+	dlm->dlm_state = DLM_CTXT_NEW;
+
+	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
+
+	mlog(0, "context init: refcount %u\n",
+		  atomic_read(&dlm->dlm_refs.refcount));
+
+leave:
+	return dlm;
+}
+
+/*
+ * dlm_register_domain: one-time setup per "domain"
+ */
+struct dlm_ctxt * dlm_register_domain(const char *domain,
+			       u32 key)
+{
+	int ret;
+	struct dlm_ctxt *dlm = NULL;
+	struct dlm_ctxt *new_ctxt = NULL;
+
+	if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		mlog(ML_ERROR, "domain name length too long\n");
+		goto leave;
+	}
+
+	if (!o2hb_check_local_node_heartbeating()) {
+		mlog(ML_ERROR, "the local node has not been configured, or is "
+		     "not heartbeating\n");
+		ret = -EPROTO;
+		goto leave;
+	}
+
+	mlog(0, "register called for domain \"%s\"\n", domain);
+
+retry:
+	dlm = NULL;
+	if (signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	spin_lock(&dlm_domain_lock);
+
+	dlm = __dlm_lookup_domain(domain);
+	if (dlm) {
+		if (dlm->dlm_state != DLM_CTXT_JOINED) {
+			spin_unlock(&dlm_domain_lock);
+
+			mlog(0, "This ctxt is not joined yet!\n");
+			wait_event_interruptible(dlm_domain_events,
+						 dlm_wait_on_domain_helper(
+							 domain));
+			goto retry;
+		}
+
+		__dlm_get(dlm);
+		dlm->num_joins++;
+
+		spin_unlock(&dlm_domain_lock);
+
+		ret = 0;
+		goto leave;
+	}
+
+	/* doesn't exist */
+	if (!new_ctxt) {
+		spin_unlock(&dlm_domain_lock);
+
+		new_ctxt = dlm_alloc_ctxt(domain, key);
+		if (new_ctxt)
+			goto retry;
+
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	/* a little variable switch-a-roo here... */
+	dlm = new_ctxt;
+	new_ctxt = NULL;
+
+	/* add the new domain */
+	list_add_tail(&dlm->list, &dlm_domains);
+	spin_unlock(&dlm_domain_lock);
+
+	ret = dlm_join_domain(dlm);
+	if (ret) {
+		mlog_errno(ret);
+		dlm_put(dlm);
+		goto leave;
+	}
+
+	ret = 0;
+leave:
+	if (new_ctxt)
+		dlm_free_ctxt_mem(new_ctxt);
+
+	if (ret < 0)
+		dlm = ERR_PTR(ret);
+
+	return dlm;
+}
+EXPORT_SYMBOL_GPL(dlm_register_domain);
+
+static LIST_HEAD(dlm_join_handlers);
+
+static void dlm_unregister_net_handlers(void)
+{
+	o2net_unregister_handler_list(&dlm_join_handlers);
+}
+
+static int dlm_register_net_handlers(void)
+{
+	int status = 0;
+
+	status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_query_join_request),
+					dlm_query_join_handler,
+					NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_assert_joined),
+					dlm_assert_joined_handler,
+					NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_cancel_join),
+					dlm_cancel_join_handler,
+					NULL, &dlm_join_handlers);
+
+bail:
+	if (status < 0)
+		dlm_unregister_net_handlers();
+
+	return status;
+}
+
+/* Domain eviction callback handling.
+ *
+ * The file system requires notification of node death *before* the
+ * dlm completes it's recovery work, otherwise it may be able to
+ * acquire locks on resources requiring recovery. Since the dlm can
+ * evict a node from it's domain *before* heartbeat fires, a similar
+ * mechanism is required. */
+
+/* Eviction is not expected to happen often, so a per-domain lock is
+ * not necessary. Eviction callbacks are allowed to sleep for short
+ * periods of time. */
+static DECLARE_RWSEM(dlm_callback_sem);
+
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num)
+{
+	struct list_head *iter;
+	struct dlm_eviction_cb *cb;
+
+	down_read(&dlm_callback_sem);
+	list_for_each(iter, &dlm->dlm_eviction_callbacks) {
+		cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
+
+		cb->ec_func(node_num, cb->ec_data);
+	}
+	up_read(&dlm_callback_sem);
+}
+
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data)
+{
+	INIT_LIST_HEAD(&cb->ec_item);
+	cb->ec_func = f;
+	cb->ec_data = data;
+}
+EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
+
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
+
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_del_init(&cb->ec_item);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
+
+static int __init dlm_init(void)
+{
+	int status;
+
+	dlm_print_version();
+
+	status = dlm_init_mle_cache();
+	if (status)
+		return -1;
+
+	status = dlm_register_net_handlers();
+	if (status) {
+		dlm_destroy_mle_cache();
+		return -1;
+	}
+
+	return 0;
+}
+
+static void __exit dlm_exit (void)
+{
+	dlm_unregister_net_handlers();
+	dlm_destroy_mle_cache();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(dlm_init);
+module_exit(dlm_exit);

+ 36 - 0
fs/ocfs2/dlm/dlmdomain.h

@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDOMAIN_H
+#define DLMDOMAIN_H
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+int dlm_joined(struct dlm_ctxt *dlm);
+int dlm_shutting_down(struct dlm_ctxt *dlm);
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num);
+
+#endif

+ 640 - 0
fs/ocfs2/dlm/dlmfs.c

@@ -0,0 +1,640 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfs.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM. This file handles the virtual file system
+ * used for communication with userspace. Credit should go to ramfs,
+ * which was a template for the fs side of this module.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* Simple VFS hooks based on: */
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+
+#include <asm/uaccess.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+
+#include "userdlm.h"
+
+#include "dlmfsver.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static struct super_operations dlmfs_ops;
+static struct file_operations dlmfs_file_operations;
+static struct inode_operations dlmfs_dir_inode_operations;
+static struct inode_operations dlmfs_root_inode_operations;
+static struct inode_operations dlmfs_file_inode_operations;
+static kmem_cache_t *dlmfs_inode_cache;
+
+struct workqueue_struct *user_dlm_worker;
+
+/*
+ * decodes a set of open flags into a valid lock level and a set of flags.
+ * returns < 0 if we have invalid flags
+ * flags which mean something to us:
+ * O_RDONLY -> PRMODE level
+ * O_WRONLY -> EXMODE level
+ *
+ * O_NONBLOCK -> LKM_NOQUEUE
+ */
+static int dlmfs_decode_open_flags(int open_flags,
+				   int *level,
+				   int *flags)
+{
+	if (open_flags & (O_WRONLY|O_RDWR))
+		*level = LKM_EXMODE;
+	else
+		*level = LKM_PRMODE;
+
+	*flags = 0;
+	if (open_flags & O_NONBLOCK)
+		*flags |= LKM_NOQUEUE;
+
+	return 0;
+}
+
+static int dlmfs_file_open(struct inode *inode,
+			   struct file *file)
+{
+	int status, level, flags;
+	struct dlmfs_filp_private *fp = NULL;
+	struct dlmfs_inode_private *ip;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
+		file->f_flags);
+
+	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
+	if (status < 0)
+		goto bail;
+
+	/* We don't want to honor O_APPEND at read/write time as it
+	 * doesn't make sense for LVB writes. */
+	file->f_flags &= ~O_APPEND;
+
+	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	if (!fp) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	fp->fp_lock_level = level;
+
+	ip = DLMFS_I(inode);
+
+	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
+	if (status < 0) {
+		/* this is a strange error to return here but I want
+		 * to be able userspace to be able to distinguish a
+		 * valid lock request from one that simply couldn't be
+		 * granted. */
+		if (flags & LKM_NOQUEUE && status == -EAGAIN)
+			status = -ETXTBSY;
+		kfree(fp);
+		goto bail;
+	}
+
+	file->private_data = fp;
+bail:
+	return status;
+}
+
+static int dlmfs_file_release(struct inode *inode,
+			      struct file *file)
+{
+	int level, status;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+	struct dlmfs_filp_private *fp =
+		(struct dlmfs_filp_private *) file->private_data;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "close called on inode %lu\n", inode->i_ino);
+
+	status = 0;
+	if (fp) {
+		level = fp->fp_lock_level;
+		if (level != LKM_IVMODE)
+			user_dlm_cluster_unlock(&ip->ip_lockres, level);
+
+		kfree(fp);
+		file->private_data = NULL;
+	}
+
+	return 0;
+}
+
+static ssize_t dlmfs_file_read(struct file *filp,
+			       char __user *buf,
+			       size_t count,
+			       loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t readlen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return 0;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	/* don't read past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		readlen = i_size_read(inode) - *ppos;
+	else
+		readlen = count - *ppos;
+
+	lvb_buf = kmalloc(readlen, GFP_KERNEL);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	user_dlm_read_lvb(inode, lvb_buf, readlen);
+	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+	readlen -= bytes_left;
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + readlen;
+
+	mlog(0, "read %zd bytes\n", readlen);
+	return readlen;
+}
+
+static ssize_t dlmfs_file_write(struct file *filp,
+				const char __user *buf,
+				size_t count,
+				loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t writelen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return -ENOSPC;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* don't write past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		writelen = i_size_read(inode) - *ppos;
+	else
+		writelen = count - *ppos;
+
+	lvb_buf = kmalloc(writelen, GFP_KERNEL);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	bytes_left = copy_from_user(lvb_buf, buf, writelen);
+	writelen -= bytes_left;
+	if (writelen)
+		user_dlm_write_lvb(inode, lvb_buf, writelen);
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + writelen;
+	mlog(0, "wrote %zd bytes\n", writelen);
+	return writelen;
+}
+
+static void dlmfs_init_once(void *foo,
+			    kmem_cache_t *cachep,
+			    unsigned long flags)
+{
+	struct dlmfs_inode_private *ip =
+		(struct dlmfs_inode_private *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		ip->ip_dlm = NULL;
+		ip->ip_parent = NULL;
+
+		inode_init_once(&ip->ip_vfs_inode);
+	}
+}
+
+static struct inode *dlmfs_alloc_inode(struct super_block *sb)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
+	if (!ip)
+		return NULL;
+
+	return &ip->ip_vfs_inode;
+}
+
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
+}
+
+static void dlmfs_clear_inode(struct inode *inode)
+{
+	int status;
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return;
+
+	mlog(0, "inode %lu\n", inode->i_ino);
+
+	ip = DLMFS_I(inode);
+
+	if (S_ISREG(inode->i_mode)) {
+		status = user_dlm_destroy_lock(&ip->ip_lockres);
+		if (status < 0)
+			mlog_errno(status);
+		iput(ip->ip_parent);
+		goto clear_fields;
+	}
+
+	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+	/* we must be a directory. If required, lets unregister the
+	 * dlm context now. */
+	if (ip->ip_dlm)
+		user_dlm_unregister_context(ip->ip_dlm);
+clear_fields:
+	ip->ip_parent = NULL;
+	ip->ip_dlm = NULL;
+}
+
+static struct backing_dev_info dlmfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+static struct inode *dlmfs_get_root_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	int mode = S_IFDIR | 0755;
+	struct dlmfs_inode_private *ip;
+
+	if (inode) {
+		ip = DLMFS_I(inode);
+
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_nlink++;
+
+		inode->i_fop = &simple_dir_operations;
+		inode->i_op = &dlmfs_root_inode_operations;
+	}
+
+	return inode;
+}
+
+static struct inode *dlmfs_get_inode(struct inode *parent,
+				     struct dentry *dentry,
+				     int mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode * inode = new_inode(sb);
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_mode = mode;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ip = DLMFS_I(inode);
+	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+
+	switch (mode & S_IFMT) {
+	default:
+		/* for now we don't support anything other than
+		 * directories and regular files. */
+		BUG();
+		break;
+	case S_IFREG:
+		inode->i_op = &dlmfs_file_inode_operations;
+		inode->i_fop = &dlmfs_file_operations;
+
+		i_size_write(inode,  DLM_LVB_LEN);
+
+		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
+
+		/* released at clear_inode time, this insures that we
+		 * get to drop the dlm reference on each lock *before*
+		 * we call the unregister code for releasing parent
+		 * directories. */
+		ip->ip_parent = igrab(parent);
+		BUG_ON(!ip->ip_parent);
+		break;
+	case S_IFDIR:
+		inode->i_op = &dlmfs_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink ==
+		 * 2 (for "." entry) */
+		inode->i_nlink++;
+		break;
+	}
+
+	if (parent->i_mode & S_ISGID) {
+		inode->i_gid = parent->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int dlmfs_mkdir(struct inode * dir,
+		       struct dentry * dentry,
+		       int mode)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct qstr *domain = &dentry->d_name;
+	struct dlmfs_inode_private *ip;
+	struct dlm_ctxt *dlm;
+
+	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
+
+	/* verify that we have a proper domain */
+	if (domain->len >= O2NM_MAX_NAME_LEN) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid domain name for directory.\n");
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ip = DLMFS_I(inode);
+
+	dlm = user_dlm_register_context(domain);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
+		     status, domain->len, domain->name);
+		goto bail;
+	}
+	ip->ip_dlm = dlm;
+
+	dir->i_nlink++;
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+
+	status = 0;
+bail:
+	if (status < 0)
+		iput(inode);
+	return status;
+}
+
+static int dlmfs_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int status = 0;
+	struct inode *inode;
+	struct qstr *name = &dentry->d_name;
+
+	mlog(0, "create %.*s\n", name->len, name->name);
+
+	/* verify name is valid and doesn't contain any dlm reserved
+	 * characters */
+	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
+	    name->name[0] == '$') {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
+		     name->name);
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+bail:
+	return status;
+}
+
+static int dlmfs_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	struct inode *inode = dentry->d_inode;
+
+	mlog(0, "unlink inode %lu\n", inode->i_ino);
+
+	/* if there are no current holders, or none that are waiting
+	 * to acquire a lock, this basically destroys our lockres. */
+	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
+	if (status < 0) {
+		mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
+		     dentry->d_name.len, dentry->d_name.name, status);
+		goto bail;
+	}
+	status = simple_unlink(dir, dentry);
+bail:
+	return status;
+}
+
+static int dlmfs_fill_super(struct super_block * sb,
+			    void * data,
+			    int silent)
+{
+	struct inode * inode;
+	struct dentry * root;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = DLMFS_MAGIC;
+	sb->s_op = &dlmfs_ops;
+	inode = dlmfs_get_root_inode(sb);
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static struct file_operations dlmfs_file_operations = {
+	.open		= dlmfs_file_open,
+	.release	= dlmfs_file_release,
+	.read		= dlmfs_file_read,
+	.write		= dlmfs_file_write,
+};
+
+static struct inode_operations dlmfs_dir_inode_operations = {
+	.create		= dlmfs_create,
+	.lookup		= simple_lookup,
+	.unlink		= dlmfs_unlink,
+};
+
+/* this way we can restrict mkdir to only the toplevel of the fs. */
+static struct inode_operations dlmfs_root_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= dlmfs_mkdir,
+	.rmdir		= simple_rmdir,
+};
+
+static struct super_operations dlmfs_ops = {
+	.statfs		= simple_statfs,
+	.alloc_inode	= dlmfs_alloc_inode,
+	.destroy_inode	= dlmfs_destroy_inode,
+	.clear_inode	= dlmfs_clear_inode,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct inode_operations dlmfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
+
+static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+}
+
+static struct file_system_type dlmfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ocfs2_dlmfs",
+	.get_sb		= dlmfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_dlmfs_fs(void)
+{
+	int status;
+	int cleanup_inode = 0, cleanup_worker = 0;
+
+	dlmfs_print_version();
+
+	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
+				sizeof(struct dlmfs_inode_private),
+				0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+				dlmfs_init_once, NULL);
+	if (!dlmfs_inode_cache)
+		return -ENOMEM;
+	cleanup_inode = 1;
+
+	user_dlm_worker = create_singlethread_workqueue("user_dlm");
+	if (!user_dlm_worker) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_worker = 1;
+
+	status = register_filesystem(&dlmfs_fs_type);
+bail:
+	if (status) {
+		if (cleanup_inode)
+			kmem_cache_destroy(dlmfs_inode_cache);
+		if (cleanup_worker)
+			destroy_workqueue(user_dlm_worker);
+	} else
+		printk("OCFS2 User DLM kernel interface loaded\n");
+	return status;
+}
+
+static void __exit exit_dlmfs_fs(void)
+{
+	unregister_filesystem(&dlmfs_fs_type);
+
+	flush_workqueue(user_dlm_worker);
+	destroy_workqueue(user_dlm_worker);
+
+	if (kmem_cache_destroy(dlmfs_inode_cache))
+		printk(KERN_INFO "dlmfs_inode_cache: not all structures "
+		       "were freed\n");
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_dlmfs_fs)
+module_exit(exit_dlmfs_fs)

+ 42 - 0
fs/ocfs2/dlm/dlmfsver.c

@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmfsver.h"
+
+#define DLM_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
+
+void dlmfs_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);

+ 31 - 0
fs/ocfs2/dlm/dlmfsver.h

@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLMFS_VER_H
+#define DLMFS_VER_H
+
+void dlmfs_print_version(void);
+
+#endif /* DLMFS_VER_H */

+ 676 - 0
fs/ocfs2/dlm/dlmlock.c

@@ -0,0 +1,676 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmlock.c
+ *
+ * underlying calls for lock creation
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_next_cookie = 1;
+
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags);
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie);
+static void dlm_lock_release(struct kref *kref);
+static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+
+/* Tell us whether we can grant a new lock request.
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         none
+ *   held on exit:  none
+ * returns: 1 if the lock can be granted, 0 otherwise.
+ */
+static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
+				  struct dlm_lock *lock)
+{
+	struct list_head *iter;
+	struct dlm_lock *tmplock;
+
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+	}
+
+	return 1;
+}
+
+/* performs lock creation at the lockres master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOTQUEUED
+ */
+static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status = DLM_NORMAL;
+
+	mlog_entry("type=%d\n", lock->ml.type);
+
+	spin_lock(&res->spinlock);
+	/* if called from dlm_create_lock_handler, need to
+	 * ensure it will not sleep in dlm_wait_on_lockres */
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL &&
+	    lock->ml.node != dlm->node_num) {
+		/* erf.  state changed after lock was dropped. */
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		return status;
+	}
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+
+	if (dlm_can_grant_new_lock(res, lock)) {
+		mlog(0, "I can grant this lock right away\n");
+		/* got it right away */
+		lock->lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+
+		/* for the recovery lock, we can't allow the ast
+		 * to be queued since the dlmthread is already
+		 * frozen.  but the recovery lock is always locked
+		 * with LKM_NOQUEUE so we do not need the ast in
+		 * this special case */
+		if (!dlm_is_recovery_lock(res->lockname.name,
+					  res->lockname.len)) {
+			kick_thread = 1;
+			call_ast = 1;
+		}
+	} else {
+		/* for NOQUEUE request, unless we get the
+		 * lock right away, return DLM_NOTQUEUED */
+		if (flags & LKM_NOQUEUE)
+			status = DLM_NOTQUEUED;
+		else {
+			dlm_lock_get(lock);
+			list_add_tail(&lock->list, &res->blocked);
+			kick_thread = 1;
+		}
+	}
+
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	dlm_lockres_calc_usage(dlm, res);
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock)
+{
+	/* remove from local queue if it failed */
+	list_del_init(&lock->list);
+	lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+}
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_DENIED, DLM_RECOVERING, or net status
+ */
+static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	enum dlm_status status = DLM_DENIED;
+
+	mlog_entry("type=%d\n", lock->ml.type);
+	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+	     res->lockname.name, flags);
+
+	spin_lock(&res->spinlock);
+
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	/* add lock to local (secondary) queue */
+	dlm_lock_get(lock);
+	list_add_tail(&lock->list, &res->blocked);
+	lock->lock_pending = 1;
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->lock_pending = 0;
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		dlm_revert_pending_lock(res, lock);
+		dlm_lock_put(lock);
+	}
+	spin_unlock(&res->spinlock);
+
+	dlm_lockres_calc_usage(dlm, res);
+
+	wake_up(&res->wq);
+	return status;
+}
+
+
+/* for remote lock creation.
+ * locking:
+ *   caller needs:  none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, or net status
+ */
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags)
+{
+	struct dlm_create_lock create;
+	int tmpret, status = 0;
+	enum dlm_status ret;
+
+	mlog_entry_void();
+
+	memset(&create, 0, sizeof(create));
+	create.node_idx = dlm->node_num;
+	create.requested_type = lock->ml.type;
+	create.cookie = lock->ml.cookie;
+	create.namelen = res->lockname.len;
+	create.flags = cpu_to_be32(flags);
+	memcpy(create.name, res->lockname.name, create.namelen);
+
+	tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
+				    sizeof(create), res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from lock message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+void dlm_lock_get(struct dlm_lock *lock)
+{
+	kref_get(&lock->lock_refs);
+}
+
+void dlm_lock_put(struct dlm_lock *lock)
+{
+	kref_put(&lock->lock_refs, dlm_lock_release);
+}
+
+static void dlm_lock_release(struct kref *kref)
+{
+	struct dlm_lock *lock;
+
+	lock = container_of(kref, struct dlm_lock, lock_refs);
+
+	BUG_ON(!list_empty(&lock->list));
+	BUG_ON(!list_empty(&lock->ast_list));
+	BUG_ON(!list_empty(&lock->bast_list));
+	BUG_ON(lock->ast_pending);
+	BUG_ON(lock->bast_pending);
+
+	dlm_lock_detach_lockres(lock);
+
+	if (lock->lksb_kernel_allocated) {
+		mlog(0, "freeing kernel-allocated lksb\n");
+		kfree(lock->lksb);
+	}
+	kfree(lock);
+}
+
+/* associate a lock with it's lockres, getting a ref on the lockres */
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res)
+{
+	dlm_lockres_get(res);
+	lock->lockres = res;
+}
+
+/* drop ref on lockres, if there is still one associated with lock */
+static void dlm_lock_detach_lockres(struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	res = lock->lockres;
+	if (res) {
+		lock->lockres = NULL;
+		mlog(0, "removing lock's lockres reference\n");
+		dlm_lockres_put(res);
+	}
+}
+
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie)
+{
+	INIT_LIST_HEAD(&newlock->list);
+	INIT_LIST_HEAD(&newlock->ast_list);
+	INIT_LIST_HEAD(&newlock->bast_list);
+	spin_lock_init(&newlock->spinlock);
+	newlock->ml.type = type;
+	newlock->ml.convert_type = LKM_IVMODE;
+	newlock->ml.highest_blocked = LKM_IVMODE;
+	newlock->ml.node = node;
+	newlock->ml.pad1 = 0;
+	newlock->ml.list = 0;
+	newlock->ml.flags = 0;
+	newlock->ast = NULL;
+	newlock->bast = NULL;
+	newlock->astdata = NULL;
+	newlock->ml.cookie = cpu_to_be64(cookie);
+	newlock->ast_pending = 0;
+	newlock->bast_pending = 0;
+	newlock->convert_pending = 0;
+	newlock->lock_pending = 0;
+	newlock->unlock_pending = 0;
+	newlock->cancel_pending = 0;
+	newlock->lksb_kernel_allocated = 0;
+
+	kref_init(&newlock->lock_refs);
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb)
+{
+	struct dlm_lock *lock;
+	int kernel_allocated = 0;
+
+	lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+	if (!lock)
+		return NULL;
+
+	if (!lksb) {
+		/* zero memory only if kernel-allocated */
+		lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+		if (!lksb) {
+			kfree(lock);
+			return NULL;
+		}
+		kernel_allocated = 1;
+	}
+
+	dlm_init_lock(lock, type, node, cookie);
+	if (kernel_allocated)
+		lock->lksb_kernel_allocated = 1;
+	lock->lksb = lksb;
+	lksb->lockid = lock;
+	return lock;
+}
+
+/* handler for lock creation net message
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
+ */
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	char *name;
+	unsigned int namelen;
+
+	BUG_ON(!dlm);
+
+	mlog_entry_void();
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	name = create->name;
+	namelen = create->namelen;
+
+	status = DLM_IVBUFLEN;
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	status = DLM_SYSERR;
+	newlock = dlm_new_lock(create->requested_type,
+			       create->node_idx,
+			       be64_to_cpu(create->cookie), NULL);
+	if (!newlock) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	lksb = newlock->lksb;
+
+	if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
+		lksb->flags |= DLM_LKSB_GET_LVB;
+		mlog(0, "set DLM_LKSB_GET_LVB flag\n");
+	}
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, name, namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	spin_unlock(&res->spinlock);
+
+	if (status != DLM_NORMAL) {
+		mlog(0, "lockres recovering/migrating/in-progress\n");
+		goto leave;
+	}
+
+	dlm_lock_attach_lockres(newlock, res);
+
+	status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
+leave:
+	if (status != DLM_NORMAL)
+		if (newlock)
+			dlm_lock_put(newlock);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
+static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
+{
+	u64 tmpnode = node_num;
+
+	/* shift single byte of node num into top 8 bits */
+	tmpnode <<= 56;
+
+	spin_lock(&dlm_cookie_lock);
+	*cookie = (dlm_next_cookie | tmpnode);
+	if (++dlm_next_cookie & 0xff00000000000000ull) {
+		mlog(0, "This node's cookie will now wrap!\n");
+		dlm_next_cookie = 1;
+	}
+	spin_unlock(&dlm_cookie_lock);
+}
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
+			struct dlm_lockstatus *lksb, int flags,
+			const char *name, dlm_astlockfunc_t *ast, void *data,
+			dlm_bastlockfunc_t *bast)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	int convert = 0, recovery = 0;
+
+	/* yes this function is a mess.
+	 * TODO: clean this up.  lots of common code in the
+	 *       lock and convert paths, especially in the retry blocks */
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	status = DLM_BADPARAM;
+	if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
+		dlm_error(status);
+		goto error;
+	}
+
+	if (flags & ~LKM_VALID_FLAGS) {
+		dlm_error(status);
+		goto error;
+	}
+
+	convert = (flags & LKM_CONVERT);
+	recovery = (flags & LKM_RECOVERY);
+
+	if (recovery &&
+	    (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
+		dlm_error(status);
+		goto error;
+	}
+	if (convert && (flags & LKM_LOCAL)) {
+		mlog(ML_ERROR, "strange LOCAL convert request!\n");
+		goto error;
+	}
+
+	if (convert) {
+		/* CONVERT request */
+
+		/* if converting, must pass in a valid dlm_lock */
+		lock = lksb->lockid;
+		if (!lock) {
+			mlog(ML_ERROR, "NULL lock pointer in convert "
+			     "request\n");
+			goto error;
+		}
+
+		res = lock->lockres;
+		if (!res) {
+			mlog(ML_ERROR, "NULL lockres pointer in convert "
+			     "request\n");
+			goto error;
+		}
+		dlm_lockres_get(res);
+
+		/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
+	 	 * static after the original lock call.  convert requests will
+		 * ensure that everything is the same, or return DLM_BADARGS.
+	 	 * this means that DLM_DENIED_NOASTS will never be returned.
+	 	 */
+		if (lock->lksb != lksb || lock->ast != ast ||
+		    lock->bast != bast || lock->astdata != data) {
+			status = DLM_BADARGS;
+			mlog(ML_ERROR, "new args:  lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lksb, ast, bast, data);
+			mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lock->lksb, lock->ast,
+			     lock->bast, lock->astdata);
+			goto error;
+		}
+retry_convert:
+		dlm_wait_for_recovery(dlm);
+
+		if (res->owner == dlm->node_num)
+			status = dlmconvert_master(dlm, res, lock, flags, mode);
+		else
+			status = dlmconvert_remote(dlm, res, lock, flags, mode);
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			/* for now, see how this works without sleeping
+			 * and just retry right away.  I suspect the reco
+			 * or migration will complete fast enough that
+			 * no waiting will be necessary */
+			mlog(0, "retrying convert with migration/recovery/"
+			     "in-progress\n");
+			msleep(100);
+			goto retry_convert;
+		}
+	} else {
+		u64 tmpcookie;
+
+		/* LOCK request */
+		status = DLM_BADARGS;
+		if (!name) {
+			dlm_error(status);
+			goto error;
+		}
+
+		status = DLM_IVBUFLEN;
+		if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
+			dlm_error(status);
+			goto error;
+		}
+
+		dlm_get_next_cookie(dlm->node_num, &tmpcookie);
+		lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
+		if (!lock) {
+			dlm_error(status);
+			goto error;
+		}
+
+		if (!recovery)
+			dlm_wait_for_recovery(dlm);
+
+		/* find or create the lock resource */
+		res = dlm_get_lock_resource(dlm, name, flags);
+		if (!res) {
+			status = DLM_IVLOCKID;
+			dlm_error(status);
+			goto error;
+		}
+
+		mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
+		mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
+
+		dlm_lock_attach_lockres(lock, res);
+		lock->ast = ast;
+		lock->bast = bast;
+		lock->astdata = data;
+
+retry_lock:
+		if (flags & LKM_VALBLK) {
+			mlog(0, "LKM_VALBLK passed by caller\n");
+
+			/* LVB requests for non PR, PW or EX locks are
+			 * ignored. */
+			if (mode < LKM_PRMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+
+		if (res->owner == dlm->node_num)
+			status = dlmlock_master(dlm, res, lock, flags);
+		else
+			status = dlmlock_remote(dlm, res, lock, flags);
+
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			mlog(0, "retrying lock with migration/"
+			     "recovery/in progress\n");
+			msleep(100);
+			dlm_wait_for_recovery(dlm);
+			goto retry_lock;
+		}
+
+		if (status != DLM_NORMAL) {
+			lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+			if (status != DLM_NOTQUEUED)
+				dlm_error(status);
+			goto error;
+		}
+	}
+
+error:
+	if (status != DLM_NORMAL) {
+		if (lock && !convert)
+			dlm_lock_put(lock);
+		// this is kind of unnecessary
+		lksb->status = status;
+	}
+
+	/* put lockres ref from the convert path
+	 * or from dlm_get_lock_resource */
+	if (res)
+		dlm_lockres_put(res);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmlock);

+ 2664 - 0
fs/ocfs2/dlm/dlmmaster.c

@@ -0,0 +1,2664 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdebug.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
+#include "cluster/masklog.h"
+
+enum dlm_mle_type {
+	DLM_MLE_BLOCK,
+	DLM_MLE_MASTER,
+	DLM_MLE_MIGRATION
+};
+
+struct dlm_lock_name
+{
+	u8 len;
+	u8 name[DLM_LOCKID_NAME_MAX];
+};
+
+struct dlm_master_list_entry
+{
+	struct list_head list;
+	struct list_head hb_events;
+	struct dlm_ctxt *dlm;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	atomic_t woken;
+	struct kref mle_refs;
+	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	u8 master;
+	u8 new_master;
+	enum dlm_mle_type type;
+	struct o2hb_callback_func mle_hb_up;
+	struct o2hb_callback_func mle_hb_down;
+	union {
+		struct dlm_lock_resource *res;
+		struct dlm_lock_name name;
+	} u;
+};
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node,
+			      int idx);
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node,
+			    int idx);
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
+static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
+				unsigned int namelen, void *nodemap,
+				u32 flags);
+
+static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
+				struct dlm_master_list_entry *mle,
+				const char *name,
+				unsigned int namelen)
+{
+	struct dlm_lock_resource *res;
+
+	if (dlm != mle->dlm)
+		return 0;
+
+	if (mle->type == DLM_MLE_BLOCK ||
+	    mle->type == DLM_MLE_MIGRATION) {
+		if (namelen != mle->u.name.len ||
+    	    	    memcmp(name, mle->u.name.name, namelen)!=0)
+			return 0;
+	} else {
+		res = mle->u.res;
+		if (namelen != res->lockname.len ||
+		    memcmp(res->lockname.name, name, namelen) != 0)
+			return 0;
+	}
+	return 1;
+}
+
+#if 0
+/* Code here is included but defined out as it aids debugging */
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+	int i = 0, refs;
+	char *type;
+	char attached;
+	u8 master;
+	unsigned int namelen;
+	const char *name;
+	struct kref *k;
+
+	k = &mle->mle_refs;
+	if (mle->type == DLM_MLE_BLOCK)
+		type = "BLK";
+	else if (mle->type == DLM_MLE_MASTER)
+		type = "MAS";
+	else
+		type = "MIG";
+	refs = atomic_read(&k->refcount);
+	master = mle->master;
+	attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
+
+	if (mle->type != DLM_MLE_MASTER) {
+		namelen = mle->u.name.len;
+		name = mle->u.name.name;
+	} else {
+		namelen = mle->u.res->lockname.len;
+		name = mle->u.res->lockname.name;
+	}
+
+	mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
+		  i, type, refs, master, mle->new_master, attached,
+		  namelen, namelen, name);
+}
+
+static void dlm_dump_mles(struct dlm_ctxt *dlm)
+{
+	struct dlm_master_list_entry *mle;
+	struct list_head *iter;
+	
+	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
+	mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
+	spin_lock(&dlm->master_lock);
+	list_for_each(iter, &dlm->master_list) {
+		mle = list_entry(iter, struct dlm_master_list_entry, list);
+		dlm_print_one_mle(mle);
+	}
+	spin_unlock(&dlm->master_lock);
+}
+
+int dlm_dump_all_mles(const char __user *data, unsigned int len)
+{
+	struct list_head *iter;
+	struct dlm_ctxt *dlm;
+
+	spin_lock(&dlm_domain_lock);
+	list_for_each(iter, &dlm_domains) {
+		dlm = list_entry (iter, struct dlm_ctxt, list);
+		mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
+		dlm_dump_mles(dlm);
+	}
+	spin_unlock(&dlm_domain_lock);
+	return len;
+}
+EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
+
+#endif  /*  0  */
+
+
+static kmem_cache_t *dlm_mle_cache = NULL;
+
+
+static void dlm_mle_release(struct kref *kref);
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen);
+static void dlm_put_mle(struct dlm_master_list_entry *mle);
+static void __dlm_put_mle(struct dlm_master_list_entry *mle);
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen);
+
+static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
+
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked);
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked);
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master);
+
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res);
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res);
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target);
+
+
+int dlm_is_host_down(int errno)
+{
+	switch (errno) {
+		case -EBADF:
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+		case -EHOSTDOWN:
+		case -EHOSTUNREACH:
+		case -ETIMEDOUT:
+		case -ECONNABORTED:
+		case -ENETDOWN:
+		case -ENETUNREACH:
+		case -ENETRESET:
+		case -ESHUTDOWN:
+		case -ENOPROTOOPT:
+		case -EINVAL:   /* if returned from our tcp code,
+				   this means there is no socket */
+			return 1;
+	}
+	return 0;
+}
+
+
+/*
+ * MASTER LIST FUNCTIONS
+ */
+
+
+/*
+ * regarding master list entries and heartbeat callbacks:
+ *
+ * in order to avoid sleeping and allocation that occurs in
+ * heartbeat, master list entries are simply attached to the
+ * dlm's established heartbeat callbacks.  the mle is attached
+ * when it is created, and since the dlm->spinlock is held at
+ * that time, any heartbeat event will be properly discovered
+ * by the mle.  the mle needs to be detached from the
+ * dlm->mle_hb_events list as soon as heartbeat events are no
+ * longer useful to the mle, and before the mle is freed.
+ *
+ * as a general rule, heartbeat events are no longer needed by
+ * the mle once an "answer" regarding the lock master has been
+ * received.
+ */
+static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
+}
+
+
+static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	if (!list_empty(&mle->hb_events))
+		list_del_init(&mle->hb_events);
+}
+
+
+static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					    struct dlm_master_list_entry *mle)
+{
+	spin_lock(&dlm->spinlock);
+	__dlm_mle_detach_hb_events(dlm, mle);
+	spin_unlock(&dlm->spinlock);
+}
+
+/* remove from list and free */
+static void __dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	BUG_ON(!atomic_read(&mle->mle_refs.refcount));
+
+	kref_put(&mle->mle_refs, dlm_mle_release);
+}
+
+
+/* must not have any spinlocks coming in */
+static void dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
+{
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	mle->dlm = dlm;
+	mle->type = type;
+	INIT_LIST_HEAD(&mle->list);
+	INIT_LIST_HEAD(&mle->hb_events);
+	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+	spin_lock_init(&mle->spinlock);
+	init_waitqueue_head(&mle->wq);
+	atomic_set(&mle->woken, 0);
+	kref_init(&mle->mle_refs);
+	memset(mle->response_map, 0, sizeof(mle->response_map));
+	mle->master = O2NM_MAX_NODES;
+	mle->new_master = O2NM_MAX_NODES;
+
+	if (mle->type == DLM_MLE_MASTER) {
+		BUG_ON(!res);
+		mle->u.res = res;
+	} else if (mle->type == DLM_MLE_BLOCK) {
+		BUG_ON(!name);
+		memcpy(mle->u.name.name, name, namelen);
+		mle->u.name.len = namelen;
+	} else /* DLM_MLE_MIGRATION */ {
+		BUG_ON(!name);
+		memcpy(mle->u.name.name, name, namelen);
+		mle->u.name.len = namelen;
+	}
+
+	/* copy off the node_map and register hb callbacks on our copy */
+	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
+	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+	clear_bit(dlm->node_num, mle->vote_map);
+	clear_bit(dlm->node_num, mle->node_map);
+
+	/* attach the mle to the domain node up/down events */
+	__dlm_mle_attach_hb_events(dlm, mle);
+}
+
+
+/* returns 1 if found, 0 if not */
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen)
+{
+	struct dlm_master_list_entry *tmpmle;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm->master_lock);
+
+	list_for_each(iter, &dlm->master_list) {
+		tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
+			continue;
+		dlm_get_mle(tmpmle);
+		*mle = tmpmle;
+		return 1;
+	}
+	return 0;
+}
+
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
+{
+	struct dlm_master_list_entry *mle;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm->spinlock);
+	
+	list_for_each(iter, &dlm->mle_hb_events) {
+		mle = list_entry(iter, struct dlm_master_list_entry, 
+				 hb_events);
+		if (node_up)
+			dlm_mle_node_up(dlm, mle, NULL, idx);
+		else
+			dlm_mle_node_down(dlm, mle, NULL, idx);
+	}
+}
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (!test_bit(idx, mle->node_map))
+		mlog(0, "node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (test_bit(idx, mle->node_map))
+		mlog(0, "node %u already in node map!\n", idx);
+	else
+		set_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+
+int dlm_init_mle_cache(void)
+{
+	dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+					  sizeof(struct dlm_master_list_entry),
+					  0, SLAB_HWCACHE_ALIGN,
+					  NULL, NULL);
+	if (dlm_mle_cache == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void dlm_destroy_mle_cache(void)
+{
+	if (dlm_mle_cache)
+		kmem_cache_destroy(dlm_mle_cache);
+}
+
+static void dlm_mle_release(struct kref *kref)
+{
+	struct dlm_master_list_entry *mle;
+	struct dlm_ctxt *dlm;
+
+	mlog_entry_void();
+
+	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
+	dlm = mle->dlm;
+
+	if (mle->type != DLM_MLE_MASTER) {
+		mlog(0, "calling mle_release for %.*s, type %d\n",
+		     mle->u.name.len, mle->u.name.name, mle->type);
+	} else {
+		mlog(0, "calling mle_release for %.*s, type %d\n",
+		     mle->u.res->lockname.len,
+		     mle->u.res->lockname.name, mle->type);
+	}
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	/* remove from list if not already */
+	if (!list_empty(&mle->list))
+		list_del_init(&mle->list);
+
+	/* detach the mle from the domain node up/down events */
+	__dlm_mle_detach_hb_events(dlm, mle);
+
+	/* NOTE: kfree under spinlock here.
+	 * if this is bad, we can move this to a freelist. */
+	kmem_cache_free(dlm_mle_cache, mle);
+}
+
+
+/*
+ * LOCK RESOURCE FUNCTIONS
+ */
+
+static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
+
+	if (owner == dlm->node_num)
+		atomic_inc(&dlm->local_resources);
+	else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
+		atomic_inc(&dlm->unknown_resources);
+	else
+		atomic_inc(&dlm->remote_resources);
+
+	res->owner = owner;
+}
+
+void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res, u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	if (owner == res->owner)
+		return;
+
+	if (res->owner == dlm->node_num)
+		atomic_dec(&dlm->local_resources);
+	else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
+		atomic_dec(&dlm->unknown_resources);
+	else
+		atomic_dec(&dlm->remote_resources);
+
+	dlm_set_lockres_owner(dlm, res, owner);
+}
+
+
+static void dlm_lockres_release(struct kref *kref)
+{
+	struct dlm_lock_resource *res;
+
+	res = container_of(kref, struct dlm_lock_resource, refs);
+
+	/* This should not happen -- all lockres' have a name
+	 * associated with them at init time. */
+	BUG_ON(!res->lockname.name);
+
+	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
+	     res->lockname.name);
+
+	/* By the time we're ready to blow this guy away, we shouldn't
+	 * be on any lists. */
+	BUG_ON(!list_empty(&res->list));
+	BUG_ON(!list_empty(&res->granted));
+	BUG_ON(!list_empty(&res->converting));
+	BUG_ON(!list_empty(&res->blocked));
+	BUG_ON(!list_empty(&res->dirty));
+	BUG_ON(!list_empty(&res->recovering));
+	BUG_ON(!list_empty(&res->purge));
+
+	kfree(res->lockname.name);
+
+	kfree(res);
+}
+
+void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+	kref_get(&res->refs);
+}
+
+void dlm_lockres_put(struct dlm_lock_resource *res)
+{
+	kref_put(&res->refs, dlm_lockres_release);
+}
+
+static void dlm_init_lockres(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res,
+			     const char *name, unsigned int namelen)
+{
+	char *qname;
+
+	/* If we memset here, we lose our reference to the kmalloc'd
+	 * res->lockname.name, so be sure to init every field
+	 * correctly! */
+
+	qname = (char *) res->lockname.name;
+	memcpy(qname, name, namelen);
+
+	res->lockname.len = namelen;
+	res->lockname.hash = full_name_hash(name, namelen);
+
+	init_waitqueue_head(&res->wq);
+	spin_lock_init(&res->spinlock);
+	INIT_LIST_HEAD(&res->list);
+	INIT_LIST_HEAD(&res->granted);
+	INIT_LIST_HEAD(&res->converting);
+	INIT_LIST_HEAD(&res->blocked);
+	INIT_LIST_HEAD(&res->dirty);
+	INIT_LIST_HEAD(&res->recovering);
+	INIT_LIST_HEAD(&res->purge);
+	atomic_set(&res->asts_reserved, 0);
+	res->migration_pending = 0;
+
+	kref_init(&res->refs);
+
+	/* just for consistency */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+	spin_unlock(&res->spinlock);
+
+	res->state = DLM_LOCK_RES_IN_PROGRESS;
+
+	res->last_used = 0;
+
+	memset(res->lvb, 0, DLM_LVB_LEN);
+}
+
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+				   const char *name,
+				   unsigned int namelen)
+{
+	struct dlm_lock_resource *res;
+
+	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+	if (!res)
+		return NULL;
+
+	res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+	if (!res->lockname.name) {
+		kfree(res);
+		return NULL;
+	}
+
+	dlm_init_lockres(dlm, res, name, namelen);
+	return res;
+}
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ * lockid is null terminated
+ *
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ *
+ * also, do a lookup in the dlm->master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here.   need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+					  const char *lockid,
+					  int flags)
+{
+	struct dlm_lock_resource *tmpres=NULL, *res=NULL;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *alloc_mle = NULL;
+	int blocked = 0;
+	int ret, nodenum;
+	struct dlm_node_iter iter;
+	unsigned int namelen;
+	int tries = 0;
+
+	BUG_ON(!lockid);
+
+	namelen = strlen(lockid);
+
+	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
+
+lookup:
+	spin_lock(&dlm->spinlock);
+	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+	if (tmpres) {
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "found in hash!\n");
+		if (res)
+			dlm_lockres_put(res);
+		res = tmpres;
+		goto leave;
+	}
+
+	if (!res) {
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "allocating a new resource\n");
+		/* nothing found and we need to allocate one. */
+		alloc_mle = (struct dlm_master_list_entry *)
+			kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+		if (!alloc_mle)
+			goto leave;
+		res = dlm_new_lockres(dlm, lockid, namelen);
+		if (!res)
+			goto leave;
+		goto lookup;
+	}
+
+	mlog(0, "no lockres found, allocated our own: %p\n", res);
+
+	if (flags & LKM_LOCAL) {
+		/* caller knows it's safe to assume it's not mastered elsewhere
+		 * DONE!  return right away */
+		spin_lock(&res->spinlock);
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+		/* lockres still marked IN_PROGRESS */
+		goto wake_waiters;
+	}
+
+	/* check master list to see if another node has started mastering it */
+	spin_lock(&dlm->master_lock);
+
+	/* if we found a block, wait for lock to be mastered by another node */
+	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
+	if (blocked) {
+		if (mle->type == DLM_MLE_MASTER) {
+			mlog(ML_ERROR, "master entry for nonexistent lock!\n");
+			BUG();
+		} else if (mle->type == DLM_MLE_MIGRATION) {
+			/* migration is in progress! */
+			/* the good news is that we now know the
+			 * "current" master (mle->master). */
+
+			spin_unlock(&dlm->master_lock);
+			assert_spin_locked(&dlm->spinlock);
+
+			/* set the lockres owner and hash it */
+			spin_lock(&res->spinlock);
+			dlm_set_lockres_owner(dlm, res, mle->master);
+			__dlm_insert_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->spinlock);
+
+			/* master is known, detach */
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+			mle = NULL;
+			goto wake_waiters;
+		}
+	} else {
+		/* go ahead and try to master lock on this node */
+		mle = alloc_mle;
+		/* make sure this does not get freed below */
+		alloc_mle = NULL;
+		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
+		set_bit(dlm->node_num, mle->maybe_map);
+		list_add(&mle->list, &dlm->master_list);
+	}
+
+	/* at this point there is either a DLM_MLE_BLOCK or a
+	 * DLM_MLE_MASTER on the master list, so it's safe to add the
+	 * lockres to the hashtable.  anyone who finds the lock will
+	 * still have to wait on the IN_PROGRESS. */
+
+	/* finally add the lockres to its hash bucket */
+	__dlm_insert_lockres(dlm, res);
+	/* get an extra ref on the mle in case this is a BLOCK
+	 * if so, the creator of the BLOCK may try to put the last
+	 * ref at this time in the assert master handler, so we
+	 * need an extra one to keep from a bad ptr deref. */
+	dlm_get_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	/* must wait for lock to be mastered elsewhere */
+	if (blocked)
+		goto wait;
+
+redo_request:
+	ret = -EINVAL;
+	dlm_node_iter_init(mle->vote_map, &iter);
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = dlm_do_master_request(mle, nodenum);
+		if (ret < 0)
+			mlog_errno(ret);
+		if (mle->master != O2NM_MAX_NODES) {
+			/* found a master ! */
+			break;
+		}
+	}
+
+wait:
+	/* keep going until the response map includes all nodes */
+	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
+	if (ret < 0) {
+		mlog(0, "%s:%.*s: node map changed, redo the "
+		     "master request now, blocked=%d\n",
+		     dlm->name, res->lockname.len,
+		     res->lockname.name, blocked);
+		if (++tries > 20) {
+			mlog(ML_ERROR, "%s:%.*s: spinning on "
+			     "dlm_wait_for_lock_mastery, blocked=%d\n", 
+			     dlm->name, res->lockname.len, 
+			     res->lockname.name, blocked);
+			dlm_print_one_lock_resource(res);
+			/* dlm_print_one_mle(mle); */
+			tries = 0;
+		}
+		goto redo_request;
+	}
+
+	mlog(0, "lockres mastered by %u\n", res->owner);
+	/* make sure we never continue without this */
+	BUG_ON(res->owner == O2NM_MAX_NODES);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle(mle);
+	/* put the extra ref */
+	dlm_put_mle(mle);
+
+wake_waiters:
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+leave:
+	/* need to free the unused mle */
+	if (alloc_mle)
+		kmem_cache_free(dlm_mle_cache, alloc_mle);
+
+	return res;
+}
+
+
+#define DLM_MASTERY_TIMEOUT_MS   5000
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked)
+{
+	u8 m;
+	int ret, bit;
+	int map_changed, voting_done;
+	int assert, sleep;
+
+recheck:
+	ret = 0;
+	assert = 0;
+
+	/* check if another node has already become the owner */
+	spin_lock(&res->spinlock);
+	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	spin_unlock(&res->spinlock);
+
+	spin_lock(&mle->spinlock);
+	m = mle->master;
+	map_changed = (memcmp(mle->vote_map, mle->node_map,
+			      sizeof(mle->vote_map)) != 0);
+	voting_done = (memcmp(mle->vote_map, mle->response_map,
+			     sizeof(mle->vote_map)) == 0);
+
+	/* restart if we hit any errors */
+	if (map_changed) {
+		int b;
+		mlog(0, "%s: %.*s: node map changed, restarting\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
+		b = (mle->type == DLM_MLE_BLOCK);
+		if ((*blocked && !b) || (!*blocked && b)) {
+			mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     *blocked, b);
+			*blocked = b;
+		}
+		spin_unlock(&mle->spinlock);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto leave;
+		}
+		mlog(0, "%s:%.*s: restart lock mastery succeeded, "
+		     "rechecking now\n", dlm->name, res->lockname.len,
+		     res->lockname.name);
+		goto recheck;
+	}
+
+	if (m != O2NM_MAX_NODES) {
+		/* another node has done an assert!
+		 * all done! */
+		sleep = 0;
+	} else {
+		sleep = 1;
+		/* have all nodes responded? */
+		if (voting_done && !*blocked) {
+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+			if (dlm->node_num <= bit) {
+				/* my node number is lowest.
+			 	 * now tell other nodes that I am
+				 * mastering this. */
+				mle->master = dlm->node_num;
+				assert = 1;
+				sleep = 0;
+			}
+			/* if voting is done, but we have not received
+			 * an assert master yet, we must sleep */
+		}
+	}
+
+	spin_unlock(&mle->spinlock);
+
+	/* sleep if we haven't finished voting yet */
+	if (sleep) {
+		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
+
+		/*
+		if (atomic_read(&mle->mle_refs.refcount) < 2)
+			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
+			atomic_read(&mle->mle_refs.refcount),
+			res->lockname.len, res->lockname.name);
+		*/
+		atomic_set(&mle->woken, 0);
+		(void)wait_event_timeout(mle->wq,
+					 (atomic_read(&mle->woken) == 1),
+					 timeo);
+		if (res->owner == O2NM_MAX_NODES) {
+			mlog(0, "waiting again\n");
+			goto recheck;
+		}
+		mlog(0, "done waiting, master is %u\n", res->owner);
+		ret = 0;
+		goto leave;
+	}
+
+	ret = 0;   /* done */
+	if (assert) {
+		m = dlm->node_num;
+		mlog(0, "about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, m);
+		ret = dlm_do_assert_master(dlm, res->lockname.name,
+					   res->lockname.len, mle->vote_map, 0);
+		if (ret) {
+			/* This is a failure in the network path,
+			 * not in the response to the assert_master
+			 * (any nonzero response is a BUG on this node).
+			 * Most likely a socket just got disconnected
+			 * due to node death. */
+			mlog_errno(ret);
+		}
+		/* no longer need to restart lock mastery.
+		 * all living nodes have been contacted. */
+		ret = 0;
+	}
+
+	/* set the lockres owner */
+	spin_lock(&res->spinlock);
+	dlm_change_lockres_owner(dlm, res, m);
+	spin_unlock(&res->spinlock);
+
+leave:
+	return ret;
+}
+
+struct dlm_bitmap_diff_iter
+{
+	int curnode;
+	unsigned long *orig_bm;
+	unsigned long *cur_bm;
+	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+enum dlm_node_state_change
+{
+	NODE_DOWN = -1,
+	NODE_NO_CHANGE = 0,
+	NODE_UP
+};
+
+static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
+				      unsigned long *orig_bm,
+				      unsigned long *cur_bm)
+{
+	unsigned long p1, p2;
+	int i;
+
+	iter->curnode = -1;
+	iter->orig_bm = orig_bm;
+	iter->cur_bm = cur_bm;
+
+	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
+       		p1 = *(iter->orig_bm + i);
+	       	p2 = *(iter->cur_bm + i);
+		iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
+	}
+}
+
+static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
+				     enum dlm_node_state_change *state)
+{
+	int bit;
+
+	if (iter->curnode >= O2NM_MAX_NODES)
+		return -ENOENT;
+
+	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
+			    iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+
+	/* if it was there in the original then this node died */
+	if (test_bit(bit, iter->orig_bm))
+		*state = NODE_DOWN;
+	else
+		*state = NODE_UP;
+
+	iter->curnode = bit;
+	return bit;
+}
+
+
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked)
+{
+	struct dlm_bitmap_diff_iter bdi;
+	enum dlm_node_state_change sc;
+	int node;
+	int ret = 0;
+
+	mlog(0, "something happened such that the "
+	     "master process may need to be restarted!\n");
+
+	assert_spin_locked(&mle->spinlock);
+
+	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
+	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	while (node >= 0) {
+		if (sc == NODE_UP) {
+			/* a node came up.  easy.  might not even need
+			 * to talk to it if its node number is higher
+			 * or if we are already blocked. */
+			mlog(0, "node up! %d\n", node);
+			if (blocked)
+				goto next;
+
+			if (node > dlm->node_num) {
+				mlog(0, "node > this node. skipping.\n");
+				goto next;
+			}
+
+			/* redo the master request, but only for the new node */
+			mlog(0, "sending request to new node\n");
+			clear_bit(node, mle->response_map);
+			set_bit(node, mle->vote_map);
+		} else {
+			mlog(ML_ERROR, "node down! %d\n", node);
+
+			/* if the node wasn't involved in mastery skip it,
+			 * but clear it out from the maps so that it will
+			 * not affect mastery of this lockres */
+			clear_bit(node, mle->response_map);
+			clear_bit(node, mle->vote_map);
+			if (!test_bit(node, mle->maybe_map))
+				goto next;
+
+			/* if we're already blocked on lock mastery, and the
+			 * dead node wasn't the expected master, or there is
+			 * another node in the maybe_map, keep waiting */
+			if (blocked) {
+				int lowest = find_next_bit(mle->maybe_map,
+						       O2NM_MAX_NODES, 0);
+
+				/* act like it was never there */
+				clear_bit(node, mle->maybe_map);
+
+			       	if (node != lowest)
+					goto next;
+
+				mlog(ML_ERROR, "expected master %u died while "
+				     "this node was blocked waiting on it!\n",
+				     node);
+				lowest = find_next_bit(mle->maybe_map,
+						       O2NM_MAX_NODES,
+						       lowest+1);
+				if (lowest < O2NM_MAX_NODES) {
+					mlog(0, "still blocked. waiting "
+					     "on %u now\n", lowest);
+					goto next;
+				}
+
+				/* mle is an MLE_BLOCK, but there is now
+				 * nothing left to block on.  we need to return
+				 * all the way back out and try again with
+				 * an MLE_MASTER. dlm_do_local_recovery_cleanup
+				 * has already run, so the mle refcount is ok */
+				mlog(0, "no longer blocking. we can "
+				     "try to master this here\n");
+				mle->type = DLM_MLE_MASTER;
+				memset(mle->maybe_map, 0,
+				       sizeof(mle->maybe_map));
+				memset(mle->response_map, 0,
+				       sizeof(mle->maybe_map));
+				memcpy(mle->vote_map, mle->node_map,
+				       sizeof(mle->node_map));
+				mle->u.res = res;
+				set_bit(dlm->node_num, mle->maybe_map);
+
+				ret = -EAGAIN;
+				goto next;
+			}
+
+			clear_bit(node, mle->maybe_map);
+			if (node > dlm->node_num)
+				goto next;
+
+			mlog(0, "dead node in map!\n");
+			/* yuck. go back and re-contact all nodes
+			 * in the vote_map, removing this node. */
+			memset(mle->response_map, 0,
+			       sizeof(mle->response_map));
+		}
+		ret = -EAGAIN;
+next:
+		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	}
+	return ret;
+}
+
+
+/*
+ * DLM_MASTER_REQUEST_MSG
+ *
+ * returns: 0 on success,
+ *          -errno on a network error
+ *
+ * on error, the caller should assume the target node is "dead"
+ *
+ */
+
+static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
+{
+	struct dlm_ctxt *dlm = mle->dlm;
+	struct dlm_master_request request;
+	int ret, response=0, resend;
+
+	memset(&request, 0, sizeof(request));
+	request.node_idx = dlm->node_num;
+
+	BUG_ON(mle->type == DLM_MLE_MIGRATION);
+
+	if (mle->type != DLM_MLE_MASTER) {
+		request.namelen = mle->u.name.len;
+		memcpy(request.name, mle->u.name.name, request.namelen);
+	} else {
+		request.namelen = mle->u.res->lockname.len;
+		memcpy(request.name, mle->u.res->lockname.name,
+			request.namelen);
+	}
+
+again:
+	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
+				 sizeof(request), to, &response);
+	if (ret < 0)  {
+		if (ret == -ESRCH) {
+			/* should never happen */
+			mlog(ML_ERROR, "TCP stack not ready!\n");
+			BUG();
+		} else if (ret == -EINVAL) {
+			mlog(ML_ERROR, "bad args passed to o2net!\n");
+			BUG();
+		} else if (ret == -ENOMEM) {
+			mlog(ML_ERROR, "out of memory while trying to send "
+			     "network message!  retrying\n");
+			/* this is totally crude */
+			msleep(50);
+			goto again;
+		} else if (!dlm_is_host_down(ret)) {
+			/* not a network error. bad. */
+			mlog_errno(ret);
+			mlog(ML_ERROR, "unhandled error!");
+			BUG();
+		}
+		/* all other errors should be network errors,
+		 * and likely indicate node death */
+		mlog(ML_ERROR, "link to %d went down!\n", to);
+		goto out;
+	}
+
+	ret = 0;
+	resend = 0;
+	spin_lock(&mle->spinlock);
+	switch (response) {
+		case DLM_MASTER_RESP_YES:
+			set_bit(to, mle->response_map);
+			mlog(0, "node %u is the master, response=YES\n", to);
+			mle->master = to;
+			break;
+		case DLM_MASTER_RESP_NO:
+			mlog(0, "node %u not master, response=NO\n", to);
+			set_bit(to, mle->response_map);
+			break;
+		case DLM_MASTER_RESP_MAYBE:
+			mlog(0, "node %u not master, response=MAYBE\n", to);
+			set_bit(to, mle->response_map);
+			set_bit(to, mle->maybe_map);
+			break;
+		case DLM_MASTER_RESP_ERROR:
+			mlog(0, "node %u hit an error, resending\n", to);
+			resend = 1;
+			response = 0;
+			break;
+		default:
+			mlog(ML_ERROR, "bad response! %u\n", response);
+			BUG();
+	}
+	spin_unlock(&mle->spinlock);
+	if (resend) {
+		/* this is also totally crude */
+		msleep(50);
+		goto again;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	u8 response = DLM_MASTER_RESP_MAYBE;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res;
+	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	char *name;
+	unsigned int namelen;
+	int found, ret;
+	int set_maybe;
+
+	if (!dlm_grab(dlm))
+		return DLM_MASTER_RESP_NO;
+
+	if (!dlm_domain_fully_joined(dlm)) {
+		response = DLM_MASTER_RESP_NO;
+		goto send_response;
+	}
+
+	name = request->name;
+	namelen = request->namelen;
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		response = DLM_IVBUFLEN;
+		goto send_response;
+	}
+
+way_up_top:
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	if (res) {
+		spin_unlock(&dlm->spinlock);
+
+		/* take care of the easy cases up front */
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			spin_unlock(&res->spinlock);
+			mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
+			     "being recovered\n");
+			response = DLM_MASTER_RESP_ERROR;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		if (res->owner == dlm->node_num) {
+			u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
+			spin_unlock(&res->spinlock);
+			// mlog(0, "this node is the master\n");
+			response = DLM_MASTER_RESP_YES;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+
+			/* this node is the owner.
+			 * there is some extra work that needs to
+			 * happen now.  the requesting node has
+			 * caused all nodes up to this one to
+			 * create mles.  this node now needs to
+			 * go back and clean those up. */
+			mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
+			     dlm->node_num, res->lockname.len, res->lockname.name);
+			ret = dlm_dispatch_assert_master(dlm, res, 1,
+							 request->node_idx,
+							 flags);
+			if (ret < 0) {
+				mlog(ML_ERROR, "failed to dispatch assert "
+				     "master work\n");
+				response = DLM_MASTER_RESP_ERROR;
+			}
+			goto send_response;
+		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			spin_unlock(&res->spinlock);
+			// mlog(0, "node %u is the master\n", res->owner);
+			response = DLM_MASTER_RESP_NO;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		/* ok, there is no owner.  either this node is
+		 * being blocked, or it is actively trying to
+		 * master this lock. */
+		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+			mlog(ML_ERROR, "lock with no owner should be "
+			     "in-progress!\n");
+			BUG();
+		}
+
+		// mlog(0, "lockres is in progress...\n");
+		spin_lock(&dlm->master_lock);
+		found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+		if (!found) {
+			mlog(ML_ERROR, "no mle found for this lock!\n");
+			BUG();
+		}
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK) {
+			// mlog(0, "this node is waiting for "
+			// "lockres to be mastered\n");
+			response = DLM_MASTER_RESP_NO;
+		} else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "node %u is master, but trying to migrate to "
+			     "node %u.\n", tmpmle->master, tmpmle->new_master);
+			if (tmpmle->master == dlm->node_num) {
+				response = DLM_MASTER_RESP_YES;
+				mlog(ML_ERROR, "no owner on lockres, but this "
+				     "node is trying to migrate it to %u?!\n",
+				     tmpmle->new_master);
+				BUG();
+			} else {
+				/* the real master can respond on its own */
+				response = DLM_MASTER_RESP_NO;
+			}
+		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			set_maybe = 0;
+			if (tmpmle->master == dlm->node_num)
+				response = DLM_MASTER_RESP_YES;
+			else
+				response = DLM_MASTER_RESP_NO;
+		} else {
+			// mlog(0, "this node is attempting to "
+			// "master lockres\n");
+			response = DLM_MASTER_RESP_MAYBE;
+		}
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&res->spinlock);
+
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+		if (mle)
+			kmem_cache_free(dlm_mle_cache, mle);
+		goto send_response;
+	}
+
+	/*
+	 * lockres doesn't exist on this node
+	 * if there is an MLE_BLOCK, return NO
+	 * if there is an MLE_MASTER, return MAYBE
+	 * otherwise, add an MLE_BLOCK, return NO
+	 */
+	spin_lock(&dlm->master_lock);
+	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+	if (!found) {
+		/* this lockid has never been seen on this node yet */
+		// mlog(0, "no mle found\n");
+		if (!mle) {
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+
+			mle = (struct dlm_master_list_entry *)
+				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+			if (!mle) {
+				// bad bad bad... this sucks.
+				response = DLM_MASTER_RESP_ERROR;
+				goto send_response;
+			}
+			spin_lock(&dlm->spinlock);
+			dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
+					 name, namelen);
+			spin_unlock(&dlm->spinlock);
+			goto way_up_top;
+		}
+
+		// mlog(0, "this is second time thru, already allocated, "
+		// "add the block.\n");
+		set_bit(request->node_idx, mle->maybe_map);
+		list_add(&mle->list, &dlm->master_list);
+		response = DLM_MASTER_RESP_NO;
+	} else {
+		// mlog(0, "mle was found\n");
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK)
+			response = DLM_MASTER_RESP_NO;
+		else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "migration mle was found (%u->%u)\n",
+			     tmpmle->master, tmpmle->new_master);
+			if (tmpmle->master == dlm->node_num) {
+				mlog(ML_ERROR, "no lockres, but migration mle "
+				     "says that this node is master!\n");
+				BUG();
+			}
+			/* real master can respond on its own */
+			response = DLM_MASTER_RESP_NO;
+		} else {
+			if (tmpmle->master == dlm->node_num) {
+				response = DLM_MASTER_RESP_YES;
+				set_maybe = 0;
+			} else
+				response = DLM_MASTER_RESP_MAYBE;
+		}
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+	}
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (found) {
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+	}
+send_response:
+	dlm_put(dlm);
+	return response;
+}
+
+/*
+ * DLM_ASSERT_MASTER_MSG
+ */
+
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
+				unsigned int namelen, void *nodemap,
+				u32 flags)
+{
+	struct dlm_assert_master assert;
+	int to, tmpret;
+	struct dlm_node_iter iter;
+	int ret = 0;
+
+	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+	/* note that if this nodemap is empty, it returns 0 */
+	dlm_node_iter_init(nodemap, &iter);
+	while ((to = dlm_node_iter_next(&iter)) >= 0) {
+		int r = 0;
+		mlog(0, "sending assert master to %d (%.*s)\n", to,
+		     namelen, lockname);
+		memset(&assert, 0, sizeof(assert));
+		assert.node_idx = dlm->node_num;
+		assert.namelen = namelen;
+		memcpy(assert.name, lockname, namelen);
+		assert.flags = cpu_to_be32(flags);
+
+		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
+					    &assert, sizeof(assert), to, &r);
+		if (tmpret < 0) {
+			mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+			if (!dlm_is_host_down(tmpret)) {
+				mlog(ML_ERROR, "unhandled error!\n");
+				BUG();
+			}
+			/* a node died.  finish out the rest of the nodes. */
+			mlog(ML_ERROR, "link to %d went down!\n", to);
+			/* any nonzero status return will do */
+			ret = tmpret;
+		} else if (r < 0) {
+			/* ok, something horribly messed.  kill thyself. */
+			mlog(ML_ERROR,"during assert master of %.*s to %u, "
+			     "got %d.\n", namelen, lockname, to, r);
+			dlm_dump_lock_resources(dlm);
+			BUG();
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	char *name;
+	unsigned int namelen;
+	u32 flags;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	name = assert->name;
+	namelen = assert->namelen;
+	flags = be32_to_cpu(assert->flags);
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length!");
+		goto done;
+	}
+
+	spin_lock(&dlm->spinlock);
+
+	if (flags)
+		mlog(0, "assert_master with flags: %u\n", flags);
+
+	/* find the MLE */
+	spin_lock(&dlm->master_lock);
+	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
+		/* not an error, could be master just re-asserting */
+		mlog(0, "just got an assert_master from %u, but no "
+		     "MLE for it! (%.*s)\n", assert->node_idx,
+		     namelen, name);
+	} else {
+		int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
+		if (bit >= O2NM_MAX_NODES) {
+			/* not necessarily an error, though less likely.
+			 * could be master just re-asserting. */
+			mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+			     "is asserting! (%.*s)\n", assert->node_idx,
+			     namelen, name);
+		} else if (bit != assert->node_idx) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "master %u was found, %u should "
+				     "back off\n", assert->node_idx, bit);
+			} else {
+				/* with the fix for bug 569, a higher node
+				 * number winning the mastery will respond
+				 * YES to mastery requests, but this node
+				 * had no way of knowing.  let it pass. */
+				mlog(ML_ERROR, "%u is the lowest node, "
+				     "%u is asserting. (%.*s)  %u must "
+				     "have begun after %u won.\n", bit,
+				     assert->node_idx, namelen, name, bit,
+				     assert->node_idx);
+			}
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+
+	/* ok everything checks out with the MLE
+	 * now check to see if there is a lockres */
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING)  {
+			mlog(ML_ERROR, "%u asserting but %.*s is "
+			     "RECOVERING!\n", assert->node_idx, namelen, name);
+			goto kill;
+		}
+		if (!mle) {
+			if (res->owner != assert->node_idx) {
+				mlog(ML_ERROR, "assert_master from "
+					  "%u, but current owner is "
+					  "%u! (%.*s)\n",
+				       assert->node_idx, res->owner,
+				       namelen, name);
+				goto kill;
+			}
+		} else if (mle->type != DLM_MLE_MIGRATION) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				/* owner is just re-asserting */
+				if (res->owner == assert->node_idx) {
+					mlog(0, "owner %u re-asserting on "
+					     "lock %.*s\n", assert->node_idx,
+					     namelen, name);
+					goto ok;
+				}
+				mlog(ML_ERROR, "got assert_master from "
+				     "node %u, but %u is the owner! "
+				     "(%.*s)\n", assert->node_idx,
+				     res->owner, namelen, name);
+				goto kill;
+			}
+			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+				mlog(ML_ERROR, "got assert from %u, but lock "
+				     "with no owner should be "
+				     "in-progress! (%.*s)\n",
+				     assert->node_idx,
+				     namelen, name);
+				goto kill;
+			}
+		} else /* mle->type == DLM_MLE_MIGRATION */ {
+			/* should only be getting an assert from new master */
+			if (assert->node_idx != mle->new_master) {
+				mlog(ML_ERROR, "got assert from %u, but "
+				     "new master is %u, and old master "
+				     "was %u (%.*s)\n",
+				     assert->node_idx, mle->new_master,
+				     mle->master, namelen, name);
+				goto kill;
+			}
+
+		}
+ok:
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	// mlog(0, "woo!  got an assert_master from node %u!\n",
+	// 	     assert->node_idx);
+	if (mle) {
+		int extra_ref;
+		
+		spin_lock(&mle->spinlock);
+		extra_ref = !!(mle->type == DLM_MLE_BLOCK
+			       || mle->type == DLM_MLE_MIGRATION);
+		mle->master = assert->node_idx;
+		atomic_set(&mle->woken, 1);
+		wake_up(&mle->wq);
+		spin_unlock(&mle->spinlock);
+
+		if (mle->type == DLM_MLE_MIGRATION && res) {
+			mlog(0, "finishing off migration of lockres %.*s, "
+			     "from %u to %u\n",
+			       res->lockname.len, res->lockname.name,
+			       dlm->node_num, mle->new_master);
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_MIGRATING;
+			dlm_change_lockres_owner(dlm, res, mle->new_master);
+			BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			spin_unlock(&res->spinlock);
+		}
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, mle);
+		dlm_put_mle(mle);
+		
+		if (extra_ref) {
+			/* the assert master message now balances the extra
+		 	 * ref given by the master / migration request message.
+		 	 * if this is the last put, it will be removed
+		 	 * from the list. */
+			dlm_put_mle(mle);
+		}
+	}
+
+done:
+	if (res)
+		dlm_lockres_put(res);
+	dlm_put(dlm);
+	return 0;
+
+kill:
+	/* kill the caller! */
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+	dlm_lockres_put(res);
+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+	     "and killing the other node now!  This node is OK and can continue.\n");
+	dlm_dump_lock_resources(dlm);
+	dlm_put(dlm);
+	return -EINVAL;
+}
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher, u8 request_from, u32 flags)
+{
+	struct dlm_work_item *item;
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+
+	/* queue up work for dlm_assert_master_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
+	item->u.am.lockres = res; /* already have a ref */
+	/* can optionally ignore node numbers higher than this node */
+	item->u.am.ignore_higher = ignore_higher;
+	item->u.am.request_from = request_from;
+	item->u.am.flags = flags;
+
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+
+	schedule_work(&dlm->dispatched_work);
+	return 0;
+}
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int ignore_higher;
+	int bit;
+	u8 request_from;
+	u32 flags;
+
+	dlm = item->dlm;
+	res = item->u.am.lockres;
+	ignore_higher = item->u.am.ignore_higher;
+	request_from = item->u.am.request_from;
+	flags = item->u.am.flags;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dlm->node_num, nodemap);
+	if (ignore_higher) {
+		/* if is this just to clear up mles for nodes below
+		 * this node, do not send the message to the original
+		 * caller or any node number higher than this */
+		clear_bit(request_from, nodemap);
+		bit = dlm->node_num;
+		while (1) {
+			bit = find_next_bit(nodemap, O2NM_MAX_NODES,
+					    bit+1);
+		       	if (bit >= O2NM_MAX_NODES)
+				break;
+			clear_bit(bit, nodemap);
+		}
+	}
+
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	mlog(0, "worker about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, dlm->node_num);
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len,
+				   nodemap, flags);
+	if (ret < 0) {
+		/* no need to restart, we are done */
+		mlog_errno(ret);
+	}
+
+	dlm_lockres_put(res);
+
+	mlog(0, "finished with dlm_assert_master_worker\n");
+}
+
+
+/*
+ * DLM_MIGRATE_LOCKRES
+ */
+
+
+int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			u8 target)
+{
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *oldmle = NULL;
+ 	struct dlm_migratable_lockres *mres = NULL;
+	int ret = -EINVAL;
+	const char *name;
+	unsigned int namelen;
+	int mle_added = 0;
+	struct list_head *queue, *iter;
+	int i;
+	struct dlm_lock *lock;
+	int empty = 1;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	name = res->lockname.name;
+	namelen = res->lockname.len;
+
+	mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+
+	/*
+	 * ensure this lockres is a proper candidate for migration
+	 */
+	spin_lock(&res->spinlock);
+	if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "cannot migrate lockres with unknown owner!\n");
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	if (res->owner != dlm->node_num) {
+		mlog(0, "cannot migrate lockres this node doesn't own!\n");
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	mlog(0, "checking queues...\n");
+	queue = &res->granted;
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			empty = 0;
+			if (lock->ml.node == dlm->node_num) {
+				mlog(0, "found a lock owned by this node "
+				     "still on the %s queue!  will not "
+				     "migrate this lockres\n",
+				     i==0 ? "granted" :
+				     (i==1 ? "converting" : "blocked"));
+				spin_unlock(&res->spinlock);
+				ret = -ENOTEMPTY;
+				goto leave;
+			}
+		}
+		queue++;
+	}
+	mlog(0, "all locks on this lockres are nonlocal.  continuing\n");
+	spin_unlock(&res->spinlock);
+
+	/* no work to do */
+	if (empty) {
+		mlog(0, "no locks were found on this lockres! done!\n");
+		ret = 0;
+		goto leave;
+	}
+
+	/*
+	 * preallocate up front
+	 * if this fails, abort
+	 */
+
+	ret = -ENOMEM;
+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+	if (!mres) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+								GFP_KERNEL);
+	if (!mle) {
+		mlog_errno(ret);
+		goto leave;
+	}
+	ret = 0;
+
+	/*
+	 * find a node to migrate the lockres to
+	 */
+
+	mlog(0, "picking a migration node\n");
+	spin_lock(&dlm->spinlock);
+	/* pick a new node */
+	if (!test_bit(target, dlm->domain_map) ||
+	    target >= O2NM_MAX_NODES) {
+		target = dlm_pick_migration_target(dlm, res);
+	}
+	mlog(0, "node %u chosen for migration\n", target);
+
+	if (target >= O2NM_MAX_NODES ||
+	    !test_bit(target, dlm->domain_map)) {
+		/* target chosen is not alive */
+		ret = -EINVAL;
+	}
+
+	if (ret) {
+		spin_unlock(&dlm->spinlock);
+		goto fail;
+	}
+
+	mlog(0, "continuing with target = %u\n", target);
+
+	/*
+	 * clear any existing master requests and
+	 * add the migration mle to the list
+	 */
+	spin_lock(&dlm->master_lock);
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
+				    namelen, target, dlm->node_num);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (ret == -EEXIST) {
+		mlog(0, "another process is already migrating it\n");
+		goto fail;
+	}
+	mle_added = 1;
+
+	/*
+	 * set the MIGRATING flag and flush asts
+	 * if we fail after this we need to re-dirty the lockres
+	 */
+	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
+		mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
+		     "the target went down.\n", res->lockname.len,
+		     res->lockname.name, target);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
+		ret = -EINVAL;
+	}
+
+fail:
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (ret < 0) {
+		if (mle_added) {
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+		} else if (mle) {
+			kmem_cache_free(dlm_mle_cache, mle);
+		}
+		goto leave;
+	}
+
+	/*
+	 * at this point, we have a migration target, an mle
+	 * in the master list, and the MIGRATING flag set on
+	 * the lockres
+	 */
+
+
+	/* get an extra reference on the mle.
+	 * otherwise the assert_master from the new
+	 * master will destroy this.
+	 * also, make sure that all callers of dlm_get_mle
+	 * take both dlm->spinlock and dlm->master_lock */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	dlm_get_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	/* notify new node and send all lock state */
+	/* call send_one_lockres with migration flag.
+	 * this serves as notice to the target node that a
+	 * migration is starting. */
+	ret = dlm_send_one_lockres(dlm, res, mres, target,
+				   DLM_MRES_MIGRATION);
+
+	if (ret < 0) {
+		mlog(0, "migration to node %u failed with %d\n",
+		     target, ret);
+		/* migration failed, detach and clean up mle */
+		dlm_mle_detach_hb_events(dlm, mle);
+		dlm_put_mle(mle);
+		dlm_put_mle(mle);
+		goto leave;
+	}
+
+	/* at this point, the target sends a message to all nodes,
+	 * (using dlm_do_migrate_request).  this node is skipped since
+	 * we had to put an mle in the list to begin the process.  this
+	 * node now waits for target to do an assert master.  this node
+	 * will be the last one notified, ensuring that the migration
+	 * is complete everywhere.  if the target dies while this is
+	 * going on, some nodes could potentially see the target as the
+	 * master, so it is important that my recovery finds the migration
+	 * mle and sets the master to UNKNONWN. */
+
+
+	/* wait for new node to assert master */
+	while (1) {
+		ret = wait_event_interruptible_timeout(mle->wq,
+					(atomic_read(&mle->woken) == 1),
+					msecs_to_jiffies(5000));
+
+		if (ret >= 0) {
+		       	if (atomic_read(&mle->woken) == 1 ||
+			    res->owner == target)
+				break;
+
+			mlog(0, "timed out during migration\n");
+		}
+		if (ret == -ERESTARTSYS) {
+			/* migration failed, detach and clean up mle */
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+			dlm_put_mle(mle);
+			goto leave;
+		}
+		/* TODO: if node died: stop, clean up, return error */
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, target);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	dlm_remove_nonlocal_locks(dlm, res);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle(mle);
+	ret = 0;
+
+	dlm_lockres_calc_usage(dlm, res);
+
+leave:
+	/* re-dirty the lockres if we failed */
+	if (ret < 0)
+		dlm_kick_thread(dlm, res);
+
+	/* TODO: cleanup */
+	if (mres)
+		free_page((unsigned long)mres);
+
+	dlm_put(dlm);
+
+	mlog(0, "returning %d\n", ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
+
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	int ret;
+	spin_lock(&dlm->ast_lock);
+	spin_lock(&lock->spinlock);
+	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&dlm->ast_lock);
+	return ret;
+}
+
+static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     u8 mig_target)
+{
+	int can_proceed;
+	spin_lock(&res->spinlock);
+	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
+	spin_unlock(&res->spinlock);
+
+	/* target has died, so make the caller break out of the 
+	 * wait_event, but caller must recheck the domain_map */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(mig_target, dlm->domain_map))
+		can_proceed = 1;
+	spin_unlock(&dlm->spinlock);
+	return can_proceed;
+}
+
+int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	int ret;
+	spin_lock(&res->spinlock);
+	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
+	spin_unlock(&res->spinlock);
+	return ret;
+}
+
+
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target)
+{
+	int ret = 0;
+
+	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
+	       res->lockname.len, res->lockname.name, dlm->node_num,
+	       target);
+	/* need to set MIGRATING flag on lockres.  this is done by
+	 * ensuring that all asts have been flushed for this lockres. */
+	spin_lock(&res->spinlock);
+	BUG_ON(res->migration_pending);
+	res->migration_pending = 1;
+	/* strategy is to reserve an extra ast then release
+	 * it below, letting the release do all of the work */
+	__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
+	/* now flush all the pending asts.. hang out for a bit */
+	dlm_kick_thread(dlm, res);
+	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
+	dlm_lockres_release_ast(dlm, res);
+
+	mlog(0, "about to wait on migration_wq, dirty=%s\n",
+	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+	/* if the extra ref we just put was the final one, this
+	 * will pass thru immediately.  otherwise, we need to wait
+	 * for the last ast to finish. */
+again:
+	ret = wait_event_interruptible_timeout(dlm->migration_wq,
+		   dlm_migration_can_proceed(dlm, res, target),
+		   msecs_to_jiffies(1000));
+	if (ret < 0) {
+		mlog(0, "woken again: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	} else {
+		mlog(0, "all is well: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	}
+	if (!dlm_migration_can_proceed(dlm, res, target)) {
+		mlog(0, "trying again...\n");
+		goto again;
+	}
+
+	/* did the target go down or die? */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(target, dlm->domain_map)) {
+		mlog(ML_ERROR, "aha. migration target %u just went down\n",
+		     target);
+		ret = -EHOSTDOWN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	/*
+	 * at this point:
+	 *
+	 *   o the DLM_LOCK_RES_MIGRATING flag is set
+	 *   o there are no pending asts on this lockres
+	 *   o all processes trying to reserve an ast on this
+	 *     lockres must wait for the MIGRATING flag to clear
+	 */
+	return ret;
+}
+
+/* last step in the migration process.
+ * original master calls this to free all of the dlm_lock
+ * structures that used to be for other nodes. */
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res)
+{
+	struct list_head *iter, *iter2;
+	struct list_head *queue = &res->granted;
+	int i;
+	struct dlm_lock *lock;
+
+	assert_spin_locked(&res->spinlock);
+
+	BUG_ON(res->owner == dlm->node_num);
+
+	for (i=0; i<3; i++) {
+		list_for_each_safe(iter, iter2, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node != dlm->node_num) {
+				mlog(0, "putting lock for node %u\n",
+				     lock->ml.node);
+				/* be extra careful */
+				BUG_ON(!list_empty(&lock->ast_list));
+				BUG_ON(!list_empty(&lock->bast_list));
+				BUG_ON(lock->ast_pending);
+				BUG_ON(lock->bast_pending);
+				list_del_init(&lock->list);
+				dlm_lock_put(lock);
+			}
+		}
+		queue++;
+	}
+}
+
+/* for now this is not too intelligent.  we will
+ * need stats to make this do the right thing.
+ * this just finds the first lock on one of the
+ * queues and uses that node as the target. */
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res)
+{
+	int i;
+	struct list_head *queue = &res->granted;
+	struct list_head *iter;
+	struct dlm_lock *lock;
+	int nodenum;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	spin_lock(&res->spinlock);
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			/* up to the caller to make sure this node
+			 * is alive */
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node != dlm->node_num) {
+				spin_unlock(&res->spinlock);
+				return lock->ml.node;
+			}
+		}
+		queue++;
+	}
+	spin_unlock(&res->spinlock);
+	mlog(0, "have not found a suitable target yet! checking domain map\n");
+
+	/* ok now we're getting desperate.  pick anyone alive. */
+	nodenum = -1;
+	while (1) {
+		nodenum = find_next_bit(dlm->domain_map,
+					O2NM_MAX_NODES, nodenum+1);
+		mlog(0, "found %d in domain map\n", nodenum);
+		if (nodenum >= O2NM_MAX_NODES)
+			break;
+		if (nodenum != dlm->node_num) {
+			mlog(0, "picking %d\n", nodenum);
+			return nodenum;
+		}
+	}
+
+	mlog(0, "giving up.  no master to migrate to\n");
+	return DLM_LOCK_RES_OWNER_UNKNOWN;
+}
+
+
+
+/* this is called by the new master once all lockres
+ * data has been received */
+static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  u8 master, u8 new_master,
+				  struct dlm_node_iter *iter)
+{
+	struct dlm_migrate_request migrate;
+	int ret, status = 0;
+	int nodenum;
+
+	memset(&migrate, 0, sizeof(migrate));
+	migrate.namelen = res->lockname.len;
+	memcpy(migrate.name, res->lockname.name, migrate.namelen);
+	migrate.new_master = new_master;
+	migrate.master = master;
+
+	ret = 0;
+
+	/* send message to all nodes, except the master and myself */
+	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
+		if (nodenum == master ||
+		    nodenum == new_master)
+			continue;
+
+		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					 &migrate, sizeof(migrate), nodenum,
+					 &status);
+		if (ret < 0)
+			mlog_errno(ret);
+		else if (status < 0) {
+			mlog(0, "migrate request (node %u) returned %d!\n",
+			     nodenum, status);
+			ret = status;
+		}
+	}
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	mlog(0, "returning ret=%d\n", ret);
+	return ret;
+}
+
+
+/* if there is an existing mle for this lockres, we now know who the master is.
+ * (the one who sent us *this* message) we can clear it up right away.
+ * since the process that put the mle on the list still has a reference to it,
+ * we can unhash it now, set the master and wake the process.  as a result,
+ * we will have no mle in the list to start with.  now we can add an mle for
+ * the migration and this should be the only one found for those scanning the
+ * list.  */
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
+	const char *name;
+	unsigned int namelen;
+	int ret = 0;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	name = migrate->name;
+	namelen = migrate->namelen;
+
+	/* preallocate.. if this fails, abort */
+	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+							 GFP_KERNEL);
+
+	if (!mle) {
+		ret = -ENOMEM;
+		goto leave;
+	}
+
+	/* check for pre-existing lock */
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	spin_lock(&dlm->master_lock);
+
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			/* if all is working ok, this can only mean that we got
+		 	* a migrate request from a node that we now see as
+		 	* dead.  what can we do here?  drop it to the floor? */
+			spin_unlock(&res->spinlock);
+			mlog(ML_ERROR, "Got a migrate request, but the "
+			     "lockres is marked as recovering!");
+			kmem_cache_free(dlm_mle_cache, mle);
+			ret = -EINVAL; /* need a better solution */
+			goto unlock;
+		}
+		res->state |= DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
+	}
+
+	/* ignore status.  only nonzero status would BUG. */
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
+				    name, namelen,
+				    migrate->new_master,
+				    migrate->master);
+
+unlock:
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (res)
+		dlm_lockres_put(res);
+leave:
+	dlm_put(dlm);
+	return ret;
+}
+
+/* must be holding dlm->spinlock and dlm->master_lock
+ * when adding a migration mle, we can clear any other mles
+ * in the master list because we know with certainty that
+ * the master is "master".  so we remove any old mle from
+ * the list after setting it's master field, and then add
+ * the new migration mle.  this way we can hold with the rule
+ * of having only one mle for a given lock name at all times. */
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master)
+{
+	int found;
+	int ret = 0;
+
+	*oldmle = NULL;
+
+	mlog_entry_void();
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	/* caller is responsible for any ref taken here on oldmle */
+	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
+	if (found) {
+		struct dlm_master_list_entry *tmp = *oldmle;
+		spin_lock(&tmp->spinlock);
+		if (tmp->type == DLM_MLE_MIGRATION) {
+			if (master == dlm->node_num) {
+				/* ah another process raced me to it */
+				mlog(0, "tried to migrate %.*s, but some "
+				     "process beat me to it\n",
+				     namelen, name);
+				ret = -EEXIST;
+			} else {
+				/* bad.  2 NODES are trying to migrate! */
+				mlog(ML_ERROR, "migration error  mle: "
+				     "master=%u new_master=%u // request: "
+				     "master=%u new_master=%u // "
+				     "lockres=%.*s\n",
+				     tmp->master, tmp->new_master,
+				     master, new_master,
+				     namelen, name);
+				BUG();
+			}
+		} else {
+			/* this is essentially what assert_master does */
+			tmp->master = master;
+			atomic_set(&tmp->woken, 1);
+			wake_up(&tmp->wq);
+			/* remove it from the list so that only one
+			 * mle will be found */
+			list_del_init(&tmp->list);
+		}
+		spin_unlock(&tmp->spinlock);
+	}
+
+	/* now add a migration mle to the tail of the list */
+	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
+	mle->new_master = new_master;
+	mle->master = master;
+	/* do this for consistency with other mle types */
+	set_bit(new_master, mle->maybe_map);
+	list_add(&mle->list, &dlm->master_list);
+
+	return ret;
+}
+
+
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter, *iter2;
+	struct dlm_master_list_entry *mle;
+	struct dlm_lock_resource *res;
+
+	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+top:
+	assert_spin_locked(&dlm->spinlock);
+
+	/* clean the master list */
+	spin_lock(&dlm->master_lock);
+	list_for_each_safe(iter, iter2, &dlm->master_list) {
+		mle = list_entry(iter, struct dlm_master_list_entry, list);
+
+		BUG_ON(mle->type != DLM_MLE_BLOCK &&
+		       mle->type != DLM_MLE_MASTER &&
+		       mle->type != DLM_MLE_MIGRATION);
+
+		/* MASTER mles are initiated locally.  the waiting
+		 * process will notice the node map change
+		 * shortly.  let that happen as normal. */
+		if (mle->type == DLM_MLE_MASTER)
+			continue;
+
+
+		/* BLOCK mles are initiated by other nodes.
+		 * need to clean up if the dead node would have
+		 * been the master. */
+		if (mle->type == DLM_MLE_BLOCK) {
+			int bit;
+
+			spin_lock(&mle->spinlock);
+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+			if (bit != dead_node) {
+				mlog(0, "mle found, but dead node %u would "
+				     "not have been master\n", dead_node);
+				spin_unlock(&mle->spinlock);
+			} else {
+				/* must drop the refcount by one since the
+				 * assert_master will never arrive.  this
+				 * may result in the mle being unlinked and
+				 * freed, but there may still be a process
+				 * waiting in the dlmlock path which is fine. */
+				mlog(ML_ERROR, "node %u was expected master\n",
+				     dead_node);
+				atomic_set(&mle->woken, 1);
+				spin_unlock(&mle->spinlock);
+				wake_up(&mle->wq);
+				/* final put will take care of list removal */
+				__dlm_put_mle(mle);
+			}
+			continue;
+		}
+
+		/* everything else is a MIGRATION mle */
+
+		/* the rule for MIGRATION mles is that the master
+		 * becomes UNKNOWN if *either* the original or
+		 * the new master dies.  all UNKNOWN lockreses
+		 * are sent to whichever node becomes the recovery
+		 * master.  the new master is responsible for
+		 * determining if there is still a master for
+		 * this lockres, or if he needs to take over
+		 * mastery.  either way, this node should expect
+		 * another message to resolve this. */
+		if (mle->master != dead_node &&
+		    mle->new_master != dead_node)
+			continue;
+
+		/* if we have reached this point, this mle needs to
+		 * be removed from the list and freed. */
+
+		/* remove from the list early.  NOTE: unlinking
+		 * list_head while in list_for_each_safe */
+		spin_lock(&mle->spinlock);
+		list_del_init(&mle->list);
+		atomic_set(&mle->woken, 1);
+		spin_unlock(&mle->spinlock);
+		wake_up(&mle->wq);
+
+		mlog(0, "node %u died during migration from "
+		     "%u to %u!\n", dead_node,
+		     mle->master, mle->new_master);
+		/* if there is a lockres associated with this
+	 	 * mle, find it and set its owner to UNKNOWN */
+		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
+					mle->u.name.len);
+		if (res) {
+			/* unfortunately if we hit this rare case, our
+		 	 * lock ordering is messed.  we need to drop
+		 	 * the master lock so that we can take the
+		  	 * lockres lock, meaning that we will have to
+			 * restart from the head of list. */
+			spin_unlock(&dlm->master_lock);
+
+			/* move lockres onto recovery list */
+			spin_lock(&res->spinlock);
+			dlm_set_lockres_owner(dlm, res,
+				      	DLM_LOCK_RES_OWNER_UNKNOWN);
+			dlm_move_lockres_to_recovery_list(dlm, res);
+			spin_unlock(&res->spinlock);
+			dlm_lockres_put(res);
+
+			/* dump the mle */
+			spin_lock(&dlm->master_lock);
+			__dlm_put_mle(mle);
+			spin_unlock(&dlm->master_lock);
+
+			/* restart */
+			goto top;
+		}
+
+		/* this may be the last reference */
+		__dlm_put_mle(mle);
+	}
+	spin_unlock(&dlm->master_lock);
+}
+
+
+int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 u8 old_master)
+{
+	struct dlm_node_iter iter;
+	int ret = 0;
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	clear_bit(old_master, iter.node_map);
+	clear_bit(dlm->node_num, iter.node_map);
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "now time to do a migrate request to other nodes\n");
+	ret = dlm_do_migrate_request(dlm, res, old_master,
+				     dlm->node_num, &iter);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mlog(0, "doing assert master of %.*s to all except the original node\n",
+	     res->lockname.len, res->lockname.name);
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		/* no longer need to retry.  all living nodes contacted. */
+		mlog_errno(ret);
+		ret = 0;
+	}
+
+	memset(iter.node_map, 0, sizeof(iter.node_map));
+	set_bit(old_master, iter.node_map);
+	mlog(0, "doing assert master of %.*s back to %u\n",
+	     res->lockname.len, res->lockname.name, old_master);
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		mlog(0, "assert master to original master failed "
+		     "with %d.\n", ret);
+		/* the only nonzero status here would be because of
+		 * a dead original node.  we're done. */
+		ret = 0;
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, dlm->node_num);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	/* re-dirty it on the new master */
+	dlm_kick_thread(dlm, res);
+	wake_up(&res->wq);
+leave:
+	return ret;
+}
+
+/*
+ * LOCKRES AST REFCOUNT
+ * this is integral to migration
+ */
+
+/* for future intent to call an ast, reserve one ahead of time.
+ * this should be called only after waiting on the lockres
+ * with dlm_wait_on_lockres, and while still holding the
+ * spinlock after the call. */
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		__dlm_print_one_lock_resource(res);
+	}
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+
+	atomic_inc(&res->asts_reserved);
+}
+
+/*
+ * used to drop the reserved ast, either because it went unused,
+ * or because the ast/bast was actually called.
+ *
+ * also, if there is a pending migration on this lockres,
+ * and this was the last pending ast on the lockres,
+ * atomically set the MIGRATING flag before we drop the lock.
+ * this is how we ensure that migration can proceed with no
+ * asts in progress.  note that it is ok if the state of the
+ * queues is such that a lock should be granted in the future
+ * or that a bast should be fired, because the new master will
+ * shuffle the lists on this lockres as soon as it is migrated.
+ */
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res)
+{
+	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
+		return;
+
+	if (!res->migration_pending) {
+		spin_unlock(&res->spinlock);
+		return;
+	}
+
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+	res->migration_pending = 0;
+	res->state |= DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	wake_up(&dlm->migration_wq);
+}

+ 2132 - 0
fs/ocfs2/dlm/dlmrecovery.c

@@ -0,0 +1,2132 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
+#include "cluster/masklog.h"
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
+
+static int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
+static int dlm_do_recovery(struct dlm_ctxt *dlm);
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_request_all_locks(struct dlm_ctxt *dlm,
+				 u8 request_from, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master);
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				    struct dlm_migratable_lockres *mres,
+				    u8 send_to,
+				    struct dlm_lock_resource *res,
+				    int total_locks);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master);
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres);
+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 u8 nodenum, u8 *real_master);
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
+				 u8 dead_node, u8 send_to);
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list, u8 dead_node);
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master);
+static void dlm_reco_ast(void *astdata);
+static void dlm_reco_bast(void *astdata, int blocked_type);
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
+static void dlm_request_all_locks_worker(struct dlm_work_item *item,
+					 void *data);
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+
+static u64 dlm_get_next_mig_cookie(void);
+
+static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_mig_cookie = 1;
+
+static u64 dlm_get_next_mig_cookie(void)
+{
+	u64 c;
+	spin_lock(&dlm_mig_cookie_lock);
+	c = dlm_mig_cookie;
+	if (dlm_mig_cookie == (~0ULL))
+		dlm_mig_cookie = 1;
+	else
+		dlm_mig_cookie++;
+	spin_unlock(&dlm_mig_cookie_lock);
+	return c;
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	spin_unlock(&dlm->spinlock);
+}
+
+/* Worker function used during recovery. */
+void dlm_dispatch_work(void *data)
+{
+	struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
+	LIST_HEAD(tmp_list);
+	struct list_head *iter, *iter2;
+	struct dlm_work_item *item;
+	dlm_workfunc_t *workfunc;
+
+	spin_lock(&dlm->work_lock);
+	list_splice_init(&dlm->work_list, &tmp_list);
+	spin_unlock(&dlm->work_lock);
+
+	list_for_each_safe(iter, iter2, &tmp_list) {
+		item = list_entry(iter, struct dlm_work_item, list);
+		workfunc = item->func;
+		list_del_init(&item->list);
+
+		/* already have ref on dlm to avoid having
+		 * it disappear.  just double-check. */
+		BUG_ON(item->dlm != dlm);
+
+		/* this is allowed to sleep and
+		 * call network stuff */
+		workfunc(item, item->data);
+
+		dlm_put(dlm);
+		kfree(item);
+	}
+}
+
+/*
+ * RECOVERY THREAD
+ */
+
+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
+{
+	/* wake the recovery thread
+	 * this will wake the reco thread in one of three places
+	 * 1) sleeping with no recovery happening
+	 * 2) sleeping with recovery mastered elsewhere
+	 * 3) recovery mastered here, waiting on reco data */
+
+	wake_up(&dlm->dlm_reco_thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm recovery thread...\n");
+
+	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
+						"dlm_reco_thread");
+	if (IS_ERR(dlm->dlm_reco_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
+		dlm->dlm_reco_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_reco_thread_task) {
+		mlog(0, "waiting for dlm recovery thread to exit\n");
+		kthread_stop(dlm->dlm_reco_thread_task);
+		dlm->dlm_reco_thread_task = NULL;
+	}
+}
+
+
+
+/*
+ * this is lame, but here's how recovery works...
+ * 1) all recovery threads cluster wide will work on recovering
+ *    ONE node at a time
+ * 2) negotiate who will take over all the locks for the dead node.
+ *    thats right... ALL the locks.
+ * 3) once a new master is chosen, everyone scans all locks
+ *    and moves aside those mastered by the dead guy
+ * 4) each of these locks should be locked until recovery is done
+ * 5) the new master collects up all of secondary lock queue info
+ *    one lock at a time, forcing each node to communicate back
+ *    before continuing
+ * 6) each secondary lock queue responds with the full known lock info
+ * 7) once the new master has run all its locks, it sends a ALLDONE!
+ *    message to everyone
+ * 8) upon receiving this message, the secondary queue node unlocks
+ *    and responds to the ALLDONE
+ * 9) once the new master gets responses from everyone, he unlocks
+ *    everything and recovery for this dead node is done
+ *10) go back to 2) while there are still dead nodes
+ *
+ */
+
+
+#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
+
+static int dlm_recovery_thread(void *data)
+{
+	int status;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		if (dlm_joined(dlm)) {
+			status = dlm_do_recovery(dlm);
+			if (status == -EAGAIN) {
+				/* do not sleep, recheck immediately. */
+				continue;
+			}
+			if (status < 0)
+				mlog_errno(status);
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM recovery thread\n");
+	return 0;
+}
+
+/* callers of the top-level api calls (dlmlock/dlmunlock) should
+ * block on the dlm->reco.event when recovery is in progress.
+ * the dlm recovery thread will set this state when it begins
+ * recovering a dead node (as the new master or not) and clear
+ * the state and wake as soon as all affected lock resources have
+ * been marked with the RECOVERY flag */
+static int dlm_in_recovery(struct dlm_ctxt *dlm)
+{
+	int in_recovery;
+	spin_lock(&dlm->spinlock);
+	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	spin_unlock(&dlm->spinlock);
+	return in_recovery;
+}
+
+
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
+{
+	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
+}
+
+static void dlm_begin_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_end_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
+	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+	wake_up(&dlm->reco.event);
+}
+
+static int dlm_do_recovery(struct dlm_ctxt *dlm)
+{
+	int status = 0;
+
+	spin_lock(&dlm->spinlock);
+
+	/* check to see if the new master has died */
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
+	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+		mlog(0, "new master %u died while recovering %u!\n",
+		     dlm->reco.new_master, dlm->reco.dead_node);
+		/* unset the new_master, leave dead_node */
+		dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	}
+
+	/* select a target to recover */
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		int bit;
+
+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+		if (bit >= O2NM_MAX_NODES || bit < 0)
+			dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+		else
+			dlm->reco.dead_node = bit;
+	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+		/* BUG? */
+		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
+		     dlm->reco.dead_node);
+		dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		// mlog(0, "nothing to recover!  sleeping now!\n");
+		spin_unlock(&dlm->spinlock);
+		/* return to main thread loop and sleep. */
+		return 0;
+	}
+	mlog(0, "recovery thread found node %u in the recovery map!\n",
+	     dlm->reco.dead_node);
+	spin_unlock(&dlm->spinlock);
+
+	/* take write barrier */
+	/* (stops the list reshuffling thread, proxy ast handling) */
+	dlm_begin_recovery(dlm);
+
+	if (dlm->reco.new_master == dlm->node_num)
+		goto master_here;
+
+	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+		/* choose a new master */
+		if (!dlm_pick_recovery_master(dlm)) {
+			/* already notified everyone.  go. */
+			dlm->reco.new_master = dlm->node_num;
+			goto master_here;
+		}
+		mlog(0, "another node will master this recovery session.\n");
+	}
+	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
+	     dlm->name, dlm->reco.new_master,
+	     dlm->node_num, dlm->reco.dead_node);
+
+	/* it is safe to start everything back up here
+	 * because all of the dead node's lock resources
+	 * have been marked as in-recovery */
+	dlm_end_recovery(dlm);
+
+	/* sleep out in main dlm_recovery_thread loop. */
+	return 0;
+
+master_here:
+	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+	     dlm->name, dlm->reco.dead_node, dlm->node_num);
+
+	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
+	if (status < 0) {
+		mlog(ML_ERROR, "error %d remastering locks for node %u, "
+		     "retrying.\n", status, dlm->reco.dead_node);
+	} else {
+		/* success!  see if any other nodes need recovery */
+		dlm_reset_recovery(dlm);
+	}
+	dlm_end_recovery(dlm);
+
+	/* continue and look for another dead node */
+	return -EAGAIN;
+}
+
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int status = 0;
+	struct dlm_reco_node_data *ndata;
+	struct list_head *iter;
+	int all_nodes_done;
+	int destroy = 0;
+	int pass = 0;
+
+	status = dlm_init_recovery_area(dlm, dead_node);
+	if (status < 0)
+		goto leave;
+
+	/* safe to access the node data list without a lock, since this
+	 * process is the only one to change the list */
+	list_for_each(iter, &dlm->reco.node_data) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
+		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
+
+		mlog(0, "requesting lock info from node %u\n",
+		     ndata->node_num);
+
+		if (ndata->node_num == dlm->node_num) {
+			ndata->state = DLM_RECO_NODE_DATA_DONE;
+			continue;
+		}
+
+		status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
+		if (status < 0) {
+			mlog_errno(status);
+			if (dlm_is_host_down(status))
+				ndata->state = DLM_RECO_NODE_DATA_DEAD;
+			else {
+				destroy = 1;
+				goto leave;
+			}
+		}
+
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				mlog(0, "node %u died after requesting "
+				     "recovery info for node %u\n",
+				     ndata->node_num, dead_node);
+				// start all over
+				destroy = 1;
+				status = -EAGAIN;
+				goto leave;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
+				mlog(0, "now receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				mlog(0, "already receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				mlog(0, "already DONE receiving recovery data "
+				     "from node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+		}
+	}
+
+	mlog(0, "done requesting all lock info\n");
+
+	/* nodes should be sending reco data now
+	 * just need to wait */
+
+	while (1) {
+		/* check all the nodes now to see if we are
+		 * done, or if anyone died */
+		all_nodes_done = 1;
+		spin_lock(&dlm_reco_state_lock);
+		list_for_each(iter, &dlm->reco.node_data) {
+			ndata = list_entry (iter, struct dlm_reco_node_data, list);
+
+			mlog(0, "checking recovery state of node %u\n",
+			     ndata->node_num);
+			switch (ndata->state) {
+				case DLM_RECO_NODE_DATA_INIT:
+				case DLM_RECO_NODE_DATA_REQUESTING:
+					mlog(ML_ERROR, "bad ndata state for "
+					     "node %u: state=%d\n",
+					     ndata->node_num, ndata->state);
+					BUG();
+					break;
+				case DLM_RECO_NODE_DATA_DEAD:
+					mlog(0, "node %u died after "
+					     "requesting recovery info for "
+					     "node %u\n", ndata->node_num,
+					     dead_node);
+					spin_unlock(&dlm_reco_state_lock);
+					// start all over
+					destroy = 1;
+					status = -EAGAIN;
+					goto leave;
+				case DLM_RECO_NODE_DATA_RECEIVING:
+				case DLM_RECO_NODE_DATA_REQUESTED:
+					all_nodes_done = 0;
+					break;
+				case DLM_RECO_NODE_DATA_DONE:
+					break;
+				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					break;
+			}
+		}
+		spin_unlock(&dlm_reco_state_lock);
+
+		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
+		     all_nodes_done?"yes":"no");
+		if (all_nodes_done) {
+			int ret;
+
+			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
+	 		 * just send a finalize message to everyone and
+	 		 * clean up */
+			mlog(0, "all nodes are done! send finalize\n");
+			ret = dlm_send_finalize_reco_message(dlm);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			spin_lock(&dlm->spinlock);
+			dlm_finish_local_lockres_recovery(dlm, dead_node,
+							  dlm->node_num);
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "should be done with recovery!\n");
+
+			mlog(0, "finishing recovery of %s at %lu, "
+			     "dead=%u, this=%u, new=%u\n", dlm->name,
+			     jiffies, dlm->reco.dead_node,
+			     dlm->node_num, dlm->reco.new_master);
+			destroy = 1;
+			status = ret;
+			/* rescan everything marked dirty along the way */
+			dlm_kick_thread(dlm, NULL);
+			break;
+		}
+		/* wait to be signalled, with periodic timeout
+		 * to check for node death */
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+					 kthread_should_stop(),
+					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
+
+	}
+
+leave:
+	if (destroy)
+		dlm_destroy_recovery_area(dlm, dead_node);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int num=0;
+	struct dlm_reco_node_data *ndata;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+	/* nodes can only be removed (by dying) after dropping
+	 * this lock, and death will be trapped later, so this should do */
+	spin_unlock(&dlm->spinlock);
+
+	while (1) {
+		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
+		if (num >= O2NM_MAX_NODES) {
+			break;
+		}
+		BUG_ON(num == dead_node);
+
+		ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+		if (!ndata) {
+			dlm_destroy_recovery_area(dlm, dead_node);
+			return -ENOMEM;
+		}
+		ndata->node_num = num;
+		ndata->state = DLM_RECO_NODE_DATA_INIT;
+		spin_lock(&dlm_reco_state_lock);
+		list_add_tail(&ndata->list, &dlm->reco.node_data);
+		spin_unlock(&dlm_reco_state_lock);
+		num++;
+	}
+
+	return 0;
+}
+
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter, *iter2;
+	struct dlm_reco_node_data *ndata;
+	LIST_HEAD(tmplist);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_splice_init(&dlm->reco.node_data, &tmplist);
+	spin_unlock(&dlm_reco_state_lock);
+
+	list_for_each_safe(iter, iter2, &tmplist) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		list_del_init(&ndata->list);
+		kfree(ndata);
+	}
+}
+
+static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
+				 u8 dead_node)
+{
+	struct dlm_lock_request lr;
+	enum dlm_status ret;
+
+	mlog(0, "\n");
+
+
+	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
+		  "to %u\n", dead_node, request_from);
+
+	memset(&lr, 0, sizeof(lr));
+	lr.node_idx = dlm->node_num;
+	lr.dead_node = dead_node;
+
+	// send message
+	ret = DLM_NOLOCKMGR;
+	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
+				 &lr, sizeof(lr), request_from, NULL);
+
+	/* negative status is handled by caller */
+	if (ret < 0)
+		mlog_errno(ret);
+
+	// return from here, then
+	// sleep until all received or error
+	return ret;
+
+}
+
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(lr->dead_node != dlm->reco.dead_node);
+
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!item) {
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* this will get freed by dlm_request_all_locks_worker */
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!buf) {
+		kfree(item);
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* queue up work for dlm_request_all_locks_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
+	item->u.ral.reco_master = lr->node_idx;
+	item->u.ral.dead_node = lr->dead_node;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	schedule_work(&dlm->dispatched_work);
+
+	dlm_put(dlm);
+	return 0;
+}
+
+static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_migratable_lockres *mres;
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
+	LIST_HEAD(resources);
+	struct list_head *iter;
+	int ret;
+	u8 dead_node, reco_master;
+
+	dlm = item->dlm;
+	dead_node = item->u.ral.dead_node;
+	reco_master = item->u.ral.reco_master;
+	BUG_ON(dead_node != dlm->reco.dead_node);
+	BUG_ON(reco_master != dlm->reco.new_master);
+
+	mres = (struct dlm_migratable_lockres *)data;
+
+	/* lock resources should have already been moved to the
+ 	 * dlm->reco.resources list.  now move items from that list
+ 	 * to a temp list if the dead owner matches.  note that the
+	 * whole cluster recovers only one node at a time, so we
+	 * can safely move UNKNOWN lock resources for each recovery
+	 * session. */
+	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
+
+	/* now we can begin blasting lockreses without the dlm lock */
+	list_for_each(iter, &resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
+				   	DLM_MRES_RECOVERY);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	/* move the resources back to the list */
+	spin_lock(&dlm->spinlock);
+	list_splice_init(&resources, &dlm->reco.resources);
+	spin_unlock(&dlm->spinlock);
+
+	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	free_page((unsigned long)data);
+}
+
+
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
+{
+	int ret, tmpret;
+	struct dlm_reco_data_done done_msg;
+
+	memset(&done_msg, 0, sizeof(done_msg));
+	done_msg.node_idx = dlm->node_num;
+	done_msg.dead_node = dead_node;
+	mlog(0, "sending DATA DONE message to %u, "
+	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
+	     done_msg.dead_node);
+
+	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
+				 sizeof(done_msg), send_to, &tmpret);
+	/* negative status is ignored by the caller */
+	if (ret >= 0)
+		ret = tmpret;
+	return ret;
+}
+
+
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
+	struct list_head *iter;
+	struct dlm_reco_node_data *ndata = NULL;
+	int ret = -EINVAL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+	     "node_idx=%u, this node=%u\n", done->dead_node,
+	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
+	BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_for_each(iter, &dlm->reco.node_data) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		if (ndata->node_num != done->node_idx)
+			continue;
+
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_DEAD:
+			case DLM_RECO_NODE_DATA_DONE:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				mlog(ML_ERROR, "bad ndata state for node %u:"
+				     " state=%d\n", ndata->node_num,
+				     ndata->state);
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				mlog(0, "node %u is DONE sending "
+					  "recovery data!\n",
+					  ndata->node_num);
+
+				ndata->state = DLM_RECO_NODE_DATA_DONE;
+				ret = 0;
+				break;
+		}
+	}
+	spin_unlock(&dlm_reco_state_lock);
+
+	/* wake the recovery thread, some node is done */
+	if (!ret)
+		dlm_kick_recovery_thread(dlm);
+
+	if (ret < 0)
+		mlog(ML_ERROR, "failed to find recovery node data for node "
+		     "%u\n", done->node_idx);
+	dlm_put(dlm);
+
+	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
+	return ret;
+}
+
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list,
+				       	u8 dead_node)
+{
+	struct dlm_lock_resource *res;
+	struct list_head *iter, *iter2;
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		if (dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len))
+			continue;
+		if (res->owner == dead_node) {
+			mlog(0, "found lockres owned by dead node while "
+				  "doing recovery for node %u. sending it.\n",
+				  dead_node);
+			list_del_init(&res->recovering);
+			list_add_tail(&res->recovering, list);
+		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "found UNKNOWN owner while doing recovery "
+				  "for node %u. sending it.\n", dead_node);
+			list_del_init(&res->recovering);
+			list_add_tail(&res->recovering, list);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
+{
+	int total_locks = 0;
+	struct list_head *iter, *queue = &res->granted;
+	int i;
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue)
+			total_locks++;
+		queue++;
+	}
+	return total_locks;
+}
+
+
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				      struct dlm_migratable_lockres *mres,
+				      u8 send_to,
+				      struct dlm_lock_resource *res,
+				      int total_locks)
+{
+	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
+	int mres_total_locks = be32_to_cpu(mres->total_locks);
+	int sz, ret = 0, status = 0;
+	u8 orig_flags = mres->flags,
+	   orig_master = mres->master;
+
+	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
+	if (!mres->num_locks)
+		return 0;
+
+	sz = sizeof(struct dlm_migratable_lockres) +
+		(mres->num_locks * sizeof(struct dlm_migratable_lock));
+
+	/* add an all-done flag if we reached the last lock */
+	orig_flags = mres->flags;
+	BUG_ON(total_locks > mres_total_locks);
+	if (total_locks == mres_total_locks)
+		mres->flags |= DLM_MRES_ALL_DONE;
+
+	/* send it */
+	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
+				 sz, send_to, &status);
+	if (ret < 0) {
+		/* XXX: negative status is not handled.
+		 * this will end up killing this node. */
+		mlog_errno(ret);
+	} else {
+		/* might get an -ENOMEM back here */
+		ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
+
+			if (ret == -EFAULT) {
+				mlog(ML_ERROR, "node %u told me to kill "
+				     "myself!\n", send_to);
+				BUG();
+			}
+		}
+	}
+
+	/* zero and reinit the message buffer */
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, mres_total_locks,
+				    mig_cookie, orig_flags, orig_master);
+	return ret;
+}
+
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master)
+{
+	/* mres here is one full page */
+	memset(mres, 0, PAGE_SIZE);
+	mres->lockname_len = namelen;
+	memcpy(mres->lockname, lockname, namelen);
+	mres->num_locks = 0;
+	mres->total_locks = cpu_to_be32(total_locks);
+	mres->mig_cookie = cpu_to_be64(cookie);
+	mres->flags = flags;
+	mres->master = master;
+}
+
+
+/* returns 1 if this lock fills the network structure,
+ * 0 otherwise */
+static int dlm_add_lock_to_array(struct dlm_lock *lock,
+				 struct dlm_migratable_lockres *mres, int queue)
+{
+	struct dlm_migratable_lock *ml;
+	int lock_num = mres->num_locks;
+
+	ml = &(mres->ml[lock_num]);
+	ml->cookie = lock->ml.cookie;
+	ml->type = lock->ml.type;
+	ml->convert_type = lock->ml.convert_type;
+	ml->highest_blocked = lock->ml.highest_blocked;
+	ml->list = queue;
+	if (lock->lksb) {
+		ml->flags = lock->lksb->flags;
+		/* send our current lvb */
+		if (ml->type == LKM_EXMODE ||
+		    ml->type == LKM_PRMODE) {
+			/* if it is already set, this had better be a PR
+			 * and it has to match */
+			if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
+			    memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+				mlog(ML_ERROR, "mismatched lvbs!\n");
+				__dlm_print_one_lock_resource(lock->lockres);
+				BUG();
+			}
+			memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+		}
+	}
+	ml->node = lock->ml.node;
+	mres->num_locks++;
+	/* we reached the max, send this network message */
+	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
+		return 1;
+	return 0;
+}
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to, u8 flags)
+{
+	struct list_head *queue, *iter;
+	int total_locks, i;
+	u64 mig_cookie = 0;
+	struct dlm_lock *lock;
+	int ret = 0;
+
+	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	mlog(0, "sending to %u\n", send_to);
+
+	total_locks = dlm_num_locks_in_lockres(res);
+	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
+		/* rare, but possible */
+		mlog(0, "argh.  lockres has %d locks.  this will "
+			  "require more than one network packet to "
+			  "migrate\n", total_locks);
+		mig_cookie = dlm_get_next_mig_cookie();
+	}
+
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, total_locks,
+				    mig_cookie, flags, res->owner);
+
+	total_locks = 0;
+	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+
+			/* add another lock. */
+			total_locks++;
+			if (!dlm_add_lock_to_array(lock, mres, i))
+				continue;
+
+			/* this filled the lock message,
+			 * we must send it immediately. */
+			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
+						       res, total_locks);
+			if (ret < 0) {
+				// TODO
+				mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
+				     "returned %d, TODO\n", ret);
+				BUG();
+			}
+		}
+	}
+	/* flush any remaining locks */
+	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
+	if (ret < 0) {
+		// TODO
+		mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
+		     "TODO\n", ret);
+		BUG();
+	}
+	return ret;
+}
+
+
+
+/*
+ * this message will contain no more than one page worth of
+ * recovery data, and it will work on only one lockres.
+ * there may be many locks in this page, and we may need to wait
+ * for additional packets to complete all the locks (rare, but
+ * possible).
+ */
+/*
+ * NOTE: the allocation error cases here are scary
+ * we really cannot afford to fail an alloc in recovery
+ * do we spin?  returning an error only delays the problem really
+ */
+
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres =
+		(struct dlm_migratable_lockres *)msg->buf;
+	int ret = 0;
+	u8 real_master;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+	struct dlm_lock_resource *res = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	real_master = mres->master;
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* cannot migrate a lockres with no master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+	}
+
+	mlog(0, "%s message received from node %u\n",
+		  (mres->flags & DLM_MRES_RECOVERY) ?
+		  "recovery" : "migration", mres->master);
+	if (mres->flags & DLM_MRES_ALL_DONE)
+		mlog(0, "all done flag.  all lockres data received!\n");
+
+	ret = -ENOMEM;
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!buf || !item)
+		goto leave;
+
+	/* lookup the lock to see if we have a secondary queue for this
+	 * already...  just add the locks in and this will have its owner
+	 * and RECOVERY flag changed when it completes. */
+	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+	if (res) {
+	 	/* this will get a ref on res */
+		/* mark it as recovering/migrating and hash it */
+		spin_lock(&res->spinlock);
+		if (mres->flags & DLM_MRES_RECOVERY) {
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		} else {
+			if (res->state & DLM_LOCK_RES_MIGRATING) {
+				/* this is at least the second
+				 * lockres message */
+				mlog(0, "lock %.*s is already migrating\n",
+					  mres->lockname_len,
+					  mres->lockname);
+			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
+				/* caller should BUG */
+				mlog(ML_ERROR, "node is attempting to migrate "
+				     "lock %.*s, but marked as recovering!\n",
+				     mres->lockname_len, mres->lockname);
+				ret = -EFAULT;
+				spin_unlock(&res->spinlock);
+				goto leave;
+			}
+			res->state |= DLM_LOCK_RES_MIGRATING;
+		}
+		spin_unlock(&res->spinlock);
+	} else {
+		/* need to allocate, just like if it was
+		 * mastered here normally  */
+		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
+		if (!res)
+			goto leave;
+
+		/* to match the ref that we would have gotten if
+		 * dlm_lookup_lockres had succeeded */
+		dlm_lockres_get(res);
+
+		/* mark it as recovering/migrating and hash it */
+		if (mres->flags & DLM_MRES_RECOVERY)
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		else
+			res->state |= DLM_LOCK_RES_MIGRATING;
+
+		spin_lock(&dlm->spinlock);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&dlm->spinlock);
+
+		/* now that the new lockres is inserted,
+		 * make it usable by other processes */
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		spin_unlock(&res->spinlock);
+
+		/* add an extra ref for just-allocated lockres 
+		 * otherwise the lockres will be purged immediately */
+		dlm_lockres_get(res);
+
+	}
+
+	/* at this point we have allocated everything we need,
+	 * and we have a hashed lockres with an extra ref and
+	 * the proper res->state flags. */
+	ret = 0;
+	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* migration cannot have an unknown master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+		mlog(0, "recovery has passed me a lockres with an "
+			  "unknown owner.. will need to requery: "
+			  "%.*s\n", mres->lockname_len, mres->lockname);
+	} else {
+		spin_lock(&res->spinlock);
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+		spin_unlock(&res->spinlock);
+	}
+
+	/* queue up work for dlm_mig_lockres_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */
+	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
+	item->u.ml.lockres = res; /* already have a ref */
+	item->u.ml.real_master = real_master;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	schedule_work(&dlm->dispatched_work);
+
+leave:
+	dlm_put(dlm);
+	if (ret < 0) {
+		if (buf)
+			kfree(buf);
+		if (item)
+			kfree(item);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	u8 real_master;
+
+	dlm = item->dlm;
+	mres = (struct dlm_migratable_lockres *)data;
+
+	res = item->u.ml.lockres;
+	real_master = item->u.ml.real_master;
+
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* this case is super-rare. only occurs if
+		 * node death happens during migration. */
+again:
+		ret = dlm_lockres_master_requery(dlm, res, &real_master);
+		if (ret < 0) {
+			mlog(0, "dlm_lockres_master_requery failure: %d\n",
+				  ret);
+			goto again;
+		}
+		if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lockres %.*s not claimed.  "
+				   "this node will take it.\n",
+				   res->lockname.len, res->lockname.name);
+		} else {
+			mlog(0, "master needs to respond to sender "
+				  "that node %u still owns %.*s\n",
+				  real_master, res->lockname.len,
+				  res->lockname.name);
+			/* cannot touch this lockres */
+			goto leave;
+		}
+	}
+
+	ret = dlm_process_recovery_data(dlm, res, mres);
+	if (ret < 0)
+		mlog(0, "dlm_process_recovery_data returned  %d\n", ret);
+	else
+		mlog(0, "dlm_process_recovery_data succeeded\n");
+
+	if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
+	                   (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
+		ret = dlm_finish_migration(dlm, res, mres->master);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+leave:
+	kfree(data);
+	mlog_exit(ret);
+}
+
+
+
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master)
+{
+	struct dlm_node_iter iter;
+	int nodenum;
+	int ret = 0;
+
+	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	/* we only reach here if one of the two nodes in a
+	 * migration died while the migration was in progress.
+	 * at this point we need to requery the master.  we
+	 * know that the new_master got as far as creating
+	 * an mle on at least one node, but we do not know
+	 * if any nodes had actually cleared the mle and set
+	 * the master to the new_master.  the old master
+	 * is supposed to set the owner to UNKNOWN in the
+	 * event of a new_master death, so the only possible
+	 * responses that we can get from nodes here are
+	 * that the master is new_master, or that the master
+	 * is UNKNOWN.
+	 * if all nodes come back with UNKNOWN then we know
+	 * the lock needs remastering here.
+	 * if any node comes back with a valid master, check
+	 * to see if that master is the one that we are
+	 * recovering.  if so, then the new_master died and
+	 * we need to remaster this lock.  if not, then the
+	 * new_master survived and that node will respond to
+	 * other nodes about the owner.
+	 * if there is an owner, this node needs to dump this
+	 * lockres and alert the sender that this lockres
+	 * was rejected. */
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		/* do not send to self */
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
+		if (ret < 0) {
+			mlog_errno(ret);
+			BUG();
+			/* TODO: need to figure a way to restart this */
+		}
+		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lock master is %u\n", *real_master);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 u8 nodenum, u8 *real_master)
+{
+	int ret = -EINVAL;
+	struct dlm_master_requery req;
+	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	memset(&req, 0, sizeof(req));
+	req.node_idx = dlm->node_num;
+	req.namelen = res->lockname.len;
+	memcpy(req.name, res->lockname.name, res->lockname.len);
+
+	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
+				 &req, sizeof(req), nodenum, &status);
+	/* XXX: negative status not handled properly here. */
+	if (ret < 0)
+		mlog_errno(ret);
+	else {
+		BUG_ON(status < 0);
+		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
+		*real_master = (u8) (status & 0xff);
+		mlog(0, "node %u responded to master requery with %u\n",
+			  nodenum, *real_master);
+		ret = 0;
+	}
+	return ret;
+}
+
+
+/* this function cannot error, so unless the sending
+ * or receiving of the message failed, the owner can
+ * be trusted */
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
+	u32 flags = DLM_ASSERT_MASTER_REQUERY;
+
+	if (!dlm_grab(dlm)) {
+		/* since the domain has gone away on this
+		 * node, the proper response is UNKNOWN */
+		return master;
+	}
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+	if (res) {
+		spin_lock(&res->spinlock);
+		master = res->owner;
+		if (master == dlm->node_num) {
+			int ret = dlm_dispatch_assert_master(dlm, res,
+							     0, 0, flags);
+			if (ret < 0) {
+				mlog_errno(-ENOMEM);
+				/* retry!? */
+				BUG();
+			}
+		}
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+	return master;
+}
+
+static inline struct list_head *
+dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
+{
+	struct list_head *ret;
+	BUG_ON(list_num < 0);
+	BUG_ON(list_num > 2);
+	ret = &(res->granted);
+	ret += list_num;
+	return ret;
+}
+/* TODO: do ast flush business
+ * TODO: do MIGRATING and RECOVERING spinning
+ */
+
+/*
+* NOTE about in-flight requests during migration:
+*
+* Before attempting the migrate, the master has marked the lockres as
+* MIGRATING and then flushed all of its pending ASTS.  So any in-flight
+* requests either got queued before the MIGRATING flag got set, in which
+* case the lock data will reflect the change and a return message is on
+* the way, or the request failed to get in before MIGRATING got set.  In
+* this case, the caller will be told to spin and wait for the MIGRATING
+* flag to be dropped, then recheck the master.
+* This holds true for the convert, cancel and unlock cases, and since lvb
+* updates are tied to these same messages, it applies to lvb updates as
+* well.  For the lock case, there is no way a lock can be on the master
+* queue and not be on the secondary queue since the lock is always added
+* locally first.  This means that the new target node will never be sent
+* a lock that he doesn't already have on the list.
+* In total, this means that the local lock is correct and should not be
+* updated to match the one sent by the master.  Any messages sent back
+* from the master before the MIGRATING flag will bring the lock properly
+* up-to-date, and the change will be ordered properly for the waiter.
+* We will *not* attempt to modify the lock underneath the waiter.
+*/
+
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres)
+{
+	struct dlm_migratable_lock *ml;
+	struct list_head *queue;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	int ret = 0;
+	int i;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+
+	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
+	for (i=0; i<mres->num_locks; i++) {
+		ml = &(mres->ml[i]);
+		BUG_ON(ml->highest_blocked != LKM_IVMODE);
+		newlock = NULL;
+		lksb = NULL;
+
+		queue = dlm_list_num_to_pointer(res, ml->list);
+
+		/* if the lock is for the local node it needs to
+		 * be moved to the proper location within the queue.
+		 * do not allocate a new lock structure. */
+		if (ml->node == dlm->node_num) {
+			/* MIGRATION ONLY! */
+			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+
+			spin_lock(&res->spinlock);
+			list_for_each(iter, queue) {
+				lock = list_entry (iter, struct dlm_lock, list);
+				if (lock->ml.cookie != ml->cookie)
+					lock = NULL;
+				else
+					break;
+			}
+
+			/* lock is always created locally first, and
+			 * destroyed locally last.  it must be on the list */
+			if (!lock) {
+				mlog(ML_ERROR, "could not find local lock "
+					       "with cookie %"MLFu64"!\n",
+				     ml->cookie);
+				BUG();
+			}
+			BUG_ON(lock->ml.node != ml->node);
+
+			/* see NOTE above about why we do not update
+			 * to match the master here */
+
+			/* move the lock to its proper place */
+			/* do not alter lock refcount.  switching lists. */
+			list_del_init(&lock->list);
+			list_add_tail(&lock->list, queue);
+			spin_unlock(&res->spinlock);
+
+			mlog(0, "just reordered a local lock!\n");
+			continue;
+		}
+
+		/* lock is for another node. */
+		newlock = dlm_new_lock(ml->type, ml->node,
+				       be64_to_cpu(ml->cookie), NULL);
+		if (!newlock) {
+			ret = -ENOMEM;
+			goto leave;
+		}
+		lksb = newlock->lksb;
+		dlm_lock_attach_lockres(newlock, res);
+
+		if (ml->convert_type != LKM_IVMODE) {
+			BUG_ON(queue != &res->converting);
+			newlock->ml.convert_type = ml->convert_type;
+		}
+		lksb->flags |= (ml->flags &
+				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+			
+		if (mres->lvb[0]) {
+			if (lksb->flags & DLM_LKSB_PUT_LVB) {
+				/* other node was trying to update
+				 * lvb when node died.  recreate the
+				 * lksb with the updated lvb. */
+				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+			} else {
+				/* otherwise, the node is sending its 
+				 * most recent valid lvb info */
+				BUG_ON(ml->type != LKM_EXMODE &&
+				       ml->type != LKM_PRMODE);
+				if (res->lvb[0] && (ml->type == LKM_EXMODE ||
+				    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+					mlog(ML_ERROR, "received bad lvb!\n");
+					__dlm_print_one_lock_resource(res);
+					BUG();
+				}
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
+			}
+		}
+
+
+		/* NOTE:
+		 * wrt lock queue ordering and recovery:
+		 *    1. order of locks on granted queue is
+		 *       meaningless.
+		 *    2. order of locks on converting queue is
+		 *       LOST with the node death.  sorry charlie.
+		 *    3. order of locks on the blocked queue is
+		 *       also LOST.
+		 * order of locks does not affect integrity, it
+		 * just means that a lock request may get pushed
+		 * back in line as a result of the node death.
+		 * also note that for a given node the lock order
+		 * for its secondary queue locks is preserved
+		 * relative to each other, but clearly *not*
+		 * preserved relative to locks from other nodes.
+		 */
+		spin_lock(&res->spinlock);
+		dlm_lock_get(newlock);
+		list_add_tail(&newlock->list, queue);
+		spin_unlock(&res->spinlock);
+	}
+	mlog(0, "done running all the locks\n");
+
+leave:
+	if (ret < 0) {
+		mlog_errno(ret);
+		if (newlock)
+			dlm_lock_put(newlock);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res)
+{
+	int i;
+	struct list_head *queue, *iter, *iter2;
+	struct dlm_lock *lock;
+
+	res->state |= DLM_LOCK_RES_RECOVERING;
+	if (!list_empty(&res->recovering))
+		list_del_init(&res->recovering);
+	list_add_tail(&res->recovering, &dlm->reco.resources);
+
+	/* find any pending locks and put them back on proper list */
+	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_safe(iter, iter2, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			dlm_lock_get(lock);
+			if (lock->convert_pending) {
+				/* move converting lock back to granted */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with convert pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_revert_pending_convert(res, lock);
+				lock->convert_pending = 0;
+			} else if (lock->lock_pending) {
+				/* remove pending lock requests completely */
+				BUG_ON(i != DLM_BLOCKED_LIST);
+				mlog(0, "node died with lock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				/* lock will be floating until ref in
+				 * dlmlock_remote is freed after the network
+				 * call returns.  ok for it to not be on any
+				 * list since no ast can be called
+				 * (the master is dead). */
+				dlm_revert_pending_lock(res, lock);
+				lock->lock_pending = 0;
+			} else if (lock->unlock_pending) {
+				/* if an unlock was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master.  note that the dlm_unlock
+				 * call is still responsible for calling
+				 * the unlockast.  that will happen after
+				 * the network call times out.  for now,
+				 * just move lists to prepare the new
+				 * recovery master.  */
+				BUG_ON(i != DLM_GRANTED_LIST);
+				mlog(0, "node died with unlock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_unlock(res, lock);
+				lock->unlock_pending = 0;
+			} else if (lock->cancel_pending) {
+				/* if a cancel was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with cancel pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_cancel(res, lock);
+				lock->cancel_pending = 0;
+			}
+			dlm_lock_put(lock);
+		}
+	}
+}
+
+
+
+/* removes all recovered locks from the recovery list.
+ * sets the res->owner to the new master.
+ * unsets the RECOVERY flag and wakes waiters. */
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master)
+{
+	int i;
+	struct list_head *iter, *iter2, *bucket;
+	struct dlm_lock_resource *res;
+
+	mlog_entry_void();
+
+	assert_spin_locked(&dlm->spinlock);
+
+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		if (res->owner == dead_node) {
+			list_del_init(&res->recovering);
+			spin_lock(&res->spinlock);
+			dlm_change_lockres_owner(dlm, res, new_master);
+			res->state &= ~DLM_LOCK_RES_RECOVERING;
+			__dlm_dirty_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			wake_up(&res->wq);
+		}
+	}
+
+	/* this will become unnecessary eventually, but
+	 * for now we need to run the whole hash, clear
+	 * the RECOVERING state and set the owner
+	 * if necessary */
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry (iter, struct dlm_lock_resource, list);
+			if (res->state & DLM_LOCK_RES_RECOVERING) {
+				if (res->owner == dead_node) {
+					mlog(0, "(this=%u) res %.*s owner=%u "
+					     "was not on recovering list, but "
+					     "clearing state anyway\n",
+					     dlm->node_num, res->lockname.len,
+					     res->lockname.name, new_master);
+				} else if (res->owner == dlm->node_num) {
+					mlog(0, "(this=%u) res %.*s owner=%u "
+					     "was not on recovering list, "
+					     "owner is THIS node, clearing\n",
+					     dlm->node_num, res->lockname.len,
+					     res->lockname.name, new_master);
+				} else
+					continue;
+
+				spin_lock(&res->spinlock);
+				dlm_change_lockres_owner(dlm, res, new_master);
+				res->state &= ~DLM_LOCK_RES_RECOVERING;
+				__dlm_dirty_lockres(dlm, res);
+				spin_unlock(&res->spinlock);
+				wake_up(&res->wq);
+			}
+		}
+	}
+}
+
+static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
+{
+	if (local) {
+		if (lock->ml.type != LKM_EXMODE &&
+		    lock->ml.type != LKM_PRMODE)
+			return 1;
+	} else if (lock->ml.type == LKM_EXMODE)
+		return 1;
+	return 0;
+}
+
+static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct list_head *iter, *queue;
+	struct dlm_lock *lock;
+	int blank_lvb = 0, local = 0;
+	int i;
+	u8 search_node;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (res->owner == dlm->node_num)
+		/* if this node owned the lockres, and if the dead node 
+		 * had an EX when he died, blank out the lvb */
+		search_node = dead_node;
+	else {
+		/* if this is a secondary lockres, and we had no EX or PR
+		 * locks granted, we can no longer trust the lvb */
+		search_node = dlm->node_num;
+		local = 1;  /* check local state for valid lvb */
+	}
+
+	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node == search_node) {
+				if (dlm_lvb_needs_invalidation(lock, local)) {
+					/* zero the lksb lvb and lockres lvb */
+					blank_lvb = 1;
+					memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
+				}
+			}
+		}
+	}
+
+	if (blank_lvb) {
+		mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
+		     res->lockname.len, res->lockname.name, dead_node);
+		memset(res->lvb, 0, DLM_LVB_LEN);
+	}
+}
+
+static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct list_head *iter, *tmpiter;
+	struct dlm_lock *lock;
+
+	/* this node is the lockres master:
+	 * 1) remove any stale locks for the dead node
+	 * 2) if the dead node had an EX when he died, blank out the lvb 
+	 */
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* TODO: check pending_asts, pending_basts here */
+	list_for_each_safe(iter, tmpiter, &res->granted) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+	list_for_each_safe(iter, tmpiter, &res->converting) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+	list_for_each_safe(iter, tmpiter, &res->blocked) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+
+	/* do not kick thread yet */
+	__dlm_dirty_lockres(dlm, res);
+}
+
+/* if this node is the recovery master, and there are no
+ * locks for a given lockres owned by this node that are in
+ * either PR or EX mode, zero out the lvb before requesting.
+ *
+ */
+
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter;
+	struct dlm_lock_resource *res;
+	int i;
+	struct list_head *bucket;
+
+
+	/* purge any stale mles */
+	dlm_clean_master_list(dlm, dead_node);
+
+	/*
+	 * now clean up all lock resources.  there are two rules:
+	 *
+	 * 1) if the dead node was the master, move the lockres
+	 *    to the recovering list.  set the RECOVERING flag.
+	 *    this lockres needs to be cleaned up before it can
+	 *    be used further.
+	 *
+	 * 2) if this node was the master, remove all locks from
+	 *    each of the lockres queues that were owned by the
+	 *    dead node.  once recovery finishes, the dlm thread
+	 *    can be kicked again to see if any ASTs or BASTs
+	 *    need to be fired as a result.
+	 */
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry (iter, struct dlm_lock_resource, list);
+			if (dlm_is_recovery_lock(res->lockname.name,
+						 res->lockname.len))
+				continue;
+			
+			spin_lock(&res->spinlock);
+			/* zero the lvb if necessary */
+			dlm_revalidate_lvb(dlm, res, dead_node);
+			if (res->owner == dead_node)
+				dlm_move_lockres_to_recovery_list(dlm, res);
+			else if (res->owner == dlm->node_num) {
+				dlm_free_dead_locks(dlm, res, dead_node);
+				__dlm_lockres_calc_usage(dlm, res);
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+}
+
+static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	/* check to see if the node is already considered dead */
+	if (!test_bit(idx, dlm->live_nodes_map)) {
+		mlog(0, "for domain %s, node %d is already dead. "
+		     "another node likely did recovery already.\n",
+		     dlm->name, idx);
+		return;
+	}
+
+	/* check to see if we do not care about this node */
+	if (!test_bit(idx, dlm->domain_map)) {
+		/* This also catches the case that we get a node down
+		 * but haven't joined the domain yet. */
+		mlog(0, "node %u already removed from domain!\n", idx);
+		return;
+	}
+
+	clear_bit(idx, dlm->live_nodes_map);
+
+	/* Clean up join state on node death. */
+	if (dlm->joining_node == idx) {
+		mlog(0, "Clearing join state for node %u\n", idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	}
+
+	/* make sure local cleanup occurs before the heartbeat events */
+	if (!test_bit(idx, dlm->recovery_map))
+		dlm_do_local_recovery_cleanup(dlm, idx);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, idx, 0);
+
+	mlog(0, "node %u being removed from domain map!\n", idx);
+	clear_bit(idx, dlm->domain_map);
+	/* wake up migration waiters if a node goes down.
+	 * perhaps later we can genericize this for other waiters. */
+	wake_up(&dlm->migration_wq);
+
+	if (test_bit(idx, dlm->recovery_map))
+		mlog(0, "domain %s, node %u already added "
+		     "to recovery map!\n", dlm->name, idx);
+	else
+		set_bit(idx, dlm->recovery_map);
+}
+
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	spin_lock(&dlm->spinlock);
+	__dlm_hb_node_down(dlm, idx);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	spin_lock(&dlm->spinlock);
+
+	set_bit(idx, dlm->live_nodes_map);
+
+	/* notify any mles attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, idx, 1);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+static void dlm_reco_ast(void *astdata)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_bast(void *astdata, int blocked_type)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
+{
+	mlog(0, "unlockast for recovery lock fired!\n");
+}
+
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
+{
+	enum dlm_status ret;
+	struct dlm_lockstatus lksb;
+	int status = -EINVAL;
+
+	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
+	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
+retry:
+	memset(&lksb, 0, sizeof(lksb));
+
+	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
+		      DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+
+	if (ret == DLM_NORMAL) {
+		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		/* I am master, send message to all nodes saying
+		 * that I am beginning a recovery session */
+		status = dlm_send_begin_reco_message(dlm,
+					      dlm->reco.dead_node);
+
+		/* recovery lock is a special case.  ast will not get fired,
+		 * so just go ahead and unlock it. */
+		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
+		if (ret != DLM_NORMAL) {
+			/* this would really suck. this could only happen
+			 * if there was a network error during the unlock
+			 * because of node death.  this means the unlock
+			 * is actually "done" and the lock structure is
+			 * even freed.  we can continue, but only
+			 * because this specific lock name is special. */
+			mlog(0, "dlmunlock returned %d\n", ret);
+		}
+
+		if (status < 0) {
+			mlog(0, "failed to send recovery message. "
+				   "must retry with new node map.\n");
+			goto retry;
+		}
+	} else if (ret == DLM_NOTQUEUED) {
+		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		/* another node is master. wait on
+		 * reco.new_master != O2NM_INVALID_NODE_NUM */
+		status = -EEXIST;
+	}
+
+	return status;
+}
+
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_begin_reco br;
+	int ret = 0;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+
+	mlog_entry("%u\n", dead_node);
+
+	mlog(0, "dead node is %u\n", dead_node);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dead_node, iter.node_map);
+
+	memset(&br, 0, sizeof(br));
+	br.node_idx = dlm->node_num;
+	br.dead_node = dead_node;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = 0;
+		if (nodenum == dead_node) {
+			mlog(0, "not sending begin reco to dead node "
+				  "%u\n", dead_node);
+			continue;
+		}
+		if (nodenum == dlm->node_num) {
+			mlog(0, "not sending begin reco to self\n");
+			continue;
+		}
+
+		ret = -EINVAL;
+		mlog(0, "attempting to send begin reco msg to %d\n",
+			  nodenum);
+		ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
+					 &br, sizeof(br), nodenum, &status);
+		/* negative status is handled ok by caller here */
+		if (ret >= 0)
+			ret = status;
+		if (ret < 0) {
+			struct dlm_lock_resource *res;
+			mlog_errno(ret);
+			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
+			    " returned %d\n", dlm->name, nodenum, ret);
+			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
+						 DLM_RECOVERY_LOCK_NAME_LEN);
+			if (res) {
+				dlm_print_one_lock_resource(res);
+				dlm_lockres_put(res);
+			} else {
+				mlog(ML_ERROR, "recovery lock not found\n");
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	mlog(0, "node %u wants to recover node %u\n",
+		  br->node_idx, br->dead_node);
+
+	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
+
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+		mlog(0, "new_master already set to %u!\n",
+			  dlm->reco.new_master);
+	}
+	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
+		mlog(0, "dead_node already set to %u!\n",
+			  dlm->reco.dead_node);
+	}
+	dlm->reco.new_master = br->node_idx;
+	dlm->reco.dead_node = br->dead_node;
+	if (!test_bit(br->dead_node, dlm->recovery_map)) {
+		mlog(ML_ERROR, "recovery master %u sees %u as dead, but this "
+		     "node has not yet.  marking %u as dead\n",
+		     br->node_idx, br->dead_node, br->dead_node);
+		__dlm_hb_node_down(dlm, br->dead_node);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_kick_recovery_thread(dlm);
+	dlm_put(dlm);
+	return 0;
+}
+
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+	struct dlm_finalize_reco fr;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+
+	mlog(0, "finishing recovery for node %s:%u\n",
+	     dlm->name, dlm->reco.dead_node);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	memset(&fr, 0, sizeof(fr));
+	fr.node_idx = dlm->node_num;
+	fr.dead_node = dlm->reco.dead_node;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
+					 &fr, sizeof(fr), nodenum, &status);
+		if (ret >= 0) {
+			ret = status;
+			if (dlm_is_host_down(ret)) {
+				/* this has no effect on this recovery 
+				 * session, so set the status to zero to 
+				 * finish out the last recovery */
+				mlog(ML_ERROR, "node %u went down after this "
+				     "node finished recovery.\n", nodenum);
+				ret = 0;
+			}
+		}
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	mlog(0, "node %u finalizing recovery of node %u\n",
+	     fr->node_idx, fr->dead_node);
+
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->reco.new_master != fr->node_idx) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
+		     "%u is supposed to be the new master, dead=%u\n",
+		     fr->node_idx, dlm->reco.new_master, fr->dead_node);
+		BUG();
+	}
+	if (dlm->reco.dead_node != fr->dead_node) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
+		     "node %u, but node %u is supposed to be dead\n",
+		     fr->node_idx, fr->dead_node, dlm->reco.dead_node);
+		BUG();
+	}
+
+	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_reset_recovery(dlm);
+
+	dlm_kick_recovery_thread(dlm);
+	dlm_put(dlm);
+	return 0;
+}

+ 692 - 0
fs/ocfs2/dlm/dlmthread.c

@@ -0,0 +1,692 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
+#include "cluster/masklog.h"
+
+static int dlm_thread(void *data);
+
+static void dlm_flush_asts(struct dlm_ctxt *dlm);
+
+#define dlm_lock_is_remote(dlm, lock)     ((lock)->ml.node != (dlm)->node_num)
+
+/* will exit holding res->spinlock, but may drop in function */
+/* waits until flags are cleared on res->state */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	assert_spin_locked(&res->spinlock);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (res->state & flags) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		spin_lock(&res->spinlock);
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	current->state = TASK_RUNNING;
+}
+
+
+static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+{
+	if (list_empty(&res->granted) &&
+	    list_empty(&res->converting) &&
+	    list_empty(&res->blocked) &&
+	    list_empty(&res->dirty))
+		return 1;
+	return 0;
+}
+
+
+/* Call whenever you may have added or deleted something from one of
+ * the lockres queue's. This will figure out whether it belongs on the
+ * unused list or not and does the appropriate thing. */
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (__dlm_lockres_unused(res)){
+		if (list_empty(&res->purge)) {
+			mlog(0, "putting lockres %.*s from purge list\n",
+			     res->lockname.len, res->lockname.name);
+
+			res->last_used = jiffies;
+			list_add_tail(&res->purge, &dlm->purge_list);
+			dlm->purge_count++;
+		}
+	} else if (!list_empty(&res->purge)) {
+		mlog(0, "removing lockres %.*s from purge list\n",
+		     res->lockname.len, res->lockname.name);
+
+		list_del_init(&res->purge);
+		dlm->purge_count--;
+	}
+}
+
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res)
+{
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+	spin_lock(&dlm->spinlock);
+	spin_lock(&res->spinlock);
+
+	__dlm_lockres_calc_usage(dlm, res);
+
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+}
+
+/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
+ * to do migration, but will re-acquire before exit. */
+void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
+{
+	int master;
+	int ret;
+
+	spin_lock(&lockres->spinlock);
+	master = lockres->owner == dlm->node_num;
+	spin_unlock(&lockres->spinlock);
+
+	mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
+	     lockres->lockname.name, master);
+
+	/* Non master is the easy case -- no migration required, just
+	 * quit. */
+	if (!master)
+		goto finish;
+
+	/* Wheee! Migrate lockres here! */
+	spin_unlock(&dlm->spinlock);
+again:
+
+	ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
+	if (ret == -ENOTEMPTY) {
+		mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
+		     lockres->lockname.len, lockres->lockname.name);
+
+		BUG();
+	} else if (ret < 0) {
+		mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
+		     lockres->lockname.len, lockres->lockname.name);
+		goto again;
+	}
+
+	spin_lock(&dlm->spinlock);
+
+finish:
+	if (!list_empty(&lockres->purge)) {
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+	}
+	__dlm_unhash_lockres(lockres);
+}
+
+static void dlm_run_purge_list(struct dlm_ctxt *dlm,
+			       int purge_now)
+{
+	unsigned int run_max, unused;
+	unsigned long purge_jiffies;
+	struct dlm_lock_resource *lockres;
+
+	spin_lock(&dlm->spinlock);
+	run_max = dlm->purge_count;
+
+	while(run_max && !list_empty(&dlm->purge_list)) {
+		run_max--;
+
+		lockres = list_entry(dlm->purge_list.next,
+				     struct dlm_lock_resource, purge);
+
+		/* Status of the lockres *might* change so double
+		 * check. If the lockres is unused, holding the dlm
+		 * spinlock will prevent people from getting and more
+		 * refs on it -- there's no need to keep the lockres
+		 * spinlock. */
+		spin_lock(&lockres->spinlock);
+		unused = __dlm_lockres_unused(lockres);
+		spin_unlock(&lockres->spinlock);
+
+		if (!unused)
+			continue;
+
+		purge_jiffies = lockres->last_used +
+			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
+
+		/* Make sure that we want to be processing this guy at
+		 * this time. */
+		if (!purge_now && time_after(purge_jiffies, jiffies)) {
+			/* Since resources are added to the purge list
+			 * in tail order, we can stop at the first
+			 * unpurgable resource -- anyone added after
+			 * him will have a greater last_used value */
+			break;
+		}
+
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+
+		/* This may drop and reacquire the dlm spinlock if it
+		 * has to do migration. */
+		mlog(0, "calling dlm_purge_lockres!\n");
+		dlm_purge_lockres(dlm, lockres);
+		mlog(0, "DONE calling dlm_purge_lockres!\n");
+
+		/* Avoid adding any scheduling latencies */
+		cond_resched_lock(&dlm->spinlock);
+	}
+
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	struct dlm_lock *lock, *target;
+	struct list_head *iter;
+	struct list_head *head;
+	int can_grant = 1;
+
+	//mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+	//mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+	//mlog(0, "shuffle res %.*s\n", res->lockname.len,
+	//	  res->lockname.name);
+
+	/* because this function is called with the lockres
+	 * spinlock, and because we know that it is not migrating/
+	 * recovering/in-progress, it is fine to reserve asts and
+	 * basts right before queueing them all throughout */
+	assert_spin_locked(&res->spinlock);
+	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
+			      DLM_LOCK_RES_RECOVERING|
+			      DLM_LOCK_RES_IN_PROGRESS)));
+
+converting:
+	if (list_empty(&res->converting))
+		goto blocked;
+	mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+	     res->lockname.name);
+
+	target = list_entry(res->converting.next, struct dlm_lock, list);
+	if (target->ml.convert_type == LKM_IVMODE) {
+		mlog(ML_ERROR, "%.*s: converting a lock with no "
+		     "convert_type!\n", res->lockname.len, res->lockname.name);
+		BUG();
+	}
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			/* queue the BAST if not already */
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			/* update the highest_blocked if needed */
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+
+	/* we can convert the lock */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+		     "granting: %d, node: %u\n", res->lockname.len,
+		     res->lockname.name, target->ml.type,
+		     target->ml.convert_type, target->ml.node);
+
+		target->ml.type = target->ml.convert_type;
+		target->ml.convert_type = LKM_IVMODE;
+		list_del_init(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+blocked:
+	if (list_empty(&res->blocked))
+		goto leave;
+	target = list_entry(res->blocked.next, struct dlm_lock, list);
+
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	/* we can grant the blocked lock (only
+	 * possible if converting list empty) */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+		     "node: %u\n", res->lockname.len, res->lockname.name,
+		     target->ml.type, target->ml.node);
+
+		// target->ml.type is already correct
+		list_del_init(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+leave:
+	return;
+}
+
+/* must have NO locks when calling this with res !=NULL * */
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
+	if (res) {
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+		__dlm_dirty_lockres(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+	}
+	wake_up(&dlm->dlm_thread_wq);
+}
+
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* don't shuffle secondary queues */
+	if ((res->owner == dlm->node_num) &&
+	    !(res->state & DLM_LOCK_RES_DIRTY)) {
+		list_add_tail(&res->dirty, &dlm->dirty_list);
+		res->state |= DLM_LOCK_RES_DIRTY;
+	}
+}
+
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm thread...\n");
+
+	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+	if (IS_ERR(dlm->dlm_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_thread_task));
+		dlm->dlm_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_thread_task) {
+		mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+		kthread_stop(dlm->dlm_thread_task);
+		dlm->dlm_thread_task = NULL;
+	}
+}
+
+static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
+{
+	int empty;
+
+	spin_lock(&dlm->spinlock);
+	empty = list_empty(&dlm->dirty_list);
+	spin_unlock(&dlm->spinlock);
+
+	return empty;
+}
+
+static void dlm_flush_asts(struct dlm_ctxt *dlm)
+{
+	int ret;
+	struct dlm_lock *lock;
+	struct dlm_lock_resource *res;
+	u8 hi;
+
+	spin_lock(&dlm->ast_lock);
+	while (!list_empty(&dlm->pending_asts)) {
+		lock = list_entry(dlm->pending_asts.next,
+				  struct dlm_lock, ast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+		mlog(0, "delivering an ast for this lockres\n");
+
+		BUG_ON(!lock->ast_pending);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->ast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_do_remote_ast(dlm, res, lock);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_ast(dlm, res, lock);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another ast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->ast_list)) {
+			mlog(0, "aha another ast got queued while "
+			     "we were finishing the last one.  will "
+			     "keep the ast_pending flag set.\n");
+		} else
+			lock->ast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+
+	while (!list_empty(&dlm->pending_basts)) {
+		lock = list_entry(dlm->pending_basts.next,
+				  struct dlm_lock, bast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+
+		BUG_ON(!lock->bast_pending);
+
+		/* get the highest blocked lock, and reset */
+		spin_lock(&lock->spinlock);
+		BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
+		hi = lock->ml.highest_blocked;
+		lock->ml.highest_blocked = LKM_IVMODE;
+		spin_unlock(&lock->spinlock);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->bast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		mlog(0, "delivering a bast for this lockres "
+		     "(blocked = %d\n", hi);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_send_proxy_bast(dlm, res, lock, hi);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_bast(dlm, res, lock, hi);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another bast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->bast_list)) {
+			mlog(0, "aha another bast got queued while "
+			     "we were finishing the last one.  will "
+			     "keep the bast_pending flag set.\n");
+		} else
+			lock->bast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+	wake_up(&dlm->ast_wq);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
+#define DLM_THREAD_MAX_DIRTY  100
+#define DLM_THREAD_MAX_ASTS   10
+
+static int dlm_thread(void *data)
+{
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		int n = DLM_THREAD_MAX_DIRTY;
+
+		/* dlm_shutting_down is very point-in-time, but that
+		 * doesn't matter as we'll just loop back around if we
+		 * get false on the leading edge of a state
+		 * transition. */
+		dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
+
+		/* We really don't want to hold dlm->spinlock while
+		 * calling dlm_shuffle_lists on each lockres that
+		 * needs to have its queues adjusted and AST/BASTs
+		 * run.  So let's pull each entry off the dirty_list
+		 * and drop dlm->spinlock ASAP.  Once off the list,
+		 * res->spinlock needs to be taken again to protect
+		 * the queues while calling dlm_shuffle_lists.  */
+		spin_lock(&dlm->spinlock);
+		while (!list_empty(&dlm->dirty_list)) {
+			int delay = 0;
+			res = list_entry(dlm->dirty_list.next,
+					 struct dlm_lock_resource, dirty);
+
+			/* peel a lockres off, remove it from the list,
+			 * unset the dirty flag and drop the dlm lock */
+			BUG_ON(!res);
+			dlm_lockres_get(res);
+
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_DIRTY;
+			list_del_init(&res->dirty);
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->spinlock);
+
+		 	/* lockres can be re-dirtied/re-added to the
+			 * dirty_list in this gap, but that is ok */
+
+			spin_lock(&res->spinlock);
+			if (res->owner != dlm->node_num) {
+				__dlm_print_one_lock_resource(res);
+				mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+				     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+			}
+			BUG_ON(res->owner != dlm->node_num);
+
+			/* it is now ok to move lockreses in these states
+			 * to the dirty list, assuming that they will only be
+			 * dirty for a short while. */
+			if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
+					  DLM_LOCK_RES_MIGRATING |
+					  DLM_LOCK_RES_RECOVERING)) {
+				/* move it to the tail and keep going */
+				spin_unlock(&res->spinlock);
+				mlog(0, "delaying list shuffling for in-"
+				     "progress lockres %.*s, state=%d\n",
+				     res->lockname.len, res->lockname.name,
+				     res->state);
+				delay = 1;
+				goto in_progress;
+			}
+
+			/* at this point the lockres is not migrating/
+			 * recovering/in-progress.  we have the lockres
+			 * spinlock and do NOT have the dlm lock.
+			 * safe to reserve/queue asts and run the lists. */
+
+			mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
+			     "res=%p\n", dlm, res);
+
+			/* called while holding lockres lock */
+			dlm_shuffle_lists(dlm, res);
+			spin_unlock(&res->spinlock);
+
+			dlm_lockres_calc_usage(dlm, res);
+
+in_progress:
+
+			spin_lock(&dlm->spinlock);
+			/* if the lock was in-progress, stick
+			 * it on the back of the list */
+			if (delay) {
+				spin_lock(&res->spinlock);
+				list_add_tail(&res->dirty, &dlm->dirty_list);
+				res->state |= DLM_LOCK_RES_DIRTY;
+				spin_unlock(&res->spinlock);
+			}
+			dlm_lockres_put(res);
+
+			/* unlikely, but we may need to give time to
+			 * other tasks */
+			if (!--n) {
+				mlog(0, "throttling dlm_thread\n");
+				break;
+			}
+		}
+
+		spin_unlock(&dlm->spinlock);
+		dlm_flush_asts(dlm);
+
+		/* yield and continue right away if there is more work to do */
+		if (!n) {
+			yield();
+			continue;
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_thread_wq,
+						 !dlm_dirty_list_empty(dlm) ||
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM thread\n");
+	return 0;
+}

+ 672 - 0
fs/ocfs2/dlm/dlmunlock.c

@@ -0,0 +1,672 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmunlock.c
+ *
+ * underlying calls for unlocking locks
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+#define DLM_UNLOCK_FREE_LOCK           0x00000001
+#define DLM_UNLOCK_CALL_AST            0x00000002
+#define DLM_UNLOCK_REMOVE_LOCK         0x00000004
+#define DLM_UNLOCK_REGRANT_LOCK        0x00000008
+#define DLM_UNLOCK_CLEAR_CONVERT_TYPE  0x00000010
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner);
+
+
+/*
+ * according to the spec:
+ * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+ *
+ *  flags & LKM_CANCEL != 0: must be converting or blocked
+ *  flags & LKM_CANCEL == 0: must be granted
+ *
+ * So to unlock a converting lock, you must first cancel the
+ * convert (passing LKM_CANCEL in flags), then call the unlock
+ * again (with no LKM_CANCEL in flags).
+ */
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         res->spinlock and lock->spinlock taken and dropped
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ * all callers should have taken an extra ref on lock coming in
+ */
+static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
+					struct dlm_lock_resource *res,
+					struct dlm_lock *lock,
+					struct dlm_lockstatus *lksb,
+					int flags, int *call_ast,
+					int master_node)
+{
+	enum dlm_status status;
+	int actions = 0;
+	int in_use;
+        u8 owner;
+
+	mlog(0, "master_node = %d, valblk = %d\n", master_node,
+	     flags & LKM_VALBLK);
+
+	if (master_node)
+		BUG_ON(res->owner != dlm->node_num);
+	else
+		BUG_ON(res->owner == dlm->node_num);
+
+	spin_lock(&dlm->spinlock);
+	/* We want to be sure that we're not freeing a lock
+	 * that still has AST's pending... */
+	in_use = !list_empty(&lock->ast_list);
+	spin_unlock(&dlm->spinlock);
+	if (in_use) {
+	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
+		    "while waiting for an ast!", res->lockname.len,
+		    res->lockname.name);
+		return DLM_BADPARAM;
+	}
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		if (master_node) {
+			mlog(ML_ERROR, "lockres in progress!\n");
+			spin_unlock(&res->spinlock);
+			return DLM_FORWARD;
+		}
+		/* ok for this to sleep if not in a network handler */
+		__dlm_wait_on_lockres(res);
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	}
+	spin_lock(&lock->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+
+	/* see above for what the spec says about
+	 * LKM_CANCEL and the lock queue state */
+	if (flags & LKM_CANCEL)
+		status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
+	else
+		status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
+
+	if (status != DLM_NORMAL)
+		goto leave;
+
+	/* By now this has been masked out of cancel requests. */
+	if (flags & LKM_VALBLK) {
+		/* make the final update to the lvb */
+		if (master_node)
+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
+		else
+			flags |= LKM_PUT_LVB; /* let the send function
+					       * handle it. */
+	}
+
+	if (!master_node) {
+		owner = res->owner;
+		/* drop locks and send message */
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 1;
+		else
+			lock->unlock_pending = 1;
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
+							flags, owner);
+		spin_lock(&res->spinlock);
+		spin_lock(&lock->spinlock);
+		/* if the master told us the lock was already granted,
+		 * let the ast handle all of these actions */
+		if (status == DLM_NORMAL &&
+		    lksb->status == DLM_CANCELGRANT) {
+			actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
+				     DLM_UNLOCK_REGRANT_LOCK|
+				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+		}
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 0;
+		else
+			lock->unlock_pending = 0;
+
+	}
+
+	/* get an extra ref on lock.  if we are just switching
+	 * lists here, we dont want the lock to go away. */
+	dlm_lock_get(lock);
+
+	if (actions & DLM_UNLOCK_REMOVE_LOCK) {
+		list_del_init(&lock->list);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_REGRANT_LOCK) {
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+	}
+	if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
+		mlog(0, "clearing convert_type at %smaster node\n",
+		     master_node ? "" : "non-");
+		lock->ml.convert_type = LKM_IVMODE;
+	}
+
+	/* remove the extra ref on lock */
+	dlm_lock_put(lock);
+
+leave:
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	if (!dlm_lock_on_list(&res->converting, lock))
+		BUG_ON(lock->ml.convert_type != LKM_IVMODE);
+	else
+		BUG_ON(lock->ml.convert_type == LKM_IVMODE);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* let the caller's final dlm_lock_put handle the actual kfree */
+	if (actions & DLM_UNLOCK_FREE_LOCK) {
+		/* this should always be coupled with list removal */
+		BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
+		mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
+		     lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_CALL_AST)
+		*call_ast = 1;
+
+	/* if cancel or unlock succeeded, lvb work is done */
+	if (status == DLM_NORMAL)
+		lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+
+	return status;
+}
+
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	/* leave DLM_LKSB_PUT_LVB on the lksb so any final
+	 * update of the lvb will be sent to the new master */
+	list_del_init(&lock->list);
+}
+
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+}
+
+
+static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags,
+					  int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
+}
+
+static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags, int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ */
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner)
+{
+	struct dlm_unlock_lock unlock;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	memset(&unlock, 0, sizeof(unlock));
+	unlock.node_idx = dlm->node_num;
+	unlock.flags = cpu_to_be32(flags);
+	unlock.cookie = lock->ml.cookie;
+	unlock.namelen = res->lockname.len;
+	memcpy(unlock.name, res->lockname.name, unlock.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_unlock_lock);
+	vec[0].iov_base = &unlock;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					vec, veclen, owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		if (status == DLM_CANCELGRANT)
+			ret = DLM_NORMAL;
+		else if (status == DLM_FORWARD) {
+			mlog(0, "master was in-progress.  retry\n");
+			ret = DLM_FORWARD;
+		} else
+			ret = status;
+		lksb->status = status;
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			/* NOTE: this seems strange, but it is what we want.
+			 * when the master goes down during a cancel or
+			 * unlock, the recovery code completes the operation
+			 * as if the master had not died, then passes the
+			 * updated state to the recovery master.  this thread
+			 * just needs to finish out the operation and call
+			 * the unlockast. */
+			ret = DLM_NORMAL;
+		} else {
+			/* something bad.  this will BUG in ocfs2 */
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+		lksb->status = ret;
+	}
+
+	return ret;
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
+ *          return value from dlmunlock_master
+ */
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	int found = 0, i;
+	struct dlm_lockstatus *lksb = NULL;
+	int ignore;
+	u32 flags;
+	struct list_head *queue;
+
+	flags = be32_to_cpu(unlock->flags);
+
+	if (flags & LKM_GET_LVB) {
+		mlog(ML_ERROR, "bad args!  GET_LVB specified on unlock!\n");
+		return DLM_BADARGS;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
+		mlog(ML_ERROR, "bad args!  cannot modify lvb on a CANCEL "
+		     "request!\n");
+		return DLM_BADARGS;
+	}
+
+	if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
+		return DLM_IVBUFLEN;
+	}
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
+
+	res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
+	if (!res) {
+		/* We assume here that a no lock resource simply means
+		 * it was migrated away and destroyed before the other
+		 * node could detect it. */
+		mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
+		status = DLM_FORWARD;
+		goto not_found;
+	}
+
+	queue=&res->granted;
+	found = 0;
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_RECOVERING\n");
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_MIGRATING\n");
+		status = DLM_MIGRATING;
+		goto leave;
+	}
+
+	if (res->owner != dlm->node_num) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_FORWARD -- not master\n");
+		status = DLM_FORWARD;
+		goto leave;
+	}
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			lock = list_entry(iter, struct dlm_lock, list);
+			if (lock->ml.cookie == unlock->cookie &&
+		    	    lock->ml.node == unlock->node_idx) {
+				dlm_lock_get(lock);
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			break;
+		/* scan granted -> converting -> blocked queues */
+		queue++;
+	}
+	spin_unlock(&res->spinlock);
+	if (!found) {
+		status = DLM_IVLOCKID;
+		goto not_found;
+	}
+
+	/* lock was found on queue */
+	lksb = lock->lksb;
+	/* unlockast only called on originating node */
+	if (flags & LKM_PUT_LVB) {
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
+	}
+
+	/* if this is in-progress, propagate the DLM_FORWARD
+	 * all the way back out */
+	status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
+	if (status == DLM_FORWARD)
+		mlog(0, "lockres is in progress\n");
+
+	if (flags & LKM_PUT_LVB)
+		lksb->flags &= ~DLM_LKSB_PUT_LVB;
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_kick_thread(dlm, res);
+
+not_found:
+	if (!found)
+		mlog(ML_ERROR, "failed to find lock to unlock! "
+			       "cookie=%"MLFu64"\n",
+		     unlock->cookie);
+	else {
+		/* send the lksb->status back to the other node */
+		status = lksb->status;
+		dlm_lock_put(lock);
+	}
+
+leave:
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	if (dlm_lock_on_list(&res->blocked, lock)) {
+		/* cancel this outright */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	} else if (dlm_lock_on_list(&res->converting, lock)) {
+		/* cancel the request, put back on granted */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK |
+			    DLM_UNLOCK_REGRANT_LOCK |
+			    DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+	} else if (dlm_lock_on_list(&res->granted, lock)) {
+		/* too late, already granted.  DLM_CANCELGRANT */
+		lksb->status = DLM_CANCELGRANT;
+		status = DLM_NORMAL;
+		*actions = DLM_UNLOCK_CALL_AST;
+	} else {
+		mlog(ML_ERROR, "lock to cancel is not on any list!\n");
+		lksb->status = DLM_IVLOCKID;
+		status = DLM_IVLOCKID;
+		*actions = 0;
+	}
+	return status;
+}
+
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	/* unlock request */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		lksb->status = DLM_DENIED;
+		status = DLM_DENIED;
+		dlm_error(status);
+		*actions = 0;
+	} else {
+		/* unlock granted lock */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_FREE_LOCK |
+			    DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	}
+	return status;
+}
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
+			  int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res;
+	struct dlm_lock *lock = NULL;
+	int call_ast, is_master;
+
+	mlog_entry_void();
+
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+		mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
+		flags &= ~LKM_VALBLK;
+	}
+
+	if (!lksb->lockid || !lksb->lockid->lockres) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	lock = lksb->lockid;
+	BUG_ON(!lock);
+	dlm_lock_get(lock);
+
+	res = lock->lockres;
+	BUG_ON(!res);
+	dlm_lockres_get(res);
+retry:
+	call_ast = 0;
+	/* need to retry up here because owner may have changed */
+	mlog(0, "lock=%p res=%p\n", lock, res);
+
+	spin_lock(&res->spinlock);
+	is_master = (res->owner == dlm->node_num);
+	spin_unlock(&res->spinlock);
+
+	if (is_master) {
+		status = dlmunlock_master(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_master: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	} else {
+		status = dlmunlock_remote(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_remote: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	}
+
+	if (status == DLM_RECOVERING ||
+	    status == DLM_MIGRATING ||
+	    status == DLM_FORWARD) {
+		/* We want to go away for a tiny bit to allow recovery
+		 * / migration to complete on this resource. I don't
+		 * know of any wait queue we could sleep on as this
+		 * may be happening on another node. Perhaps the
+		 * proper solution is to queue up requests on the
+		 * other end? */
+
+		/* do we want to yield(); ?? */
+		msleep(50);
+
+		mlog(0, "retrying unlock due to pending recovery/"
+		     "migration/in-progress\n");
+		goto retry;
+	}
+
+	if (call_ast) {
+		mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status);
+		if (is_master) {
+			/* it is possible that there is one last bast 
+			 * pending.  make sure it is flushed, then
+			 * call the unlockast.
+			 * not an issue if this is a mastered remotely,
+			 * since this lock has been removed from the
+			 * lockres queues and cannot be found. */
+			dlm_kick_thread(dlm, NULL);
+			wait_event(dlm->ast_wq, 
+				   dlm_lock_basts_flushed(dlm, lock));
+		}
+		(*unlockast)(data, lksb->status);
+	}
+
+	if (status == DLM_NORMAL) {
+		mlog(0, "kicking the thread\n");
+		dlm_kick_thread(dlm, res);
+	} else
+		dlm_error(status);
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_lockres_put(res);
+	dlm_lock_put(lock);
+
+	mlog(0, "returning status=%d!\n", status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmunlock);
+

+ 42 - 0
fs/ocfs2/dlm/dlmver.c

@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmver.h"
+
+#define DLM_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
+
+void dlm_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);

+ 31 - 0
fs/ocfs2/dlm/dlmver.h

@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLM_VER_H
+#define DLM_VER_H
+
+void dlm_print_version(void);
+
+#endif /* DLM_VER_H */

+ 658 - 0
fs/ocfs2/dlm/userdlm.c

@@ -0,0 +1,658 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM.
+ *
+ * Many of the functions here are pared down versions of dlmglue.c
+ * functions.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <asm/signal.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+
+#include "userdlm.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static inline int user_check_wait_flag(struct user_lock_res *lockres,
+				       int flag)
+{
+	int ret;
+
+	spin_lock(&lockres->l_lock);
+	ret = lockres->l_flags & flag;
+	spin_unlock(&lockres->l_lock);
+
+	return ret;
+}
+
+static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
+}
+
+static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
+}
+
+/* I heart container_of... */
+static inline struct dlm_ctxt *
+dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return ip->ip_dlm;
+}
+
+static struct inode *
+user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return &ip->ip_vfs_inode;
+}
+
+static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+}
+
+#define user_log_dlm_error(_func, _stat, _lockres) do {		\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_name, dlm_errmsg(_stat));		\
+} while (0)
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int user_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void user_ast(void *opaque)
+{
+	struct user_lock_res *lockres = opaque;
+	struct dlm_lockstatus *lksb;
+
+	mlog(0, "AST fired for lockres %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
+		     lksb->status, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	/* we're downconverting. */
+	if (lockres->l_requested < lockres->l_level) {
+		if (lockres->l_requested <=
+		    user_highest_compat_lock_level(lockres->l_blocking)) {
+			lockres->l_blocking = LKM_NLMODE;
+			lockres->l_flags &= ~USER_LOCK_BLOCKED;
+		}
+	}
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_flags |= USER_LOCK_ATTACHED;
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	if (!igrab(inode))
+		BUG();
+}
+
+static void user_dlm_unblock_lock(void *opaque);
+
+static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
+{
+	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
+		user_dlm_grab_inode_ref(lockres);
+
+		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
+			  lockres);
+
+		queue_work(user_dlm_worker, &lockres->l_work);
+		lockres->l_flags |= USER_LOCK_QUEUED;
+	}
+}
+
+static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
+{
+	int queue = 0;
+
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
+		return;
+
+	switch (lockres->l_blocking) {
+	case LKM_EXMODE:
+		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+			queue = 1;
+		break;
+	case LKM_PRMODE:
+		if (!lockres->l_ex_holders)
+			queue = 1;
+		break;
+	default:
+		BUG();
+	}
+
+	if (queue)
+		__user_dlm_queue_lockres(lockres);
+}
+
+static void user_bast(void *opaque, int level)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
+		lockres->l_name, level);
+
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags |= USER_LOCK_BLOCKED;
+	if (level > lockres->l_blocking)
+		lockres->l_blocking = level;
+
+	__user_dlm_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static void user_unlock_ast(void *opaque, enum dlm_status status)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
+
+	if (status != DLM_NORMAL)
+		mlog(ML_ERROR, "Dlm returns status %d\n", status);
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
+		lockres->l_level = LKM_IVMODE;
+	else {
+		lockres->l_requested = LKM_IVMODE; /* cancel an
+						    * upconvert
+						    * request. */
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		/* we want the unblock thread to look at it again
+		 * now. */
+		__user_dlm_queue_lockres(lockres);
+	}
+
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	iput(inode);
+}
+
+static void user_dlm_unblock_lock(void *opaque)
+{
+	int new_level, status;
+	struct user_lock_res *lockres = (struct user_lock_res *) opaque;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "processing lockres %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+	BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
+
+	/* notice that we don't clear USER_LOCK_BLOCKED here. That's
+	 * for user_ast to do. */
+	lockres->l_flags &= ~USER_LOCK_QUEUED;
+
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(0, "lock is in teardown so we do nothing\n");
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_BUSY) {
+		mlog(0, "BUSY flag detected...\n");
+		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			spin_unlock(&lockres->l_lock);
+			goto drop_ref;
+		}
+
+		lockres->l_flags |= USER_LOCK_IN_CANCEL;
+		spin_unlock(&lockres->l_lock);
+
+		status = dlmunlock(dlm,
+				   &lockres->l_lksb,
+				   LKM_CANCEL,
+				   user_unlock_ast,
+				   lockres);
+		if (status == DLM_CANCELGRANT) {
+			/* If we got this, then the ast was fired
+			 * before we could cancel. We cleanup our
+			 * state, and restart the function. */
+			spin_lock(&lockres->l_lock);
+			lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+			spin_unlock(&lockres->l_lock);
+		} else if (status != DLM_NORMAL)
+			user_log_dlm_error("dlmunlock", status, lockres);
+		goto drop_ref;
+	}
+
+	/* If there are still incompat holders, we can exit safely
+	 * without worrying about re-queueing this lock as that will
+	 * happen on the last call to user_cluster_unlock. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
+			lockres->l_ro_holders, lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	if ((lockres->l_blocking == LKM_PRMODE)
+	    && lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for pr: ex = %u\n",
+			lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	/* yay, we can downconvert now. */
+	new_level = user_highest_compat_lock_level(lockres->l_blocking);
+	lockres->l_requested = new_level;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	mlog(0, "Downconvert lock from %d to %d\n",
+		lockres->l_level, new_level);
+	spin_unlock(&lockres->l_lock);
+
+	/* need lock downconvert request now... */
+	status = dlmlock(dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 LKM_CONVERT|LKM_VALBLK,
+			 lockres->l_name,
+			 user_ast,
+			 lockres,
+			 user_bast);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmlock", status, lockres);
+		user_recover_from_dlm_error(lockres);
+	}
+
+drop_ref:
+	user_dlm_drop_inode_ref(lockres);
+}
+
+static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int
+user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
+				  int wanted)
+{
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+
+	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
+}
+
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags)
+{
+	int status, local_flags;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %s: invalid request!\n",
+		     lockres->l_name);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
+		lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
+		lkm_flags);
+
+again:
+	if (signal_pending(current)) {
+		status = -ERESTARTSYS;
+		goto bail;
+	}
+
+	spin_lock(&lockres->l_lock);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if ((lockres->l_flags & USER_LOCK_BUSY) &&
+	    (level > lockres->l_level)) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
+	    (!user_may_continue_on_blocked_lock(lockres, level))) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_blocked_lock(lockres);
+		goto again;
+	}
+
+	if (level > lockres->l_level) {
+		local_flags = lkm_flags | LKM_VALBLK;
+		if (lockres->l_level != LKM_IVMODE)
+			local_flags |= LKM_CONVERT;
+
+		lockres->l_requested = level;
+		lockres->l_flags |= USER_LOCK_BUSY;
+		spin_unlock(&lockres->l_lock);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		mlog(0, "lock %s, get lock from %d to level = %d\n",
+			lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(dlm,
+				 level,
+				 &lockres->l_lksb,
+				 local_flags,
+				 lockres->l_name,
+				 user_ast,
+				 lockres,
+				 user_bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				status = -EAGAIN;
+			else {
+				user_log_dlm_error("dlmlock", status, lockres);
+				status = -EINVAL;
+			}
+			user_recover_from_dlm_error(lockres);
+			goto bail;
+		}
+
+		mlog(0, "lock %s, successfull return from dlmlock\n",
+			lockres->l_name);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	user_dlm_inc_holders(lockres, level);
+	spin_unlock(&lockres->l_lock);
+
+	mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
+
+	status = 0;
+bail:
+	return status;
+}
+
+static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level)
+{
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
+		return;
+	}
+
+	mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
+
+	spin_lock(&lockres->l_lock);
+	user_dlm_dec_holders(lockres, level);
+	__user_dlm_cond_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_EXMODE);
+	memcpy(lvb, val, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_PRMODE);
+	memcpy(val, lvb, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry)
+{
+	memset(lockres, 0, sizeof(*lockres));
+
+	spin_lock_init(&lockres->l_lock);
+	init_waitqueue_head(&lockres->l_event);
+	lockres->l_level = LKM_IVMODE;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_blocking = LKM_IVMODE;
+
+	/* should have been checked before getting here. */
+	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
+
+	memcpy(lockres->l_name,
+	       dentry->d_name.name,
+	       dentry->d_name.len);
+}
+
+int user_dlm_destroy_lock(struct user_lock_res *lockres)
+{
+	int status = -EBUSY;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "asked to destroy %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+	while (lockres->l_flags & USER_LOCK_BUSY) {
+		spin_unlock(&lockres->l_lock);
+
+		mlog(0, "lock %s is busy\n", lockres->l_name);
+
+		user_wait_on_busy_lock(lockres);
+
+		spin_lock(&lockres->l_lock);
+	}
+
+	if (lockres->l_ro_holders || lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "lock %s has holders\n", lockres->l_name);
+		goto bail;
+	}
+
+	status = 0;
+	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "lock %s is not attached\n", lockres->l_name);
+		goto bail;
+	}
+
+	lockres->l_flags &= ~USER_LOCK_ATTACHED;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+	spin_unlock(&lockres->l_lock);
+
+	mlog(0, "unlocking lockres %s\n", lockres->l_name);
+	status = dlmunlock(dlm,
+			   &lockres->l_lksb,
+			   LKM_VALBLK,
+			   user_unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmunlock", status, lockres);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	user_wait_on_busy_lock(lockres);
+
+	status = 0;
+bail:
+	return status;
+}
+
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
+{
+	struct dlm_ctxt *dlm;
+	u32 dlm_key;
+	char *domain;
+
+	domain = kmalloc(name->len + 1, GFP_KERNEL);
+	if (!domain) {
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dlm_key = crc32_le(0, name->name, name->len);
+
+	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+
+	dlm = dlm_register_domain(domain, dlm_key);
+	if (IS_ERR(dlm))
+		mlog_errno(PTR_ERR(dlm));
+
+	kfree(domain);
+	return dlm;
+}
+
+void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain(dlm);
+}

+ 111 - 0
fs/ocfs2/dlm/userdlm.h

@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.h
+ *
+ * Userspace dlm defines
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef USERDLM_H
+#define USERDLM_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* user_lock_res->l_flags flags. */
+#define USER_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					      * downconvert*/
+#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
+					      * destroying this
+					      * lock. */
+#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
+					      * workqueue */
+#define USER_LOCK_IN_CANCEL     (0x00000020)
+
+struct user_lock_res {
+	spinlock_t               l_lock;
+
+	int                      l_flags;
+
+#define USER_DLM_LOCK_ID_MAX_LEN  32
+	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct work_struct       l_work;
+};
+
+extern struct workqueue_struct *user_dlm_worker;
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry);
+int user_dlm_destroy_lock(struct user_lock_res *lockres);
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags);
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level);
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len);
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len);
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
+void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+
+struct dlmfs_inode_private {
+	struct dlm_ctxt             *ip_dlm;
+
+	struct user_lock_res ip_lockres; /* unused for directories. */
+	struct inode         *ip_parent;
+
+	struct inode         ip_vfs_inode;
+};
+
+static inline struct dlmfs_inode_private *
+DLMFS_I(struct inode *inode)
+{
+        return container_of(inode,
+			    struct dlmfs_inode_private,
+			    ip_vfs_inode);
+}
+
+struct dlmfs_filp_private {
+	int                  fp_lock_level;
+};
+
+#define DLMFS_MAGIC	0x76a9f425
+
+#endif /* USERDLM_H */

+ 2904 - 0
fs/ocfs2/dlmglue.c

@@ -0,0 +1,2904 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.c
+ *
+ * Code which implements an OCFS2 specific interface to our DLM.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/crc32.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_DLM_GLUE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "slot_map.h"
+#include "super.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_mask_waiter {
+	struct list_head	mw_item;
+	int			mw_status;
+	struct completion	mw_complete;
+	unsigned long		mw_mask;
+	unsigned long		mw_goal;
+};
+
+static void ocfs2_inode_ast_func(void *opaque);
+static void ocfs2_inode_bast_func(void *opaque,
+				  int level);
+static void ocfs2_super_ast_func(void *opaque);
+static void ocfs2_super_bast_func(void *opaque,
+				  int level);
+static void ocfs2_rename_ast_func(void *opaque);
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level);
+
+/* so far, all locks have gotten along with the same unlock ast */
+static void ocfs2_unlock_ast_func(void *opaque,
+				  enum dlm_status status);
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue);
+static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue);
+typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
+				      struct ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker);
+
+struct ocfs2_lock_res_ops {
+	void (*ast)(void *);
+	void (*bast)(void *, int);
+	void (*unlock_ast)(void *, enum dlm_status);
+	int  (*unblock)(struct ocfs2_lock_res *, int *);
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_inode_lock,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_meta,
+};
+
+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				      int blocking);
+
+static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_data,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_super_lops = {
+	.ast		= ocfs2_super_ast_func,
+	.bast		= ocfs2_super_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+	.ast		= ocfs2_rename_ast_func,
+	.bast		= ocfs2_rename_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
+static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
+		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
+		lockres->l_type == OCFS2_LOCK_TYPE_RW;
+}
+
+static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
+}
+
+static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
+}
+
+static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_super_lock(lockres)
+	       && !ocfs2_is_rename_lock(lockres));
+
+	return (struct ocfs2_super *) lockres->l_priv;
+}
+
+static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	return (struct inode *) lockres->l_priv;
+}
+
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     int dlm_flags);
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted);
+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int level);
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert);
+#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_name, dlm_errmsg(_stat));		\
+} while (0)
+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres);
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh);
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
+static inline int ocfs2_highest_compat_lock_level(int level);
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  struct ocfs2_lock_res *lockres,
+						  int new_level);
+
+static char *ocfs2_lock_type_strings[] = {
+	[OCFS2_LOCK_TYPE_META] = "Meta",
+	[OCFS2_LOCK_TYPE_DATA] = "Data",
+	[OCFS2_LOCK_TYPE_SUPER] = "Super",
+	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
+	/* Need to differntiate from [R]ename.. serializing writes is the
+	 * important job it does, anyway. */
+	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
+};
+
+static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
+{
+	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+	return ocfs2_lock_type_strings[type];
+}
+
+static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
+				  u64 blkno,
+				  u32 generation,
+				  char *name)
+{
+	int len;
+
+	mlog_entry_void();
+
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
+
+	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
+		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
+		       generation);
+
+	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
+
+	mlog(0, "built lock resource with name: %s\n", name);
+
+	mlog_exit_void();
+}
+
+static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+
+static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
+				       struct ocfs2_dlm_debug *dlm_debug)
+{
+	mlog(0, "Add tracking for lockres %s\n", res->l_name);
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
+{
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	if (!list_empty(&res->l_debug_list))
+		list_del_init(&res->l_debug_list);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *res,
+				       enum ocfs2_lock_type type,
+				       u64 blkno,
+				       u32 generation,
+				       struct ocfs2_lock_res_ops *ops,
+				       void *priv)
+{
+	ocfs2_build_lock_name(type, blkno, generation, res->l_name);
+
+	res->l_type          = type;
+	res->l_ops           = ops;
+	res->l_priv          = priv;
+
+	res->l_level         = LKM_IVMODE;
+	res->l_requested     = LKM_IVMODE;
+	res->l_blocking      = LKM_IVMODE;
+	res->l_action        = OCFS2_AST_INVALID;
+	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	res->l_flags         = OCFS2_LOCK_INITIALIZED;
+
+	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+}
+
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
+{
+	/* This also clears out the lock status block */
+	memset(res, 0, sizeof(struct ocfs2_lock_res));
+	spin_lock_init(&res->l_lock);
+	init_waitqueue_head(&res->l_event);
+	INIT_LIST_HEAD(&res->l_blocked_list);
+	INIT_LIST_HEAD(&res->l_mask_waiters);
+}
+
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode)
+{
+	struct ocfs2_lock_res_ops *ops;
+
+	switch(type) {
+		case OCFS2_LOCK_TYPE_RW:
+			ops = &ocfs2_inode_rw_lops;
+			break;
+		case OCFS2_LOCK_TYPE_META:
+			ops = &ocfs2_inode_meta_lops;
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			ops = &ocfs2_inode_data_lops;
+			break;
+		default:
+			mlog_bug_on_msg(1, "type: %d\n", type);
+			ops = NULL; /* thanks, gcc */
+			break;
+	};
+
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
+				   OCFS2_I(inode)->ip_blkno,
+				   inode->i_generation, ops, inode);
+}
+
+static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
+				      struct ocfs2_super *osb)
+{
+	/* Superblock lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
+				   OCFS2_SUPER_BLOCK_BLKNO, 0,
+				   &ocfs2_super_lops, osb);
+}
+
+static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
+				       struct ocfs2_super *osb)
+{
+	/* Rename lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
+				   &ocfs2_rename_lops, osb);
+}
+
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+{
+	mlog_entry_void();
+
+	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
+		return;
+
+	ocfs2_remove_lockres_tracking(res);
+
+	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
+			"Lockres %s is on the blocked list\n",
+			res->l_name);
+	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
+			"Lockres %s has mask waiters pending\n",
+			res->l_name);
+	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
+			"Lockres %s is locked\n",
+			res->l_name);
+	mlog_bug_on_msg(res->l_ro_holders,
+			"Lockres %s has %u ro holders\n",
+			res->l_name, res->l_ro_holders);
+	mlog_bug_on_msg(res->l_ex_holders,
+			"Lockres %s has %u ex holders\n",
+			res->l_name, res->l_ex_holders);
+
+	/* Need to clear out the lock status block for the dlm */
+	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
+
+	res->l_flags = 0UL;
+	mlog_exit_void();
+}
+
+static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+	mlog_exit_void();
+}
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int ocfs2_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void lockres_set_flags(struct ocfs2_lock_res *lockres,
+			      unsigned long newflags)
+{
+	struct list_head *pos, *tmp;
+	struct ocfs2_mask_waiter *mw;
+
+ 	assert_spin_locked(&lockres->l_lock);
+
+	lockres->l_flags = newflags;
+
+	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
+		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			continue;
+
+		list_del_init(&mw->mw_item);
+		mw->mw_status = 0;
+		complete(&mw->mw_complete);
+	}
+}
+static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
+{
+	lockres_set_flags(lockres, lockres->l_flags | or);
+}
+static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
+				unsigned long clear)
+{
+	lockres_set_flags(lockres, lockres->l_flags & ~clear);
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+
+	lockres->l_level = lockres->l_requested;
+	if (lockres->l_level <=
+	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
+		lockres->l_blocking = LKM_NLMODE;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+	}
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+
+	/* Convert from RO to EX doesn't really need anything as our
+	 * information is already up to data. Convert from NL to
+	 * *anything* however should mark ourselves as needing an
+	 * update */
+	if (lockres->l_level == LKM_NLMODE)
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+
+	if (lockres->l_requested > LKM_NLMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_inode_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct inode *inode;
+	struct dlm_lockstatus *lksb;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	inode = ocfs2_lock_res_inode(lockres);
+
+	mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
+	     OCFS2_I(inode)->ip_blkno, lockres->l_action,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
+		     "on inode %"MLFu64"\n", lksb->status,
+		     OCFS2_I(inode)->ip_blkno);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		mlog_exit_void();
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
+		     "lockres flags = 0x%lx, unlock action: %u\n",
+		     lockres->l_name, lockres->l_action, lockres->l_flags,
+		     lockres->l_unlock_action);
+
+		BUG();
+	}
+
+	/* data and rw locking ignores refresh flag for now. */
+	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int needs_downconvert = 0;
+	mlog_entry_void();
+
+	assert_spin_locked(&lockres->l_lock);
+
+	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+
+	if (level > lockres->l_blocking) {
+		/* only schedule a downconvert if we haven't already scheduled
+		 * one that goes low enough to satisfy the level we're
+		 * blocking.  this also catches the case where we get
+		 * duplicate BASTs */
+		if (ocfs2_highest_compat_lock_level(level) <
+		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
+			needs_downconvert = 1;
+
+		lockres->l_blocking = level;
+	}
+
+	mlog_exit(needs_downconvert);
+	return needs_downconvert;
+}
+
+static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
+				    struct ocfs2_lock_res *lockres,
+				    int level)
+{
+	int needs_downconvert;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	BUG_ON(level <= LKM_NLMODE);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
+	if (needs_downconvert)
+		ocfs2_schedule_blocked_lock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ocfs2_kick_vote_thread(osb);
+
+	wake_up(&lockres->l_event);
+	mlog_exit_void();
+}
+
+static void ocfs2_inode_bast_func(void *opaque, int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct inode *inode;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	inode = ocfs2_lock_res_inode(lockres);
+	osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
+	     "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
+	     lockres->l_level,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
+				   int ignore_refresh)
+{
+	struct dlm_lockstatus *lksb = &lockres->l_lksb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
+		     lockres->l_name, lksb->status);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		BUG();
+	}
+
+	if (ignore_refresh)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+}
+
+static void ocfs2_super_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+	mlog(0, "Superblock AST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+	ocfs2_generic_ast_func(lockres, 0);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_super_bast_func(void *opaque,
+				  int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+	mlog(0, "Superblock BAST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+       	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename AST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	ocfs2_generic_ast_func(lockres, 1);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename BAST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert)
+{
+	unsigned long flags;
+
+	mlog_entry_void();
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	if (convert)
+		lockres->l_action = OCFS2_AST_INVALID;
+	else
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+	mlog_exit_void();
+}
+
+/* Note: If we detect another process working on the lock (i.e.,
+ * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
+ * to do the right thing in that case.
+ */
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     int dlm_flags)
+{
+	int ret = 0;
+	enum dlm_status status;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+	     dlm_flags);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
+	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	lockres->l_action = OCFS2_AST_ATTACH;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = dlmlock(osb->dlm,
+			 level,
+			 &lockres->l_lksb,
+			 dlm_flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+	}
+
+	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
+					int flag)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ret = lockres->l_flags & flag;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+}
+
+static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
+}
+
+static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
+}
+
+static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
+{
+	INIT_LIST_HEAD(&mw->mw_item);
+	init_completion(&mw->mw_complete);
+}
+
+static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
+{
+	wait_for_completion(&mw->mw_complete);
+	/* Re-arm the completion in case we want to wait on it again */
+	INIT_COMPLETION(mw->mw_complete);
+	return mw->mw_status;
+}
+
+static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
+				    struct ocfs2_mask_waiter *mw,
+				    unsigned long mask,
+				    unsigned long goal)
+{
+	BUG_ON(!list_empty(&mw->mw_item));
+
+	assert_spin_locked(&lockres->l_lock);
+
+	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
+	mw->mw_mask = mask;
+	mw->mw_goal = goal;
+}
+
+/* returns 0 if the mw that was removed was already satisfied, -EBUSY
+ * if the mask still hadn't reached its goal */
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+				      struct ocfs2_mask_waiter *mw)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!list_empty(&mw->mw_item)) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			ret = -EBUSY;
+
+		list_del_init(&mw->mw_item);
+		init_completion(&mw->mw_complete);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+
+}
+
+static int ocfs2_cluster_lock(struct ocfs2_super *osb,
+			      struct ocfs2_lock_res *lockres,
+			      int level,
+			      int lkm_flags,
+			      int arg_flags)
+{
+	struct ocfs2_mask_waiter mw;
+	enum dlm_status status;
+	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
+	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	ocfs2_init_mask_waiter(&mw);
+
+again:
+	wait = 0;
+
+	if (catch_signals && signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		goto out;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
+			"Cluster lock called on freeing lockres %s! flags "
+			"0x%lx\n", lockres->l_name, lockres->l_flags);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
+	    level > lockres->l_level) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		/* lock has not been created yet. */
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		goto again;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
+	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (level > lockres->l_level) {
+		if (lockres->l_action != OCFS2_AST_INVALID)
+			mlog(ML_ERROR, "lockres %s has action %u pending\n",
+			     lockres->l_name, lockres->l_action);
+
+		lockres->l_action = OCFS2_AST_CONVERT;
+		lockres->l_requested = level;
+		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		mlog(0, "lock %s, convert from %d to level = %d\n",
+		     lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(osb->dlm,
+				 level,
+				 &lockres->l_lksb,
+				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
+				 lockres->l_name,
+				 lockres->l_ops->ast,
+				 lockres,
+				 lockres->l_ops->bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				ret = -EAGAIN;
+			else {
+				ocfs2_log_dlm_error("dlmlock", status,
+						    lockres);
+				ret = -EINVAL;
+			}
+			ocfs2_recover_from_dlm_error(lockres, 1);
+			goto out;
+		}
+
+		mlog(0, "lock %s, successfull return from dlmlock\n",
+		     lockres->l_name);
+
+		/* At this point we've gone inside the dlm and need to
+		 * complete our work regardless. */
+		catch_signals = 0;
+
+		/* wait for busy to clear and carry on */
+		goto again;
+	}
+
+	/* Ok, if we get here then we're good to go. */
+	ocfs2_inc_holders(lockres, level);
+
+	ret = 0;
+unlock:
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+out:
+	/*
+	 * This is helping work around a lock inversion between the page lock
+	 * and dlm locks.  One path holds the page lock while calling aops
+	 * which block acquiring dlm locks.  The voting thread holds dlm
+	 * locks while acquiring page locks while down converting data locks.
+	 * This block is helping an aop path notice the inversion and back
+	 * off to unlock its page lock before trying the dlm lock again.
+	 */
+	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
+	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
+		wait = 0;
+		if (lockres_remove_mask_waiter(lockres, &mw))
+			ret = -EAGAIN;
+		else
+			goto again;
+	}
+	if (wait) {
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret == 0)
+			goto again;
+		mlog_errno(ret);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int level)
+{
+	unsigned long flags;
+
+	mlog_entry_void();
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ocfs2_dec_holders(lockres, level);
+	ocfs2_vote_on_unlock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	mlog_exit_void();
+}
+
+static int ocfs2_create_new_inode_lock(struct inode *inode,
+				       struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+}
+
+/* Grants us an EX lock on the data and metadata resources, skipping
+ * the normal cluster directory lookup. Use this ONLY on newly created
+ * inodes which other nodes can't possibly see, and which haven't been
+ * hashed in the inode hash yet. This can give us a good performance
+ * increase as it'll skip the network broadcast normally associated
+ * with creating a new lock resource. */
+int ocfs2_create_new_inode_locks(struct inode *inode)
+{
+	int ret;
+
+	BUG_ON(!inode);
+	BUG_ON(!ocfs2_inode_is_new(inode));
+
+	mlog_entry_void();
+
+	mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	/* NOTE: That we don't increment any of the holder counts, nor
+	 * do we add anything to a journal handle. Since this is
+	 * supposed to be a new inode which the cluster doesn't know
+	 * about yet, there is no need to.  As far as the LVB handling
+	 * is concerned, this is basically like acquiring an EX lock
+	 * on a resource which has an invalid one -- we'll set it
+	 * valid when we release the EX. */
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_rw_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_meta_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_data_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+int ocfs2_rw_lock(struct inode *inode, int write)
+{
+	int status, level;
+	struct ocfs2_lock_res *lockres;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
+				    0);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_rw_unlock(struct inode *inode, int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags)
+{
+	int status = 0, level;
+	struct ocfs2_lock_res *lockres;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s DATA lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	/* We'll allow faking a readonly data lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
+		if (write) {
+			status = -EROFS;
+			mlog_errno(status);
+		}
+		goto out;
+	}
+
+	lockres = &OCFS2_I(inode)->ip_data_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
+				    0, arg_flags);
+	if (status < 0 && status != -EAGAIN)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* see ocfs2_meta_lock_with_page() */
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_data_lock(inode, write) == 0)
+			ocfs2_data_unlock(inode, write);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres)
+{
+	int kick = 0;
+
+	mlog_entry_void();
+
+	/* If we know that another node is waiting on our lock, kick
+	 * the vote thread * pre-emptively when we reach a release
+	 * condition. */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
+		switch(lockres->l_blocking) {
+		case LKM_EXMODE:
+			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+				kick = 1;
+			break;
+		case LKM_PRMODE:
+			if (!lockres->l_ex_holders)
+				kick = 1;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	if (kick)
+		ocfs2_kick_vote_thread(osb);
+
+	mlog_exit_void();
+}
+
+void ocfs2_data_unlock(struct inode *inode,
+		       int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+#define OCFS2_SEC_BITS   34
+#define OCFS2_SEC_SHIFT  (64 - 34)
+#define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
+
+/* LVB only has room for 64 bits of time here so we pack it for
+ * now. */
+static u64 ocfs2_pack_timespec(struct timespec *spec)
+{
+	u64 res;
+	u64 sec = spec->tv_sec;
+	u32 nsec = spec->tv_nsec;
+
+	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
+
+	return res;
+}
+
+/* Call this with the lockres locked. I am reasonably sure we don't
+ * need ip_lock in this function as anyone who would be changing those
+ * values is supposed to be blocked in ocfs2_meta_lock right now. */
+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_entry_void();
+
+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
+	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
+	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
+	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
+	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
+	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
+	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
+	lvb->lvb_iatime_packed  =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
+	lvb->lvb_ictime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+	lvb->lvb_imtime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+
+	mlog_meta_lvb(0, lockres);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_unpack_timespec(struct timespec *spec,
+				  u64 packed_time)
+{
+	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
+	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
+}
+
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_entry_void();
+
+	mlog_meta_lvb(0, lockres);
+
+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	/* We're safe here without the lockres lock... */
+	spin_lock(&oi->ip_lock);
+	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
+	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
+
+	/* fast-symlinks are a special case */
+	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks =
+			ocfs2_align_bytes_to_sectors(i_size_read(inode));
+
+	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
+	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
+	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
+	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
+	ocfs2_unpack_timespec(&inode->i_atime,
+			      be64_to_cpu(lvb->lvb_iatime_packed));
+	ocfs2_unpack_timespec(&inode->i_mtime,
+			      be64_to_cpu(lvb->lvb_imtime_packed));
+	ocfs2_unpack_timespec(&inode->i_ctime,
+			      be64_to_cpu(lvb->lvb_ictime_packed));
+	spin_unlock(&oi->ip_lock);
+
+	mlog_exit_void();
+}
+
+static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
+		return 1;
+	return 0;
+}
+
+/* Determine whether a lock resource needs to be refreshed, and
+ * arbitrate who gets to refresh it.
+ *
+ *   0 means no refresh needed.
+ *
+ *   > 0 means you need to refresh this and you MUST call
+ *   ocfs2_complete_lock_res_refresh afterwards. */
+static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
+{
+	unsigned long flags;
+	int status = 0;
+
+	mlog_entry_void();
+
+refresh_check:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_on_refreshing_lock(lockres);
+		goto refresh_check;
+	}
+
+	/* Ok, I'll be the one to refresh this lock. */
+	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = 1;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* If status is non zero, I'll mark it as not being in refresh
+ * anymroe, but i won't clear the needs refresh flag. */
+static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
+						   int status)
+{
+	unsigned long flags;
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
+	if (!status)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+/* may or may not return a bh if it went to disk. */
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
+		mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
+		     "were waiting on a lock. ip_flags = 0x%x\n",
+		     oi->ip_blkno, oi->ip_flags);
+		spin_unlock(&oi->ip_lock);
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	lockres = &oi->ip_meta_lockres;
+
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+
+	/* This will discard any caching information we might have had
+	 * for the inode metadata. */
+	ocfs2_metadata_cache_purge(inode);
+
+	/* will do nothing for inode types that don't use the extent
+	 * map (directories, bitmap files, etc) */
+	ocfs2_extent_map_trunc(inode, 0);
+
+	if (ocfs2_meta_lvb_is_trustable(lockres)) {
+		mlog(0, "Trusting LVB on inode %"MLFu64"\n",
+		     oi->ip_blkno);
+		ocfs2_refresh_inode_from_lvb(inode);
+	} else {
+		/* Boo, we have to go to disk. */
+		/* read bh, cast, ocfs2_refresh_inode */
+		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
+					  bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_refresh;
+		}
+		fe = (struct ocfs2_dinode *) (*bh)->b_data;
+
+		/* This is a good chance to make sure we're not
+		 * locking an invalid object.
+		 *
+		 * We bug on a stale inode here because we checked
+		 * above whether it was wiped from disk. The wiping
+		 * node provides a guarantee that we receive that
+		 * message and can mark the inode before dropping any
+		 * locks associated with it. */
+		if (!OCFS2_IS_VALID_DINODE(fe)) {
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+			status = -EIO;
+			goto bail_refresh;
+		}
+		mlog_bug_on_msg(inode->i_generation !=
+				le32_to_cpu(fe->i_generation),
+				"Invalid dinode %"MLFu64" disk generation: %u "
+				"inode->i_generation: %u\n",
+				oi->ip_blkno, le32_to_cpu(fe->i_generation),
+				inode->i_generation);
+		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
+				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
+				"Stale dinode %"MLFu64" dtime: %"MLFu64" "
+				"flags: 0x%x\n", oi->ip_blkno,
+				le64_to_cpu(fe->i_dtime),
+				le32_to_cpu(fe->i_flags));
+
+		ocfs2_refresh_inode(inode, fe);
+	}
+
+	status = 0;
+bail_refresh:
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_assign_bh(struct inode *inode,
+			   struct buffer_head **ret_bh,
+			   struct buffer_head *passed_bh)
+{
+	int status;
+
+	if (passed_bh) {
+		/* Ok, the update went to disk for us, use the
+		 * returned bh. */
+		*ret_bh = passed_bh;
+		get_bh(*ret_bh);
+
+		return 0;
+	}
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  ret_bh,
+				  OCFS2_BH_CACHED,
+				  inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * returns < 0 error if the callback will never be called, otherwise
+ * the result of the lock will be communicated via the callback.
+ */
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags)
+{
+	int status, level, dlm_flags, acquired;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *local_bh = NULL;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64", take %s META lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	status = 0;
+	acquired = 0;
+	/* We'll allow faking a readonly metadata lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		wait_event(osb->recovery_event,
+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+
+	acquired = 0;
+	lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	level = ex ? LKM_EXMODE : LKM_PRMODE;
+	dlm_flags = 0;
+	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
+		dlm_flags |= LKM_NOQUEUE;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
+	if (status < 0) {
+		if (status != -EAGAIN && status != -EIOCBRETRY)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* Notify the error cleanup path to drop the cluster lock. */
+	acquired = 1;
+
+	/* We wait twice because a node may have died while we were in
+	 * the lower dlm layers. The second time though, we've
+	 * committed to owning this lock so we don't allow signals to
+	 * abort the operation. */
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		wait_event(osb->recovery_event,
+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+
+	/* This is fun. The caller may want a bh back, or it may
+	 * not. ocfs2_meta_lock_update definitely wants one in, but
+	 * may or may not read one, depending on what's in the
+	 * LVB. The result of all of this is that we've *only* gone to
+	 * disk if we have to, so the complexity is worthwhile. */
+	status = ocfs2_meta_lock_update(inode, &local_bh);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	if (ret_bh) {
+		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (handle) {
+		status = ocfs2_handle_add_lock(handle, inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+bail:
+	if (status < 0) {
+		if (ret_bh && (*ret_bh)) {
+			brelse(*ret_bh);
+			*ret_bh = NULL;
+		}
+		if (acquired)
+			ocfs2_meta_unlock(inode, ex);
+	}
+
+	if (local_bh)
+		brelse(local_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * This is working around a lock inversion between tasks acquiring DLM locks
+ * while holding a page lock and the vote thread which blocks dlm lock acquiry
+ * while acquiring page locks.
+ *
+ * ** These _with_page variantes are only intended to be called from aop
+ * methods that hold page locks and return a very specific *positive* error
+ * code that aop methods pass up to the VFS -- test for errors with != 0. **
+ *
+ * The DLM is called such that it returns -EAGAIN if it would have blocked
+ * waiting for the vote thread.  In that case we unlock our page so the vote
+ * thread can make progress.  Once we've done this we have to return
+ * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
+ * into the VFS who will then immediately retry the aop call.
+ *
+ * We do a blocking lock and immediate unlock before returning, though, so that
+ * the lock has a great chance of being cached on this node by the time the VFS
+ * calls back to retry the aop.    This has a potential to livelock as nodes
+ * ping locks back and forth, but that's a risk we're willing to take to avoid
+ * the lock inversion simply.
+ */
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
+				   OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
+			ocfs2_meta_unlock(inode, ex);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s META lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex)
+{
+	int status;
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+	struct buffer_head *bh;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* The super block lock path is really in the best position to
+	 * know when resources covered by the lock need to be
+	 * refreshed, so we do it here. Of course, making sense of
+	 * everything is up to the caller :) */
+	status = ocfs2_should_refresh_lock_res(lockres);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (status) {
+		bh = si->si_bh;
+		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
+					  si->si_inode);
+		if (status == 0)
+			ocfs2_update_slot_info(si);
+
+		ocfs2_complete_lock_res_refresh(lockres, status);
+
+		if (status < 0)
+			mlog_errno(status);
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+int ocfs2_rename_lock(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rename_unlock(struct ocfs2_super *osb)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+}
+
+/* Reference counting of the dlm debug structure. We want this because
+ * open references on the debug inodes can live on after a mount, so
+ * we can't rely on the ocfs2_super to always exist. */
+static void ocfs2_dlm_debug_free(struct kref *kref)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
+
+	kfree(dlm_debug);
+}
+
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
+{
+	if (dlm_debug)
+		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
+}
+
+static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
+{
+	kref_get(&debug->d_refcnt);
+}
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
+	if (!dlm_debug) {
+		mlog_errno(-ENOMEM);
+		goto out;
+	}
+
+	kref_init(&dlm_debug->d_refcnt);
+	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
+	dlm_debug->d_locking_state = NULL;
+out:
+	return dlm_debug;
+}
+
+/* Access to this is arbitrated for us via seq_file->sem. */
+struct ocfs2_dlm_seq_priv {
+	struct ocfs2_dlm_debug *p_dlm_debug;
+	struct ocfs2_lock_res p_iter_res;
+	struct ocfs2_lock_res p_tmp_res;
+};
+
+static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
+						 struct ocfs2_dlm_seq_priv *priv)
+{
+	struct ocfs2_lock_res *iter, *ret = NULL;
+	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
+
+	assert_spin_locked(&ocfs2_dlm_tracking_lock);
+
+	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
+		/* discover the head of the list */
+		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
+			mlog(0, "End of list found, %p\n", ret);
+			break;
+		}
+
+		/* We track our "dummy" iteration lockres' by a NULL
+		 * l_ops field. */
+		if (iter->l_ops != NULL) {
+			ret = iter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
+	if (iter) {
+		/* Since lockres' have the lifetime of their container
+		 * (which can be inodes, ocfs2_supers, etc) we want to
+		 * copy this out to a temporary lockres while still
+		 * under the spinlock. Obviously after this we can't
+		 * trust any pointers on the copy returned, but that's
+		 * ok as the information we want isn't typically held
+		 * in them. */
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter = v;
+	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(iter, priv);
+	list_del_init(&dummy->l_debug_list);
+	if (iter) {
+		list_add(&dummy->l_debug_list, &iter->l_debug_list);
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define OCFS2_DLM_DEBUG_STR_VERSION 1
+static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+	char *lvb;
+	struct ocfs2_lock_res *lockres = v;
+
+	if (!lockres)
+		return -EINVAL;
+
+	seq_printf(m, "0x%x\t"
+		   "%.*s\t"
+		   "%d\t"
+		   "0x%lx\t"
+		   "0x%x\t"
+		   "0x%x\t"
+		   "%u\t"
+		   "%u\t"
+		   "%d\t"
+		   "%d\t",
+		   OCFS2_DLM_DEBUG_STR_VERSION,
+		   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
+		   lockres->l_level,
+		   lockres->l_flags,
+		   lockres->l_action,
+		   lockres->l_unlock_action,
+		   lockres->l_ro_holders,
+		   lockres->l_ex_holders,
+		   lockres->l_requested,
+		   lockres->l_blocking);
+
+	/* Dump the raw LVB */
+	lvb = lockres->l_lksb.lvb;
+	for(i = 0; i < DLM_LVB_LEN; i++)
+		seq_printf(m, "0x%x\t", lvb[i]);
+
+	/* End the line */
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static struct seq_operations ocfs2_dlm_seq_ops = {
+	.start =	ocfs2_dlm_seq_start,
+	.stop =		ocfs2_dlm_seq_stop,
+	.next =		ocfs2_dlm_seq_next,
+	.show =		ocfs2_dlm_seq_show,
+};
+
+static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = (struct seq_file *) file->private_data;
+	struct ocfs2_dlm_seq_priv *priv = seq->private;
+	struct ocfs2_lock_res *res = &priv->p_iter_res;
+
+	ocfs2_remove_lockres_tracking(res);
+	ocfs2_put_dlm_debug(priv->p_dlm_debug);
+	return seq_release_private(inode, file);
+}
+
+static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct ocfs2_dlm_seq_priv *priv;
+	struct seq_file *seq;
+	struct ocfs2_super *osb;
+
+	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	osb = (struct ocfs2_super *) inode->u.generic_ip;
+	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
+	priv->p_dlm_debug = osb->osb_dlm_debug;
+	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
+
+	ret = seq_open(file, &ocfs2_dlm_seq_ops);
+	if (ret) {
+		kfree(priv);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	seq = (struct seq_file *) file->private_data;
+	seq->private = priv;
+
+	ocfs2_add_lockres_tracking(&priv->p_iter_res,
+				   priv->p_dlm_debug);
+
+out:
+	return ret;
+}
+
+static struct file_operations ocfs2_dlm_debug_fops = {
+	.open =		ocfs2_dlm_debug_open,
+	.release =	ocfs2_dlm_debug_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+
+static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
+							 S_IFREG|S_IRUSR,
+							 osb->osb_debug_root,
+							 osb,
+							 &ocfs2_dlm_debug_fops);
+	if (!dlm_debug->d_locking_state) {
+		ret = -EINVAL;
+		mlog(ML_ERROR,
+		     "Unable to create locking state debugfs file.\n");
+		goto out;
+	}
+
+	ocfs2_get_dlm_debug(dlm_debug);
+out:
+	return ret;
+}
+
+static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
+{
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	if (dlm_debug) {
+		debugfs_remove(dlm_debug->d_locking_state);
+		ocfs2_put_dlm_debug(dlm_debug);
+	}
+}
+
+int ocfs2_dlm_init(struct ocfs2_super *osb)
+{
+	int status;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+
+	mlog_entry_void();
+
+	status = ocfs2_dlm_init_debug(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* launch vote thread */
+	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
+				     osb->osb_id);
+	if (IS_ERR(osb->vote_task)) {
+		status = PTR_ERR(osb->vote_task);
+		osb->vote_task = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
+
+	/* for now, uuid == domain */
+	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
+	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+
+	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
+
+	osb->dlm = dlm;
+
+	status = 0;
+bail:
+	if (status < 0) {
+		ocfs2_dlm_shutdown_debug(osb);
+		if (osb->vote_task)
+			kthread_stop(osb->vote_task);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+{
+	mlog_entry_void();
+
+	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
+
+	ocfs2_drop_osb_locks(osb);
+
+	if (osb->vote_task) {
+		kthread_stop(osb->vote_task);
+		osb->vote_task = NULL;
+	}
+
+	ocfs2_lock_res_free(&osb->osb_super_lockres);
+	ocfs2_lock_res_free(&osb->osb_rename_lockres);
+
+	dlm_unregister_domain(osb->dlm);
+	osb->dlm = NULL;
+
+	ocfs2_dlm_shutdown_debug(osb);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
+	     lockres->l_unlock_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	/* We tried to cancel a convert request, but it was already
+	 * granted. All we want to do here is clear our unlock
+	 * state. The wake_up call done at the bottom is redundant
+	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
+	 * hurt anything anyway */
+	if (status == DLM_CANCELGRANT &&
+	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
+
+		/* We don't clear the busy flag in this case as it
+		 * should have been cleared by the ast which the dlm
+		 * has called. */
+		goto complete_unlock;
+	}
+
+	if (status != DLM_NORMAL) {
+		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
+		     "unlock_action %d\n", status, lockres->l_name,
+		     lockres->l_unlock_action);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+		lockres->l_action = OCFS2_AST_INVALID;
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = LKM_IVMODE;
+		break;
+	default:
+		BUG();
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+complete_unlock:
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
+
+struct drop_lock_cb {
+	ocfs2_pre_drop_cb_t	*drop_func;
+	void			*drop_data;
+};
+
+static int ocfs2_drop_lock(struct ocfs2_super *osb,
+			   struct ocfs2_lock_res *lockres,
+			   struct drop_lock_cb *dcb)
+{
+	enum dlm_status status;
+	unsigned long flags;
+
+	/* We didn't get anywhere near actually using this lockres. */
+	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
+		goto out;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
+			"lockres %s, flags 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
+		     "%u, unlock_action = %u\n",
+		     lockres->l_name, lockres->l_flags, lockres->l_action,
+		     lockres->l_unlock_action);
+
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/* XXX: Today we just wait on any busy
+		 * locks... Perhaps we need to cancel converts in the
+		 * future? */
+		ocfs2_wait_on_busy_lock(lockres);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	if (dcb)
+		dcb->drop_func(lockres, dcb->drop_data);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
+		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
+		     lockres->l_name);
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto out;
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
+
+	/* make sure we never get here while waiting for an ast to
+	 * fire. */
+	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
+
+	/* is this necessary? */
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
+			   lockres->l_ops->unlock_ast, lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
+		dlm_print_one_lock(lockres->l_lksb.lockid);
+		BUG();
+	}
+	mlog(0, "lock %s, successfull return from dlmunlock\n",
+	     lockres->l_name);
+
+	ocfs2_wait_on_busy_lock(lockres);
+out:
+	mlog_exit(0);
+	return 0;
+}
+
+/* Mark the lockres as being dropped. It will no longer be
+ * queued if blocking, but we still may have to wait on it
+ * being dequeued from the vote thread before we can consider
+ * it safe to drop. 
+ *
+ * You can *not* attempt to call cluster_lock on this lockres anymore. */
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+{
+	int status;
+	struct ocfs2_mask_waiter mw;
+	unsigned long flags;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres->l_flags |= OCFS2_LOCK_FREEING;
+	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
+
+		status = ocfs2_wait_for_mask(&mw);
+		if (status)
+			mlog_errno(status);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
+{
+	int status;
+
+	mlog_entry_void();
+
+	ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
+
+	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
+
+	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+}
+
+static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
+{
+	struct inode *inode = data;
+
+	/* the metadata lock requires a bit more work as we have an
+	 * LVB to worry about. */
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+	    lockres->l_level == LKM_EXMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+		__ocfs2_stuff_meta_lvb(inode);
+}
+
+int ocfs2_drop_inode_locks(struct inode *inode)
+{
+	int status, err;
+	struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
+
+	mlog_entry_void();
+
+	/* No need to call ocfs2_mark_lockres_freeing here -
+	 * ocfs2_clear_inode has done it for us. */
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_data_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+
+	status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_meta_lockres,
+			      &meta_dcb);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_rw_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	mlog_exit(status);
+	return status;
+}
+
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+				      int new_level)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+
+	if (lockres->l_level <= new_level) {
+		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+		     lockres->l_level, new_level);
+		BUG();
+	}
+
+	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
+	     lockres->l_name, new_level, lockres->l_blocking);
+
+	lockres->l_action = OCFS2_AST_DOWNCONVERT;
+	lockres->l_requested = new_level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb)
+{
+	int ret, dlm_flags = LKM_CONVERT;
+	enum dlm_status status;
+
+	mlog_entry_void();
+
+	if (lvb)
+		dlm_flags |= LKM_VALBLK;
+
+	status = dlmlock(osb->dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 dlm_flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* returns 1 when the caller should unlock and call dlmunlock */
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		/* If we're already trying to cancel a lock conversion
+		 * then just drop the spinlock and allow the caller to
+		 * requeue this lock. */
+
+		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+		return 0;
+	}
+
+	/* were we in a convert when we got the bast fire? */
+	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
+	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
+	/* set things up for the unlockast to know to just
+	 * clear out the ast_action and unset busy, etc. */
+	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
+			"lock %s, invalid flags: 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	return 1;
+}
+
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int ret;
+	enum dlm_status status;
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	ret = 0;
+	status = dlmunlock(osb->dlm,
+			   &lockres->l_lksb,
+			   LKM_CANCEL,
+			   lockres->l_ops->unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 0);
+	}
+
+	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  struct ocfs2_lock_res *lockres,
+						  int new_level)
+{
+	int ret;
+
+	mlog_entry_void();
+
+	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		ret = 0;
+		mlog(0, "lockres %s currently being refreshed -- backing "
+		     "off!\n", lockres->l_name);
+	} else if (new_level == LKM_PRMODE)
+		ret = !lockres->l_ex_holders &&
+			ocfs2_inode_fully_checkpointed(inode);
+	else /* Must be NLMODE we're converting to. */
+		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
+			ocfs2_inode_fully_checkpointed(inode);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue)
+{
+	int new_level;
+	int set_lvb = 0;
+	int ret = 0;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	unsigned long flags;
+
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
+	     lockres->l_blocking);
+
+	BUG_ON(lockres->l_level != LKM_EXMODE &&
+	       lockres->l_level != LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
+	     lockres->l_level, lockres->l_blocking, new_level);
+
+	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
+		if (lockres->l_level == LKM_EXMODE)
+			set_lvb = 1;
+
+		/* If the lock hasn't been refreshed yet (rare), then
+		 * our memory inode values are old and we skip
+		 * stuffing the lvb. There's no need to actually clear
+		 * out the lvb here as it's value is still valid. */
+		if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+			if (set_lvb)
+				__ocfs2_stuff_meta_lvb(inode);
+		} else
+			mlog(0, "lockres %s: downconverting stale lock!\n",
+			     lockres->l_name);
+
+		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
+		     "l_blocking=%d, new_level=%d\n",
+		     lockres->l_level, lockres->l_blocking, new_level);
+
+		ocfs2_prepare_downconvert(lockres, new_level);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+		goto leave;
+	}
+	if (!ocfs2_inode_fully_checkpointed(inode))
+		ocfs2_start_checkpoint(osb);
+
+	*requeue = 1;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = 0;
+leave:
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
+				      struct ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker)
+{
+	unsigned long flags;
+	int blocking;
+	int new_level;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+recheck:
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	/* if we're blocking an exclusive and we have *any* holders,
+	 * then requeue. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		*requeue = 1;
+		ret = 0;
+		goto leave;
+	}
+
+	/* If it's a PR we're blocking, then only
+	 * requeue if we've got any EX holders */
+	if (lockres->l_blocking == LKM_PRMODE &&
+	    lockres->l_ex_holders) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		*requeue = 1;
+		ret = 0;
+		goto leave;
+	}
+
+	/* If we get here, then we know that there are no more
+	 * incompatible holders (and anyone asking for an incompatible
+	 * lock is blocked). We can now downconvert the lock */
+	if (!worker)
+		goto downconvert;
+
+	/* Some lockres types want to do a bit of work before
+	 * downconverting a lock. Allow that here. The worker function
+	 * may sleep, so we save off a copy of what we're blocking as
+	 * it may change while we're not holding the spin lock. */
+	blocking = lockres->l_blocking;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	worker(lockres, blocking);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (blocking != lockres->l_blocking) {
+		/* If this changed underneath us, then we can't drop
+		 * it just yet. */
+		goto recheck;
+	}
+
+downconvert:
+	*requeue = 0;
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	ocfs2_prepare_downconvert(lockres, new_level);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
+leave:
+	mlog_exit(ret);
+	return ret;
+}
+
+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				      int blocking)
+{
+	struct inode *inode;
+	struct address_space *mapping;
+
+	mlog_entry_void();
+
+       	inode = ocfs2_lock_res_inode(lockres);
+	mapping = inode->i_mapping;
+
+	if (filemap_fdatawrite(mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
+		     OCFS2_I(inode)->ip_blkno);
+	}
+	sync_mapping_buffers(mapping);
+	if (blocking == LKM_EXMODE) {
+		truncate_inode_pages(mapping, 0);
+		unmap_mapping_range(mapping, 0, 0, 0);
+	} else {
+		/* We only need to wait on the I/O if we're not also
+		 * truncating pages because truncate_inode_pages waits
+		 * for us above. We don't truncate pages if we're
+		 * blocking anything < EXMODE because we want to keep
+		 * them around in that case. */
+		filemap_fdatawait(mapping);
+	}
+
+	mlog_exit_void();
+}
+
+int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	inode = ocfs2_lock_res_inode(lockres);
+	osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    ocfs2_data_convert_worker);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog(0, "inode %"MLFu64", requeue = %d\n",
+	     OCFS2_I(inode)->ip_blkno, *requeue);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+				    int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	inode  = ocfs2_lock_res_inode(lockres);
+
+	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+
+int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+       	inode = ocfs2_lock_res_inode(lockres);
+
+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_do_unblock_meta(inode, requeue);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog(0, "inode %"MLFu64", requeue = %d\n",
+	     OCFS2_I(inode)->ip_blkno, *requeue);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Generic unblock function for any lockres whose private data is an
+ * ocfs2_super pointer. */
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue)
+{
+	int status;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	osb = ocfs2_lock_res_super(lockres);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int status;
+	int requeue = 0;
+	unsigned long flags;
+
+	/* Our reference to the lockres in this function can be
+	 * considered valid until we remove the OCFS2_LOCK_QUEUED
+	 * flag. */
+
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+	BUG_ON(!lockres->l_ops);
+	BUG_ON(!lockres->l_ops->unblock);
+
+	mlog(0, "lockres %s blocked.\n", lockres->l_name);
+
+	/* Detect whether a lock has been marked as going away while
+	 * the vote thread was processing other things. A lock can
+	 * still be marked with OCFS2_LOCK_FREEING after this check,
+	 * but short circuiting here will still save us some
+	 * performance. */
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_FREEING)
+		goto unqueue;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = lockres->l_ops->unblock(lockres, &requeue);
+	if (status < 0)
+		mlog_errno(status);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+unqueue:
+	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
+	} else
+		ocfs2_schedule_blocked_lock(osb, lockres);
+
+	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+	     requeue ? "yes" : "no");
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	assert_spin_locked(&lockres->l_lock);
+
+	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
+		/* Do not schedule a lock for downconvert when it's on
+		 * the way to destruction - any nodes wanting access
+		 * to the resource will get it soon. */
+		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+		     lockres->l_name, lockres->l_flags);
+		return;
+	}
+
+	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
+
+	spin_lock(&osb->vote_task_lock);
+	if (list_empty(&lockres->l_blocked_list)) {
+		list_add_tail(&lockres->l_blocked_list,
+			      &osb->blocked_lock_list);
+		osb->blocked_lock_count++;
+	}
+	spin_unlock(&osb->vote_task_lock);
+
+	mlog_exit_void();
+}
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	mlog(level, "LVB information for %s (called from %s:%u):\n",
+	     lockres->l_name, function, line);
+	mlog(level, "version: %u, clusters: %u\n",
+	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
+	mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
+	     be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
+	     be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
+	mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
+	     "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
+	     be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
+	     be64_to_cpu(lvb->lvb_ictime_packed),
+	     be64_to_cpu(lvb->lvb_imtime_packed));
+}

+ 111 - 0
fs/ocfs2/dlmglue.h

@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef DLMGLUE_H
+#define DLMGLUE_H
+
+#define OCFS2_LVB_VERSION 2
+
+struct ocfs2_meta_lvb {
+	__be32       lvb_version;
+	__be32       lvb_iclusters;
+	__be32       lvb_iuid;
+	__be32       lvb_igid;
+	__be64       lvb_iatime_packed;
+	__be64       lvb_ictime_packed;
+	__be64       lvb_imtime_packed;
+	__be64       lvb_isize;
+	__be16       lvb_imode;
+	__be16       lvb_inlink;
+	__be32       lvb_reserved[3];
+};
+
+/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* don't block waiting for the vote thread, instead return -EAGAIN */
+#define OCFS2_LOCK_NONBLOCK		(0x04)
+
+int ocfs2_dlm_init(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode);
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags);
+#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page);
+void ocfs2_data_unlock(struct inode *inode,
+		       int write);
+int ocfs2_rw_lock(struct inode *inode, int write);
+void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags);
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page);
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex);
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex);
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex);
+int ocfs2_rename_lock(struct ocfs2_super *osb);
+void ocfs2_rename_unlock(struct ocfs2_super *osb);
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+
+/* for the vote thread */
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
+
+/* aids in debugging and tracking lvbs */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres);
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+#endif	/* DLMGLUE_H */

+ 45 - 0
fs/ocfs2/endian.h

@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ENDIAN_H
+#define OCFS2_ENDIAN_H
+
+static inline void le16_add_cpu(__le16 *var, u16 val)
+{
+	*var = cpu_to_le16(le16_to_cpu(*var) + val);
+}
+
+static inline void le32_add_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) + val);
+}
+
+static inline void le32_and_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) & val);
+}
+
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+
+#endif /* OCFS2_ENDIAN_H */

+ 248 - 0
fs/ocfs2/export.c

@@ -0,0 +1,248 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.c
+ *
+ * Functions to facilitate NFS exporting
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#define MLOG_MASK_PREFIX ML_EXPORT
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dir.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "inode.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_inode_handle
+{
+	u64 ih_blkno;
+	u32 ih_generation;
+};
+
+static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
+{
+	struct ocfs2_inode_handle *handle = vobjp;
+	struct inode *inode;
+	struct dentry *result;
+
+	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+
+	if (handle->ih_blkno == 0) {
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
+
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		return (void *)inode;
+	}
+
+	if (handle->ih_generation != inode->i_generation) {
+		iput(inode);
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+
+	result = d_alloc_anon(inode);
+
+	if (!result) {
+		iput(inode);
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mlog_exit_ptr(result);
+	return result;
+}
+
+static struct dentry *ocfs2_get_parent(struct dentry *child)
+{
+	int status;
+	u64 blkno;
+	struct dentry *parent;
+	struct inode *inode;
+	struct inode *dir = child->d_inode;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent;
+
+	mlog_entry("(0x%p, '%.*s')\n", child,
+		   child->d_name.len, child->d_name.name);
+
+	mlog(0, "find parent of directory %"MLFu64"\n",
+	     OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
+					  &dirent);
+	if (status < 0) {
+		parent = ERR_PTR(-ENOENT);
+		goto bail_unlock;
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		parent = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+
+bail_unlock:
+	ocfs2_meta_unlock(dir, 0);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+bail:
+	mlog_exit_ptr(parent);
+
+	return parent;
+}
+
+static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type = 1;
+	u64 blkno;
+	u32 generation;
+
+	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name,
+		   fh, len, connectable);
+
+	if (len < 3 || (connectable && len < 6)) {
+		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+		type = 255;
+		goto bail;
+	}
+
+	blkno = OCFS2_I(inode)->ip_blkno;
+	generation = inode->i_generation;
+
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     blkno, generation);
+
+	len = 3;
+	fh[0] = cpu_to_le32((u32)(blkno >> 32));
+	fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
+	fh[2] = cpu_to_le32(generation);
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+
+		parent = dentry->d_parent->d_inode;
+		blkno = OCFS2_I(parent)->ip_blkno;
+		generation = parent->i_generation;
+
+		fh[3] = cpu_to_le32((u32)(blkno >> 32));
+		fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
+		fh[5] = cpu_to_le32(generation);
+
+		spin_unlock(&dentry->d_lock);
+
+		len = 6;
+		type = 2;
+
+		mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
+		     blkno, generation);
+	}
+	
+	*max_len = len;
+
+bail:
+	mlog_exit(type);
+	return type;
+}
+
+static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
+				      int fh_len, int fileid_type,
+				      int (*acceptable)(void *context,
+						        struct dentry *de),
+				      void *context)
+{
+	struct ocfs2_inode_handle handle, parent;
+	struct dentry *ret = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
+		   sb, fh, fh_len, fileid_type, acceptable, context);
+
+	if (fh_len < 3 || fileid_type > 2)
+		goto bail;
+
+	if (fileid_type == 2) {
+		if (fh_len < 6)
+			goto bail;
+
+		parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
+		parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
+		parent.ih_generation = le32_to_cpu(fh[5]);
+
+		mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
+		     parent.ih_blkno, parent.ih_generation);
+	}
+
+	handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
+	handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
+	handle.ih_generation = le32_to_cpu(fh[2]);
+
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     handle.ih_blkno, handle.ih_generation);
+
+	ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
+						    acceptable, context);
+
+bail:
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+struct export_operations ocfs2_export_ops = {
+	.decode_fh	= ocfs2_decode_fh,
+	.encode_fh	= ocfs2_encode_fh,
+
+	.get_parent	= ocfs2_get_parent,
+	.get_dentry	= ocfs2_get_dentry,
+};

+ 31 - 0
fs/ocfs2/export.h

@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_EXPORT_H
+#define OCFS2_EXPORT_H
+
+extern struct export_operations ocfs2_export_ops;
+
+#endif /* OCFS2_EXPORT_H */

+ 994 - 0
fs/ocfs2/extent_map.c

@@ -0,0 +1,994 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.c
+ *
+ * In-memory extent map for OCFS2.  Man, this code was prettier in
+ * the library.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_EXTENT_MAP
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "extent_map.h"
+#include "inode.h"
+#include "super.h"
+
+#include "buffer_head_io.h"
+
+
+/*
+ * SUCK SUCK SUCK
+ * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
+ */
+
+struct ocfs2_extent_map_entry {
+	struct rb_node e_node;
+	int e_tree_depth;
+	struct ocfs2_extent_rec e_rec;
+};
+
+struct ocfs2_em_insert_context {
+	int need_left;
+	int need_right;
+	struct ocfs2_extent_map_entry *new_ent;
+	struct ocfs2_extent_map_entry *old_ent;
+	struct ocfs2_extent_map_entry *left_ent;
+	struct ocfs2_extent_map_entry *right_ent;
+};
+
+static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
+
+
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent);
+static int ocfs2_extent_map_insert(struct inode *inode,
+				   struct ocfs2_extent_rec *rec,
+				   int tree_depth);
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent);
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      struct ocfs2_extent_list *el);
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+					u32 cpos, u32 clusters,
+					struct ocfs2_extent_map_entry **ret_ent);
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       struct ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt);
+
+/* returns 1 only if the rec contains all the given clusters -- that is that
+ * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
+ * clusters) is >= the argument's endpoint */
+static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
+					      u32 cpos, u32 clusters)
+{
+	if (le32_to_cpu(rec->e_cpos) > cpos)
+		return 0;
+	if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
+			      le32_to_cpu(rec->e_clusters))
+		return 0;
+	return 1;
+}
+
+
+/*
+ * Find an entry in the tree that intersects the region passed in.
+ * Note that this will find straddled intervals, it is up to the
+ * callers to enforce any boundary conditions.
+ *
+ * Callers must hold ip_lock.  This lookup is not guaranteed to return
+ * a tree_depth 0 match, and as such can race inserts if the lock
+ * were not held.
+ *
+ * The rb_node garbage lets insertion share the search.  Trivial
+ * callers pass NULL.
+ */
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent)
+{
+	struct rb_node **p = &em->em_extents.rb_node;
+	struct rb_node *parent = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	while (*p)
+	{
+		parent = *p;
+		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
+			       e_node);
+		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
+			p = &(*p)->rb_left;
+			ent = NULL;
+		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
+				    le32_to_cpu(ent->e_rec.e_clusters))) {
+			p = &(*p)->rb_right;
+			ent = NULL;
+		} else
+			break;
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+	return ent;
+}
+
+/*
+ * Find the leaf containing the interval we want.  While we're on our
+ * way down the tree, fill in every record we see at any depth, because
+ * we might want it later.
+ *
+ * Note that this code is run without ip_lock.  That's because it
+ * sleeps while reading.  If someone is also filling the extent list at
+ * the same time we are, we might have to restart.
+ */
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      struct ocfs2_extent_list *el)
+{
+	int i, ret;
+	struct buffer_head *eb_bh = NULL;
+	u64 blkno;
+	u32 rec_end;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+
+	/*
+	 * The bh data containing the el cannot change here, because
+	 * we hold alloc_sem.  So we can do this without other
+	 * locks.
+	 */
+	while (el->l_tree_depth)
+	{
+		blkno = 0;
+		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+			rec = &el->l_recs[i];
+			rec_end = (le32_to_cpu(rec->e_cpos) +
+				   le32_to_cpu(rec->e_clusters));
+
+			ret = -EBADR;
+			if (rec_end > OCFS2_I(inode)->ip_clusters) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			if (rec_end <= cpos) {
+				ret = ocfs2_extent_map_insert(inode, rec,
+						le16_to_cpu(el->l_tree_depth));
+				if (ret && (ret != -EEXIST)) {
+					mlog_errno(ret);
+					goto out_free;
+				}
+				continue;
+			}
+			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
+				ret = ocfs2_extent_map_insert(inode, rec,
+						le16_to_cpu(el->l_tree_depth));
+				if (ret && (ret != -EEXIST)) {
+					mlog_errno(ret);
+					goto out_free;
+				}
+				continue;
+			}
+
+			/*
+			 * We've found a record that matches our
+			 * interval.  We don't insert it because we're
+			 * about to traverse it.
+			 */
+
+			/* Check to see if we're stradling */
+			ret = -ESRCH;
+			if (!ocfs2_extent_rec_contains_clusters(rec,
+							        cpos,
+								clusters)) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			/*
+			 * If we've already found a record, the el has
+			 * two records covering the same interval.
+			 * EEEK!
+			 */
+			ret = -EBADR;
+			if (blkno) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			blkno = le64_to_cpu(rec->e_blkno);
+		}
+
+		/*
+		 * We don't support holes, and we're still up
+		 * in the branches, so we'd better have found someone
+		 */
+		ret = -EBADR;
+		if (!blkno) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+
+		if (eb_bh) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+		}
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       blkno, &eb_bh, OCFS2_BH_CACHED,
+				       inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			ret = -EIO;
+			goto out_free;
+		}
+		el = &eb->h_list;
+	}
+
+	if (el->l_tree_depth)
+		BUG();
+
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+		ret = ocfs2_extent_map_insert(inode, rec,
+					      le16_to_cpu(el->l_tree_depth));
+		if (ret) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+	}
+
+	ret = 0;
+
+out_free:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	return ret;
+}
+
+/*
+ * This lookup actually will read from disk.  It has one invariant:
+ * It will never re-traverse blocks.  This means that all inserts should
+ * be new regions or more granular regions (both allowed by insert).
+ */
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+					u32 cpos,
+					u32 clusters,
+					struct ocfs2_extent_map_entry **ret_ent)
+{
+	int ret;
+	u64 blkno;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_list *el;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (ent) {
+		if (!ent->e_tree_depth) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			*ret_ent = ent;
+			return 0;
+		}
+		blkno = le64_to_cpu(ent->e_rec.e_blkno);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		eb = (struct ocfs2_extent_block *)bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			brelse(bh);
+			return -EIO;
+		}
+		el = &eb->h_list;
+	} else {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       OCFS2_I(inode)->ip_blkno, &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		di = (struct ocfs2_dinode *)bh->b_data;
+		if (!OCFS2_IS_VALID_DINODE(di)) {
+			brelse(bh);
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
+			return -EIO;
+		}
+		el = &di->id2.i_list;
+	}
+
+	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
+	brelse(bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (!ent) {
+		ret = -ESRCH;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ent->e_tree_depth)
+		BUG();  /* FIXME: Make sure this isn't a corruption */
+
+	*ret_ent = ent;
+
+	return 0;
+}
+
+/*
+ * Callers must hold ip_lock.  This can insert pieces of the tree,
+ * thus racing lookup if the lock weren't held.
+ */
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent)
+{
+	struct rb_node **p, *parent;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
+					  le32_to_cpu(ent->e_rec.e_clusters),
+					  &p, &parent);
+	if (old_ent)
+		return -EEXIST;
+
+	rb_link_node(&ent->e_node, parent, p);
+	rb_insert_color(&ent->e_node, &em->em_extents);
+
+	return 0;
+}
+
+
+/*
+ * Simple rule: on any return code other than -EAGAIN, anything left
+ * in the insert_context will be freed.
+ */
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       struct ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	ctxt->need_left = 0;
+	ctxt->need_right = 0;
+	ctxt->old_ent = NULL;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+	if (!ret) {
+		ctxt->new_ent = NULL;
+		goto out_unlock;
+	}
+
+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
+					  le32_to_cpu(rec->e_clusters), NULL,
+					  NULL);
+
+	if (!old_ent)
+		BUG();
+
+	ret = -EEXIST;
+	if (old_ent->e_tree_depth < tree_depth)
+		goto out_unlock;
+
+	if (old_ent->e_tree_depth == tree_depth) {
+		if (!memcmp(rec, &old_ent->e_rec,
+			    sizeof(struct ocfs2_extent_rec)))
+			ret = 0;
+
+		/* FIXME: Should this be ESRCH/EBADR??? */
+		goto out_unlock;
+	}
+
+	/*
+	 * We do it in this order specifically so that no actual tree
+	 * changes occur until we have all the pieces we need.  We
+	 * don't want malloc failures to leave an inconsistent tree.
+	 * Whenever we drop the lock, another process could be
+	 * inserting.  Also note that, if another process just beat us
+	 * to an insert, we might not need the same pieces we needed
+	 * the first go round.  In the end, the pieces we need will
+	 * be used, and the pieces we don't will be freed.
+	 */
+	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
+			     le32_to_cpu(old_ent->e_rec.e_cpos));
+	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
+			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
+			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
+	ret = -EAGAIN;
+	if (ctxt->need_left) {
+		if (!ctxt->left_ent)
+			goto out_unlock;
+		*(ctxt->left_ent) = *old_ent;
+		ctxt->left_ent->e_rec.e_clusters =
+			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
+				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
+	}
+	if (ctxt->need_right) {
+		if (!ctxt->right_ent)
+			goto out_unlock;
+		*(ctxt->right_ent) = *old_ent;
+		ctxt->right_ent->e_rec.e_cpos =
+			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
+				    le32_to_cpu(rec->e_clusters));
+		ctxt->right_ent->e_rec.e_clusters =
+			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
+				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
+				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
+	}
+
+	rb_erase(&old_ent->e_node, &em->em_extents);
+	/* Now that he's erased, set him up for deletion */
+	ctxt->old_ent = old_ent;
+
+	if (ctxt->need_left) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->left_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->left_ent = NULL;
+	}
+
+	if (ctxt->need_right) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->right_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->right_ent = NULL;
+	}
+
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+
+	if (!ret)
+		ctxt->new_ent = NULL;
+
+out_unlock:
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	return ret;
+}
+
+
+static int ocfs2_extent_map_insert(struct inode *inode,
+				   struct ocfs2_extent_rec *rec,
+				   int tree_depth)
+{
+	int ret;
+	struct ocfs2_em_insert_context ctxt = {0, };
+
+	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+	    OCFS2_I(inode)->ip_map.em_clusters) {
+		ret = -EBADR;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Zero e_clusters means a truncated tail record.  It better be EOF */
+	if (!rec->e_clusters) {
+		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
+		    OCFS2_I(inode)->ip_map.em_clusters) {
+			ret = -EBADR;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		/* Ignore the truncated tail */
+		return 0;
+	}
+
+	ret = -ENOMEM;
+	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
+					GFP_KERNEL);
+	if (!ctxt.new_ent) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ctxt.new_ent->e_rec = *rec;
+	ctxt.new_ent->e_tree_depth = tree_depth;
+
+	do {
+		ret = -ENOMEM;
+		if (ctxt.need_left && !ctxt.left_ent) {
+			ctxt.left_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.left_ent)
+				break;
+		}
+		if (ctxt.need_right && !ctxt.right_ent) {
+			ctxt.right_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.right_ent)
+				break;
+		}
+
+		ret = ocfs2_extent_map_try_insert(inode, rec,
+						  tree_depth, &ctxt);
+	} while (ret == -EAGAIN);
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	if (ctxt.left_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
+	if (ctxt.right_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
+	if (ctxt.old_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
+	if (ctxt.new_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+
+	return ret;
+}
+
+/*
+ * Append this record to the tail of the extent map.  It must be
+ * tree_depth 0.  The record might be an extension of an existing
+ * record, and as such that needs to be handled.  eg:
+ *
+ * Existing record in the extent map:
+ *
+ *	cpos = 10, len = 10
+ * 	|---------|
+ *
+ * New Record:
+ *
+ *	cpos = 10, len = 20
+ * 	|------------------|
+ *
+ * The passed record is the new on-disk record.  The new_clusters value
+ * is how many clusters were added to the file.  If the append is a
+ * contiguous append, the new_clusters has been added to
+ * rec->e_clusters.  If the append is an entirely new extent, then
+ * rec->e_clusters is == new_clusters.
+ */
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct ocfs2_extent_rec *old;
+
+	BUG_ON(!new_clusters);
+	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
+
+	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
+			 le32_to_cpu(rec->e_clusters)) !=
+			(em->em_clusters + new_clusters),
+			"Inode %"MLFu64":\n"
+			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
+			"em->em_clusters = %u + new_clusters = %u = %u\n",
+			OCFS2_I(inode)->ip_blkno,
+			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
+			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
+			em->em_clusters, new_clusters,
+			em->em_clusters + new_clusters);
+
+	em->em_clusters += new_clusters;
+
+	ret = -ENOENT;
+	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
+		/* This is a contiguous append */
+		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
+					      NULL, NULL);
+		if (ent) {
+			old = &ent->e_rec;
+			BUG_ON((le32_to_cpu(rec->e_cpos) +
+				le32_to_cpu(rec->e_clusters)) !=
+				 (le32_to_cpu(old->e_cpos) +
+				  le32_to_cpu(old->e_clusters) +
+				  new_clusters));
+			if (ent->e_tree_depth == 0) {
+				BUG_ON(le32_to_cpu(old->e_cpos) !=
+				       le32_to_cpu(rec->e_cpos));
+				BUG_ON(le64_to_cpu(old->e_blkno) !=
+				       le64_to_cpu(rec->e_blkno));
+				ret = 0;
+			}
+			/*
+			 * Let non-leafs fall through as -ENOENT to
+			 * force insertion of the new leaf.
+			 */
+			le32_add_cpu(&old->e_clusters, new_clusters);
+		}
+	}
+
+	if (ret == -ENOENT)
+		ret = ocfs2_extent_map_insert(inode, rec, 0);
+	if (ret < 0)
+		mlog_errno(ret);
+	return ret;
+}
+
+#if 0
+/* Code here is included but defined out as it completes the extent
+ * map api and may be used in the future. */
+
+/*
+ * Look up the record containing this cluster offset.  This record is
+ * part of the extent map.  Do not free it.  Any changes you make to
+ * it will reflect in the extent map.  So, if your last extent
+ * is (cpos = 10, clusters = 10) and you truncate the file by 5
+ * clusters, you can do:
+ *
+ * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
+ * rec->e_clusters -= 5;
+ *
+ * The lookup does not read from disk.  If the map isn't filled in for
+ * an entry, you won't find it.
+ *
+ * Also note that the returned record is valid until alloc_sem is
+ * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
+ */
+int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+			     struct ocfs2_extent_rec **rec,
+			     int *tree_depth)
+{
+	int ret = -ENOENT;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*rec = NULL;
+
+	if (cpos >= OCFS2_I(inode)->ip_clusters)
+		return -EINVAL;
+
+	if (cpos >= em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
+	}
+
+	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
+				      NULL, NULL);
+
+	if (ent) {
+		*rec = &ent->e_rec;
+		if (tree_depth)
+			*tree_depth = ent->e_tree_depth;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+int ocfs2_extent_map_get_clusters(struct inode *inode,
+				  u32 v_cpos, int count,
+				  u32 *p_cpos, int *ret_count)
+{
+	int ret;
+	u32 coff, ccount;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	*p_cpos = ccount = 0;
+
+	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
+		return -EINVAL;
+
+	if ((v_cpos + count) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+
+	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
+	if (ret)
+		return ret;
+
+	if (ent) {
+		/* We should never find ourselves straddling an interval */
+		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
+							v_cpos,
+							count))
+			return -ESRCH;
+
+		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
+		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+				le64_to_cpu(ent->e_rec.e_blkno)) +
+			  coff;
+
+		if (ret_count)
+			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
+
+		return 0;
+	}
+
+
+	return -ENOENT;
+}
+
+#endif  /*  0  */
+
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count)
+{
+	int ret;
+	u64 boff;
+	u32 cpos, clusters;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	struct ocfs2_extent_map_entry *ent = NULL;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_rec *rec;
+
+	*p_blkno = 0;
+
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
+					    (u64)count + bpc - 1);
+	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if ((cpos + clusters) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ent)
+	{
+		rec = &ent->e_rec;
+
+		/* We should never find ourselves straddling an interval */
+		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
+			ret = -ESRCH;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
+						le32_to_cpu(rec->e_cpos));
+		boff += (v_blkno & (u64)(bpc - 1));
+		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
+
+		if (ret_count) {
+			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
+					le32_to_cpu(rec->e_clusters)) - boff;
+		}
+
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+int ocfs2_extent_map_init(struct inode *inode)
+{
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+
+	em->em_extents = RB_ROOT;
+	em->em_clusters = 0;
+
+	return 0;
+}
+
+/* Needs the lock */
+static void __ocfs2_extent_map_drop(struct inode *inode,
+				    u32 new_clusters,
+				    struct rb_node **free_head,
+				    struct ocfs2_extent_map_entry **tail_ent)
+{
+	struct rb_node *node, *next;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*free_head = NULL;
+
+	ent = NULL;
+	node = rb_last(&em->em_extents);
+	while (node)
+	{
+		next = rb_prev(node);
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
+			break;
+
+		rb_erase(&ent->e_node, &em->em_extents);
+
+		node->rb_right = *free_head;
+		*free_head = node;
+
+		ent = NULL;
+		node = next;
+	}
+
+	/* Do we have an entry straddling new_clusters? */
+	if (tail_ent) {
+		if (ent &&
+		    ((le32_to_cpu(ent->e_rec.e_cpos) +
+		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
+			*tail_ent = ent;
+		else
+			*tail_ent = NULL;
+	}
+}
+
+static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
+{
+	struct rb_node *node;
+	struct ocfs2_extent_map_entry *ent;
+
+	while (free_head) {
+		node = free_head;
+		free_head = node->rb_right;
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		kmem_cache_free(ocfs2_em_ent_cachep, ent);
+	}
+}
+
+/*
+ * Remove all entries past new_clusters, inclusive of an entry that
+ * contains new_clusters.  This is effectively a cache forget.
+ *
+ * If you want to also clip the last extent by some number of clusters,
+ * you need to call ocfs2_extent_map_trunc().
+ * This code does not check or modify ip_clusters.
+ */
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent) {
+		rb_erase(&ent->e_node, &em->em_extents);
+		ent->e_node.rb_right = free_head;
+		free_head = &ent->e_node;
+	}
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+/*
+ * Remove all entries past new_clusters and also clip any extent
+ * straddling new_clusters, if there is one.  This does not check
+ * or modify ip_clusters
+ */
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent)
+		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
+					       le32_to_cpu(ent->e_rec.e_cpos));
+
+	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+int __init init_ocfs2_extent_maps(void)
+{
+	ocfs2_em_ent_cachep =
+		kmem_cache_create("ocfs2_em_ent",
+				  sizeof(struct ocfs2_extent_map_entry),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_em_ent_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit exit_ocfs2_extent_maps(void)
+{
+	kmem_cache_destroy(ocfs2_em_ent_cachep);
+}

+ 46 - 0
fs/ocfs2/extent_map.h

@@ -0,0 +1,46 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.h
+ *
+ * In-memory file extent mappings for OCFS2.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _EXTENT_MAP_H
+#define _EXTENT_MAP_H
+
+int init_ocfs2_extent_maps(void);
+void exit_ocfs2_extent_maps(void);
+
+/*
+ * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+ * to be held.  The allocation cannot change at all while the map is
+ * in the process of being updated.
+ */
+int ocfs2_extent_map_init(struct inode *inode);
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters);
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count);
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+
+#endif  /* _EXTENT_MAP_H */

+ 1237 - 0
fs/ocfs2/file.c

@@ -0,0 +1,1237 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c
+ *
+ * File open, close, extend, truncate
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "mmap.h"
+#include "suballoc.h"
+#include "super.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_sync_inode(struct inode *inode)
+{
+	filemap_fdatawrite(inode->i_mapping);
+	return sync_mapping_buffers(inode->i_mapping);
+}
+
+static int ocfs2_file_open(struct inode *inode, struct file *file)
+{
+	int status;
+	int mode = file->f_flags;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+		   file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+
+	spin_lock(&oi->ip_lock);
+
+	/* Check that the inode hasn't been wiped from disk by another
+	 * node. If it hasn't then we're safe as long as we hold the
+	 * spin lock until our increment of open count. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&oi->ip_lock);
+
+		status = -ENOENT;
+		goto leave;
+	}
+
+	if (mode & O_DIRECT)
+		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
+
+	oi->ip_open_count++;
+	spin_unlock(&oi->ip_lock);
+	status = 0;
+leave:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_file_release(struct inode *inode, struct file *file)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+		       file->f_dentry->d_name.len,
+		       file->f_dentry->d_name.name);
+
+	spin_lock(&oi->ip_lock);
+	if (!--oi->ip_open_count)
+		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+	spin_unlock(&oi->ip_lock);
+
+	mlog_exit(0);
+
+	return 0;
+}
+
+static int ocfs2_sync_file(struct file *file,
+			   struct dentry *dentry,
+			   int datasync)
+{
+	int err = 0;
+	journal_t *journal;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	err = ocfs2_sync_inode(dentry->d_inode);
+	if (err)
+		goto bail;
+
+	journal = osb->journal->j_journal;
+	err = journal_force_commit(journal);
+
+bail:
+	mlog_exit(err);
+
+	return (err < 0) ? -EIO : 0;
+}
+
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size)
+{
+	int status;
+
+	mlog_entry_void();
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_simple_size_update(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    u64 new_i_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, NULL,
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
+				   new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(handle);
+out:
+	return ret;
+}
+
+static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     u64 new_i_size)
+{
+	int status;
+	struct ocfs2_journal_handle *handle;
+
+	mlog_entry_void();
+
+	/* TODO: This needs to actually orphan the inode in this
+	 * transaction. */
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(handle);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_truncate_file(struct inode *inode,
+			       struct buffer_head *di_bh,
+			       u64 new_i_size)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_truncate_context *tc = NULL;
+
+	mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
+		   OCFS2_I(inode)->ip_blkno, new_i_size);
+
+	truncate_inode_pages(inode->i_mapping, new_i_size);
+
+	fe = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+
+	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
+			"Inode %"MLFu64", inode i_size = %lld != di "
+			"i_size = %"MLFu64", i_flags = 0x%x\n",
+			OCFS2_I(inode)->ip_blkno,
+			i_size_read(inode),
+			le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
+
+	if (new_i_size > le64_to_cpu(fe->i_size)) {
+		mlog(0, "asked to truncate file with size (%"MLFu64") "
+		     "to size (%"MLFu64")!\n",
+		     le64_to_cpu(fe->i_size), new_i_size);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
+	     le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
+
+	/* lets handle the simple truncate cases before doing any more
+	 * cluster locking. */
+	if (new_i_size == le64_to_cpu(fe->i_size))
+		goto bail;
+
+	if (le32_to_cpu(fe->i_clusters) ==
+	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
+		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
+		     fe->i_clusters);
+		/* No allocation change is required, so lets fast path
+		 * this truncate. */
+		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+		if (status < 0)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* This forces other nodes to sync and drop their pages */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_data_unlock(inode, 1);
+
+	/* alright, we're going to need to do a full blown alloc size
+	 * change. Orphan the inode so that recovery can complete the
+	 * truncate if necessary. This does the task of marking
+	 * i_size. */
+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* TODO: orphan dir cleanup here. */
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * extend allocation only here.
+ * we'll update all the disk stuff, and oip->alloc_size
+ *
+ * expect stuff to be locked, a transaction started and enough data /
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
+ */
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason_ret)
+{
+	int status = 0;
+	int free_extents;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+
+	BUG_ON(!clusters_to_add);
+
+	free_extents = ocfs2_num_free_extents(osb, inode, fe);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		mlog(0, "we haven't reserved any metadata!\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(fe))) {
+		mlog(0, "filesystem is really fragmented...\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
+				      &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the inode */
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
+	     num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
+	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
+				     num_bits, meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_clusters, num_bits);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= num_bits;
+
+	if (clusters_to_add) {
+		mlog(0, "need to alloc once more, clusters = %u, wanted = "
+		     "%u\n", fe->i_clusters, clusters_to_add);
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
+	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
+	return status;
+}
+
+static int ocfs2_extend_allocation(struct inode *inode,
+				   u32 clusters_to_add)
+{
+	int status = 0;
+	int restart_func = 0;
+	int drop_alloc_sem = 0;
+	int credits, num_free_extents;
+	u32 prev_clusters;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto leave;
+	}
+
+restart_all:
+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
+
+	mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+	     fe->i_clusters, clusters_to_add);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	num_free_extents = ocfs2_num_free_extents(osb,
+						  inode,
+						  fe);
+	if (num_free_extents < 0) {
+		status = num_free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!num_free_extents) {
+		status = ocfs2_reserve_new_metadata(osb,
+						    handle,
+						    fe,
+						    &meta_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	status = ocfs2_reserve_clusters(osb,
+					handle,
+					clusters_to_add,
+					&data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* blocks peope in read/write from reading our allocation
+	 * until we're done changing it. We depend on i_sem to block
+	 * other extend/truncate calls while we're here. Ordering wrt
+	 * start_trans is important here -- always do it before! */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	drop_alloc_sem = 1;
+
+	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	/* reserve a write to the file entry early on - that we if we
+	 * run out of credits in the allocation path, we can still
+	 * update i_size. */
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = OCFS2_I(inode)->ip_clusters;
+
+	status = ocfs2_do_extend_allocation(osb,
+					    inode,
+					    clusters_to_add,
+					    bh,
+					    handle,
+					    data_ac,
+					    meta_ac,
+					    &why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			mlog(0, "restarting function.\n");
+			restart_func = 1;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			mlog(0, "restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    fe,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+	mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
+	     fe->i_clusters, fe->i_size);
+	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
+	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+
+leave:
+	if (drop_alloc_sem) {
+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
+		drop_alloc_sem = 0;
+	}
+	if (handle) {
+		ocfs2_commit_trans(handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->commit_write(). */
+static int ocfs2_write_zero_page(struct inode *inode,
+				 u64 size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long index;
+	unsigned int offset;
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret;
+
+	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_prepare_write(NULL, page, offset, offset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		handle = ocfs2_start_walk_page_trans(inode, page, offset,
+						     offset);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/* must not update i_size! */
+	ret = block_commit_write(page, offset, offset);
+	if (ret < 0)
+		mlog_errno(ret);
+	else
+		ret = 0;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+static int ocfs2_zero_extend(struct inode *inode,
+			     u64 zero_to_size)
+{
+	int ret = 0;
+	u64 start_off;
+	struct super_block *sb = inode->i_sb;
+
+	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	while (start_off < zero_to_size) {
+		ret = ocfs2_write_zero_page(inode, start_off);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		start_off += sb->s_blocksize;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret = 0;
+	u32 clusters_to_add;
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto out;
+
+	if (i_size_read(inode) == new_i_size)
+  		goto out;
+	BUG_ON(new_i_size < i_size_read(inode));
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
+		OCFS2_I(inode)->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_zero_extend(inode, new_i_size);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} 
+
+	/* No allocation required, we just use this helper to
+	 * do a trivial update of i_size. */
+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int status = 0, size_change;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+	           dentry->d_name.len, dentry->d_name.name);
+
+	if (attr->ia_valid & ATTR_MODE)
+		mlog(0, "mode change: %d\n", attr->ia_mode);
+	if (attr->ia_valid & ATTR_UID)
+		mlog(0, "uid change: %d\n", attr->ia_uid);
+	if (attr->ia_valid & ATTR_GID)
+		mlog(0, "gid change: %d\n", attr->ia_gid);
+	if (attr->ia_valid & ATTR_SIZE)
+		mlog(0, "size change...\n");
+	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
+		mlog(0, "time change...\n");
+
+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
+			   | ATTR_GID | ATTR_UID | ATTR_MODE)
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
+		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+		return 0;
+	}
+
+	status = inode_change_ok(inode, attr);
+	if (status)
+		return status;
+
+	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
+	if (size_change) {
+		status = ocfs2_rw_lock(inode, 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail_unlock_rw;
+	}
+
+	if (size_change && attr->ia_size != i_size_read(inode)) {
+		if (i_size_read(inode) > attr->ia_size)
+			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
+		else
+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			status = -ENOSPC;
+			goto bail_unlock;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	status = inode_setattr(inode, attr);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode, 1);
+bail_unlock_rw:
+	if (size_change)
+		ocfs2_rw_unlock(inode, 1);
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_getattr(struct vfsmount *mnt,
+		  struct dentry *dentry,
+		  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct ocfs2_super *osb = sb->s_fs_info;
+	int err;
+
+	mlog_entry_void();
+
+	err = ocfs2_inode_revalidate(dentry);
+	if (err) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	generic_fillattr(inode, stat);
+
+	/* We set the blksize from the cluster size for performance */
+	stat->blksize = osb->s_clustersize;
+
+bail:
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_journal_handle *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+
+	mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
+		   inode->i_mode);
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_trans;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_bh;
+	}
+
+	inode->i_mode &= ~S_ISUID;
+	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
+		inode->i_mode &= ~S_ISGID;
+
+	di = (struct ocfs2_dinode *) bh->b_data;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+
+	ret = ocfs2_journal_dirty(handle, bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_bh:
+	brelse(bh);
+out_trans:
+	ocfs2_commit_trans(handle);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_write_should_remove_suid(struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+
+	if (!capable(CAP_FSETID)) {
+		if (unlikely(mode & S_ISUID))
+			return 1;
+
+		if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+			return 1;
+	}
+	return 0;
+}
+
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+				    const char __user *buf,
+				    size_t count,
+				    loff_t pos)
+{
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+				   .iov_len = count };
+	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
+	u32 clusters;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	loff_t newsize, saved_pos;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	/* happy write of zero bytes */
+	if (count == 0)
+		return 0;
+
+	if (!inode) {
+		mlog(0, "bad inode\n");
+		return -EIO;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	/* ugh, work around some applications which open everything O_DIRECT +
+	 * O_APPEND and really don't mean to use O_DIRECT. */
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 
+		filp->f_flags &= ~O_DIRECT;
+#endif
+
+	down(&inode->i_sem);
+	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
+	if (filp->f_flags & O_DIRECT) {
+		have_alloc_sem = 1;
+		down_read(&inode->i_alloc_sem);
+	}
+
+	/* concurrent O_DIRECT writes are allowed */
+	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	ret = ocfs2_rw_lock(inode, rw_level);
+	if (ret < 0) {
+		rw_level = -1;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* 
+	 * We sample i_size under a read level meta lock to see if our write
+	 * is extending the file, if it is we back off and get a write level
+	 * meta lock.
+	 */
+	meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
+	for(;;) {
+		ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
+		if (ret < 0) {
+			meta_level = -1;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Clear suid / sgid if necessary. We do this here
+		 * instead of later in the write path because
+		 * remove_suid() calls ->setattr without any hint that
+		 * we may have already done our cluster locking. Since
+		 * ocfs2_setattr() *must* take cluster locks to
+		 * proceeed, this will lead us to recursively lock the
+		 * inode. There's also the dinode i_size state which
+		 * can be lost via setattr during extending writes (we
+		 * set inode->i_size at the end of a write. */
+		if (ocfs2_write_should_remove_suid(inode)) {
+			if (meta_level == 0) {
+				ocfs2_meta_unlock(inode, meta_level);
+				meta_level = 1;
+				continue;
+			}
+
+			ret = ocfs2_write_remove_suid(inode);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		/* work on a copy of ppos until we're sure that we won't have
+		 * to recalculate it due to relocking. */
+		if (filp->f_flags & O_APPEND) {
+			saved_pos = i_size_read(inode);
+			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+		} else {
+			saved_pos = iocb->ki_pos;
+		}
+		newsize = count + saved_pos;
+
+		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
+		     saved_pos, newsize, i_size_read(inode));
+
+		/* No need for a higher level metadata lock if we're
+		 * never going past i_size. */
+		if (newsize <= i_size_read(inode))
+			break;
+
+		if (meta_level == 0) {
+			ocfs2_meta_unlock(inode, meta_level);
+			meta_level = 1;
+			continue;
+		}
+
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
+			OCFS2_I(inode)->ip_clusters;
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		mlog(0, "Writing at EOF, may need more allocation: "
+		     "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
+		     i_size_read(inode), newsize, clusters);
+
+		/* We only want to continue the rest of this loop if
+		 * our extend will actually require more
+		 * allocation. */
+		if (!clusters)
+			break;
+
+		ret = ocfs2_extend_allocation(inode, clusters);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		/* Fill any holes which would've been created by this
+		 * write. If we're O_APPEND, this will wind up
+		 * (correctly) being a noop. */
+		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/* ok, we're done with i_size and alloc work */
+	iocb->ki_pos = saved_pos;
+	ocfs2_meta_unlock(inode, meta_level);
+	meta_level = -1;
+
+	/* communicate with ocfs2_dio_end_io */
+	ocfs2_iocb_set_rw_locked(iocb);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    filp->f_flags & O_DIRECT) {
+		unsigned int saved_flags = filp->f_flags;
+		int sector_size = 1 << osb->s_sectsize_bits;
+
+		if ((saved_pos & (sector_size - 1)) ||
+		    (count & (sector_size - 1)) ||
+		    ((unsigned long)buf & (sector_size - 1))) {
+			filp->f_flags |= O_SYNC;
+			filp->f_flags &= ~O_DIRECT;
+		}
+
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+		filp->f_flags = saved_flags;
+	} else
+#endif
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* 
+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+	 * function pointer which is called when o_direct io completes so that
+	 * it can unlock our rw lock.  (it's the clustered equivalent of
+	 * i_alloc_sem; protects truncate from racing with pending ios).
+	 * Unfortunately there are error cases which call end_io and others
+	 * that don't.  so we don't have to unlock the rw_lock if either an
+	 * async dio is going to do it in the future or an end_io after an
+	 * error has already done it.
+	 */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+out:
+	if (meta_level != -1)
+		ocfs2_meta_unlock(inode, meta_level);
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	up(&inode->i_sem);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+				   char __user *buf,
+				   size_t count,
+				   loff_t pos)
+{
+	int ret = 0, rw_level = -1, have_alloc_sem = 0;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+		if (filp->f_flags & O_DIRECT) {
+			int sector_size = 1 << osb->s_sectsize_bits;
+
+			if ((pos & (sector_size - 1)) ||
+			    (count & (sector_size - 1)) ||
+			    ((unsigned long)buf & (sector_size - 1)) ||
+			    (i_size_read(inode) & (sector_size -1))) {
+				filp->f_flags &= ~O_DIRECT;
+			}
+		}
+	}
+#endif
+
+	/* 
+	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
+	 * need locks to protect pending reads from racing with truncate.
+	 */
+	if (filp->f_flags & O_DIRECT) {
+		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
+
+		ret = ocfs2_rw_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+		rw_level = 0;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_rw_locked(iocb);
+	}
+
+	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+	if (ret == -EINVAL)
+		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* see ocfs2_file_aio_write */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+bail:
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+struct inode_operations ocfs2_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
+
+struct inode_operations ocfs2_special_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
+
+struct file_operations ocfs2_fops = {
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.sendfile	= generic_file_sendfile,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+};
+
+struct file_operations ocfs2_dops = {
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+};

+ 57 - 0
fs/ocfs2/file.h

@@ -0,0 +1,57 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_FILE_H
+#define OCFS2_FILE_H
+
+extern struct file_operations ocfs2_fops;
+extern struct file_operations ocfs2_dops;
+extern struct inode_operations ocfs2_file_iops;
+extern struct inode_operations ocfs2_special_file_iops;
+struct ocfs2_alloc_context;
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason);
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
+int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size);
+
+#endif /* OCFS2_FILE_H */

+ 378 - 0
fs/ocfs2/heartbeat.c

@@ -0,0 +1,378 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recovery when needed.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kmod.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
+#define OCFS2_HB_NODE_UP_PRI	   OCFS2_HB_NODE_DOWN_PRI
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit);
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit);
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb)
+{
+	spin_lock_init(&osb->node_map_lock);
+	ocfs2_node_map_init(&osb->mounted_map);
+	ocfs2_node_map_init(&osb->recovery_map);
+	ocfs2_node_map_init(&osb->umount_map);
+}
+
+static void ocfs2_do_node_down(int node_num,
+			       struct ocfs2_super *osb)
+{
+	BUG_ON(osb->node_num == node_num);
+
+	mlog(0, "ocfs2: node down event for %d\n", node_num);
+
+	if (!osb->dlm) {
+		/*
+		 * No DLM means we're not even ready to participate yet.
+		 * We check the slots after the DLM comes up, so we will
+		 * notice the node death then.  We can safely ignore it
+		 * here.
+		 */
+		return;
+	}
+
+	if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
+		/* If a node is in the umount map, then we've been
+		 * expecting him to go down and we know ahead of time
+		 * that recovery is not necessary. */
+		ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+		return;
+	}
+
+	ocfs2_recovery_thread(osb, node_num);
+
+	ocfs2_remove_node_from_vote_queues(osb, node_num);
+}
+
+static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
+				  int node_num,
+				  void *data)
+{
+	ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
+}
+
+/* Called from the dlm when it's about to evict a node. We may also
+ * get a heartbeat callback later. */
+static void ocfs2_dlm_eviction_cb(int node_num,
+				  void *data)
+{
+	struct ocfs2_super *osb = (struct ocfs2_super *) data;
+	struct super_block *sb = osb->sb;
+
+	mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
+	     MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
+
+	ocfs2_do_node_down(node_num, osb);
+}
+
+static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
+				int node_num,
+				void *data)
+{
+	struct ocfs2_super *osb = data;
+
+	BUG_ON(osb->node_num == node_num);
+
+	mlog(0, "node up event for %d\n", node_num);
+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
+{
+	o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
+			    ocfs2_hb_node_down_cb, osb,
+			    OCFS2_HB_NODE_DOWN_PRI);
+
+	o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
+			    ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
+
+	/* Not exactly a heartbeat callback, but leads to essentially
+	 * the same path so we set it up here. */
+	dlm_setup_eviction_cb(&osb->osb_eviction_cb,
+			      ocfs2_dlm_eviction_cb,
+			      osb);
+}
+
+/* Most functions here are just stubs for now... */
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+
+	status = o2hb_register_callback(&osb->osb_hb_down);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = o2hb_register_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	return status;
+}
+
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+
+	status = o2hb_unregister_callback(&osb->osb_hb_down);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = o2hb_unregister_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+}
+
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	if (!osb->uuid_str) {
+		/* This can happen if we don't get far enough in mount... */
+		mlog(0, "No UUID with which to stop heartbeat!\n\n");
+		return;
+	}
+
+	argv[0] = (char *)o2nm_get_hb_ctl_path();
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = osb->uuid_str;
+	argv[4] = NULL;
+
+	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+}
+
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!!  */
+void ocfs2_node_map_init(struct ocfs2_node_map *map)
+{
+	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+	       sizeof(unsigned long));
+}
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit)
+{
+	set_bit(bit, map->map);
+}
+
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_set_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit)
+{
+	clear_bit(bit, map->map);
+}
+
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_clear_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	int ret;
+	if (bit >= map->num_nodes) {
+		mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+		BUG();
+	}
+	spin_lock(&osb->node_map_lock);
+	ret = test_bit(bit, map->map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
+{
+	int bit;
+	bit = find_next_bit(map->map, map->num_nodes, 0);
+	if (bit < map->num_nodes)
+		return 0;
+	return 1;
+}
+
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map)
+{
+	int ret;
+	BUG_ON(map->num_nodes == 0);
+	spin_lock(&osb->node_map_lock);
+	ret = __ocfs2_node_map_is_empty(map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	BUG_ON(from->num_nodes == 0);
+	ocfs2_node_map_init(target);
+	__ocfs2_node_map_set(target, from);
+}
+
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit)
+{
+	struct ocfs2_node_map temp;
+	int ret;
+
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_dup(&temp, target);
+	__ocfs2_node_map_clear_bit(&temp, bit);
+	ret = __ocfs2_node_map_is_empty(&temp);
+	spin_unlock(&osb->node_map_lock);
+
+	return ret;
+}
+
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	int num_longs, i;
+
+	BUG_ON(target->num_nodes != from->num_nodes);
+	BUG_ON(target->num_nodes == 0);
+
+	num_longs = BITS_TO_LONGS(target->num_nodes);
+	for (i = 0; i < num_longs; i++)
+		target->map[i] = from->map[i];
+}
+
+/* Returns whether the recovery bit was actually set - it may not be
+ * if a node is still marked as needing recovery */
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num)
+{
+	int set = 0;
+
+	spin_lock(&osb->node_map_lock);
+
+	__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
+
+	if (!test_bit(num, osb->recovery_map.map)) {
+	    __ocfs2_node_map_set_bit(&osb->recovery_map, num);
+	    set = 1;
+	}
+
+	spin_unlock(&osb->node_map_lock);
+
+	return set;
+}
+
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
+}
+
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx)
+{
+	int i = idx;
+
+	idx = O2NM_INVALID_NODE_NUM;
+	spin_lock(&osb->node_map_lock);
+	if ((i != O2NM_INVALID_NODE_NUM) &&
+	    (i >= 0) &&
+	    (i < map->num_nodes)) {
+		while(i < map->num_nodes) {
+			if (test_bit(i, map->map)) {
+				idx = i;
+				break;
+			}
+			i++;
+		}
+	}
+	spin_unlock(&osb->node_map_lock);
+	return idx;
+}

+ 67 - 0
fs/ocfs2/heartbeat.h

@@ -0,0 +1,67 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_HEARTBEAT_H
+#define OCFS2_HEARTBEAT_H
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb);
+
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
+
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs2_node_map_init(struct ocfs2_node_map *map);
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map);
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit);
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx);
+static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
+					       struct ocfs2_node_map *map)
+{
+	return ocfs2_node_map_iterate(osb, map, 0);
+}
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num);
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num);
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit);
+
+#endif /* OCFS2_HEARTBEAT_H */

+ 1140 - 0
fs/ocfs2/inode.c

@@ -0,0 +1,1140 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c
+ *
+ * vfs' aops, fops, dops and iops
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_FI_FLAG_NOWAIT	0x1
+#define OCFS2_FI_FLAG_DELETE	0x2
+struct ocfs2_find_inode_args
+{
+	u64		fi_blkno;
+	unsigned long	fi_ino;
+	unsigned int	fi_flags;
+};
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args);
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
+static int ocfs2_find_actor(struct inode *inode, void *opaque);
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh);
+
+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+				     u64 blkno,
+				     int delete_vote)
+{
+	struct ocfs2_find_inode_args args;
+
+	/* ocfs2_ilookup_for_vote should *only* be called from the
+	 * vote thread */
+	BUG_ON(current != osb->vote_task);
+
+	args.fi_blkno = blkno;
+	args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
+	if (delete_vote)
+		args.fi_flags |= OCFS2_FI_FLAG_DELETE;
+	args.fi_ino = ino_from_blkno(osb->sb, blkno);
+	return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
+}
+
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
+{
+	struct inode *inode = NULL;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_find_inode_args args;
+
+	mlog_entry("(blkno = %"MLFu64")\n", blkno);
+
+	/* Ok. By now we've either got the offsets passed to us by the
+	 * caller, or we just pulled them off the bh. Lets do some
+	 * sanity checks to make sure they're OK. */
+	if (blkno == 0) {
+		inode = ERR_PTR(-EINVAL);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+	args.fi_blkno = blkno;
+	args.fi_flags = 0;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+
+	inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
+			     ocfs2_init_locked_inode, &args);
+	/* inode was *not* in the inode cache. 2.6.x requires
+	 * us to do our own read_inode call and unlock it
+	 * afterwards. */
+	if (inode && inode->i_state & I_NEW) {
+		mlog(0, "Inode was not in inode cache, reading it.\n");
+		ocfs2_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+	if (inode == NULL) {
+		inode = ERR_PTR(-ENOMEM);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+bail:
+	if (!IS_ERR(inode)) {
+		mlog(0, "returning inode with number %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_blkno);
+		mlog_exit_ptr(inode);
+	} else
+		mlog_errno(PTR_ERR(inode));
+
+	return inode;
+}
+
+
+/*
+ * here's how inodes get read from disk:
+ * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
+ * found? : return the in-memory inode
+ * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
+ */
+
+static int ocfs2_find_actor(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
+
+	args = opaque;
+
+	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+
+	if (oi->ip_blkno != args->fi_blkno)
+		goto bail;
+
+	/* OCFS2_FI_FLAG_NOWAIT is *only* set from
+	 * ocfs2_ilookup_for_vote which won't create an inode for one
+	 * that isn't found. The vote thread which doesn't want to get
+	 * an inode which is in the process of going away - otherwise
+	 * the call to __wait_on_freeing_inode in find_inode_fast will
+	 * cause it to deadlock on an inode which may be waiting on a
+	 * vote (or lock release) in delete_inode */
+	if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
+	    (inode->i_state & (I_FREEING|I_CLEAR))) {
+		/* As stated above, we're not going to return an
+		 * inode.  In the case of a delete vote, the voting
+		 * code is going to signal the other node to go
+		 * ahead. Mark that state here, so this freeing inode
+		 * has the state when it gets to delete_inode. */
+		if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
+			spin_lock(&oi->ip_lock);
+			ocfs2_mark_inode_remotely_deleted(inode);
+			spin_unlock(&oi->ip_lock);
+		}
+		goto bail;
+	}
+
+	ret = 1;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * initialize the new inode, but don't do anything that would cause
+ * us to sleep.
+ * return 0 on success, 1 on failure
+ */
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = opaque;
+
+	mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
+
+	inode->i_ino = args->fi_ino;
+	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+
+	mlog_exit(0);
+	return 0;
+}
+
+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+		     	 int create_ino)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	int status = -EINVAL;
+
+	mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
+
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	/* this means that read_inode cannot create a superblock inode
+	 * today.  change if needed. */
+	if (!OCFS2_IS_VALID_DINODE(fe) ||
+	    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
+		     "signature = %.*s, flags = 0x%x\n",
+		     inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
+		     fe->i_signature, le32_to_cpu(fe->i_flags));
+		goto bail;
+	}
+
+	if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
+		mlog(ML_ERROR, "file entry generation does not match "
+		     "superblock! osb->fs_generation=%x, "
+		     "fe->i_fs_generation=%x\n",
+		     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
+		goto bail;
+	}
+
+	inode->i_version = 1;
+	inode->i_generation = le32_to_cpu(fe->i_generation);
+	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_blksize = (u32)osb->s_clustersize;
+
+	/* Fast symlinks will have i_size but no allocated clusters. */
+	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks =
+			ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
+	inode->i_mapping->a_ops = &ocfs2_aops;
+	inode->i_flags |= S_NOATIME;
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
+		mlog(ML_ERROR,
+		     "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
+		     OCFS2_I(inode)->ip_blkno, fe->i_blkno);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+
+	if (create_ino)
+		inode->i_ino = ino_from_blkno(inode->i_sb,
+			       le64_to_cpu(fe->i_blkno));
+
+	mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
+	     fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
+
+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+
+	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
+		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
+		/* we can't actually hit this as read_inode can't
+		 * handle superblocks today ;-) */
+		BUG();
+	}
+
+	switch (inode->i_mode & S_IFMT) {
+	    case S_IFREG:
+		    inode->i_fop = &ocfs2_fops;
+		    inode->i_op = &ocfs2_file_iops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFDIR:
+		    inode->i_op = &ocfs2_dir_iops;
+		    inode->i_fop = &ocfs2_dops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFLNK:
+		    if (ocfs2_inode_is_fast_symlink(inode))
+			inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		    else
+			inode->i_op = &ocfs2_symlink_inode_operations;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    default:
+		    inode->i_op = &ocfs2_special_file_iops;
+		    init_special_inode(inode, inode->i_mode,
+				       inode->i_rdev);
+		    break;
+	}
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
+				  OCFS2_LOCK_TYPE_RW, inode);
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+				  OCFS2_LOCK_TYPE_META, inode);
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
+				  OCFS2_LOCK_TYPE_DATA, inode);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *fe;
+	struct buffer_head *bh = NULL;
+	int status;
+	int sysfile = 0;
+
+	mlog_entry("(0x%p, 0x%p)\n", inode, args);
+
+	status = -EINVAL;
+	if (inode == NULL || inode->i_sb == NULL) {
+		mlog(ML_ERROR, "bad inode\n");
+		goto bail;
+	}
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	if (!args) {
+		mlog(ML_ERROR, "bad inode args\n");
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	/* Read the FE off disk. This is safe because the kernel only
+	 * does one read_inode2 for a new inode, and if it doesn't
+	 * exist yet then nobody can be working on it! */
+	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+		sysfile = 1;
+
+	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
+	    S_ISBLK(le16_to_cpu(fe->i_mode)))
+    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+
+	status = -EINVAL;
+	if (ocfs2_populate_inode(inode, fe, 0) < 0) {
+		mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
+		     "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+
+	if (sysfile)
+	       OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+
+	status = 0;
+
+bail:
+	if (args && bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_sync_blockdev(struct super_block *sb)
+{
+	sync_blockdev(sb->s_bdev);
+}
+
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh)
+{
+	int status = 0;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_truncate_context *tc = NULL;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	/* zero allocation, zero truncate :) */
+	if (!fe->i_clusters)
+		goto bail;
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_commit_trans(handle);
+	handle = NULL;
+
+	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_remove_inode(struct inode *inode,
+			      struct buffer_head *di_bh,
+			      struct inode *orphan_dir_inode,
+			      struct buffer_head *orphan_dir_bh)
+{
+	int status;
+	struct inode *inode_alloc_inode = NULL;
+	struct buffer_head *inode_alloc_bh = NULL;
+	struct ocfs2_journal_handle *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    le16_to_cpu(di->i_suballoc_slot));
+	if (!inode_alloc_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down(&inode_alloc_inode->i_sem);
+	status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
+	if (status < 0) {
+		up(&inode_alloc_inode->i_sem);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	/* set the inodes dtime */
+	status = ocfs2_journal_access(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+
+	status = ocfs2_journal_dirty(handle, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	ocfs2_remove_from_cache(inode, di_bh);
+
+	status = ocfs2_free_dinode(handle, inode_alloc_inode,
+				   inode_alloc_bh, di);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode_alloc_inode, 1);
+	up(&inode_alloc_inode->i_sem);
+	brelse(inode_alloc_bh);
+bail:
+	iput(inode_alloc_inode);
+
+	return status;
+}
+
+static int ocfs2_wipe_inode(struct inode *inode,
+			    struct buffer_head *di_bh)
+{
+	int status, orphaned_slot;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We've already voted on this so it should be readonly - no
+	 * spinlock needed. */
+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       orphaned_slot);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Lock the orphan dir. The lock will be held for the entire
+	 * delete_inode operation. We do this now to avoid races with
+	 * recovery completion on other nodes. */
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* we do this while holding the orphan dir lock because we
+	 * don't want recovery being run from another node to vote for
+	 * an inode delete on us -- this will result in two nodes
+	 * truncating the same file! */
+	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
+				    orphan_dir_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_unlock_dir:
+	ocfs2_meta_unlock(orphan_dir_inode, 1);
+	up(&orphan_dir_inode->i_sem);
+	brelse(orphan_dir_bh);
+bail:
+	iput(orphan_dir_inode);
+
+	return status;
+}
+
+/* There is a series of simple checks that should be done before a
+ * vote is even considered. Encapsulate those in this function. */
+static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We shouldn't be getting here for the root directory
+	 * inode.. */
+	if (inode == osb->root_inode) {
+		mlog(ML_ERROR, "Skipping delete of root inode.\n");
+		goto bail;
+	}
+
+	/* If we're coming from process_vote we can't go into our own
+	 * voting [hello, deadlock city!], so unforuntately we just
+	 * have to skip deleting this guy. That's OK though because
+	 * the node who's doing the actual deleting should handle it
+	 * anyway. */
+	if (current == osb->vote_task) {
+		mlog(0, "Skipping delete of %lu because we're currently "
+		     "in process_vote\n", inode->i_ino);
+		goto bail;
+	}
+
+	spin_lock(&oi->ip_lock);
+	/* OCFS2 *never* deletes system files. This should technically
+	 * never get here as system file inodes should always have a
+	 * positive link count. */
+	if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
+		     oi->ip_blkno);
+		goto bail_unlock;
+	}
+
+	/* If we have voted "yes" on the wipe of this inode for
+	 * another node, it will be marked here so we can safely skip
+	 * it. Recovery will cleanup any inodes we might inadvertantly
+	 * skip here. */
+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
+		mlog(0, "Skipping delete of %lu because another node "
+		     "has done this for us.\n", inode->i_ino);
+		goto bail_unlock;
+	}
+
+	ret = 1;
+bail_unlock:
+	spin_unlock(&oi->ip_lock);
+bail:
+	return ret;
+}
+
+/* Query the cluster to determine whether we should wipe an inode from
+ * disk or not.
+ *
+ * Requires the inode to have the cluster lock. */
+static int ocfs2_query_inode_wipe(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  int *wipe)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di;
+
+	*wipe = 0;
+
+	/* While we were waiting for the cluster lock in
+	 * ocfs2_delete_inode, another node might have asked to delete
+	 * the inode. Recheck our flags to catch this. */
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
+		     oi->ip_blkno);
+		goto bail;
+	}
+
+	/* Now that we have an up to date inode, we can double check
+	 * the link count. */
+	if (inode->i_nlink) {
+		mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
+		     oi->ip_blkno, inode->i_nlink);
+		goto bail;
+	}
+
+	/* Do some basic inode verification... */
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+		/* for lack of a better error? */
+		status = -EEXIST;
+		mlog(ML_ERROR,
+		     "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
+		     "Disk flags  0x%x, inode flags 0x%x\n",
+		     oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
+		goto bail;
+	}
+
+	/* has someone already deleted us?! baaad... */
+	if (di->i_dtime) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_request_delete_vote(inode);
+	/* -EBUSY means that other nodes are still using the
+	 * inode. We're done here though, so avoid doing anything on
+	 * disk and let them worry about deleting it. */
+	if (status == -EBUSY) {
+		status = 0;
+		mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
+		     "other nodes\n", oi->ip_blkno);
+		goto bail;
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
+		/* Nobody knew which slot this inode was orphaned
+		 * into. This may happen during node death and
+		 * recovery knows how to clean it up so we can safely
+		 * ignore this inode for now on. */
+		mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
+		     oi->ip_blkno);
+	} else {
+		*wipe = 1;
+
+		mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
+		     oi->ip_blkno, oi->ip_orphaned_slot);
+	}
+	spin_unlock(&oi->ip_lock);
+
+bail:
+	return status;
+}
+
+/* Support function for ocfs2_delete_inode. Will help us keep the
+ * inode data in a consistent state for clear_inode. Always truncates
+ * pages, optionally sync's them first. */
+static void ocfs2_cleanup_delete_inode(struct inode *inode,
+				       int sync_data)
+{
+	mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
+	     OCFS2_I(inode)->ip_blkno, sync_data);
+	if (sync_data)
+		write_inode_now(inode, 1);
+	truncate_inode_pages(&inode->i_data, 0);
+}
+
+void ocfs2_delete_inode(struct inode *inode)
+{
+	int wipe, status;
+	sigset_t blocked, oldset;
+	struct buffer_head *di_bh = NULL;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	if (is_bad_inode(inode)) {
+		mlog(0, "Skipping delete of bad inode\n");
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		/* It's probably not necessary to truncate_inode_pages
+		 * here but we do it for safety anyway (it will most
+		 * likely be a no-op anyway) */
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail;
+	}
+
+	/* We want to block signals in delete_inode as the lock and
+	 * messaging paths may return us -ERESTARTSYS. Which would
+	 * cause us to exit early, resulting in inodes being orphaned
+	 * forever. */
+	sigfillset(&blocked);
+	status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail;
+	}
+
+	/* Lock down the inode. This gives us an up to date view of
+	 * it's metadata (for verification), and allows us to
+	 * serialize delete_inode votes. 
+	 *
+	 * Even though we might be doing a truncate, we don't take the
+	 * allocation lock here as it won't be needed - nobody will
+	 * have the file open.
+	 */
+	status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unblock;
+	}
+
+	/* Query the cluster. This will be the final decision made
+	 * before we go ahead and wipe the inode. */
+	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
+	if (!wipe || status < 0) {
+		/* Error and inode busy vote both mean we won't be
+		 * removing the inode, so they take almost the same
+		 * path. */
+		if (status < 0)
+			mlog_errno(status);
+
+		/* Someone in the cluster has voted to not wipe this
+		 * inode, or it was never completely orphaned. Write
+		 * out the pages and exit now. */
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail_unlock_inode;
+	}
+
+	ocfs2_cleanup_delete_inode(inode, 0);
+
+	status = ocfs2_wipe_inode(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	/* Mark the inode as successfully deleted. This is important
+	 * for ocfs2_clear_inode as it will check this flag and skip
+	 * any checkpointing work */
+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
+
+bail_unlock_inode:
+	ocfs2_meta_unlock(inode, 1);
+	brelse(di_bh);
+bail_unblock:
+	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	clear_inode(inode);
+	mlog_exit_void();
+}
+
+void ocfs2_clear_inode(struct inode *inode)
+{
+	int status;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	if (!inode)
+		goto bail;
+
+	mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
+	     OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+
+	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+			"Inode=%lu\n", inode->i_ino);
+
+	/* Do these before all the other work so that we don't bounce
+	 * the vote thread while waiting to destroy the locks. */
+	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+
+	/* We very well may get a clear_inode before all an inodes
+	 * metadata has hit disk. Of course, we can't drop any cluster
+	 * locks until the journal has finished with it. The only
+	 * exception here are successfully wiped inodes - their
+	 * metadata can now be considered to be part of the system
+	 * inodes from which it came. */
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+		ocfs2_checkpoint_inode(inode);
+
+	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
+			"Clear inode of %"MLFu64", inode has io markers\n",
+			oi->ip_blkno);
+
+	ocfs2_extent_map_drop(inode, 0);
+	ocfs2_extent_map_init(inode);
+
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
+	ocfs2_lock_res_free(&oi->ip_meta_lockres);
+	ocfs2_lock_res_free(&oi->ip_data_lockres);
+
+	ocfs2_metadata_cache_purge(inode);
+
+	mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+			"Clear inode of %"MLFu64", inode has %u cache items\n",
+			oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Clear inode of %"MLFu64", inode has a bad flag\n",
+			oi->ip_blkno);
+
+	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
+			"Clear inode of %"MLFu64", inode is locked\n",
+			oi->ip_blkno);
+
+	mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
+			"Clear inode of %"MLFu64", io_sem is locked\n",
+			oi->ip_blkno);
+	up(&oi->ip_io_sem);
+
+	/*
+	 * down_trylock() returns 0, down_write_trylock() returns 1
+	 * kernel 1, world 0
+	 */
+	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
+			"Clear inode of %"MLFu64", alloc_sem is locked\n",
+			oi->ip_blkno);
+	up_write(&oi->ip_alloc_sem);
+
+	mlog_bug_on_msg(oi->ip_open_count,
+			"Clear inode of %"MLFu64" has open count %d\n",
+			oi->ip_blkno, oi->ip_open_count);
+	mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
+			"Clear inode of %"MLFu64" has non empty handle list\n",
+			oi->ip_blkno);
+	mlog_bug_on_msg(oi->ip_handle,
+			"Clear inode of %"MLFu64" has non empty handle pointer\n",
+			oi->ip_blkno);
+
+	/* Clear all other flags. */
+	oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
+	oi->ip_created_trans = 0;
+	oi->ip_last_trans = 0;
+	oi->ip_dir_start_lookup = 0;
+	oi->ip_blkno = 0ULL;
+
+bail:
+	mlog_exit_void();
+}
+
+/* Called under inode_lock, with no more references on the
+ * struct inode, so it's safe here to check the flags field
+ * and to manipulate i_nlink without any other locks. */
+void ocfs2_drop_inode(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
+	     oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+
+	/* Testing ip_orphaned_slot here wouldn't work because we may
+	 * not have gotten a delete_inode vote from any other nodes
+	 * yet. */
+	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
+		mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
+		inode->i_nlink = 0;
+	}
+
+	generic_drop_inode(inode);
+
+	mlog_exit_void();
+}
+
+/*
+ * TODO: this should probably be merged into ocfs2_get_block
+ *
+ * However, you now need to pay attention to the cont_prepare_write()
+ * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
+ * expects never to extend).
+ */
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada)
+{
+	struct buffer_head *bh = NULL;
+	int tmperr;
+	u64 p_blkno;
+	int readflags = OCFS2_BH_CACHED;
+
+#if 0
+	/* only turn this on if we know we can deal with read_block
+	 * returning nothing */
+	if (reada)
+		readflags |= OCFS2_BH_READAHEAD;
+#endif
+
+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!reada);
+		return NULL;
+	}
+
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
+					     &p_blkno, NULL);
+	if (tmperr < 0) {
+		mlog_errno(tmperr);
+		goto fail;
+	}
+
+	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
+				  readflags, inode);
+	if (tmperr < 0)
+		goto fail;
+
+	tmperr = 0;
+
+	*err = 0;
+	return bh;
+
+fail:
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
+	*err = -EIO;
+	return NULL;
+}
+
+/*
+ * This is called from our getattr.
+ */
+int ocfs2_inode_revalidate(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int status = 0;
+
+	mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
+		   inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
+
+	if (!inode) {
+		mlog(0, "eep, no inode!\n");
+		status = -ENOENT;
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		mlog(0, "inode deleted!\n");
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Let ocfs2_meta_lock do the work of updating our struct
+	 * inode for us. */
+	status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_meta_unlock(inode, 0);
+bail:
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * Updates a disk inode from a
+ * struct inode.
+ * Only takes ip_lock.
+ */
+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh)
+{
+	int status;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+
+	mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	fe->i_size = cpu_to_le64(i_size_read(inode));
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0;
+leave:
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ *
+ * Updates a struct inode from a disk inode.
+ * does no i/o, only takes ip_lock.
+ */
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	i_size_write(inode, le64_to_cpu(fe->i_size));
+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_blksize = (u32) osb->s_clustersize;
+	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}

+ 145 - 0
fs/ocfs2/inode.h

@@ -0,0 +1,145 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_INODE_H
+#define OCFS2_INODE_H
+
+/* OCFS2 Inode Private Data */
+struct ocfs2_inode_info
+{
+	u64			ip_blkno;
+
+	struct ocfs2_lock_res		ip_rw_lockres;
+	struct ocfs2_lock_res		ip_meta_lockres;
+	struct ocfs2_lock_res		ip_data_lockres;
+
+	/* protects allocation changes on this inode. */
+	struct rw_semaphore		ip_alloc_sem;
+
+	/* These fields are protected by ip_lock */
+	spinlock_t			ip_lock;
+	u32				ip_open_count;
+	u32				ip_clusters;
+	struct ocfs2_extent_map		ip_map;
+	struct list_head		ip_io_markers;
+	int				ip_orphaned_slot;
+
+	struct semaphore		ip_io_sem;
+
+	/* Used by the journalling code to attach an inode to a
+	 * handle.  These are protected by ip_io_sem in order to lock
+	 * out other I/O to the inode until we either commit or
+	 * abort. */
+	struct list_head		ip_handle_list;
+	struct ocfs2_journal_handle	*ip_handle;
+
+	u32				ip_flags; /* see below */
+
+	/* protected by recovery_lock. */
+	struct inode			*ip_next_orphan;
+
+	u32				ip_dir_start_lookup;
+
+	/* next two are protected by trans_inc_lock */
+	/* which transaction were we created on? Zero if none. */
+	unsigned long			ip_created_trans;
+	/* last transaction we were a part of. */
+	unsigned long			ip_last_trans;
+
+	struct ocfs2_caching_info	ip_metadata_cache;
+
+	struct inode			vfs_inode;
+};
+
+/*
+ * Flags for the ip_flags field
+ */
+/* System file inodes  */
+#define OCFS2_INODE_SYSTEM_FILE		0x00000001
+#define OCFS2_INODE_JOURNAL		0x00000002
+#define OCFS2_INODE_BITMAP		0x00000004
+/* This inode has been wiped from disk */
+#define OCFS2_INODE_DELETED		0x00000008
+/* Another node is deleting, so our delete is a nop */
+#define OCFS2_INODE_SKIP_DELETE		0x00000010
+/* Has the inode been orphaned on another node?
+ *
+ * This hints to ocfs2_drop_inode that it should clear i_nlink before
+ * continuing.
+ *
+ * We *only* set this on unlink vote from another node. If the inode
+ * was locally orphaned, then we're sure of the state and don't need
+ * to twiddle i_nlink later - it's either zero or not depending on
+ * whether our unlink succeeded. Otherwise we got this from a node
+ * whose intention was to orphan the inode, however he may have
+ * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
+ * rely on ocfs2_delete_inode to sort things out under the proper
+ * cluster locks.
+ */
+#define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
+/* Does someone have the file open O_DIRECT */
+#define OCFS2_INODE_OPEN_DIRECT		0x00000040
+/* Indicates that the metadata cache should be used as an array. */
+#define OCFS2_INODE_CACHE_INLINE	0x00000080
+
+static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct ocfs2_inode_info, vfs_inode);
+}
+
+#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
+#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
+
+extern kmem_cache_t *ocfs2_inode_cache;
+
+extern struct address_space_operations ocfs2_aops;
+
+struct buffer_head *ocfs2_bread(struct inode *inode, int block,
+				int *err, int reada);
+void ocfs2_clear_inode(struct inode *inode);
+void ocfs2_delete_inode(struct inode *inode);
+void ocfs2_drop_inode(struct inode *inode);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+				     u64 blkno,
+				     int delete_vote);
+int ocfs2_inode_init_private(struct inode *inode);
+int ocfs2_inode_revalidate(struct dentry *dentry);
+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			 int create_ino);
+void ocfs2_read_inode(struct inode *inode);
+void ocfs2_read_inode2(struct inode *inode, void *opaque);
+ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
+			size_t size, loff_t *offp);
+void ocfs2_sync_blockdev(struct super_block *sb);
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe);
+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh);
+int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
+int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+
+#endif /* OCFS2_INODE_H */

+ 1652 - 0
fs/ocfs2/journal.c

@@ -0,0 +1,1652 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.c
+ *
+ * Defines functions of journalling api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+
+#define MLOG_MASK_PREFIX ML_JOURNAL
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "vote.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+
+static int ocfs2_force_read_journal(struct inode *inode);
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num);
+static int __ocfs2_recovery_thread(void *arg);
+static int ocfs2_commit_cache(struct ocfs2_super *osb);
+static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
+				       struct ocfs2_journal_handle *handle);
+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty);
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num);
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot);
+static int ocfs2_commit_thread(void *arg);
+
+static int ocfs2_commit_cache(struct ocfs2_super *osb)
+{
+	int status = 0;
+	unsigned int flushed;
+	unsigned long old_id;
+	struct ocfs2_journal *journal = NULL;
+
+	mlog_entry_void();
+
+	journal = osb->journal;
+
+	/* Flush all pending commits and checkpoint the journal. */
+	down_write(&journal->j_trans_barrier);
+
+	if (atomic_read(&journal->j_num_trans) == 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog(0, "No transactions for me to flush!\n");
+		goto finally;
+	}
+
+	journal_lock_updates(journal->j_journal);
+	status = journal_flush(journal->j_journal);
+	journal_unlock_updates(journal->j_journal);
+	if (status < 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog_errno(status);
+		goto finally;
+	}
+
+	old_id = ocfs2_inc_trans_id(journal);
+
+	flushed = atomic_read(&journal->j_num_trans);
+	atomic_set(&journal->j_num_trans, 0);
+	up_write(&journal->j_trans_barrier);
+
+	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
+	     journal->j_trans_id, flushed);
+
+	ocfs2_kick_vote_thread(osb);
+	wake_up(&journal->j_checkpointed);
+finally:
+	mlog_exit(status);
+	return status;
+}
+
+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal_handle *retval = NULL;
+
+	retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
+	if (!retval) {
+		mlog(ML_ERROR, "Failed to allocate memory for journal "
+		     "handle!\n");
+		return NULL;
+	}
+
+	retval->max_buffs = 0;
+	retval->num_locks = 0;
+	retval->k_handle = NULL;
+
+	INIT_LIST_HEAD(&retval->locks);
+	INIT_LIST_HEAD(&retval->inode_list);
+	retval->journal = osb->journal;
+
+	return retval;
+}
+
+/* pass it NULL and it will allocate a new handle object for you.  If
+ * you pass it a handle however, it may still return error, in which
+ * case it has free'd the passed handle for you. */
+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
+					       struct ocfs2_journal_handle *handle,
+					       int max_buffs)
+{
+	int ret;
+	journal_t *journal = osb->journal->j_journal;
+
+	mlog_entry("(max_buffs = %d)\n", max_buffs);
+
+	if (!osb || !osb->journal->j_journal)
+		BUG();
+
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto done_free;
+	}
+
+	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
+	BUG_ON(max_buffs <= 0);
+
+	/* JBD might support this, but our journalling code doesn't yet. */
+	if (journal_current_handle()) {
+		mlog(ML_ERROR, "Recursive transaction attempted!\n");
+		BUG();
+	}
+
+	if (!handle)
+		handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog(ML_ERROR, "Failed to allocate memory for journal "
+		     "handle!\n");
+		goto done_free;
+	}
+
+	handle->max_buffs = max_buffs;
+
+	down_read(&osb->journal->j_trans_barrier);
+
+	/* actually start the transaction now */
+	handle->k_handle = journal_start(journal, max_buffs);
+	if (IS_ERR(handle->k_handle)) {
+		up_read(&osb->journal->j_trans_barrier);
+
+		ret = PTR_ERR(handle->k_handle);
+		handle->k_handle = NULL;
+		mlog_errno(ret);
+
+		if (is_journal_aborted(journal)) {
+			ocfs2_abort(osb->sb, "Detected aborted journal");
+			ret = -EROFS;
+		}
+		goto done_free;
+	}
+
+	atomic_inc(&(osb->journal->j_num_trans));
+	handle->flags |= OCFS2_HANDLE_STARTED;
+
+	mlog_exit_ptr(handle);
+	return handle;
+
+done_free:
+	if (handle)
+		ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
+
+	mlog_exit(ret);
+	return ERR_PTR(ret);
+}
+
+void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
+			    struct inode *inode)
+{
+	BUG_ON(!handle);
+	BUG_ON(!inode);
+
+	atomic_inc(&inode->i_count);
+
+	/* we're obviously changing it... */
+	down(&inode->i_sem);
+
+	/* sanity check */
+	BUG_ON(OCFS2_I(inode)->ip_handle);
+	BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
+
+	OCFS2_I(inode)->ip_handle = handle;
+	list_del(&(OCFS2_I(inode)->ip_handle_list));
+	list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
+}
+
+static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
+{
+	struct list_head *p, *n;
+	struct inode *inode;
+	struct ocfs2_inode_info *oi;
+
+	list_for_each_safe(p, n, &handle->inode_list) {
+		oi = list_entry(p, struct ocfs2_inode_info,
+				ip_handle_list);
+		inode = &oi->vfs_inode;
+
+		OCFS2_I(inode)->ip_handle = NULL;
+		list_del_init(&OCFS2_I(inode)->ip_handle_list);
+
+		up(&inode->i_sem);
+		iput(inode);
+	}
+}
+
+/* This is trivial so we do it out of the main commit
+ * paths. Beware, it can be called from start_trans too! */
+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
+{
+	mlog_entry_void();
+
+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+
+	ocfs2_handle_unlock_inodes(handle);
+	/* You are allowed to add journal locks before the transaction
+	 * has started. */
+	ocfs2_handle_cleanup_locks(handle->journal, handle);
+
+	kfree(handle);
+
+	mlog_exit_void();
+}
+
+void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
+{
+	handle_t *jbd_handle;
+	int retval;
+	struct ocfs2_journal *journal = handle->journal;
+
+	mlog_entry_void();
+
+	BUG_ON(!handle);
+
+	if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
+		ocfs2_commit_unstarted_handle(handle);
+		mlog_exit_void();
+		return;
+	}
+
+	/* release inode semaphores we took during this transaction */
+	ocfs2_handle_unlock_inodes(handle);
+
+	/* ocfs2_extend_trans may have had to call journal_restart
+	 * which will always commit the transaction, but may return
+	 * error for any number of reasons. If this is the case, we
+	 * clear k_handle as it's not valid any more. */
+	if (handle->k_handle) {
+		jbd_handle = handle->k_handle;
+
+		if (handle->flags & OCFS2_HANDLE_SYNC)
+			jbd_handle->h_sync = 1;
+		else
+			jbd_handle->h_sync = 0;
+
+		/* actually stop the transaction. if we've set h_sync,
+		 * it'll have been committed when we return */
+		retval = journal_stop(jbd_handle);
+		if (retval < 0) {
+			mlog_errno(retval);
+			mlog(ML_ERROR, "Could not commit transaction\n");
+			BUG();
+		}
+
+		handle->k_handle = NULL; /* it's been free'd in journal_stop */
+	}
+
+	ocfs2_handle_cleanup_locks(journal, handle);
+
+	up_read(&journal->j_trans_barrier);
+
+	kfree(handle);
+	mlog_exit_void();
+}
+
+/*
+ * 'nblocks' is what you want to add to the current
+ * transaction. extend_trans will either extend the current handle by
+ * nblocks, or commit it and start a new one with nblocks credits.
+ *
+ * WARNING: This will not release any semaphores or disk locks taken
+ * during the transaction, so make sure they were taken *before*
+ * start_trans or we'll have ordering deadlocks.
+ *
+ * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
+ * good because transaction ids haven't yet been recorded on the
+ * cluster locks associated with this handle.
+ */
+int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
+		       int nblocks)
+{
+	int status;
+
+	BUG_ON(!handle);
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+	BUG_ON(!nblocks);
+
+	mlog_entry_void();
+
+	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+
+	status = journal_extend(handle->k_handle, nblocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (status > 0) {
+		mlog(0, "journal_extend failed, trying journal_restart\n");
+		status = journal_restart(handle->k_handle, nblocks);
+		if (status < 0) {
+			handle->k_handle = NULL;
+			mlog_errno(status);
+			goto bail;
+		}
+		handle->max_buffs = nblocks;
+	} else
+		handle->max_buffs += nblocks;
+
+	status = 0;
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *bh,
+			 int type)
+{
+	int status;
+
+	BUG_ON(!inode);
+	BUG_ON(!handle);
+	BUG_ON(!bh);
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+
+	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
+		   (unsigned long long)bh->b_blocknr, type,
+		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
+		   "OCFS2_JOURNAL_ACCESS_CREATE" :
+		   "OCFS2_JOURNAL_ACCESS_WRITE",
+		   bh->b_size);
+
+	/* we can safely remove this assertion after testing. */
+	if (!buffer_uptodate(bh)) {
+		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
+		mlog(ML_ERROR, "b_blocknr=%llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		BUG();
+	}
+
+	/* Set the current transaction information on the inode so
+	 * that the locking code knows whether it can drop it's locks
+	 * on this inode or not. We're protected from the commit
+	 * thread updating the current transaction id until
+	 * ocfs2_commit_trans() because ocfs2_start_trans() took
+	 * j_trans_barrier for us. */
+	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
+
+	down(&OCFS2_I(inode)->ip_io_sem);
+	switch (type) {
+	case OCFS2_JOURNAL_ACCESS_CREATE:
+	case OCFS2_JOURNAL_ACCESS_WRITE:
+		status = journal_get_write_access(handle->k_handle, bh);
+		break;
+
+	case OCFS2_JOURNAL_ACCESS_UNDO:
+		status = journal_get_undo_access(handle->k_handle, bh);
+		break;
+
+	default:
+		status = -EINVAL;
+		mlog(ML_ERROR, "Uknown access type!\n");
+	}
+	up(&OCFS2_I(inode)->ip_io_sem);
+
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
+		     status, type);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
+			struct buffer_head *bh)
+{
+	int status;
+
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+
+	mlog_entry("(bh->b_blocknr=%llu)\n",
+		   (unsigned long long)bh->b_blocknr);
+
+	status = journal_dirty_metadata(handle->k_handle, bh);
+	if (status < 0)
+		mlog(ML_ERROR, "Could not dirty metadata buffer. "
+		     "(bh->b_blocknr=%llu)\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_dirty_data(handle_t *handle,
+			     struct buffer_head *bh)
+{
+	int err = journal_dirty_data(handle, bh);
+	if (err)
+		mlog_errno(err);
+	/* TODO: When we can handle it, abort the handle and go RO on
+	 * error here. */
+
+	return err;
+}
+
+/* We always assume you're adding a metadata lock at level 'ex' */
+int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
+			  struct inode *inode)
+{
+	int status;
+	struct ocfs2_journal_lock *lock;
+
+	BUG_ON(!inode);
+
+	lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
+	if (!lock) {
+		status = -ENOMEM;
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	if (!igrab(inode))
+		BUG();
+	lock->jl_inode = inode;
+
+	list_add_tail(&(lock->jl_lock_list), &(handle->locks));
+	handle->num_locks++;
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
+				       struct ocfs2_journal_handle *handle)
+{
+	struct list_head *p, *n;
+	struct ocfs2_journal_lock *lock;
+	struct inode *inode;
+
+	list_for_each_safe(p, n, &(handle->locks)) {
+		lock = list_entry(p, struct ocfs2_journal_lock,
+				  jl_lock_list);
+		list_del(&lock->jl_lock_list);
+		handle->num_locks--;
+
+		inode = lock->jl_inode;
+		ocfs2_meta_unlock(inode, 1);
+		if (atomic_read(&inode->i_count) == 1)
+			mlog(ML_ERROR,
+			     "Inode %"MLFu64", I'm doing a last iput for!",
+			     OCFS2_I(inode)->ip_blkno);
+		iput(inode);
+		kmem_cache_free(ocfs2_lock_cache, lock);
+	}
+}
+
+#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * 5)
+
+void ocfs2_set_journal_params(struct ocfs2_super *osb)
+{
+	journal_t *journal = osb->journal->j_journal;
+
+	spin_lock(&journal->j_state_lock);
+	journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	spin_unlock(&journal->j_state_lock);
+}
+
+int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+{
+	int status = -1;
+	struct inode *inode = NULL; /* the journal inode */
+	journal_t *j_journal = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_super *osb;
+	int meta_lock = 0;
+
+	mlog_entry_void();
+
+	BUG_ON(!journal);
+
+	osb = journal->j_osb;
+
+	/* already have the inode for our journal */
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    osb->slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto done;
+	}
+
+	SET_INODE_JOURNAL(inode);
+	OCFS2_I(inode)->ip_open_count++;
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not get lock on journal!\n");
+		goto done;
+	}
+
+	meta_lock = 1;
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
+		mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
+		     inode->i_size);
+		status = -EINVAL;
+		goto done;
+	}
+
+	mlog(0, "inode->i_size = %lld\n", inode->i_size);
+	mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
+	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+
+	/* call the kernels journal init function now */
+	j_journal = journal_init_inode(inode);
+	if (j_journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EINVAL;
+		goto done;
+	}
+
+	mlog(0, "Returned from journal_init_inode\n");
+	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+
+	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
+		  OCFS2_JOURNAL_DIRTY_FL);
+
+	journal->j_journal = j_journal;
+	journal->j_inode = inode;
+	journal->j_bh = bh;
+
+	ocfs2_set_journal_params(osb);
+
+	journal->j_state = OCFS2_JOURNAL_LOADED;
+
+	status = 0;
+done:
+	if (status < 0) {
+		if (meta_lock)
+			ocfs2_meta_unlock(inode, 1);
+		if (bh != NULL)
+			brelse(bh);
+		if (inode) {
+			OCFS2_I(inode)->ip_open_count--;
+			iput(inode);
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty)
+{
+	int status;
+	unsigned int flags;
+	struct ocfs2_journal *journal = osb->journal;
+	struct buffer_head *bh = journal->j_bh;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		/* This is called from startup/shutdown which will
+		 * handle the errors in a specific manner, so no need
+		 * to call ocfs2_error() here. */
+		mlog(ML_ERROR, "Journal dinode %"MLFu64"  has invalid "
+		     "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
+		status = -EIO;
+		goto out;
+	}
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	if (dirty)
+		flags |= OCFS2_JOURNAL_DIRTY_FL;
+	else
+		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	status = ocfs2_write_block(osb, bh, journal->j_inode);
+	if (status < 0)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * If the journal has been kmalloc'd it needs to be freed after this
+ * call.
+ */
+void ocfs2_journal_shutdown(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = NULL;
+	int status = 0;
+	struct inode *inode = NULL;
+	int num_running_trans = 0;
+
+	mlog_entry_void();
+
+	if (!osb)
+		BUG();
+
+	journal = osb->journal;
+	if (!journal)
+		goto done;
+
+	inode = journal->j_inode;
+
+	if (journal->j_state != OCFS2_JOURNAL_LOADED)
+		goto done;
+
+	/* need to inc inode use count as journal_destroy will iput. */
+	if (!igrab(inode))
+		BUG();
+
+	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+	if (num_running_trans > 0)
+		mlog(0, "Shutting down journal: must wait on %d "
+		     "running transactions!\n",
+		     num_running_trans);
+
+	/* Do a commit_cache here. It will flush our journal, *and*
+	 * release any locks that are still held.
+	 * set the SHUTDOWN flag and release the trans lock.
+	 * the commit thread will take the trans lock for us below. */
+	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
+
+	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
+	 * drop the trans_lock (which we want to hold until we
+	 * completely destroy the journal. */
+	if (osb->commit_task) {
+		/* Wait for the commit thread */
+		mlog(0, "Waiting for ocfs2commit to exit....\n");
+		kthread_stop(osb->commit_task);
+		osb->commit_task = NULL;
+	}
+
+	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+
+	status = ocfs2_journal_toggle_dirty(osb, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Shutdown the kernel journal system */
+	journal_destroy(journal->j_journal);
+
+	OCFS2_I(inode)->ip_open_count--;
+
+	/* unlock our journal */
+	ocfs2_meta_unlock(inode, 1);
+
+	brelse(journal->j_bh);
+	journal->j_bh = NULL;
+
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+//	up_write(&journal->j_trans_barrier);
+done:
+	if (inode)
+		iput(inode);
+	mlog_exit_void();
+}
+
+static void ocfs2_clear_journal_error(struct super_block *sb,
+				      journal_t *journal,
+				      int slot)
+{
+	int olderr;
+
+	olderr = journal_errno(journal);
+	if (olderr) {
+		mlog(ML_ERROR, "File system error %d recorded in "
+		     "journal %u.\n", olderr, slot);
+		mlog(ML_ERROR, "File system on device %s needs checking.\n",
+		     sb->s_id);
+
+		journal_ack_err(journal);
+		journal_clear_err(journal);
+	}
+}
+
+int ocfs2_journal_load(struct ocfs2_journal *journal)
+{
+	int status = 0;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	if (!journal)
+		BUG();
+
+	osb = journal->j_osb;
+
+	status = journal_load(journal->j_journal);
+	if (status < 0) {
+		mlog(ML_ERROR, "Failed to load journal!\n");
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+
+	status = ocfs2_journal_toggle_dirty(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Launch the commit thread */
+	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
+				       osb->osb_id);
+	if (IS_ERR(osb->commit_task)) {
+		status = PTR_ERR(osb->commit_task);
+		osb->commit_task = NULL;
+		mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
+		     status);
+		goto done;
+	}
+
+done:
+	mlog_exit(status);
+	return status;
+}
+
+
+/* 'full' flag tells us whether we clear out all blocks or if we just
+ * mark the journal clean */
+int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
+{
+	int status;
+
+	mlog_entry_void();
+
+	if (!journal)
+		BUG();
+
+	status = journal_wipe(journal->j_journal, full);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * JBD Might read a cached version of another nodes journal file. We
+ * don't want this as this file changes often and we get no
+ * notification on those changes. The only way to be sure that we've
+ * got the most up to date version of those blocks then is to force
+ * read them off disk. Just searching through the buffer cache won't
+ * work as there may be pages backing this file which are still marked
+ * up to date. We know things can't change on this file underneath us
+ * as we have the lock by now :)
+ */
+static int ocfs2_force_read_journal(struct inode *inode)
+{
+	int status = 0;
+	int i, p_blocks;
+	u64 v_blkno, p_blkno;
+#define CONCURRENT_JOURNAL_FILL 32
+	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
+
+	mlog_entry_void();
+
+	BUG_ON(inode->i_blocks !=
+		     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
+
+	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+
+	mlog(0, "Force reading %lu blocks\n",
+	     (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
+
+	v_blkno = 0;
+	while (v_blkno <
+	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
+
+		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
+						     1, &p_blkno,
+						     &p_blocks);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (p_blocks > CONCURRENT_JOURNAL_FILL)
+			p_blocks = CONCURRENT_JOURNAL_FILL;
+
+		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+					   p_blkno, p_blocks, bhs, 0,
+					   inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = 0; i < p_blocks; i++) {
+			brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+
+		v_blkno += p_blocks;
+	}
+
+bail:
+	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
+		if (bhs[i])
+			brelse(bhs[i]);
+	mlog_exit(status);
+	return status;
+}
+
+struct ocfs2_la_recovery_item {
+	struct list_head	lri_list;
+	int			lri_slot;
+	struct ocfs2_dinode	*lri_la_dinode;
+	struct ocfs2_dinode	*lri_tl_dinode;
+};
+
+/* Does the second half of the recovery process. By this point, the
+ * node is marked clean and can actually be considered recovered,
+ * hence it's no longer in the recovery map, but there's still some
+ * cleanup we can do which shouldn't happen within the recovery thread
+ * as locking in that context becomes very difficult if we are to take
+ * recovering nodes into account.
+ *
+ * NOTE: This function can and will sleep on recovery of other nodes
+ * during cluster locking, just like any other ocfs2 process.
+ */
+void ocfs2_complete_recovery(void *data)
+{
+	int ret;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_journal *journal = osb->journal;
+	struct ocfs2_dinode *la_dinode, *tl_dinode;
+	struct ocfs2_la_recovery_item *item;
+	struct list_head *p, *n;
+	LIST_HEAD(tmp_la_list);
+
+	mlog_entry_void();
+
+	mlog(0, "completing recovery from keventd\n");
+
+	spin_lock(&journal->j_lock);
+	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
+	spin_unlock(&journal->j_lock);
+
+	list_for_each_safe(p, n, &tmp_la_list) {
+		item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
+		list_del_init(&item->lri_list);
+
+		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+
+		la_dinode = item->lri_la_dinode;
+		if (la_dinode) {
+			mlog(0, "Clean up local alloc %"MLFu64"\n",
+			     la_dinode->i_blkno);
+
+			ret = ocfs2_complete_local_alloc_recovery(osb,
+								  la_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(la_dinode);
+		}
+
+		tl_dinode = item->lri_tl_dinode;
+		if (tl_dinode) {
+			mlog(0, "Clean up truncate log %"MLFu64"\n",
+			     tl_dinode->i_blkno);
+
+			ret = ocfs2_complete_truncate_log_recovery(osb,
+								   tl_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(tl_dinode);
+		}
+
+		ret = ocfs2_recover_orphans(osb, item->lri_slot);
+		if (ret < 0)
+			mlog_errno(ret);
+
+		kfree(item);
+	}
+
+	mlog(0, "Recovery completion\n");
+	mlog_exit_void();
+}
+
+/* NOTE: This function always eats your references to la_dinode and
+ * tl_dinode, either manually on error, or by passing them to
+ * ocfs2_complete_recovery */
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode)
+{
+	struct ocfs2_la_recovery_item *item;
+
+	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
+	if (!item) {
+		/* Though we wish to avoid it, we are in fact safe in
+		 * skipping local alloc cleanup as fsck.ocfs2 is more
+		 * than capable of reclaiming unused space. */
+		if (la_dinode)
+			kfree(la_dinode);
+
+		if (tl_dinode)
+			kfree(tl_dinode);
+
+		mlog_errno(-ENOMEM);
+		return;
+	}
+
+	INIT_LIST_HEAD(&item->lri_list);
+	item->lri_la_dinode = la_dinode;
+	item->lri_slot = slot_num;
+	item->lri_tl_dinode = tl_dinode;
+
+	spin_lock(&journal->j_lock);
+	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
+	queue_work(ocfs2_wq, &journal->j_recovery_work);
+	spin_unlock(&journal->j_lock);
+}
+
+/* Called by the mount code to queue recovery the last part of
+ * recovery for it's own slot. */
+void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = osb->journal;
+
+	if (osb->dirty) {
+		/* No need to queue up our truncate_log as regular
+		 * cleanup will catch that. */
+		ocfs2_queue_recovery_completion(journal,
+						osb->slot_num,
+						osb->local_alloc_copy,
+						NULL);
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+
+		osb->local_alloc_copy = NULL;
+		osb->dirty = 0;
+	}
+}
+
+static int __ocfs2_recovery_thread(void *arg)
+{
+	int status, node_num;
+	struct ocfs2_super *osb = arg;
+
+	mlog_entry_void();
+
+	status = ocfs2_wait_on_mount(osb);
+	if (status < 0) {
+		goto bail;
+	}
+
+restart:
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+		node_num = ocfs2_node_map_first_set_bit(osb,
+							&osb->recovery_map);
+		if (node_num == O2NM_INVALID_NODE_NUM) {
+			mlog(0, "Out of nodes to recover.\n");
+			break;
+		}
+
+		status = ocfs2_recover_node(osb, node_num);
+		if (status < 0) {
+			mlog(ML_ERROR,
+			     "Error %d recovering node %d on device (%u,%u)!\n",
+			     status, node_num,
+			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+			mlog(ML_ERROR, "Volume requires unmount.\n");
+			continue;
+		}
+
+		ocfs2_recovery_map_clear(osb, node_num);
+	}
+	ocfs2_super_unlock(osb, 1);
+
+	/* We always run recovery on our own orphan dir - the dead
+	 * node(s) may have voted "no" on an inode delete earlier. A
+	 * revote is therefore required. */
+	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+					NULL);
+
+bail:
+	down(&osb->recovery_lock);
+	if (!status &&
+	    !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+		up(&osb->recovery_lock);
+		goto restart;
+	}
+
+	osb->recovery_thread_task = NULL;
+	mb(); /* sync with ocfs2_recovery_thread_running */
+	wake_up(&osb->recovery_event);
+
+	up(&osb->recovery_lock);
+
+	mlog_exit(status);
+	/* no one is callint kthread_stop() for us so the kthread() api
+	 * requires that we call do_exit().  And it isn't exported, but
+	 * complete_and_exit() seems to be a minimal wrapper around it. */
+	complete_and_exit(NULL, status);
+	return status;
+}
+
+void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
+{
+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+		   node_num, osb->node_num);
+
+	down(&osb->recovery_lock);
+	if (osb->disable_recovery)
+		goto out;
+
+	/* People waiting on recovery will wait on
+	 * the recovery map to empty. */
+	if (!ocfs2_recovery_map_set(osb, node_num))
+		mlog(0, "node %d already be in recovery.\n", node_num);
+
+	mlog(0, "starting recovery thread...\n");
+
+	if (osb->recovery_thread_task)
+		goto out;
+
+	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
+						 "ocfs2rec-%d", osb->osb_id);
+	if (IS_ERR(osb->recovery_thread_task)) {
+		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
+		osb->recovery_thread_task = NULL;
+	}
+
+out:
+	up(&osb->recovery_lock);
+	wake_up(&osb->recovery_event);
+
+	mlog_exit_void();
+}
+
+/* Does the actual journal replay and marks the journal inode as
+ * clean. Will only replay if the journal inode is marked dirty. */
+static int ocfs2_replay_journal(struct ocfs2_super *osb,
+				int node_num,
+				int slot_num)
+{
+	int status;
+	int got_lock = 0;
+	unsigned int flags;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *fe;
+	journal_t *journal = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		status = -EACCES;
+		iput(inode);
+		inode = NULL;
+		mlog_errno(status);
+		goto done;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
+				      OCFS2_META_LOCK_RECOVERY);
+	if (status < 0) {
+		mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not lock journal!\n");
+		goto done;
+	}
+	got_lock = 1;
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+
+	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
+		mlog(0, "No recovery required for node %d\n", node_num);
+		goto done;
+	}
+
+	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
+	     node_num, slot_num,
+	     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+
+	status = ocfs2_force_read_journal(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	mlog(0, "calling journal_init_inode\n");
+	journal = journal_init_inode(inode);
+	if (journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EIO;
+		goto done;
+	}
+
+	status = journal_load(journal);
+	if (status < 0) {
+		mlog_errno(status);
+		if (!igrab(inode))
+			BUG();
+		journal_destroy(journal);
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
+
+	/* wipe the journal */
+	mlog(0, "flushing the journal.\n");
+	journal_lock_updates(journal);
+	status = journal_flush(journal);
+	journal_unlock_updates(journal);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will mark the node clean */
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	status = ocfs2_write_block(osb, bh, inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	if (!igrab(inode))
+		BUG();
+
+	journal_destroy(journal);
+
+done:
+	/* drop the lock on this nodes journal */
+	if (got_lock)
+		ocfs2_meta_unlock(inode, 1);
+
+	if (inode)
+		iput(inode);
+
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Do the most important parts of node recovery:
+ *  - Replay it's journal
+ *  - Stamp a clean local allocator file
+ *  - Stamp a clean truncate log
+ *  - Mark the node clean
+ *
+ * If this function completes without error, a node in OCFS2 can be
+ * said to have been safely recovered. As a result, failure during the
+ * second part of a nodes recovery process (local alloc recovery) is
+ * far less concerning.
+ */
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num)
+{
+	int status = 0;
+	int slot_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+	struct ocfs2_dinode *la_copy = NULL;
+	struct ocfs2_dinode *tl_copy = NULL;
+
+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+		   node_num, osb->node_num);
+
+	mlog(0, "checking node %d\n", node_num);
+
+	/* Should not ever be called to recover ourselves -- in that
+	 * case we should've called ocfs2_journal_load instead. */
+	if (osb->node_num == node_num)
+		BUG();
+
+	slot_num = ocfs2_node_num_to_slot(si, node_num);
+	if (slot_num == OCFS2_INVALID_SLOT) {
+		status = 0;
+		mlog(0, "no slot for this node, so no recovery required.\n");
+		goto done;
+	}
+
+	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+	status = ocfs2_replay_journal(osb, node_num, slot_num);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Stamp a clean local alloc file AFTER recovering the journal... */
+	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* An error from begin_truncate_log_recovery is not
+	 * serious enough to warrant halting the rest of
+	 * recovery. */
+	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Likewise, this would be a strange but ultimately not so
+	 * harmful place to get an error... */
+	ocfs2_clear_slot(si, slot_num);
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will kfree the memory pointed to by la_copy and tl_copy */
+	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
+					tl_copy);
+
+	status = 0;
+done:
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Test node liveness by trylocking his journal. If we get the lock,
+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
+ * still alive (we couldn't get the lock) and < 0 on error. */
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num)
+{
+	int status, flags;
+	struct inode *inode = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		mlog(ML_ERROR, "access error\n");
+		status = -EACCES;
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
+	status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_meta_unlock(inode, 1);
+bail:
+	if (inode)
+		iput(inode);
+
+	return status;
+}
+
+/* Call this underneath ocfs2_super_lock. It also assumes that the
+ * slot info struct has been updated from disk. */
+int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
+{
+	int status, i, node_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	/* This is called with the super block cluster lock, so we
+	 * know that the slot map can't change underneath us. */
+
+	spin_lock(&si->si_lock);
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (i == osb->slot_num)
+			continue;
+		if (ocfs2_is_empty_slot(si, i))
+			continue;
+
+		node_num = si->si_global_node_nums[i];
+		if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+			continue;
+		spin_unlock(&si->si_lock);
+
+		/* Ok, we have a slot occupied by another node which
+		 * is not in the recovery map. We trylock his journal
+		 * file here to test if he's alive. */
+		status = ocfs2_trylock_journal(osb, i);
+		if (!status) {
+			/* Since we're called from mount, we know that
+			 * the recovery thread can't race us on
+			 * setting / checking the recovery bits. */
+			ocfs2_recovery_thread(osb, node_num);
+		} else if ((status < 0) && (status != -EAGAIN)) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		spin_lock(&si->si_lock);
+	}
+	spin_unlock(&si->si_lock);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot)
+{
+	int status = 0;
+	int have_disk_lock = 0;
+	struct inode *inode = NULL;
+	struct inode *iter;
+	struct inode *orphan_dir_inode = NULL;
+	unsigned long offset, blk, local;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_inode_info *oi;
+
+	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       slot);
+	if  (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto out;
+	}
+
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+		mlog_errno(status);
+		goto out;
+	}
+	have_disk_lock = 1;
+
+	offset = 0;
+	iter = NULL;
+	while(offset < i_size_read(orphan_dir_inode)) {
+		blk = offset >> sb->s_blocksize_bits;
+
+		bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
+		if (!bh)
+			status = -EINVAL;
+		if (status < 0) {
+			up(&orphan_dir_inode->i_sem);
+			if (bh)
+				brelse(bh);
+			mlog_errno(status);
+			goto out;
+		}
+
+		local = 0;
+		while(offset < i_size_read(orphan_dir_inode)
+		      && local < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + local);
+
+			if (!ocfs2_check_dir_entry(orphan_dir_inode,
+						  de, bh, local)) {
+				up(&orphan_dir_inode->i_sem);
+				status = -EINVAL;
+				mlog_errno(status);
+				brelse(bh);
+				goto out;
+			}
+
+			local += le16_to_cpu(de->rec_len);
+			offset += le16_to_cpu(de->rec_len);
+
+			/* I guess we silently fail on no inode? */
+			if (!le64_to_cpu(de->inode))
+				continue;
+			if (de->file_type > OCFS2_FT_MAX) {
+				mlog(ML_ERROR,
+				     "block %llu contains invalid de: "
+				     "inode = %"MLFu64", rec_len = %u, "
+				     "name_len = %u, file_type = %u, "
+				     "name='%.*s'\n",
+				     (unsigned long long)bh->b_blocknr,
+				     le64_to_cpu(de->inode),
+				     le16_to_cpu(de->rec_len),
+				     de->name_len,
+				     de->file_type,
+				     de->name_len,
+				     de->name);
+				continue;
+			}
+			if (de->name_len == 1 && !strncmp(".", de->name, 1))
+				continue;
+			if (de->name_len == 2 && !strncmp("..", de->name, 2))
+				continue;
+
+			iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
+			if (IS_ERR(iter))
+				continue;
+
+			mlog(0, "queue orphan %"MLFu64"\n",
+			     OCFS2_I(iter)->ip_blkno);
+			OCFS2_I(iter)->ip_next_orphan = inode;
+			inode = iter;
+		}
+		brelse(bh);
+	}
+	up(&orphan_dir_inode->i_sem);
+
+	ocfs2_meta_unlock(orphan_dir_inode, 0);
+	have_disk_lock = 0;
+
+	iput(orphan_dir_inode);
+	orphan_dir_inode = NULL;
+
+	while (inode) {
+		oi = OCFS2_I(inode);
+		mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
+
+		iter = oi->ip_next_orphan;
+
+		spin_lock(&oi->ip_lock);
+		/* Delete voting may have set these on the assumption
+		 * that the other node would wipe them successfully.
+		 * If they are still in the node's orphan dir, we need
+		 * to reset that state. */
+		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
+
+		/* Set the proper information to get us going into
+		 * ocfs2_delete_inode. */
+		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+		oi->ip_orphaned_slot = slot;
+		spin_unlock(&oi->ip_lock);
+
+		iput(inode);
+
+		inode = iter;
+	}
+
+out:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(orphan_dir_inode, 0);
+
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	return status;
+}
+
+static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	/* This check is good because ocfs2 will wait on our recovery
+	 * thread before changing it to something other than MOUNTED
+	 * or DISABLED. */
+	wait_event(osb->osb_mount_event,
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
+
+	/* If there's an error on mount, then we may never get to the
+	 * MOUNTED flag, but this is set right before
+	 * dismount_volume() so we can trust it. */
+	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+		mlog(0, "mount error, exiting!\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int ocfs2_commit_thread(void *arg)
+{
+	int status;
+	struct ocfs2_super *osb = arg;
+	struct ocfs2_journal *journal = osb->journal;
+
+	/* we can trust j_num_trans here because _should_stop() is only set in
+	 * shutdown and nobody other than ourselves should be able to start
+	 * transactions.  committing on shutdown might take a few iterations
+	 * as final transactions put deleted inodes on the list */
+	while (!(kthread_should_stop() &&
+		 atomic_read(&journal->j_num_trans) == 0)) {
+
+		wait_event_interruptible_timeout(osb->checkpoint_event,
+						 atomic_read(&journal->j_num_trans)
+						 || kthread_should_stop(),
+						 OCFS2_CHECKPOINT_INTERVAL);
+
+		status = ocfs2_commit_cache(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
+			mlog(ML_KTHREAD,
+			     "commit_thread: %u transactions pending on "
+			     "shutdown\n",
+			     atomic_read(&journal->j_num_trans));
+		}
+	}
+
+	return 0;
+}
+
+/* Look for a dirty journal without taking any cluster locks. Used for
+ * hard readonly access to determine whether the file system journals
+ * require recovery. */
+int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	unsigned int slot;
+	struct buffer_head *di_bh;
+	struct ocfs2_dinode *di;
+	struct inode *journal = NULL;
+
+	for(slot = 0; slot < osb->max_slots; slot++) {
+		journal = ocfs2_get_system_file_inode(osb,
+						      JOURNAL_SYSTEM_INODE,
+						      slot);
+		if (!journal || is_bad_inode(journal)) {
+			ret = -EACCES;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di_bh = NULL;
+		ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
+				       0, journal);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di = (struct ocfs2_dinode *) di_bh->b_data;
+
+		if (le32_to_cpu(di->id1.journal1.ij_flags) &
+		    OCFS2_JOURNAL_DIRTY_FL)
+			ret = -EROFS;
+
+		brelse(di_bh);
+		if (ret)
+			break;
+	}
+
+out:
+	if (journal)
+		iput(journal);
+
+	return ret;
+}

+ 457 - 0
fs/ocfs2/journal.h

@@ -0,0 +1,457 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.h
+ *
+ * Defines journalling api and structures.
+ *
+ * Copyright (C) 2003, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_JOURNAL_H
+#define OCFS2_JOURNAL_H
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+
+#define OCFS2_CHECKPOINT_INTERVAL        (8 * HZ)
+
+enum ocfs2_journal_state {
+	OCFS2_JOURNAL_FREE = 0,
+	OCFS2_JOURNAL_LOADED,
+	OCFS2_JOURNAL_IN_SHUTDOWN,
+};
+
+struct ocfs2_super;
+struct ocfs2_dinode;
+struct ocfs2_journal_handle;
+
+struct ocfs2_journal {
+	enum ocfs2_journal_state   j_state;    /* Journals current state   */
+
+	journal_t                 *j_journal; /* The kernels journal type */
+	struct inode              *j_inode;   /* Kernel inode pointing to
+					       * this journal             */
+	struct ocfs2_super        *j_osb;     /* pointer to the super
+					       * block for the node
+					       * we're currently
+					       * running on -- not
+					       * necessarily the super
+					       * block from the node
+					       * which we usually run
+					       * from (recovery,
+					       * etc)                     */
+	struct buffer_head        *j_bh;      /* Journal disk inode block */
+	atomic_t                  j_num_trans; /* Number of transactions
+					        * currently in the system. */
+	unsigned long             j_trans_id;
+	struct rw_semaphore       j_trans_barrier;
+	wait_queue_head_t         j_checkpointed;
+
+	spinlock_t                j_lock;
+	struct list_head          j_la_cleanups;
+	struct work_struct        j_recovery_work;
+};
+
+extern spinlock_t trans_inc_lock;
+
+/* wrap j_trans_id so we never have it equal to zero. */
+static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
+{
+	unsigned long old_id;
+	spin_lock(&trans_inc_lock);
+	old_id = j->j_trans_id++;
+	if (unlikely(!j->j_trans_id))
+		j->j_trans_id = 1;
+	spin_unlock(&trans_inc_lock);
+	return old_id;
+}
+
+static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
+					      struct inode *inode)
+{
+	spin_lock(&trans_inc_lock);
+	OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Used to figure out whether it's safe to drop a metadata lock on an
+ * inode. Returns true if all the inodes changes have been
+ * checkpointed to disk. You should be holding the spinlock on the
+ * metadata lock while calling this to be sure that nobody can take
+ * the lock and put it on another transaction. */
+static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
+{
+	int ret;
+	struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+
+	spin_lock(&trans_inc_lock);
+	ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+/* convenience function to check if an inode is still new (has never
+ * hit disk) Will do you a favor and set created_trans = 0 when you've
+ * been checkpointed.  returns '1' if the inode is still new. */
+static inline int ocfs2_inode_is_new(struct inode *inode)
+{
+	int ret;
+
+	/* System files are never "new" as they're written out by
+	 * mkfs. This helps us early during mount, before we have the
+	 * journal open and j_trans_id could be junk. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		return 0;
+	spin_lock(&trans_inc_lock);
+	ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
+			   OCFS2_I(inode)->ip_created_trans));
+	if (!ret)
+		OCFS2_I(inode)->ip_created_trans = 0;
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
+				       struct inode *inode)
+{
+	spin_lock(&trans_inc_lock);
+	OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+extern kmem_cache_t *ocfs2_lock_cache;
+
+struct ocfs2_journal_lock {
+	struct inode     *jl_inode;
+	struct list_head  jl_lock_list;
+};
+
+struct ocfs2_journal_handle {
+	handle_t            *k_handle; /* kernel handle.                */
+	struct ocfs2_journal        *journal;
+	u32                 flags;     /* see flags below.              */
+	int                 max_buffs; /* Buffs reserved by this handle */
+
+	/* The following two fields are for ocfs2_handle_add_lock */
+	int                 num_locks;
+	struct list_head    locks;     /* A bunch of locks to
+					* release on commit. This
+					* should be a list_head */
+
+	struct list_head     inode_list;
+};
+
+#define OCFS2_HANDLE_STARTED			1
+/* should we sync-commit this handle? */
+#define OCFS2_HANDLE_SYNC			2
+static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
+{
+	return handle->flags & OCFS2_HANDLE_STARTED;
+}
+
+static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
+{
+	if (sync)
+		handle->flags |= OCFS2_HANDLE_SYNC;
+	else
+		handle->flags &= ~OCFS2_HANDLE_SYNC;
+}
+
+/* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_complete_recovery(void *data);
+
+/*
+ *  Journal Control:
+ *  Initialize, Load, Shutdown, Wipe a journal.
+ *
+ *  ocfs2_journal_init     - Initialize journal structures in the OSB.
+ *  ocfs2_journal_load     - Load the given journal off disk. Replay it if
+ *                          there's transactions still in there.
+ *  ocfs2_journal_shutdown - Shutdown a journal, this will flush all
+ *                          uncommitted, uncheckpointed transactions.
+ *  ocfs2_journal_wipe     - Wipe transactions from a journal. Optionally
+ *                          zero out each block.
+ *  ocfs2_recovery_thread  - Perform recovery on a node. osb is our own osb.
+ *  ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
+ *                          event on.
+ *  ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
+ */
+void   ocfs2_set_journal_params(struct ocfs2_super *osb);
+int    ocfs2_journal_init(struct ocfs2_journal *journal,
+			  int *dirty);
+void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
+int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
+			  int full);
+int    ocfs2_journal_load(struct ocfs2_journal *journal);
+int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
+void   ocfs2_recovery_thread(struct ocfs2_super *osb,
+			     int node_num);
+int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
+void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+
+static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
+{
+	atomic_set(&osb->needs_checkpoint, 1);
+	wake_up(&osb->checkpoint_event);
+}
+
+static inline void ocfs2_checkpoint_inode(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!ocfs2_inode_fully_checkpointed(inode)) {
+		/* WARNING: This only kicks off a single
+		 * checkpoint. If someone races you and adds more
+		 * metadata to the journal, you won't know, and will
+		 * wind up waiting *alot* longer than necessary. Right
+		 * now we only use this in clear_inode so that's
+		 * OK. */
+		ocfs2_start_checkpoint(osb);
+
+		wait_event(osb->journal->j_checkpointed,
+			   ocfs2_inode_fully_checkpointed(inode));
+	}
+}
+
+/*
+ *  Transaction Handling:
+ *  Manage the lifetime of a transaction handle.
+ *
+ *  ocfs2_alloc_handle     - Only allocate a handle so we can start putting
+ *                          cluster locks on it. To actually change blocks,
+ *                          call ocfs2_start_trans with the handle returned
+ *                          from this function. You may call ocfs2_commit_trans
+ *                           at any time in the lifetime of a handle.
+ *  ocfs2_start_trans      - Begin a transaction. Give it an upper estimate of
+ *                          the number of blocks that will be changed during
+ *                          this handle.
+ *  ocfs2_commit_trans     - Complete a handle.
+ *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
+ *                          commit the handle to disk in the process, but will
+ *                          not release any locks taken during the transaction.
+ *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *                          buffer. Will have to call ocfs2_journal_dirty once
+ *                          we've actually dirtied it. Type is one of . or .
+ *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
+ *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
+ *                             the current handle commits.
+ *  ocfs2_handle_add_lock  - Sometimes we need to delay lock release
+ *                          until after a transaction has been completed. Use
+ *                          ocfs2_handle_add_lock to indicate that a lock needs
+ *                          to be released at the end of that handle. Locks
+ *                          will be released in the order that they are added.
+ *  ocfs2_handle_add_inode - Add a locked inode to a transaction.
+ */
+
+/* You must always start_trans with a number of buffs > 0, but it's
+ * perfectly legal to go through an entire transaction without having
+ * dirtied any buffers. */
+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
+					       struct ocfs2_journal_handle *handle,
+					       int max_buffs);
+void			     ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
+int			     ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
+						int nblocks);
+
+/*
+ * Create access is for when we get a newly created buffer and we're
+ * not gonna read it off disk, but rather fill it ourselves.  Right
+ * now, we don't do anything special with this (it turns into a write
+ * request), but this is a good placeholder in case we do...
+ *
+ * Write access is for when we read a block off disk and are going to
+ * modify it. This way the journalling layer knows it may need to make
+ * a copy of that block (if it's part of another, uncommitted
+ * transaction) before we do so.
+ */
+#define OCFS2_JOURNAL_ACCESS_CREATE 0
+#define OCFS2_JOURNAL_ACCESS_WRITE  1
+#define OCFS2_JOURNAL_ACCESS_UNDO   2
+
+int                  ocfs2_journal_access(struct ocfs2_journal_handle *handle,
+					  struct inode *inode,
+					  struct buffer_head *bh,
+					  int type);
+/*
+ * A word about the journal_access/journal_dirty "dance". It is
+ * entirely legal to journal_access a buffer more than once (as long
+ * as the access type is the same -- I'm not sure what will happen if
+ * access type is different but this should never happen anyway) It is
+ * also legal to journal_dirty a buffer more than once. In fact, you
+ * can even journal_access a buffer after you've done a
+ * journal_access/journal_dirty pair. The only thing you cannot do
+ * however, is journal_dirty a buffer which you haven't yet passed to
+ * journal_access at least once.
+ *
+ * That said, 99% of the time this doesn't matter and this is what the
+ * path looks like:
+ *
+ *	<read a bh>
+ *	ocfs2_journal_access(handle, bh,	OCFS2_JOURNAL_ACCESS_WRITE);
+ *	<modify the bh>
+ * 	ocfs2_journal_dirty(handle, bh);
+ */
+int                  ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
+					 struct buffer_head *bh);
+int                  ocfs2_journal_dirty_data(handle_t *handle,
+					      struct buffer_head *bh);
+int                  ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
+					   struct inode *inode);
+/*
+ * Use this to protect from other processes reading buffer state while
+ * it's in flight.
+ */
+void                 ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
+					    struct inode *inode);
+
+/*
+ *  Credit Macros:
+ *  Convenience macros to calculate number of credits needed.
+ *
+ *  For convenience sake, I have a set of macros here which calculate
+ *  the *maximum* number of sectors which will be changed for various
+ *  metadata updates.
+ */
+
+/* simple file updates like chmod, etc. */
+#define OCFS2_INODE_UPDATE_CREDITS 1
+
+/* get one bit out of a suballocator: dinode + group descriptor +
+ * prev. group desc. if we relink. */
+#define OCFS2_SUBALLOC_ALLOC (3)
+
+/* dinode + group descriptor update. We don't relink on free yet. */
+#define OCFS2_SUBALLOC_FREE  (2)
+
+#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
+#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
+					 + OCFS2_TRUNCATE_LOG_UPDATE)
+
+/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
+ * bitmap block for the new bit) */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+
+/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
+ * group descriptor + mkdir/symlink blocks */
+#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
+			    + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+
+/* local alloc metadata change + main bitmap updates */
+#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
+				  + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
+
+/* used when we don't need an allocation change for a dir extend. One
+ * for the dinode, one for the new block. */
+#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
+
+/* file update (nlink, etc) + dir entry block */
+#define OCFS2_LINK_CREDITS  (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
+ * dir inode link */
+#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
+			      + OCFS2_LINK_CREDITS)
+
+/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
+ * inode alloc group descriptor */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+
+/* dinode update, old dir dinode update, new dir dinode update, old
+ * dir dir entry, new dir dir entry, dir entry update for renaming
+ * directory + target unlink */
+#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
+			     + OCFS2_UNLINK_CREDITS)
+
+static inline int ocfs2_calc_extend_credits(struct super_block *sb,
+					    struct ocfs2_dinode *fe,
+					    u32 bits_wanted)
+{
+	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+
+	/* bitmap dinode, group desc. + relinked group. */
+	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
+
+	/* we might need to shift tree depth so lets assume an
+	 * absolute worst case of complete fragmentation.  Even with
+	 * that, we only need one update for the dinode, and then
+	 * however many metadata chunks needed * a remaining suballoc
+	 * alloc. */
+	sysfile_bitmap_blocks = 1 +
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+
+	/* this does not include *new* metadata blocks, which are
+	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * prev. last_eb_blk + blocks along edge of tree.
+	 * calc_symlink_credits passes because we just need 1
+	 * credit for the dinode there. */
+	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+
+	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+}
+
+static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
+{
+	int blocks = OCFS2_MKNOD_CREDITS;
+
+	/* links can be longer than one block so we may update many
+	 * within our single allocated extent. */
+	blocks += ocfs2_clusters_to_blocks(sb, 1);
+
+	return blocks;
+}
+
+static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
+						 unsigned int cpg)
+{
+	int blocks;
+	int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
+	/* parent inode update + new block group header + bitmap inode update
+	   + bitmap blocks affected */
+	blocks = 1 + 1 + 1 + bitmap_blocks;
+	return blocks;
+}
+
+static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
+						unsigned int clusters_to_del,
+						struct ocfs2_dinode *fe,
+						struct ocfs2_extent_list *last_el)
+{
+ 	/* for dinode + all headers in this pass + update to next leaf */
+	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
+	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	int credits = 1 + tree_depth + 1;
+	int i;
+
+	i = next_free - 1;
+	BUG_ON(i < 0);
+
+	/* We may be deleting metadata blocks, so metadata alloc dinode +
+	   one desc. block for each possible delete. */
+	if (tree_depth && next_free == 1 &&
+	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+		credits += 1 + tree_depth;
+
+	/* update to the truncate log. */
+	credits += OCFS2_TRUNCATE_LOG_UPDATE;
+
+	return credits;
+}
+
+#endif /* OCFS2_JOURNAL_H */

+ 983 - 0
fs/ocfs2/localalloc.c

@@ -0,0 +1,983 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.c
+ *
+ * Node local data allocation
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
+
+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 numbits);
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
+
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh);
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_journal_handle *handle,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh);
+
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					struct ocfs2_journal_handle *handle,
+					struct ocfs2_alloc_context *ac);
+
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode);
+
+/*
+ * Determine how large our local alloc window should be, in bits.
+ *
+ * These values (and the behavior in ocfs2_alloc_should_use_local) have
+ * been chosen so that most allocations, including new block groups go
+ * through local alloc.
+ */
+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+{
+	BUG_ON(osb->s_clustersize_bits < 12);
+
+	return 2048 >> (osb->s_clustersize_bits - 12);
+}
+
+/*
+ * Tell us whether a given allocation should use the local alloc
+ * file. Otherwise, it has to go to the main bitmap.
+ */
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
+{
+	int la_bits = ocfs2_local_alloc_window_bits(osb);
+
+	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+		return 0;
+
+	/* la_bits should be at least twice the size (in clusters) of
+	 * a new block group. We want to be sure block group
+	 * allocations go through the local alloc, so allow an
+	 * allocation to take up to half the bitmap. */
+	if (bits > (la_bits / 2))
+		return 0;
+
+	return 1;
+}
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb)
+{
+	int status = 0;
+	struct ocfs2_dinode *alloc = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	u32 num_used;
+	struct inode *inode = NULL;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+
+	/* read the alloc off disk */
+	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (!(le32_to_cpu(alloc->i_flags) &
+	    (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
+		mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if ((la->la_size == 0) ||
+	    (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
+		mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
+		     le16_to_cpu(la->la_size));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* do a little verification. */
+	num_used = ocfs2_local_alloc_count_bits(alloc);
+
+	/* hopefully the local alloc has always been recovered before
+	 * we load it. */
+	if (num_used
+	    || alloc->id1.bitmap1.i_used
+	    || alloc->id1.bitmap1.i_total
+	    || la->la_bm_off)
+		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+		     "found = %u, set = %u, taken = %u, off = %u\n",
+		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
+		     le32_to_cpu(alloc->id1.bitmap1.i_total),
+		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+
+	osb->local_alloc_bh = alloc_bh;
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+
+bail:
+	if (status < 0)
+		if (alloc_bh)
+			brelse(alloc_bh);
+	if (inode)
+		iput(inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * return any unused bits to the bitmap and write out a clean
+ * local_alloc.
+ *
+ * local_alloc_bh is optional. If not passed, we will simply use the
+ * one off osb. If you do pass it however, be warned that it *will* be
+ * returned brelse'd and NULL'd out.*/
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *local_alloc_inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_dinode *alloc = NULL;
+
+	mlog_entry_void();
+
+	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
+		goto bail;
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->local_alloc_state = OCFS2_LA_DISABLED;
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* WINDOW_MOVE_CREDITS is a bit heavy... */
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		handle = NULL;
+		goto bail;
+	}
+
+	bh = osb->local_alloc_bh;
+	alloc = (struct ocfs2_dinode *) bh->b_data;
+
+	alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, bh->b_size);
+
+	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	brelse(bh);
+	osb->local_alloc_bh = NULL;
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	mlog_exit_void();
+}
+
+/*
+ * We want to free the bitmap bits outside of any recovery context as
+ * we'll need a cluster lock to do so, but we must clear the local
+ * alloc before giving up the recovered nodes journal. To solve this,
+ * we kmalloc a copy of the local alloc before it's change for the
+ * caller to process with ocfs2_complete_local_alloc_recovery
+ */
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int slot_num,
+				     struct ocfs2_dinode **alloc_copy)
+{
+	int status = 0;
+	struct buffer_head *alloc_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *alloc;
+
+	mlog_entry("(slot_num = %d)\n", slot_num);
+
+	*alloc_copy = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down(&inode->i_sem);
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
+	if (!(*alloc_copy)) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_write_block(osb, alloc_bh, inode);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if ((status < 0) && (*alloc_copy)) {
+		kfree(*alloc_copy);
+		*alloc_copy = NULL;
+	}
+
+	if (alloc_bh)
+		brelse(alloc_bh);
+
+	if (inode) {
+		up(&inode->i_sem);
+		iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Step 2: By now, we've completed the journal recovery, we've stamped
+ * a clean local alloc on disk and dropped the node out of the
+ * recovery map. Dlm locks will no longer stall, so lets clear out the
+ * main bitmap.
+ */
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc)
+{
+	int status;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* we want the bitmap change to be recorded on disk asap */
+	ocfs2_handle_set_sync(handle, 1);
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * make sure we've got at least bitswanted contiguous bits in the
+ * local alloc. You lose them when you drop i_sem.
+ *
+ * We will add ourselves to the transaction passed in, but may start
+ * our own in order to shift windows.
+ */
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   struct ocfs2_journal_handle *passed_handle,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac)
+{
+	int status;
+	struct ocfs2_dinode *alloc;
+	struct inode *local_alloc_inode;
+	unsigned int free_bits;
+
+	mlog_entry_void();
+
+	BUG_ON(!passed_handle);
+	BUG_ON(!ac);
+	BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
+
+	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
+		mlog(0, "Asking for more than my max window size!\n");
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
+	    ocfs2_local_alloc_count_bits(alloc)) {
+		ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
+			    "%u free bits, but a count shows %u",
+			    le64_to_cpu(alloc->i_blkno),
+			    le32_to_cpu(alloc->id1.bitmap1.i_used),
+			    ocfs2_local_alloc_count_bits(alloc));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(alloc->id1.bitmap1.i_used);
+	if (bits_wanted > free_bits) {
+		/* uhoh, window change time. */
+		status =
+			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	ac->ac_inode = igrab(local_alloc_inode);
+	get_bh(osb->local_alloc_bh);
+	ac->ac_bh = osb->local_alloc_bh;
+	ac->ac_which = OCFS2_AC_USE_LOCAL;
+	status = 0;
+bail:
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 struct ocfs2_journal_handle *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 min_bits,
+				 u32 *bit_off,
+				 u32 *num_bits)
+{
+	int status, start;
+	struct inode *local_alloc_inode;
+	u32 bits_wanted;
+	void *bitmap;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	local_alloc_inode = ac->ac_inode;
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		/* TODO: Shouldn't we just BUG here? */
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bitmap = la->la_bitmap;
+	*bit_off = le32_to_cpu(la->la_bm_off) + start;
+	/* local alloc is always contiguous by nature -- we never
+	 * delete bits from it! */
+	*num_bits = bits_wanted;
+
+	status = ocfs2_journal_access(handle, local_alloc_inode,
+				      osb->local_alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while(bits_wanted--)
+		ocfs2_set_bit(start++, bitmap);
+
+	alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
+				le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
+{
+	int i;
+	u8 *buffer;
+	u32 count = 0;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	mlog_entry_void();
+
+	buffer = la->la_bitmap;
+	for (i = 0; i < le16_to_cpu(la->la_size); i++)
+		count += hweight8(buffer[i]);
+
+	mlog_exit(count);
+	return count;
+}
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 numbits)
+{
+	int numfound, bitoff, left, startoff, lastzero;
+	void *bitmap = NULL;
+
+	mlog_entry("(numbits wanted = %u)\n", numbits);
+
+	if (!alloc->id1.bitmap1.i_total) {
+		mlog(0, "No bits in my window!\n");
+		bitoff = -1;
+		goto bail;
+	}
+
+	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
+
+	numfound = bitoff = startoff = 0;
+	lastzero = -1;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
+		if (bitoff == left) {
+			/* mlog(0, "bitoff (%d) == left", bitoff); */
+			break;
+		}
+		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
+		   "numfound = %d\n", bitoff, startoff, numfound);*/
+
+		/* Ok, we found a zero bit... is it contig. or do we
+		 * start over?*/
+		if (bitoff == startoff) {
+			/* we found a zero */
+			numfound++;
+			startoff++;
+		} else {
+			/* got a zero after some ones */
+			numfound = 1;
+			startoff = bitoff+1;
+		}
+		/* we got everything we needed */
+		if (numfound == numbits) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
+	     numfound);
+
+	if (numfound == numbits)
+		bitoff = startoff - numfound;
+	else
+		bitoff = -1;
+
+bail:
+	mlog_exit(bitoff);
+	return bitoff;
+}
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
+{
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+	int i;
+	mlog_entry_void();
+
+	alloc->id1.bitmap1.i_total = 0;
+	alloc->id1.bitmap1.i_used = 0;
+	la->la_bm_off = 0;
+	for(i = 0; i < le16_to_cpu(la->la_size); i++)
+		la->la_bitmap[i] = 0;
+
+	mlog_exit_void();
+}
+
+#if 0
+/* turn this on and uncomment below to aid debugging window shifts. */
+static void ocfs2_verify_zero_bits(unsigned long *bitmap,
+				   unsigned int start,
+				   unsigned int count)
+{
+	unsigned int tmp = count;
+	while(tmp--) {
+		if (ocfs2_test_bit(start + tmp, bitmap)) {
+			printk("ocfs2_verify_zero_bits: start = %u, count = "
+			       "%u\n", start, count);
+			printk("ocfs2_verify_zero_bits: bit %u is set!",
+			       start + tmp);
+			BUG();
+		}
+	}
+}
+#endif
+
+/*
+ * sync the local alloc to main bitmap.
+ *
+ * assumes you've already locked the main bitmap -- the bitmap inode
+ * passed is used for caching.
+ */
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh)
+{
+	int status = 0;
+	int bit_off, left, count, start;
+	u64 la_start_blk;
+	u64 blkno;
+	void *bitmap;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	mlog_entry("total = %u, COUNT = %u, used = %u\n",
+		   le32_to_cpu(alloc->id1.bitmap1.i_total),
+		   ocfs2_local_alloc_count_bits(alloc),
+		   le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	if (!alloc->id1.bitmap1.i_total) {
+		mlog(0, "nothing to sync!\n");
+		goto bail;
+	}
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
+	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
+		mlog(0, "all bits were taken!\n");
+		goto bail;
+	}
+
+	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(la->la_bm_off));
+	bitmap = la->la_bitmap;
+	start = count = bit_off = 0;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+
+	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
+	       != -1) {
+		if ((bit_off < left) && (bit_off == start)) {
+			count++;
+			start++;
+			continue;
+		}
+		if (count) {
+			blkno = la_start_blk +
+				ocfs2_clusters_to_blocks(osb->sb,
+							 start - count);
+
+			mlog(0, "freeing %u bits starting at local "
+			     "alloc bit %u (la_start_blk = %"MLFu64", "
+			     "blkno = %"MLFu64")\n", count, start - count,
+			     la_start_blk, blkno);
+
+			status = ocfs2_free_clusters(handle, main_bm_inode,
+						     main_bm_bh, blkno, count);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		if (bit_off >= left)
+			break;
+		count = 1;
+		start = bit_off + 1;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_journal_handle *handle,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh)
+{
+	int status;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+
+	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	*bitmap_inode = (*ac)->ac_inode;
+	igrab(*bitmap_inode);
+	*bitmap_bh = (*ac)->ac_bh;
+	get_bh(*bitmap_bh);
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * pass it the bitmap lock in lock_bh if you have it.
+ */
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					struct ocfs2_journal_handle *handle,
+					struct ocfs2_alloc_context *ac)
+{
+	int status = 0;
+	u32 cluster_off, cluster_count;
+	struct ocfs2_dinode *alloc = NULL;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (alloc->id1.bitmap1.i_total)
+		mlog(0, "asking me to alloc a new window over a non-empty "
+		     "one\n");
+
+	mlog(0, "Allocating %u clusters for a new window.\n",
+	     ocfs2_local_alloc_window_bits(osb));
+	/* we used the generic suballoc reserve function, but we set
+	 * everything up nicely, so there's no reason why we can't use
+	 * the more specific cluster api to claim bits. */
+	status = ocfs2_claim_clusters(osb, handle, ac,
+				      ocfs2_local_alloc_window_bits(osb),
+				      &cluster_off, &cluster_count);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	la->la_bm_off = cpu_to_le32(cluster_off);
+	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
+	/* just in case... In the future when we find space ourselves,
+	 * we don't have to get all contiguous -- but we'll have to
+	 * set all previously used bits in bitmap and update
+	 * la_bits_set before setting the bits in the main bitmap. */
+	alloc->id1.bitmap1.i_used = 0;
+	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
+	       le16_to_cpu(la->la_size));
+
+	mlog(0, "New window allocated:\n");
+	mlog(0, "window la_bm_off = %u\n",
+	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* Note that we do *NOT* lock the local alloc inode here as
+ * it's been locked already for us. */
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode)
+{
+	int status = 0;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_alloc_context *ac = NULL;
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* This will lock the main bitmap for us. */
+	status = ocfs2_local_alloc_reserve_for_window(osb,
+						      handle,
+						      &ac,
+						      &main_bm_inode,
+						      &main_bm_bh);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	/* We want to clear the local alloc before doing anything
+	 * else, so that if we error later during this operation,
+	 * local alloc shutdown won't try to double free main bitmap
+	 * bits. Make a copy so the sync function knows which bits to
+	 * free. */
+	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
+
+	status = ocfs2_journal_access(handle, local_alloc_inode,
+				      osb->local_alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_local_alloc_new_window(osb, handle, ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_inc(&osb->alloc_stats.moves);
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	mlog_exit(status);
+	return status;
+}
+

+ 56 - 0
fs/ocfs2/localalloc.h

@@ -0,0 +1,56 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCALALLOC_H
+#define OCFS2_LOCALALLOC_H
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb);
+
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int node_num,
+				     struct ocfs2_dinode **alloc_copy);
+
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc);
+
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
+				 u64 bits);
+
+struct ocfs2_alloc_context;
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   struct ocfs2_journal_handle *passed_handle,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac);
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 struct ocfs2_journal_handle *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 min_bits,
+				 u32 *bit_off,
+				 u32 *num_bits);
+
+#endif /* OCFS2_LOCALALLOC_H */

+ 102 - 0
fs/ocfs2/mmap.c

@@ -0,0 +1,102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mmap.c
+ *
+ * Code to deal with the mess that is clustered mmap.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/signal.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "mmap.h"
+
+static struct page *ocfs2_nopage(struct vm_area_struct * area,
+				 unsigned long address,
+				 int *type)
+{
+	struct inode *inode = area->vm_file->f_dentry->d_inode;
+	struct page *page = NOPAGE_SIGBUS;
+	sigset_t blocked, oldset;
+	int ret;
+
+	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
+
+	/* The best way to deal with signals in this path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(&blocked);
+
+	/* We should technically never get a bad ret return
+	 * from sigprocmask */
+	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	page = filemap_nopage(area, address, type);
+
+	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (ret < 0)
+		mlog_errno(ret);
+out:
+	mlog_exit_ptr(page);
+	return page;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+	.nopage = ocfs2_nopage,
+};
+
+int ocfs2_mmap(struct file *file,
+	       struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+
+	/* We don't want to support shared writable mappings yet. */
+	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
+	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
+		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
+		/* This is -EINVAL because generic_file_readonly_mmap
+		 * returns it in a similar situation. */
+		return -EINVAL;
+	}
+
+	update_atime(inode);
+	vma->vm_ops = &ocfs2_file_vm_ops;
+	return 0;
+}
+

+ 6 - 0
fs/ocfs2/mmap.h

@@ -0,0 +1,6 @@
+#ifndef OCFS2_MMAP_H
+#define OCFS2_MMAP_H
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+
+#endif  /* OCFS2_MMAP_H */

+ 2264 - 0
fs/ocfs2/namei.c

@@ -0,0 +1,2264 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.c
+ *
+ * Create and rename file, directory, symlinks
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir);
+
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh);
+
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh);
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac);
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac);
+
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2);
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh);
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh);
+
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname);
+
+static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+				  struct dentry *dentry,
+				  struct inode *inode, u64 blkno,
+				  struct buffer_head *parent_fe_bh,
+				  struct buffer_head *insert_bh)
+{
+	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 inode, blkno, parent_fe_bh, insert_bh);
+}
+
+/* An orphan dir name is an 8 byte value, printed as a hex string */
+#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+
+static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	int status;
+	u64 blkno;
+	struct buffer_head *dirent_bh = NULL;
+	struct inode *inode = NULL;
+	struct dentry *ret;
+	struct ocfs2_dir_entry *dirent;
+	struct ocfs2_inode_info *oi;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
+		ret = ERR_PTR(-ENAMETOOLONG);
+		goto bail;
+	}
+
+	mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
+	     dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ret = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0)
+		goto bail_add;
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		ret = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	oi = OCFS2_I(inode);
+	/* Clear any orphaned state... If we were able to look up the
+	 * inode from a directory, it certainly can't be orphaned. We
+	 * might have the bad state from a node which intended to
+	 * orphan this inode but crashed before it could commit the
+	 * unlink. */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
+	oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&oi->ip_lock);
+
+bail_add:
+
+	dentry->d_op = &ocfs2_dentry_ops;
+	ret = d_splice_alias(inode, dentry);
+
+bail_unlock:
+	/* Don't drop the cluster lock until *after* the d_add --
+	 * unlink on another node will message us to remove that
+	 * dentry under this lock so otherwise we can race this with
+	 * the vote thread and have a stale dentry. */
+	ocfs2_meta_unlock(dir, 0);
+
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit_ptr(ret);
+
+	return ret;
+}
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac)
+{
+	int status;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				     data_ac, NULL, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+	status = ocfs2_journal_access(handle, inode, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
+				  OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mknod(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode,
+		       dev_t dev)
+{
+	int status = 0;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dirfe;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	/* get our super block */
+	osb = OCFS2_SB(dir->i_sb);
+
+	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+		mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
+		     OCFS2_I(dir)->ip_blkno, dir->i_nlink);
+		status = -EMLINK;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* are we making a directory? If so, reserve a cluster for his
+	 * 1st extent. */
+	if (S_ISDIR(mode)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(mode)) {
+		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
+					    new_fe_bh, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+
+		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		le16_add_cpu(&dirfe->i_links_count, 1);
+		status = ocfs2_journal_dirty(handle, parent_fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		dir->i_nlink++;
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+	status = 0;
+leave:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (status == -ENOSPC)
+		mlog(0, "Disk is full\n");
+
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+
+	if (de_bh)
+		brelse(de_bh);
+
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	if ((status < 0) && inode)
+		iput(inode);
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_extent_list *fel;
+	u64 fe_blkno = 0;
+	u16 suballoc_bit;
+	struct inode *inode = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	*new_fe_bh = NULL;
+	*ret_inode = NULL;
+
+	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+				       &fe_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	inode = new_inode(dir->i_sb);
+	if (IS_ERR(inode)) {
+		status = PTR_ERR(inode);
+		mlog(ML_ERROR, "new_inode failed!\n");
+		goto leave;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
+	OCFS2_I(inode)->ip_blkno = fe_blkno;
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode->i_mode = mode;
+	spin_lock(&osb->osb_lock);
+	inode->i_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
+	if (!*new_fe_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto leave;
+	}
+	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
+
+	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
+	memset(fe, 0, osb->sb->s_blocksize);
+
+	fe->i_generation = cpu_to_le32(inode->i_generation);
+	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
+	fe->i_blkno = cpu_to_le64(fe_blkno);
+	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
+	fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+	fe->i_uid = cpu_to_le32(current->fsuid);
+	if (dir->i_mode & S_ISGID) {
+		fe->i_gid = cpu_to_le32(dir->i_gid);
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		fe->i_gid = cpu_to_le32(current->fsgid);
+	fe->i_mode = cpu_to_le16(mode);
+	if (S_ISCHR(mode) || S_ISBLK(mode))
+		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
+
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	fe->i_last_eb_blk = 0;
+	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
+	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+	fe->i_atime = fe->i_ctime = fe->i_mtime =
+		cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
+		cpu_to_le32(CURRENT_TIME.tv_nsec);
+	fe->i_dtime = 0;
+
+	fel = &fe->id2.i_list;
+	fel->l_tree_depth = 0;
+	fel->l_next_free_rec = 0;
+	fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+
+	status = ocfs2_journal_dirty(handle, *new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
+		     "i_blkno=%"MLFu64", i_ino=%lu\n",
+		     (unsigned long long) (*new_fe_bh)->b_blocknr,
+		     fe->i_blkno, inode->i_ino);
+		BUG();
+	}
+
+	ocfs2_inode_set_new(osb, inode);
+	status = ocfs2_create_new_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0; /* error in ocfs2_create_new_inode_locks is not
+		     * critical */
+
+	*ret_inode = inode;
+leave:
+	if (status < 0) {
+		if (*new_fe_bh) {
+			brelse(*new_fe_bh);
+			*new_fe_bh = NULL;
+		}
+		if (inode)
+			iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mkdir(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_link(struct dentry *old_dentry,
+		      struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (S_ISDIR(inode->i_mode)) {
+		err = -EPERM;
+		goto bail;
+	}
+
+	if (inode->i_nlink >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		err = -ENOMEM;
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					dentry->d_name.len);
+	if (err)
+		goto bail;
+
+	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					   dentry->d_name.name,
+					   dentry->d_name.len, &de_bh);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_journal_access(handle, inode, fe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	inode->i_nlink++;
+	inode->i_ctime = CURRENT_TIME;
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	err = ocfs2_journal_dirty(handle, fe_bh);
+	if (err < 0) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_add_entry(handle, dentry, inode,
+			      OCFS2_I(inode)->ip_blkno,
+			      parent_fe_bh, de_bh);
+	if (err) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	atomic_inc(&inode->i_count);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (de_bh)
+		brelse(de_bh);
+	if (fe_bh)
+		brelse(fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	unsigned int saved_nlink = 0;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	u64 blkno;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_node_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+	struct buffer_head *dirent_bh = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	BUG_ON(dentry->d_parent->d_inode != dir);
+
+	mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	if (inode == osb->root_inode) {
+		mlog(0, "Cannot delete the root directory\n");
+		status = -EPERM;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (OCFS2_I(inode)->ip_blkno != blkno) {
+		status = -ENOENT;
+
+		mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
+		     "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
+		     OCFS2_I(inode)->ip_flags);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+	       	if (!ocfs2_empty_dir(inode)) {
+			status = -ENOTEMPTY;
+			goto leave;
+		} else if (inode->i_nlink != 2) {
+			status = -ENOTEMPTY;
+			goto leave;
+		}
+	}
+
+	/* There are still a few steps left until we can consider the
+	 * unlink to have succeeded. Save off nlink here before
+	 * modification so we can set it back in case we hit an issue
+	 * before commit. */
+	saved_nlink = inode->i_nlink;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 0;
+	else
+		inode->i_nlink--;
+
+	status = ocfs2_request_unlink_vote(inode, dentry,
+					   (unsigned int) inode->i_nlink);
+	if (status < 0) {
+		/* This vote should succeed under all normal
+		 * circumstances. */
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!inode->i_nlink) {
+		status = ocfs2_prepare_orphan_dir(osb, handle, inode,
+						  orphan_name,
+						  &orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (!inode->i_nlink) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+					  orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* delete the name from the parent dir */
+	status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* We can set nlink on the dinode now. clear the saved version
+	 * so that it doesn't get set later. */
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	saved_nlink = 0;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		dir->i_nlink--;
+		status = ocfs2_mark_inode_dirty(handle, dir,
+						parent_node_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			dir->i_nlink++;
+		}
+	}
+
+leave:
+	if (status < 0 && saved_nlink)
+		inode->i_nlink = saved_nlink;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (fe_bh)
+		brelse(fe_bh);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	if (parent_node_bh)
+		brelse(parent_node_bh);
+
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * The only place this should be used is rename!
+ * if they have the same id, then the 1st one is the only one locked.
+ */
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2)
+{
+	int status;
+	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
+	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
+	struct buffer_head **tmpbh;
+	struct inode *tmpinode;
+
+	mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
+		   oi1->ip_blkno, oi2->ip_blkno);
+
+	BUG_ON(!handle);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* we always want to lock the one with the lower lockid first. */
+	if (oi1->ip_blkno != oi2->ip_blkno) {
+		if (oi1->ip_blkno < oi2->ip_blkno) {
+			/* switch id1 and id2 around */
+			mlog(0, "switching them around...\n");
+			tmpbh = bh2;
+			bh2 = bh1;
+			bh1 = tmpbh;
+
+			tmpinode = inode2;
+			inode2 = inode1;
+			inode1 = tmpinode;
+		}
+		/* lock id2 */
+		status = ocfs2_meta_lock(inode2, handle, bh2, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+	/* lock id1 */
+	status = ocfs2_meta_lock(inode1, handle, bh1, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+#define PARENT_INO(buffer) \
+	((struct ocfs2_dir_entry *) \
+	 ((char *)buffer + \
+	  le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
+
+static int ocfs2_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	int status = 0, rename_lock = 0;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct ocfs2_dinode *newfe = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+	struct buffer_head *newfe_bh = NULL;
+	struct buffer_head *insert_entry_bh = NULL;
+	struct ocfs2_super *osb = NULL;
+	u64 newfe_blkno;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *old_dir_bh = NULL;
+	struct buffer_head *new_dir_bh = NULL;
+	struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
+							       // and new_dentry
+	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
+	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
+						    // this is the 1st dirent bh
+	nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
+	unsigned int links_count;
+
+	/* At some point it might be nice to break this function up a
+	 * bit. */
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
+		   old_dir, old_dentry, new_dir, new_dentry,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   new_dentry->d_name.len, new_dentry->d_name.name);
+
+	osb = OCFS2_SB(old_dir->i_sb);
+
+	if (new_inode) {
+		if (!igrab(new_inode))
+			BUG();
+	}
+
+	if (atomic_read(&old_dentry->d_count) > 2) {
+		shrink_dcache_parent(old_dentry);
+		if (atomic_read(&old_dentry->d_count) > 2) {
+			status = -EBUSY;
+			goto bail;
+		}
+	}
+
+	/* Assume a directory heirarchy thusly:
+	 * a/b/c
+	 * a/d
+	 * a,b,c, and d are all directories.
+	 *
+	 * from cwd of 'a' on both nodes:
+	 * node1: mv b/c d
+	 * node2: mv d   b/c
+	 *
+	 * And that's why, just like the VFS, we need a file system
+	 * rename lock. */
+	if (old_dentry != new_dentry) {
+		status = ocfs2_rename_lock(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		rename_lock = 1;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* if old and new are the same, this'll just do one lock. */
+	status = ocfs2_double_lock(osb, handle,
+				  &old_dir_bh, old_dir,
+				  &new_dir_bh, new_dir);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* make sure both dirs have bhs
+	 * get an extra ref on old_dir_bh if old==new */
+	if (!new_dir_bh) {
+		if (old_dir_bh) {
+			new_dir_bh = old_dir_bh;
+			get_bh(new_dir_bh);
+		} else {
+			mlog(ML_ERROR, "no old_dir_bh!\n");
+			status = -EIO;
+			goto bail;
+		}
+	}
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		/* Directories actually require metadata updates to
+		 * the directory info so we can't get away with not
+		 * doing node locking on it. */
+		status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		status = -EIO;
+		old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
+		if (!old_inode_de_bh)
+			goto bail;
+
+		status = -EIO;
+		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
+		    OCFS2_I(old_dir)->ip_blkno)
+			goto bail;
+		status = -EMLINK;
+		if (!new_inode && new_dir!=old_dir &&
+		    new_dir->i_nlink >= OCFS2_LINK_MAX)
+			goto bail;
+	} else {
+		/* Ah, the simple case - we're a file so just send a
+		 * message. */
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = -ENOENT;
+	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+				     old_dentry->d_name.len,
+				     old_dir, &old_de);
+	if (!old_de_bh)
+		goto bail;
+
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
+		goto bail;
+
+	/* check if the target already exists (in which case we need
+	 * to delete it */
+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
+					  new_dentry->d_name.len,
+					  &newfe_blkno, new_dir, &new_de_bh,
+					  &new_de);
+	/* The only error we allow here is -ENOENT because the new
+	 * file not existing is perfectly valid. */
+	if ((status < 0) && (status != -ENOENT)) {
+		/* If we cannot find the file specified we should just */
+		/* return the error... */
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!new_de && new_inode)
+		mlog(ML_ERROR, "inode %lu does not exist in it's parent "
+		     "directory!", new_inode->i_ino);
+
+	/* In case we need to overwrite an existing file, we blow it
+	 * away first */
+	if (new_de) {
+		/* VFS didn't think there existed an inode here, but
+		 * someone else in the cluster must have raced our
+		 * rename to create one. Today we error cleanly, in
+		 * the future we should consider calling iget to build
+		 * a new struct inode for this entry. */
+		if (!new_inode) {
+			status = -EACCES;
+
+			mlog(0, "We found an inode for name %.*s but VFS "
+			     "didn't give us one.\n", new_dentry->d_name.len,
+			     new_dentry->d_name.name);
+			goto bail;
+		}
+
+		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
+			status = -EACCES;
+
+			mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
+			     "disagree. ip_flags = %x\n",
+			     OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
+			     OCFS2_I(new_inode)->ip_flags);
+			goto bail;
+		}
+
+		status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode))
+			links_count = 0;
+		else
+			links_count = (unsigned int) (new_inode->i_nlink - 1);
+
+		status = ocfs2_request_unlink_vote(new_inode, new_dentry,
+						   links_count);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
+
+		mlog(0, "aha rename over existing... new_de=%p "
+		     "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
+		     new_de, newfe_blkno, newfe_bh, newfe_bh ?
+		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+
+		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
+			status = ocfs2_prepare_orphan_dir(osb, handle,
+							  new_inode,
+							  orphan_name,
+							  &orphan_entry_bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+	} else {
+		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
+
+		status = ocfs2_check_dir_for_entry(new_dir,
+						   new_dentry->d_name.name,
+						   new_dentry->d_name.len);
+		if (status)
+			goto bail;
+
+		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
+						      new_dentry->d_name.name,
+						      new_dentry->d_name.len,
+						      &insert_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_de) {
+		if (S_ISDIR(new_inode->i_mode)) {
+			if (!ocfs2_empty_dir(new_inode) ||
+			    new_inode->i_nlink != 2) {
+				status = -ENOTEMPTY;
+				goto bail;
+			}
+		}
+		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode) ||
+		    (newfe->i_links_count == cpu_to_le16(1))){
+			status = ocfs2_orphan_add(osb, handle, new_inode,
+						  newfe, orphan_name,
+						  orphan_entry_bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		/* change the dirent to point to the correct inode */
+		status = ocfs2_journal_access(handle, new_dir, new_de_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
+		new_de->file_type = old_de->file_type;
+		new_dir->i_version++;
+		status = ocfs2_journal_dirty(handle, new_de_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode))
+			newfe->i_links_count = 0;
+		else
+			le16_add_cpu(&newfe->i_links_count, -1);
+
+		status = ocfs2_journal_dirty(handle, newfe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	} else {
+		/* if the name was not found in new_dir, add it now */
+		status = ocfs2_add_entry(handle, new_dentry, old_inode,
+					 OCFS2_I(old_inode)->ip_blkno,
+					 new_dir_bh, insert_entry_bh);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_inode);
+
+	/* now that the name has been added to new_dir, remove the old name */
+	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_inode) {
+		new_inode->i_nlink--;
+		new_inode->i_ctime = CURRENT_TIME;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+	if (old_inode_de_bh) {
+		status = ocfs2_journal_access(handle, old_inode,
+					     old_inode_de_bh,
+					     OCFS2_JOURNAL_ACCESS_WRITE);
+		PARENT_INO(old_inode_de_bh->b_data) =
+			cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
+		status = ocfs2_journal_dirty(handle, old_inode_de_bh);
+		old_dir->i_nlink--;
+		if (new_inode) {
+			new_inode->i_nlink--;
+		} else {
+			new_dir->i_nlink++;
+			mark_inode_dirty(new_dir);
+		}
+	}
+	mark_inode_dirty(old_dir);
+	if (new_inode)
+		mark_inode_dirty(new_inode);
+
+	if (old_dir != new_dir)
+		if (new_dir_nlink != new_dir->i_nlink) {
+			if (!new_dir_bh) {
+				mlog(ML_ERROR, "need to change nlink for new "
+				     "dir %"MLFu64" from %d to %d but bh is "
+				     "NULL\n", OCFS2_I(new_dir)->ip_blkno,
+				     (int)new_dir_nlink, new_dir->i_nlink);
+			} else {
+				struct ocfs2_dinode *fe;
+				status = ocfs2_journal_access(handle,
+							      new_dir,
+							      new_dir_bh,
+							      OCFS2_JOURNAL_ACCESS_WRITE);
+				fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
+				fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
+				status = ocfs2_journal_dirty(handle, new_dir_bh);
+			}
+		}
+
+	if (old_dir_nlink != old_dir->i_nlink) {
+		if (!old_dir_bh) {
+			mlog(ML_ERROR, "need to change nlink for old dir "
+			     "%"MLFu64" from %d to %d but bh is NULL!\n",
+			     OCFS2_I(old_dir)->ip_blkno,
+			     (int)old_dir_nlink,
+			     old_dir->i_nlink);
+		} else {
+			struct ocfs2_dinode *fe;
+			status = ocfs2_journal_access(handle, old_dir,
+						      old_dir_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
+			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+			status = ocfs2_journal_dirty(handle, old_dir_bh);
+		}
+	}
+
+	status = 0;
+bail:
+	if (rename_lock)
+		ocfs2_rename_unlock(osb);
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (new_inode)
+		sync_mapping_buffers(old_inode->i_mapping);
+
+	if (new_inode)
+		iput(new_inode);
+	if (newfe_bh)
+		brelse(newfe_bh);
+	if (old_dir_bh)
+		brelse(old_dir_bh);
+	if (new_dir_bh)
+		brelse(new_dir_bh);
+	if (new_de_bh)
+		brelse(new_de_bh);
+	if (old_de_bh)
+		brelse(old_de_bh);
+	if (old_inode_de_bh)
+		brelse(old_inode_de_bh);
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+	if (insert_entry_bh)
+		brelse(insert_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * we expect i_size = strlen(symname). Copy symname into the file
+ * data, including the null terminator.
+ */
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname)
+{
+	struct buffer_head **bhs = NULL;
+	const char *c;
+	struct super_block *sb = osb->sb;
+	u64 p_blkno;
+	int p_blocks;
+	int virtual, blocks, status, i, bytes_left;
+
+	bytes_left = i_size_read(inode) + 1;
+	/* we can't trust i_blocks because we're actually going to
+	 * write i_size + 1 bytes. */
+	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
+		       inode->i_blocks, i_size_read(inode), blocks);
+
+	/* Sanity check -- make sure we're going to fit. */
+	if (bytes_left >
+	    ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
+					     &p_blocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* links can never be larger than one cluster so we know this
+	 * is all going to be contiguous, but do a sanity check
+	 * anyway. */
+	if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	virtual = 0;
+	while(bytes_left > 0) {
+		c = &symname[virtual * sb->s_blocksize];
+
+		bhs[virtual] = sb_getblk(sb, p_blkno);
+		if (!bhs[virtual]) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
+
+		status = ocfs2_journal_access(handle, inode, bhs[virtual],
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
+
+		memcpy(bhs[virtual]->b_data, c,
+		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
+		       bytes_left);
+
+		status = ocfs2_journal_dirty(handle, bhs[virtual]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		virtual++;
+		p_blkno++;
+		bytes_left -= sb->s_blocksize;
+	}
+
+	status = 0;
+bail:
+
+	if (bhs) {
+		for(i = 0; i < blocks; i++)
+			if (bhs[i])
+				brelse(bhs[i]);
+		kfree(bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_symlink(struct inode *dir,
+			 struct dentry *dentry,
+			 const char *symname)
+{
+	int status, l, credits;
+	u64 newsize;
+	struct ocfs2_super *osb = NULL;
+	struct inode *inode = NULL;
+	struct super_block *sb;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_dinode *dirfe;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
+		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
+
+	sb = dir->i_sb;
+	osb = OCFS2_SB(sb);
+
+	l = strlen(symname) + 1;
+
+	credits = ocfs2_calc_symlink_credits(sb);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* lock the parent directory */
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto bail;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto bail;
+
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_mknod_locked(osb, dir, dentry,
+				    S_IFLNK | S_IRWXUGO, 0,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
+	inode->i_rdev = 0;
+	newsize = l - 1;
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		inode->i_op = &ocfs2_symlink_inode_operations;
+		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+						    handle, data_ac, NULL,
+						    NULL);
+		if (status < 0) {
+			if (status != -ENOSPC && status != -EINTR) {
+				mlog(ML_ERROR, "Failed to extend file to "
+					       "%"MLFu64"\n",
+				     newsize);
+				mlog_errno(status);
+				status = -ENOSPC;
+			}
+			goto bail;
+		}
+		i_size_write(inode, newsize);
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+	} else {
+		inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		memcpy((char *) fe->id2.i_symlink, symname, l);
+		i_size_write(inode, newsize);
+		inode->i_blocks = 0;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_fast_symlink(inode)) {
+		status = ocfs2_create_symlink_data(osb, handle, inode,
+						   symname);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+	if (de_bh)
+		brelse(de_bh);
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if ((status < 0) && inode)
+		iput(inode);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+int ocfs2_check_dir_entry(struct inode * dir,
+			  struct ocfs2_dir_entry * de,
+			  struct buffer_head * bh,
+			  unsigned long offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = le16_to_cpu(de->rec_len);
+
+	if (rlen < OCFS2_DIR_REC_LEN(1))
+		error_msg = "rec_len is smaller than minimal";
+	else if (rlen % 4 != 0)
+		error_msg = "rec_len % 4 != 0";
+	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+		error_msg = "rec_len is too small for name_len";
+	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+		error_msg = "directory entry across blocks";
+
+	if (error_msg != NULL)
+		mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
+		     "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
+		     OCFS2_I(dir)->ip_blkno, error_msg, offset,
+		     le64_to_cpu(de->inode), rlen, de->name_len);
+	return error_msg == NULL ? 1 : 0;
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * If you pass me insert_bh, I'll skip the search of the other dir
+ * blocks and put the record in there.
+ */
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh)
+{
+	unsigned long offset;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de, *de1;
+	struct super_block *sb;
+	int retval, status;
+
+	mlog_entry_void();
+
+	sb = dir->i_sb;
+
+	if (!namelen)
+		return -EINVAL;
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
+	while (1) {
+		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
+		/* These checks should've already been passed by the
+		 * prepare function, but I guess we can leave them
+		 * here anyway. */
+		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+			retval = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			retval = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			status = ocfs2_journal_access(handle, dir, insert_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			/* By now the buffer is marked for journaling */
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				de1 = (struct ocfs2_dir_entry *)((char *) de +
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de1->rec_len =
+					cpu_to_le16(le16_to_cpu(de->rec_len) -
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+				de = de1;
+			}
+			de->file_type = OCFS2_FT_UNKNOWN;
+			if (blkno) {
+				de->inode = cpu_to_le64(blkno);
+				ocfs2_set_de_type(de, inode->i_mode);
+			} else
+				de->inode = 0;
+			de->name_len = namelen;
+			memcpy(de->name, name, namelen);
+
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, insert_bh);
+			retval = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	/* when you think about it, the assert above should prevent us
+	 * from ever getting here. */
+	retval = -ENOSPC;
+bail:
+
+	mlog_exit(retval);
+	return retval;
+}
+
+
+/*
+ * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh)
+{
+	struct ocfs2_dir_entry *de, *pde;
+	int i, status = -ENOENT;
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+
+	i = 0;
+	pde = NULL;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (i < bh->b_size) {
+		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+			status = -EIO;
+			mlog_errno(status);
+			goto bail;
+		}
+		if (de == de_del)  {
+			status = ocfs2_journal_access(handle, dir, bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			if (pde)
+				pde->rec_len =
+					cpu_to_le16(le16_to_cpu(pde->rec_len) +
+						    le16_to_cpu(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, bh);
+			goto bail;
+		}
+		i += le16_to_cpu(de->rec_len);
+		pde = de;
+		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir)
+{
+	struct ocfs2_dir_entry *de;
+	char *dlimit, *de_buf;
+	int de_len;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	de_buf = bh->b_data;
+	dlimit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		de = (struct ocfs2_dir_entry *) de_buf;
+
+		if (de_buf + namelen <= dlimit &&
+		    ocfs2_match(namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+				ret = -1;
+				goto bail;
+			}
+			*res_dir = de;
+			ret = 1;
+			goto bail;
+		}
+
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->rec_len);
+		if (de_len <= 0) {
+			ret = -1;
+			goto bail;
+		}
+
+		de_buf += de_len;
+		offset += de_len;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	unsigned long start, block, b;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+
+	mlog_entry_void();
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	start = OCFS2_I(dir)->ip_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+
+				/* XXX: questionable readahead stuff here */
+				bh = ocfs2_bread(dir, b++, &err, 1);
+				bh_use[ra_max] = bh;
+#if 0		// ???
+				if (bh)
+					ll_rw_block(READ, 1, &bh);
+#endif
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			brelse(bh);
+			goto next;
+		}
+		i = ocfs2_search_dirblock(bh, dir, name, namelen,
+					  block << sb->s_blocksize_bits,
+					  res_dir);
+		if (i == 1) {
+			OCFS2_I(dir)->ip_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+static int ocfs2_blkno_stringify(u64 blkno, char *name)
+{
+	int status, namelen;
+
+	mlog_entry_void();
+
+	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
+			   blkno);
+	if (namelen <= 0) {
+		if (namelen)
+			status = namelen;
+		else
+			status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	if (namelen != OCFS2_ORPHAN_NAMELEN) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
+	     namelen);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_handle_add_inode(handle, orphan_dir_inode);
+	status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+					      orphan_dir_bh, name,
+					      OCFS2_ORPHAN_NAMELEN, de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+	struct ocfs2_dinode *orphan_fe;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_read_block(osb,
+				  OCFS2_I(orphan_dir_inode)->ip_blkno,
+				  &orphan_dir_bh, OCFS2_BH_CACHED,
+				  orphan_dir_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* we're a cluster, and nlink can change on disk from
+	 * underneath us... */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		le16_add_cpu(&orphan_fe->i_links_count, 1);
+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+
+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+				   OCFS2_ORPHAN_NAMELEN, inode,
+				   OCFS2_I(inode)->ip_blkno,
+				   orphan_dir_bh, de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+
+	/* Record which orphan dir our inode now resides
+	 * in. delete_inode will use this to determine which orphan
+	 * dir to lock. */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
+	     OCFS2_I(inode)->ip_blkno, osb->slot_num);
+
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* unlike orphan_add, we expect the orphan dir to already be locked here. */
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     struct ocfs2_journal_handle *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh)
+{
+	char name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct ocfs2_dinode *orphan_fe;
+	int status = 0;
+	struct buffer_head *target_de_bh = NULL;
+	struct ocfs2_dir_entry *target_de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
+	     name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
+
+	/* find it's spot in the orphan directory */
+	target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
+					orphan_dir_inode, &target_de);
+	if (!target_de_bh) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* remove it from the orphan directory */
+	status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
+				    target_de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the i_nlink dance! :) */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		le16_add_cpu(&orphan_fe->i_links_count, -1);
+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+
+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+leave:
+	if (target_de_bh)
+		brelse(target_de_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+struct inode_operations ocfs2_dir_iops = {
+	.create		= ocfs2_create,
+	.lookup		= ocfs2_lookup,
+	.link		= ocfs2_link,
+	.unlink		= ocfs2_unlink,
+	.rmdir		= ocfs2_unlink,
+	.symlink	= ocfs2_symlink,
+	.mkdir		= ocfs2_mkdir,
+	.mknod		= ocfs2_mknod,
+	.rename		= ocfs2_rename,
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};

+ 58 - 0
fs/ocfs2/namei.h

@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_NAMEI_H
+#define OCFS2_NAMEI_H
+
+extern struct inode_operations ocfs2_dir_iops;
+
+struct dentry *ocfs2_get_parent(struct dentry *child);
+
+int ocfs2_check_dir_entry (struct inode *dir,
+			   struct ocfs2_dir_entry *de,
+			   struct buffer_head *bh,
+			   unsigned long offset);
+struct buffer_head *ocfs2_find_entry(const char *name,
+				     int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir);
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     struct ocfs2_journal_handle *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh);
+
+static inline int ocfs2_match(int len,
+			      const char * const name,
+			      struct ocfs2_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+#endif /* OCFS2_NAMEI_H */

+ 109 - 0
fs/ocfs2/ocfs1_fs_compat.h

@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs1_fs_compat.h
+ *
+ * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
+ * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
+ * This allows an OCFS1 volume to see the partition and cleanly fail to
+ * mount it.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS1_FS_COMPAT_H
+#define _OCFS1_FS_COMPAT_H
+
+#define OCFS1_MAX_VOL_SIGNATURE_LEN          128
+#define OCFS1_MAX_MOUNT_POINT_LEN            128
+#define OCFS1_MAX_VOL_ID_LENGTH               16
+#define OCFS1_MAX_VOL_LABEL_LEN               64
+#define OCFS1_MAX_CLUSTER_NAME_LEN            64
+
+#define OCFS1_MAJOR_VERSION              (2)
+#define OCFS1_MINOR_VERSION              (0)
+#define OCFS1_VOLUME_SIGNATURE		 "OracleCFS"
+
+/*
+ * OCFS1 superblock.  Lives at sector 0.
+ */
+struct ocfs1_vol_disk_hdr
+{
+/*00*/	__u32 minor_version;
+	__u32 major_version;
+/*08*/	__u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
+/*88*/	__u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
+/*108*/	__u64 serial_num;
+/*110*/	__u64 device_size;
+	__u64 start_off;
+/*120*/	__u64 bitmap_off;
+	__u64 publ_off;
+/*130*/	__u64 vote_off;
+	__u64 root_bitmap_off;
+/*140*/	__u64 data_start_off;
+	__u64 root_bitmap_size;
+/*150*/	__u64 root_off;
+	__u64 root_size;
+/*160*/	__u64 cluster_size;
+	__u64 num_nodes;
+/*170*/	__u64 num_clusters;
+	__u64 dir_node_size;
+/*180*/	__u64 file_node_size;
+	__u64 internal_off;
+/*190*/	__u64 node_cfg_off;
+	__u64 node_cfg_size;
+/*1A0*/	__u64 new_cfg_off;
+	__u32 prot_bits;
+	__s32 excl_mount;
+/*1B0*/
+};
+
+
+struct ocfs1_disk_lock
+{
+/*00*/	__u32 curr_master;
+	__u8 file_lock;
+	__u8 compat_pad[3];  /* Not in orignal definition.  Used to
+				make the already existing alignment
+				explicit */
+	__u64 last_write_time;
+/*10*/	__u64 last_read_time;
+	__u32 writer_node_num;
+	__u32 reader_node_num;
+/*20*/	__u64 oin_node_map;
+	__u64 dlock_seq_num;
+/*30*/
+};
+
+/*
+ * OCFS1 volume label.  Lives at sector 1.
+ */
+struct ocfs1_vol_label
+{
+/*00*/	struct ocfs1_disk_lock disk_lock;
+/*30*/	__u8 label[OCFS1_MAX_VOL_LABEL_LEN];
+/*70*/	__u16 label_len;
+/*72*/	__u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
+/*82*/	__u16 vol_id_len;
+/*84*/	__u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
+/*A4*/	__u16 cluster_name_len;
+/*A6*/
+};
+
+
+#endif /* _OCFS1_FS_COMPAT_H */
+

+ 464 - 0
fs/ocfs2/ocfs2.h

@@ -0,0 +1,464 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2.h
+ *
+ * Defines macros and structures used in OCFS2
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_H
+#define OCFS2_H
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlm/dlmapi.h"
+
+#include "ocfs2_fs.h"
+#include "endian.h"
+#include "ocfs2_lockid.h"
+
+struct ocfs2_extent_map {
+	u32		em_clusters;
+	struct rb_root	em_extents;
+};
+
+/* Most user visible OCFS2 inodes will have very few pieces of
+ * metadata, but larger files (including bitmaps, etc) must be taken
+ * into account when designing an access scheme. We allow a small
+ * amount of inlined blocks to be stored on an array and grow the
+ * structure into a rb tree when necessary. */
+#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+
+struct ocfs2_caching_info {
+	unsigned int		ci_num_cached;
+	union {
+		sector_t	ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+		struct rb_root	ci_tree;
+	} ci_cache;
+};
+
+/* this limits us to 256 nodes
+ * if we need more, we can do a kmalloc for the map */
+#define OCFS2_NODE_MAP_MAX_NODES    256
+struct ocfs2_node_map {
+	u16 num_nodes;
+	unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
+};
+
+enum ocfs2_ast_action {
+	OCFS2_AST_INVALID = 0,
+	OCFS2_AST_ATTACH,
+	OCFS2_AST_CONVERT,
+	OCFS2_AST_DOWNCONVERT,
+};
+
+/* actions for an unlockast function to take. */
+enum ocfs2_unlock_action {
+	OCFS2_UNLOCK_INVALID = 0,
+	OCFS2_UNLOCK_CANCEL_CONVERT,
+	OCFS2_UNLOCK_DROP_LOCK,
+};
+
+/* ocfs2_lock_res->l_flags flags. */
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define OCFS2_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					       * downconvert*/
+#define OCFS2_LOCK_LOCAL         (0x00000008) /* newly created inode */
+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
+#define OCFS2_LOCK_REFRESHING    (0x00000020)
+#define OCFS2_LOCK_INITIALIZED   (0x00000040) /* track initialization
+					       * for shutdown paths */
+#define OCFS2_LOCK_FREEING       (0x00000080) /* help dlmglue track
+					       * when to skip queueing
+					       * a lock because it's
+					       * about to be
+					       * dropped. */
+#define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+
+struct ocfs2_lock_res_ops;
+
+typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+
+struct ocfs2_lock_res {
+	void                    *l_priv;
+	struct ocfs2_lock_res_ops *l_ops;
+	spinlock_t               l_lock;
+
+	struct list_head         l_blocked_list;
+	struct list_head         l_mask_waiters;
+
+	enum ocfs2_lock_type     l_type;
+	unsigned long		 l_flags;
+	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	/* used from AST/BAST funcs. */
+	enum ocfs2_ast_action    l_action;
+	enum ocfs2_unlock_action l_unlock_action;
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct list_head         l_debug_list;
+};
+
+struct ocfs2_dlm_debug {
+	struct kref d_refcnt;
+	struct dentry *d_locking_state;
+	struct list_head d_lockres_tracking;
+};
+
+enum ocfs2_vol_state
+{
+	VOLUME_INIT = 0,
+	VOLUME_MOUNTED,
+	VOLUME_DISMOUNTED,
+	VOLUME_DISABLED
+};
+
+struct ocfs2_alloc_stats
+{
+	atomic_t moves;
+	atomic_t local_data;
+	atomic_t bitmap_data;
+	atomic_t bg_allocs;
+	atomic_t bg_extends;
+};
+
+enum ocfs2_local_alloc_state
+{
+	OCFS2_LA_UNUSED = 0,
+	OCFS2_LA_ENABLED,
+	OCFS2_LA_DISABLED
+};
+
+enum ocfs2_mount_options
+{
+	OCFS2_MOUNT_HB_LOCAL   = 1 << 0, /* Heartbeat started in local mode */
+	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
+	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
+	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
+	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
+#endif
+};
+
+#define OCFS2_OSB_SOFT_RO	0x0001
+#define OCFS2_OSB_HARD_RO	0x0002
+#define OCFS2_OSB_ERROR_FS	0x0004
+
+struct ocfs2_journal;
+struct ocfs2_journal_handle;
+struct ocfs2_super
+{
+	u32 osb_id;		/* id used by the proc interface */
+	struct task_struct *commit_task;
+	struct super_block *sb;
+	struct inode *root_inode;
+	struct inode *sys_root_inode;
+	struct inode *system_inodes[NUM_SYSTEM_INODES];
+
+	struct ocfs2_slot_info *slot_info;
+
+	spinlock_t node_map_lock;
+	struct ocfs2_node_map mounted_map;
+	struct ocfs2_node_map recovery_map;
+	struct ocfs2_node_map umount_map;
+
+	u32 num_clusters;
+	u64 root_blkno;
+	u64 system_dir_blkno;
+	u64 bitmap_blkno;
+	u32 bitmap_cpg;
+	u8 *uuid;
+	char *uuid_str;
+	u8 *vol_label;
+	u64 first_cluster_group_blkno;
+	u32 fs_generation;
+
+	u32 s_feature_compat;
+	u32 s_feature_incompat;
+	u32 s_feature_ro_compat;
+
+	/* Protects s_next_generaion, osb_flags. Could protect more on
+	 * osb as it's very short lived. */
+	spinlock_t osb_lock;
+	u32 s_next_generation;
+	unsigned long osb_flags;
+
+	unsigned long s_mount_opt;
+
+	u16 max_slots;
+	u16 num_nodes;
+	s16 node_num;
+	s16 slot_num;
+	int s_sectsize_bits;
+	int s_clustersize;
+	int s_clustersize_bits;
+	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
+
+	atomic_t vol_state;
+	struct semaphore recovery_lock;
+	struct task_struct *recovery_thread_task;
+	int disable_recovery;
+	wait_queue_head_t checkpoint_event;
+	atomic_t needs_checkpoint;
+	struct ocfs2_journal *journal;
+
+	enum ocfs2_local_alloc_state local_alloc_state;
+	struct buffer_head *local_alloc_bh;
+
+	/* Next two fields are for local node slot recovery during
+	 * mount. */
+	int dirty;
+	struct ocfs2_dinode *local_alloc_copy;
+
+	struct ocfs2_alloc_stats alloc_stats;
+	char dev_str[20];		/* "major,minor" of the device */
+
+	struct dlm_ctxt *dlm;
+	struct ocfs2_lock_res osb_super_lockres;
+	struct ocfs2_lock_res osb_rename_lockres;
+	struct dlm_eviction_cb osb_eviction_cb;
+	struct ocfs2_dlm_debug *osb_dlm_debug;
+
+	struct dentry *osb_debug_root;
+
+	wait_queue_head_t recovery_event;
+
+	spinlock_t vote_task_lock;
+	struct task_struct *vote_task;
+	wait_queue_head_t vote_event;
+	unsigned long vote_wake_sequence;
+	unsigned long vote_work_sequence;
+
+	struct list_head blocked_lock_list;
+	unsigned long blocked_lock_count;
+
+	struct list_head vote_list;
+	int vote_count;
+
+	u32 net_key;
+	spinlock_t net_response_lock;
+	unsigned int net_response_ids;
+	struct list_head net_response_list;
+
+	struct o2hb_callback_func osb_hb_up;
+	struct o2hb_callback_func osb_hb_down;
+
+	struct list_head	osb_net_handlers;
+
+	wait_queue_head_t		osb_mount_event;
+
+	/* Truncate log info */
+	struct inode			*osb_tl_inode;
+	struct buffer_head		*osb_tl_bh;
+	struct work_struct		osb_truncate_log_wq;
+};
+
+#define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
+#define OCFS2_MAX_OSB_ID             65536
+
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
+		return 0;
+	return 1;
+}
+
+/* set / clear functions because cluster events can make these happen
+ * in parallel so we want the transitions to be atomic. this also
+ * means that any future flags osb_flags must be protected by spinlock
+ * too! */
+static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
+				      unsigned long flag)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags |= flag;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
+				     int hard)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
+	if (hard)
+		osb->osb_flags |= OCFS2_OSB_HARD_RO;
+	else
+		osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+#define OCFS2_IS_VALID_DINODE(ptr)					\
+	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)	do {			\
+	typeof(__di) ____di = (__di);					\
+	ocfs2_error((__sb), 						\
+		"Dinode # %"MLFu64" has bad signature %.*s",		\
+		(____di)->i_blkno, 7,					\
+		(____di)->i_signature);					\
+} while (0);
+
+#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
+	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)	do {		\
+	typeof(__eb) ____eb = (__eb);					\
+	ocfs2_error((__sb), 						\
+		"Extent Block # %"MLFu64" has bad signature %.*s",	\
+		(____eb)->h_blkno, 7,					\
+		(____eb)->h_signature);					\
+} while (0);
+
+#define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
+	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)	do {		\
+	typeof(__gd) ____gd = (__gd);					\
+		ocfs2_error((__sb),					\
+		"Group Descriptor # %"MLFu64" has bad signature %.*s",	\
+		(____gd)->bg_blkno, 7,					\
+		(____gd)->bg_signature);				\
+} while (0);
+
+static inline unsigned long ino_from_blkno(struct super_block *sb,
+					   u64 blkno)
+{
+	return (unsigned long)(blkno & (u64)ULONG_MAX);
+}
+
+static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
+					   u32 clusters)
+{
+	int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u64)clusters << c_to_b_bits;
+}
+
+static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
+					   u64 blocks)
+{
+	int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u32)(blocks >> b_to_c_bits);
+}
+
+static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
+						    u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	bytes += OCFS2_SB(sb)->s_clustersize - 1;
+	/* OCFS2 just cannot have enough clusters to overflow this */
+	clusters = (unsigned int)(bytes >> cl_bits);
+
+	return clusters;
+}
+
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+					 u64 bytes)
+{
+	bytes += sb->s_blocksize - 1;
+	return bytes >> sb->s_blocksize_bits;
+}
+
+static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
+					  u32 clusters)
+{
+	return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
+						u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_clusters_for_bytes(sb, bytes);
+	return (u64)clusters << cl_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+					      u64 bytes)
+{
+	u64 blocks;
+
+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
+	return blocks << sb->s_blocksize_bits;
+}
+
+static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
+{
+	return (unsigned long)((bytes + 511) >> 9);
+}
+
+#define ocfs2_set_bit ext2_set_bit
+#define ocfs2_clear_bit ext2_clear_bit
+#define ocfs2_test_bit ext2_test_bit
+#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#endif  /* OCFS2_H */
+

+ 638 - 0
fs/ocfs2/ocfs2_fs.h

@@ -0,0 +1,638 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_fs.h
+ *
+ * On-disk structures for OCFS2.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_FS_H
+#define _OCFS2_FS_H
+
+/* Version */
+#define OCFS2_MAJOR_REV_LEVEL		0
+#define OCFS2_MINOR_REV_LEVEL          	90
+
+/*
+ * An OCFS2 volume starts this way:
+ * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
+ * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
+ * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
+ *
+ * All other structures are found from the superblock information.
+ *
+ * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors.  eg, for a
+ * blocksize of 2K, it is 4096 bytes into disk.
+ */
+#define OCFS2_SUPER_BLOCK_BLKNO		2
+
+/*
+ * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
+ * grow if needed.
+ */
+#define OCFS2_MIN_CLUSTERSIZE		4096
+#define OCFS2_MAX_CLUSTERSIZE		1048576
+
+/*
+ * Blocks cannot be bigger than clusters, so the maximum blocksize is the
+ * minimum cluster size.
+ */
+#define OCFS2_MIN_BLOCKSIZE		512
+#define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
+
+/* Filesystem magic number */
+#define OCFS2_SUPER_MAGIC		0x7461636f
+
+/* Object signatures */
+#define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
+#define OCFS2_INODE_SIGNATURE		"INODE01"
+#define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+
+/* Compatibility flags */
+#define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_compat & (mask) )
+#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
+#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_incompat & (mask) )
+#define OCFS2_SET_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat |= (mask)
+#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
+#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat |= (mask)
+#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat &= ~(mask)
+#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
+#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
+
+#define OCFS2_FEATURE_COMPAT_SUPP	0
+#define OCFS2_FEATURE_INCOMPAT_SUPP	0
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	0
+
+/*
+ * Heartbeat-only devices are missing journals and other files.  The
+ * filesystem driver can't load them, but the library can.  Never put
+ * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
+ */
+#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV	0x0002
+
+
+/*
+ * Flags on ocfs2_dinode.i_flags
+ */
+#define OCFS2_VALID_FL		(0x00000001)	/* Inode is valid */
+#define OCFS2_UNUSED2_FL	(0x00000002)
+#define OCFS2_ORPHANED_FL	(0x00000004)	/* On the orphan list */
+#define OCFS2_UNUSED3_FL	(0x00000008)
+/* System inode flags */
+#define OCFS2_SYSTEM_FL		(0x00000010)	/* System inode */
+#define OCFS2_SUPER_BLOCK_FL	(0x00000020)	/* Super block */
+#define OCFS2_LOCAL_ALLOC_FL	(0x00000040)	/* Slot local alloc bitmap */
+#define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
+#define OCFS2_JOURNAL_FL	(0x00000100)	/* Slot local journal */
+#define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
+#define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
+#define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+
+/*
+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
+ */
+#define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
+
+/*
+ * superblock s_state flags
+ */
+#define OCFS2_ERROR_FS		(0x00000001)	/* FS saw errors */
+
+/* Limit of space in ocfs2_dir_entry */
+#define OCFS2_MAX_FILENAME_LEN		255
+
+/* Maximum slots on an ocfs2 file system */
+#define OCFS2_MAX_SLOTS			255
+
+/* Slot map indicator for an empty slot */
+#define OCFS2_INVALID_SLOT		-1
+
+#define OCFS2_VOL_UUID_LEN		16
+#define OCFS2_MAX_VOL_LABEL_LEN		64
+
+/* Journal limits (in bytes) */
+#define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
+#define OCFS2_MAX_JOURNAL_SIZE		(500 * 1024 * 1024)
+
+struct ocfs2_system_inode_info {
+	char	*si_name;
+	int	si_iflags;
+	int	si_mode;
+};
+
+/* System file index */
+enum {
+	BAD_BLOCK_SYSTEM_INODE = 0,
+	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+	SLOT_MAP_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
+	HEARTBEAT_SYSTEM_INODE,
+	GLOBAL_BITMAP_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+	ORPHAN_DIR_SYSTEM_INODE,
+	EXTENT_ALLOC_SYSTEM_INODE,
+	INODE_ALLOC_SYSTEM_INODE,
+	JOURNAL_SYSTEM_INODE,
+	LOCAL_ALLOC_SYSTEM_INODE,
+	TRUNCATE_LOG_SYSTEM_INODE,
+	NUM_SYSTEM_INODES
+};
+
+static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
+	/* Global system inodes (single copy) */
+	/* The first two are only used from userspace mfks/tunefs */
+	[BAD_BLOCK_SYSTEM_INODE]		= { "bad_blocks", 0, S_IFREG | 0644 },
+	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	= { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+
+	/* These are used by the running filesystem */
+	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
+	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
+	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+
+	/* Slot-specific system inodes (one copy per slot) */
+	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
+	[EXTENT_ALLOC_SYSTEM_INODE]		= { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
+	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+};
+
+/* Parameter passed from mount.ocfs2 to module */
+#define OCFS2_HB_NONE			"heartbeat=none"
+#define OCFS2_HB_LOCAL			"heartbeat=local"
+
+/*
+ * OCFS2 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define OCFS2_FT_UNKNOWN	0
+#define OCFS2_FT_REG_FILE	1
+#define OCFS2_FT_DIR		2
+#define OCFS2_FT_CHRDEV		3
+#define OCFS2_FT_BLKDEV		4
+#define OCFS2_FT_FIFO		5
+#define OCFS2_FT_SOCK		6
+#define OCFS2_FT_SYMLINK	7
+
+#define OCFS2_FT_MAX		8
+
+/*
+ * OCFS2_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define OCFS2_DIR_PAD			4
+#define OCFS2_DIR_ROUND			(OCFS2_DIR_PAD - 1)
+#define OCFS2_DIR_MEMBER_LEN 		offsetof(struct ocfs2_dir_entry, name)
+#define OCFS2_DIR_REC_LEN(name_len)	(((name_len) + OCFS2_DIR_MEMBER_LEN + \
+                                          OCFS2_DIR_ROUND) & \
+					 ~OCFS2_DIR_ROUND)
+
+#define OCFS2_LINK_MAX		32000
+
+#define S_SHIFT			12
+static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
+	[S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
+};
+
+
+/*
+ * Convenience casts
+ */
+#define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
+
+/*
+ * On disk extent record for OCFS2
+ * It describes a range of clusters on disk.
+ */
+struct ocfs2_extent_rec {
+/*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
+	__le32 e_clusters;	/* Clusters covered by this extent */
+	__le64 e_blkno;		/* Physical disk offset, in blocks */
+/*10*/
+};
+
+struct ocfs2_chain_rec {
+	__le32 c_free;	/* Number of free bits in this chain. */
+	__le32 c_total;	/* Number of total bits in this chain */
+	__le64 c_blkno;	/* Physical disk offset (blocks) of 1st group */
+};
+
+struct ocfs2_truncate_rec {
+	__le32 t_start;		/* 1st cluster in this log */
+	__le32 t_clusters;	/* Number of total clusters covered */
+};
+
+/*
+ * On disk extent list for OCFS2 (node in the tree).  Note that this
+ * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
+ * offsets are relative to ocfs2_dinode.id2.i_list or
+ * ocfs2_extent_block.h_list, respectively.
+ */
+struct ocfs2_extent_list {
+/*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
+					   point.  0 means data extents
+					   hang directly off this
+					   header (a leaf) */
+	__le16 l_count;			/* Number of extent records */
+	__le16 l_next_free_rec;		/* Next unused extent slot */
+	__le16 l_reserved1;
+	__le64 l_reserved2;		/* Pad to
+					   sizeof(ocfs2_extent_rec) */
+/*10*/	struct ocfs2_extent_rec l_recs[0];	/* Extent records */
+};
+
+/*
+ * On disk allocation chain list for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_chain.
+ */
+struct ocfs2_chain_list {
+/*00*/	__le16 cl_cpg;			/* Clusters per Block Group */
+	__le16 cl_bpc;			/* Bits per cluster */
+	__le16 cl_count;		/* Total chains in this list */
+	__le16 cl_next_free_rec;	/* Next unused chain slot */
+	__le64 cl_reserved1;
+/*10*/	struct ocfs2_chain_rec cl_recs[0];	/* Chain records */
+};
+
+/*
+ * On disk deallocation log for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_dealloc.
+ */
+struct ocfs2_truncate_log {
+/*00*/	__le16 tl_count;		/* Total records in this log */
+	__le16 tl_used;			/* Number of records in use */
+	__le32 tl_reserved1;
+/*08*/	struct ocfs2_truncate_rec tl_recs[0];	/* Truncate records */
+};
+
+/*
+ * On disk extent block (indirect block) for OCFS2
+ */
+struct ocfs2_extent_block
+{
+/*00*/	__u8 h_signature[8];		/* Signature for verification */
+	__le64 h_reserved1;
+/*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
+					   extent_header belongs to */
+	__le16 h_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 h_fs_generation;		/* Must match super block */
+	__le64 h_blkno;			/* Offset on disk, in blocks */
+/*20*/	__le64 h_reserved3;
+	__le64 h_next_leaf_blk;		/* Offset on disk, in blocks,
+					   of next leaf header pointing
+					   to data */
+/*30*/	struct ocfs2_extent_list h_list;	/* Extent record list */
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk superblock for OCFS2
+ * Note that it is contained inside an ocfs2_dinode, so all offsets
+ * are relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_super_block {
+/*00*/	__le16 s_major_rev_level;
+	__le16 s_minor_rev_level;
+	__le16 s_mnt_count;
+	__le16 s_max_mnt_count;
+	__le16 s_state;			/* File system state */
+	__le16 s_errors;			/* Behaviour when detecting errors */
+	__le32 s_checkinterval;		/* Max time between checks */
+/*10*/	__le64 s_lastcheck;		/* Time of last check */
+	__le32 s_creator_os;		/* OS */
+	__le32 s_feature_compat;		/* Compatible feature set */
+/*20*/	__le32 s_feature_incompat;	/* Incompatible feature set */
+	__le32 s_feature_ro_compat;	/* Readonly-compatible feature set */
+	__le64 s_root_blkno;		/* Offset, in blocks, of root directory
+					   dinode */
+/*30*/	__le64 s_system_dir_blkno;	/* Offset, in blocks, of system
+					   directory dinode */
+	__le32 s_blocksize_bits;		/* Blocksize for this fs */
+	__le32 s_clustersize_bits;	/* Clustersize for this fs */
+/*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
+					   before tunefs required */
+	__le16 s_reserved1;
+	__le32 s_reserved2;
+	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
+					 * group header */
+/*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
+/*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
+/*A0*/
+};
+
+/*
+ * Local allocation bitmap for OCFS2 slots
+ * Note that it exists inside an ocfs2_dinode, so all offsets are
+ * relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_local_alloc
+{
+/*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
+	__le16 la_size;		/* Size of included bitmap, in bytes */
+	__le16 la_reserved1;
+	__le64 la_reserved2;
+/*10*/	__u8   la_bitmap[0];
+};
+
+/*
+ * On disk inode for OCFS2
+ */
+struct ocfs2_dinode {
+/*00*/	__u8 i_signature[8];		/* Signature for validation */
+	__le32 i_generation;		/* Generation number */
+	__le16 i_suballoc_slot;		/* Slot suballocator this inode
+					   belongs to */
+	__le16 i_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+/*10*/	__le32 i_reserved0;
+	__le32 i_clusters;		/* Cluster count */
+	__le32 i_uid;			/* Owner UID */
+	__le32 i_gid;			/* Owning GID */
+/*20*/	__le64 i_size;			/* Size in bytes */
+	__le16 i_mode;			/* File mode */
+	__le16 i_links_count;		/* Links count */
+	__le32 i_flags;			/* File flags */
+/*30*/	__le64 i_atime;			/* Access time */
+	__le64 i_ctime;			/* Creation time */
+/*40*/	__le64 i_mtime;			/* Modification time */
+	__le64 i_dtime;			/* Deletion time */
+/*50*/	__le64 i_blkno;			/* Offset on disk, in blocks */
+	__le64 i_last_eb_blk;		/* Pointer to last extent
+					   block */
+/*60*/	__le32 i_fs_generation;		/* Generation per fs-instance */
+	__le32 i_atime_nsec;
+	__le32 i_ctime_nsec;
+	__le32 i_mtime_nsec;
+/*70*/	__le64 i_reserved1[9];
+/*B8*/	union {
+		__le64 i_pad1;		/* Generic way to refer to this
+					   64bit union */
+		struct {
+			__le64 i_rdev;	/* Device number */
+		} dev1;
+		struct {		/* Info for bitmap system
+					   inodes */
+			__le32 i_used;	/* Bits (ie, clusters) used  */
+			__le32 i_total;	/* Total bits (clusters)
+					   available */
+		} bitmap1;
+		struct {		/* Info for journal system
+					   inodes */
+			__le32 ij_flags;	/* Mounted, version, etc. */
+			__le32 ij_pad;
+		} journal1;
+	} id1;				/* Inode type dependant 1 */
+/*C0*/	union {
+		struct ocfs2_super_block	i_super;
+		struct ocfs2_local_alloc	i_lab;
+		struct ocfs2_chain_list		i_chain;
+		struct ocfs2_extent_list	i_list;
+		struct ocfs2_truncate_log	i_dealloc;
+		__u8               		i_symlink[0];
+	} id2;
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On-disk directory entry structure for OCFS2
+ *
+ * Packed as this structure could be accessed unaligned on 64-bit platforms
+ */
+struct ocfs2_dir_entry {
+/*00*/	__le64   inode;                  /* Inode number */
+	__le16   rec_len;                /* Directory entry length */
+	__u8    name_len;               /* Name length */
+	__u8    file_type;
+/*0C*/	char    name[OCFS2_MAX_FILENAME_LEN];   /* File name */
+/* Actual on-disk length specified by rec_len */
+} __attribute__ ((packed));
+
+/*
+ * On disk allocator group structure for OCFS2
+ */
+struct ocfs2_group_desc
+{
+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
+	__le16   bg_size;                /* Size of included bitmap in
+					   bytes. */
+	__le16   bg_bits;                /* Bits represented by this
+					   group. */
+	__le16	bg_free_bits_count;     /* Free bits count */
+	__le16   bg_chain;               /* What chain I am in. */
+/*10*/	__le32   bg_generation;
+	__le32	bg_reserved1;
+	__le64   bg_next_group;          /* Next group in my list, in
+					   blocks */
+/*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
+					   blocks */
+	__le64   bg_blkno;               /* Offset on disk, in blocks */
+/*30*/	__le64   bg_reserved2[2];
+/*40*/	__u8    bg_bitmap[0];
+};
+
+#ifdef __KERNEL__
+static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
+{
+	return  sb->s_blocksize -
+		 offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
+{
+	u16 size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+#else
+static inline int ocfs2_fast_symlink_chars(int blocksize)
+{
+	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_extent_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline int ocfs2_extent_recs_per_eb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_local_alloc_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+#endif  /* __KERNEL__ */
+
+
+static inline int ocfs2_system_inode_is_global(int type)
+{
+	return ((type >= 0) &&
+		(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
+}
+
+static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
+						  int type, int slot)
+{
+	int chars;
+
+        /*
+         * Global system inodes can only have one copy.  Everything
+         * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
+         * list has a copy per slot.
+         */
+	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name);
+	else
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name,
+				 slot);
+
+	return chars;
+}
+
+static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
+				    umode_t mode)
+{
+	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+#endif  /* _OCFS2_FS_H */
+

+ 73 - 0
fs/ocfs2/ocfs2_lockid.h

@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockid.h
+ *
+ * Defines OCFS2 lockid bits.
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKID_H
+#define OCFS2_LOCKID_H
+
+/* lock ids are made up in the following manner:
+ * name[0]     --> type
+ * name[1-6]   --> 6 pad characters, reserved for now
+ * name[7-22]  --> block number, expressed in hex as 16 chars
+ * name[23-30] --> i_generation, expressed in hex 8 chars
+ * name[31]    --> '\0' */
+#define OCFS2_LOCK_ID_MAX_LEN  32
+#define OCFS2_LOCK_ID_PAD "000000"
+
+enum ocfs2_lock_type {
+	OCFS2_LOCK_TYPE_META = 0,
+	OCFS2_LOCK_TYPE_DATA,
+	OCFS2_LOCK_TYPE_SUPER,
+	OCFS2_LOCK_TYPE_RENAME,
+	OCFS2_LOCK_TYPE_RW,
+	OCFS2_NUM_LOCK_TYPES
+};
+
+static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
+{
+	char c;
+	switch (type) {
+		case OCFS2_LOCK_TYPE_META:
+			c = 'M';
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			c = 'D';
+			break;
+		case OCFS2_LOCK_TYPE_SUPER:
+			c = 'S';
+			break;
+		case OCFS2_LOCK_TYPE_RENAME:
+			c = 'R';
+			break;
+		case OCFS2_LOCK_TYPE_RW:
+			c = 'W';
+			break;
+		default:
+			c = '\0';
+	}
+
+	return c;
+}
+
+#endif  /* OCFS2_LOCKID_H */

+ 303 - 0
fs/ocfs2/slot_map.c

@@ -0,0 +1,303 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slot_map.c
+ *
+ *
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    s16 global);
+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num);
+
+/* Use the slot information we've collected to create a map of mounted
+ * nodes. Should be holding an EX on super block. assumes slot info is
+ * up to date. Note that we call this *after* we find a slot, so our
+ * own node should be set in the map too... */
+void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
+{
+	int i;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	spin_lock(&si->si_lock);
+
+	for (i = 0; i < si->si_size; i++)
+		if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
+			ocfs2_node_map_set_bit(osb, &osb->mounted_map,
+					      si->si_global_node_nums[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the slot information on disk into our slot_info struct. */
+void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+	int i;
+	__le16 *disk_info;
+
+	/* we don't read the slot block here as ocfs2_super_lock
+	 * should've made sure we have the most recent copy. */
+	spin_lock(&si->si_lock);
+	disk_info = (__le16 *) si->si_bh->b_data;
+
+	for (i = 0; i < si->si_size; i++)
+		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the our slot info stuff into it's destination bh and write it
+ * out. */
+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+			    struct ocfs2_slot_info *si)
+{
+	int status, i;
+	__le16 *disk_info = (__le16 *) si->si_bh->b_data;
+
+	spin_lock(&si->si_lock);
+	for (i = 0; i < si->si_size; i++)
+		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/* try to find global node in the slot info. Returns
+ * OCFS2_INVALID_SLOT if nothing is found. */
+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    s16 global)
+{
+	int i;
+	s16 ret = OCFS2_INVALID_SLOT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (global == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
+{
+	int i;
+	s16 ret = OCFS2_INVALID_SLOT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+			   s16 global)
+{
+	s16 ret;
+
+	spin_lock(&si->si_lock);
+	ret = __ocfs2_node_num_to_slot(si, global);
+	spin_unlock(&si->si_lock);
+	return ret;
+}
+
+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num)
+{
+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+	BUG_ON(slot_num >= si->si_num_slots);
+	BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
+	       (node_num >= O2NM_MAX_NODES));
+
+	si->si_global_node_nums[slot_num] = node_num;
+}
+
+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+		      s16 slot_num)
+{
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
+	spin_unlock(&si->si_lock);
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+	int status, i;
+	u64 blkno;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_slot_info *si;
+
+	si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+	if (!si) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock_init(&si->si_lock);
+	si->si_num_slots = osb->max_slots;
+	si->si_size = OCFS2_MAX_SLOTS;
+
+	for(i = 0; i < si->si_num_slots; i++)
+		si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+
+	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	si->si_inode = inode;
+	si->si_bh = bh;
+	osb->slot_info = si;
+bail:
+	if (status < 0 && si)
+		ocfs2_free_slot_info(si);
+
+	return status;
+}
+
+void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+{
+	if (si->si_inode)
+		iput(si->si_inode);
+	if (si->si_bh)
+		brelse(si->si_bh);
+	kfree(si);
+}
+
+int ocfs2_find_slot(struct ocfs2_super *osb)
+{
+	int status;
+	s16 slot;
+	struct ocfs2_slot_info *si;
+
+	mlog_entry_void();
+
+	si = osb->slot_info;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	/* search for ourselves first and take the slot if it already
+	 * exists. Perhaps we need to mark this in a variable for our
+	 * own journal recovery? Possibly not, though we certainly
+	 * need to warn to the user */
+	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+	if (slot == OCFS2_INVALID_SLOT) {
+		/* if no slot yet, then just take 1st available
+		 * one. */
+		slot = __ocfs2_find_empty_slot(si);
+		if (slot == OCFS2_INVALID_SLOT) {
+			spin_unlock(&si->si_lock);
+			mlog(ML_ERROR, "no free slots available!\n");
+			status = -EINVAL;
+			goto bail;
+		}
+	} else
+		mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
+		     slot);
+
+	__ocfs2_fill_slot(si, slot, osb->node_num);
+	osb->slot_num = slot;
+	spin_unlock(&si->si_lock);
+
+	mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_put_slot(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (!si)
+		return;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+	osb->slot_num = OCFS2_INVALID_SLOT;
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	osb->slot_info = NULL;
+	ocfs2_free_slot_info(si);
+}
+

+ 66 - 0
fs/ocfs2/slot_map.h

@@ -0,0 +1,66 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slotmap.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef SLOTMAP_H
+#define SLOTMAP_H
+
+struct ocfs2_slot_info {
+	spinlock_t si_lock;
+
+       	struct inode *si_inode;
+	struct buffer_head *si_bh;
+	unsigned int si_num_slots;
+	unsigned int si_size;
+	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
+};
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb);
+void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+
+int ocfs2_find_slot(struct ocfs2_super *osb);
+void ocfs2_put_slot(struct ocfs2_super *osb);
+
+void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+			    struct ocfs2_slot_info *si);
+
+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+			   s16 global);
+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+		      s16 slot_num);
+
+void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
+
+static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
+				      int slot_num)
+{
+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+	assert_spin_locked(&si->si_lock);
+
+	return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
+}
+
+#endif

+ 1651 - 0
fs/ocfs2/suballoc.c

@@ -0,0 +1,1651 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ * Inspired by ext3 block groups.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh);
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac);
+
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u16 *bit_off, u16 *bits_found);
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u16 *bit_off, u16 *bits_found);
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      u16 *bit_off,
+			      unsigned int *num_bits,
+			      u64 *bg_blkno);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+				     struct ocfs2_alloc_context *ac,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     u16 *bit_off,
+				     unsigned int *num_bits,
+				     u64 *bg_blkno);
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr);
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     u16 *bit_off,
+					     u16 *bits_found);
+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits);
+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
+					       struct inode *alloc_inode,
+					       struct ocfs2_group_desc *bg,
+					       struct buffer_head *group_bh,
+					       unsigned int bit_off,
+					       unsigned int num_bits);
+
+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain);
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted);
+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *alloc_bh,
+				    unsigned int start_bit,
+				    u64 bg_blkno,
+				    unsigned int count);
+static inline u64 ocfs2_which_suballoc_group(u64 block,
+					     unsigned int bit);
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off);
+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+					    u32 cluster);
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off);
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+	if (ac->ac_inode)
+		iput(ac->ac_inode);
+	if (ac->ac_bh)
+		brelse(ac->ac_bh);
+	kfree(ac);
+}
+
+static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
+{
+	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
+}
+
+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl)
+{
+	int status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct super_block * sb = alloc_inode->i_sb;
+
+	mlog_entry_void();
+
+	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
+		ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
+			    "!= b_blocknr (%llu)", group_blkno,
+			    (unsigned long long) bg_bh->b_blocknr);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      bg_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	memset(bg, 0, sb->s_blocksize);
+	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
+	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+	bg->bg_chain = cpu_to_le16(my_chain);
+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
+	bg->bg_blkno = cpu_to_le64(group_blkno);
+	/* set the 1st bit in the bitmap to account for the descriptor block */
+	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
+	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
+
+	status = ocfs2_journal_dirty(handle, bg_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* There is no need to zero out or otherwise initialize the
+	 * other blocks in a group - All valid FS metadata in a block
+	 * group stores the superblock fs_generation value at
+	 * allocation time. */
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_count)) {
+		if (le32_to_cpu(cl->cl_recs[best].c_total) >
+		    le32_to_cpu(cl->cl_recs[curr].c_total))
+			best = curr;
+		curr++;
+	}
+	return best;
+}
+
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh)
+{
+	int status, credits;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_alloc_context *ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	u32 bit_off, num_bits;
+	u16 alloc_rec;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+
+	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	cl = &fe->id2.i_chain;
+	status = ocfs2_reserve_clusters(osb,
+					handle,
+					le16_to_cpu(cl->cl_cpg),
+					&ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	credits = ocfs2_calc_group_alloc_credits(osb->sb,
+						 le16_to_cpu(cl->cl_cpg));
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_claim_clusters(osb,
+				      handle,
+				      ac,
+				      le16_to_cpu(cl->cl_cpg),
+				      &bit_off,
+				      &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
+	     alloc_rec, bg_blkno);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
+
+	status = ocfs2_block_group_fill(handle,
+					alloc_inode,
+					bg_bh,
+					bg_blkno,
+					alloc_rec,
+					cl);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	status = ocfs2_journal_access(handle, alloc_inode,
+				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
+		     le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
+	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
+					le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
+	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
+	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
+					     le32_to_cpu(fe->i_clusters)));
+	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
+	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
+	alloc_inode->i_blocks =
+		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	if (bg_bh)
+		brelse(bg_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac)
+{
+	int status;
+	u32 bits_wanted = ac->ac_bits_wanted;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_journal_handle *handle = ac->ac_handle;
+	struct ocfs2_dinode *fe;
+	u32 free_bits;
+
+	mlog_entry_void();
+
+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+
+	ocfs2_handle_add_inode(handle, alloc_inode);
+	status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
+		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
+			    "# %"MLFu64, le64_to_cpu(fe->i_blkno));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
+		le32_to_cpu(fe->id1.bitmap1.i_used);
+
+	if (bits_wanted > free_bits) {
+		/* cluster bitmap never grows */
+		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
+			     bits_wanted, free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		atomic_inc(&osb->alloc_stats.bg_extends);
+
+		/* You should never ask for this much metadata */
+		BUG_ON(bits_wanted >
+		       (le32_to_cpu(fe->id1.bitmap1.i_total)
+			- le32_to_cpu(fe->id1.bitmap1.i_used)));
+	}
+
+	get_bh(bh);
+	ac->ac_bh = bh;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_dinode *fe,
+			       struct ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS2_AC_USE_META;
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  EXTENT_ALLOC_SYSTEM_INODE,
+						  0);
+#else
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  EXTENT_ALLOC_SYSTEM_INODE,
+						  osb->slot_num);
+#endif
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, (*ac));
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = 1;
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS2_AC_USE_INODE;
+
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  INODE_ALLOC_SYSTEM_INODE,
+						  osb->slot_num);
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, *ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* local alloc code has to do the same thing, so rather than do this
+ * twice.. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac)
+{
+	int status;
+
+	ac->ac_inode = ocfs2_get_system_file_inode(osb,
+						   GLOBAL_BITMAP_SYSTEM_INODE,
+						   OCFS2_INVALID_SLOT);
+	if (!ac->ac_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto bail;
+	}
+	ac->ac_which = OCFS2_AC_USE_MAIN;
+	ac->ac_group_search = ocfs2_cluster_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, ac);
+	if (status < 0 && status != -ENOSPC)
+		mlog_errno(status);
+bail:
+	return status;
+}
+
+/* Callers don't need to care which bitmap (local alloc or main) to
+ * use so we figure it out for them, but unfortunately this clutters
+ * things a bit. */
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   struct ocfs2_journal_handle *handle,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	int status;
+
+	mlog_entry_void();
+
+	BUG_ON(!handle);
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_handle = handle;
+
+	status = -ENOSPC;
+	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+		status = ocfs2_reserve_local_alloc_bits(osb,
+							handle,
+							bits_wanted,
+							*ac);
+		if ((status < 0) && (status != -ENOSPC)) {
+			mlog_errno(status);
+			goto bail;
+		} else if (status == -ENOSPC) {
+			/* reserve_local_bits will return enospc with
+			 * the local alloc inode still locked, so we
+			 * can change this safely here. */
+			mlog(0, "Disabling local alloc\n");
+			/* We set to OCFS2_LA_DISABLED so that umount
+			 * can clean up what's left of the local
+			 * allocation */
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+	}
+
+	if (status == -ENOSPC) {
+		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * More or less lifted from ext3. I'll leave their description below:
+ *
+ * "For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes."
+ *
+ * Note: OCFS2 already does this differently for metadata vs data
+ * allocations, as those bitmaps are seperate and undo access is never
+ * called on a metadata group descriptor.
+ */
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr)
+{
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
+		return 0;
+	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
+		return 1;
+
+	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+}
+
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     u16 *bit_off,
+					     u16 *bits_found)
+{
+	void *bitmap;
+	u16 best_offset, best_size;
+	int offset, start, found, status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
+		return -EIO;
+	}
+
+	found = start = best_offset = best_size = 0;
+	bitmap = bg->bg_bitmap;
+
+	while((offset = ocfs2_find_next_zero_bit(bitmap,
+						 le16_to_cpu(bg->bg_bits),
+						 start)) != -1) {
+		if (offset == le16_to_cpu(bg->bg_bits))
+			break;
+
+		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
+			/* We found a zero, but we can't use it as it
+			 * hasn't been put to disk yet! */
+			found = 0;
+			start = offset + 1;
+		} else if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_size) {
+			best_size = found;
+			best_offset = start - found;
+		}
+		/* we got everything we needed */
+		if (found == bits_wanted) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	/* XXX: I think the first clause is equivalent to the second
+	 * 	- jlbec */
+	if (found == bits_wanted) {
+		*bit_off = start - found;
+		*bits_found = found;
+	} else if (best_size) {
+		*bit_off = best_offset;
+		*bits_found = best_size;
+	} else {
+		status = -ENOSPC;
+		/* No error log here -- see the comment above
+		 * ocfs2_test_bg_bit_allocatable */
+	}
+
+	return status;
+}
+
+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+	     num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      group_bh,
+				      journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+
+	while(num_bits--)
+		ocfs2_set_bit(bit_off++, bitmap);
+
+	status = ocfs2_journal_dirty(handle,
+				     group_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	BUG_ON(!cl->cl_next_free_rec);
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
+		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
+		    le32_to_cpu(cl->cl_recs[best].c_free))
+			best = curr;
+		curr++;
+	}
+
+	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
+	return best;
+}
+
+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain)
+{
+	int status;
+	/* there is a really tiny chance the journal calls could fail,
+	 * but we wouldn't want inconsistent blocks in *any* case. */
+	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto out;
+	}
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto out;
+	}
+	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
+		status = -EIO;
+		goto out;
+	}
+
+	mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
+	     "top, prev = %"MLFu64"\n",
+	     fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
+
+	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
+	bg_ptr = le64_to_cpu(bg->bg_next_group);
+	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
+
+	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	prev_bg->bg_next_group = bg->bg_next_group;
+
+	status = ocfs2_journal_dirty(handle, prev_bg_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+
+	status = ocfs2_journal_dirty(handle, bg_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = 0;
+out_rollback:
+	if (status < 0) {
+		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
+		bg->bg_next_group = cpu_to_le64(bg_ptr);
+		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+	}
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted)
+{
+	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
+}
+
+/* return 0 on success, -ENOSPC to keep searching and any other < 0
+ * value on error. */
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u16 *bit_off, u16 *bits_found)
+{
+	int search = -ENOSPC;
+	int ret;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	u16 tmp_off, tmp_found;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count) {
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							&tmp_off, &tmp_found);
+		if (ret)
+			return ret;
+
+		/* ocfs2_block_group_find_clear_bits() might
+		 * return success, but we still want to return
+		 * -ENOSPC unless it found the minimum number
+		 * of bits. */
+		if (min_bits <= tmp_found) {
+			*bit_off = tmp_off;
+			*bits_found = tmp_found;
+			search = 0; /* success */
+		}
+	}
+
+	return search;
+}
+
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u16 *bit_off, u16 *bits_found)
+{
+	int ret = -ENOSPC;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	BUG_ON(min_bits != 1);
+	BUG_ON(ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count)
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							bit_off, bits_found);
+
+	return ret;
+}
+
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      u16 *bit_off,
+			      unsigned int *num_bits,
+			      u64 *bg_blkno)
+{
+	int status;
+	u16 chain, tmp_bits;
+	u32 tmp_used;
+	u64 next_group;
+	struct ocfs2_journal_handle *handle = ac->ac_handle;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *group_bh = NULL;
+	struct buffer_head *prev_group_bh = NULL;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+	struct ocfs2_group_desc *bg;
+
+	chain = ac->ac_chain;
+	mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
+	     bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
+
+	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
+				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = -ENOSPC;
+	/* for now, the chain search is a bit simplistic. We just use
+	 * the 1st group with any empty bits. */
+	while ((status = ac->ac_group_search(alloc_inode, group_bh,
+					     bits_wanted, min_bits, bit_off,
+					     &tmp_bits)) == -ENOSPC) {
+		if (!bg->bg_next_group)
+			break;
+
+		if (prev_group_bh) {
+			brelse(prev_group_bh);
+			prev_group_bh = NULL;
+		}
+		next_group = le64_to_cpu(bg->bg_next_group);
+		prev_group_bh = group_bh;
+		group_bh = NULL;
+		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+					  next_group, &group_bh,
+					  OCFS2_BH_CACHED, alloc_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		bg = (struct ocfs2_group_desc *) group_bh->b_data;
+		if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+			OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+			status = -EIO;
+			goto bail;
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
+	     tmp_bits, bg->bg_blkno);
+
+	*num_bits = tmp_bits;
+
+	BUG_ON(*num_bits == 0);
+
+	/*
+	 * Keep track of previous block descriptor read. When
+	 * we find a target, if we have read more than X
+	 * number of descriptors, and the target is reasonably
+	 * empty, relink him to top of his chain.
+	 *
+	 * We've read 0 extra blocks and only send one more to
+	 * the transaction, yet the next guy to search has a
+	 * much easier time.
+	 *
+	 * Do this *after* figuring out how many bits we're taking out
+	 * of our target group.
+	 */
+	if (ac->ac_allow_chain_relink &&
+	    (prev_group_bh) &&
+	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+		status = ocfs2_relink_block_group(handle, alloc_inode,
+						  ac->ac_bh, group_bh,
+						  prev_group_bh, chain);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Ok, claim our bits now: set the info on dinode, chainlist
+	 * and then the group */
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      ac->ac_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+
+	status = ocfs2_journal_dirty(handle,
+				     ac->ac_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_set_bits(handle,
+					    alloc_inode,
+					    bg,
+					    group_bh,
+					    *bit_off,
+					    *num_bits);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
+	     *num_bits, fe->i_blkno);
+
+	*bg_blkno = le64_to_cpu(bg->bg_blkno);
+bail:
+	if (group_bh)
+		brelse(group_bh);
+	if (prev_group_bh)
+		brelse(prev_group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+				     struct ocfs2_alloc_context *ac,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     u16 *bit_off,
+				     unsigned int *num_bits,
+				     u64 *bg_blkno)
+{
+	int status;
+	u16 victim, i;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
+	BUG_ON(!ac->ac_bh);
+
+	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
+	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
+		ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
+			    "used bits but only %u total.",
+			    le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->id1.bitmap1.i_used),
+			    le32_to_cpu(fe->id1.bitmap1.i_total));
+		status = -EIO;
+		goto bail;
+	}
+
+	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+
+	victim = ocfs2_find_victim_chain(cl);
+	ac->ac_chain = victim;
+	ac->ac_allow_chain_relink = 1;
+
+	status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
+				    num_bits, bg_blkno);
+	if (!status)
+		goto bail;
+	if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Search of victim chain %u came up with nothing, "
+	     "trying all chains now.\n", victim);
+
+	/* If we didn't pick a good victim, then just default to
+	 * searching each chain in order. Don't allow chain relinking
+	 * because we only calculate enough journal credits for one
+	 * relink per alloc. */
+	ac->ac_allow_chain_relink = 0;
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
+		if (i == victim)
+			continue;
+		if (!cl->cl_recs[i].c_free)
+			continue;
+
+		ac->ac_chain = i;
+		status = ocfs2_search_chain(ac, bits_wanted, min_bits,
+					    bit_off, num_bits,
+					    bg_blkno);
+		if (!status)
+			break;
+		if (status < 0 && status != -ENOSPC) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_metadata(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u16 *suballoc_bit_start,
+			 unsigned int *num_bits,
+			 u64 *blkno_start)
+{
+	int status;
+	u64 bg_blkno;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
+	BUG_ON(ac->ac_handle != handle);
+
+	status = ocfs2_claim_suballoc_bits(osb,
+					   ac,
+					   bits_wanted,
+					   1,
+					   suballoc_bit_start,
+					   num_bits,
+					   &bg_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bg_allocs);
+
+	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+	ac->ac_bits_given += (*num_bits);
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+			  struct ocfs2_journal_handle *handle,
+			  struct ocfs2_alloc_context *ac,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno)
+{
+	int status;
+	unsigned int num_bits;
+	u64 bg_blkno;
+
+	mlog_entry_void();
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+	BUG_ON(ac->ac_handle != handle);
+
+	status = ocfs2_claim_suballoc_bits(osb,
+					   ac,
+					   1,
+					   1,
+					   suballoc_bit,
+					   &num_bits,
+					   &bg_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bg_allocs);
+
+	BUG_ON(num_bits != 1);
+
+	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+	ac->ac_bits_given++;
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* translate a group desc. blkno and it's bitmap offset into
+ * disk cluster offset. */
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 cluster = 0;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg_blkno != osb->first_cluster_group_blkno)
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
+	cluster += (u32) bg_bit_off;
+	return cluster;
+}
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+					    u32 cluster)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 group_no;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	group_no = cluster / osb->bitmap_cpg;
+	if (!group_no)
+		return osb->first_cluster_group_blkno;
+	return ocfs2_clusters_to_blocks(inode->i_sb,
+					group_no * osb->bitmap_cpg);
+}
+
+/* given the block number of a cluster start, calculate which cluster
+ * group and descriptor bitmap offset that corresponds to. */
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	*bg_blkno = ocfs2_which_cluster_group(inode,
+					      data_cluster);
+
+	if (*bg_blkno == osb->first_cluster_group_blkno)
+		*bg_bit_off = (u16) data_cluster;
+	else
+		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
+							     data_blkno - *bg_blkno);
+}
+
+/*
+ * min_bits - minimum contiguous chunk from this total allocation we
+ * can handle. set to what we asked for originally for a full
+ * contig. allocation, set to '1' to indicate we can deal with extents
+ * of any size.
+ */
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters)
+{
+	int status;
+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	u64 bg_blkno;
+	u16 bg_bit_off;
+
+	mlog_entry_void();
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
+	       && ac->ac_which != OCFS2_AC_USE_MAIN);
+	BUG_ON(ac->ac_handle != handle);
+
+	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+		status = ocfs2_claim_local_alloc_bits(osb,
+						      handle,
+						      ac,
+						      bits_wanted,
+						      cluster_start,
+						      num_clusters);
+		if (!status)
+			atomic_inc(&osb->alloc_stats.local_data);
+	} else {
+		if (min_clusters > (osb->bitmap_cpg - 1)) {
+			/* The only paths asking for contiguousness
+			 * should know about this already. */
+			mlog(ML_ERROR, "minimum allocation requested exceeds "
+				       "group bitmap size!");
+			status = -ENOSPC;
+			goto bail;
+		}
+		/* clamp the current request down to a realistic size. */
+		if (bits_wanted > (osb->bitmap_cpg - 1))
+			bits_wanted = osb->bitmap_cpg - 1;
+
+		status = ocfs2_claim_suballoc_bits(osb,
+						   ac,
+						   bits_wanted,
+						   min_clusters,
+						   &bg_bit_off,
+						   num_clusters,
+						   &bg_blkno);
+		if (!status) {
+			*cluster_start =
+				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
+								 bg_blkno,
+								 bg_bit_off);
+			atomic_inc(&osb->alloc_stats.bitmap_data);
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ac->ac_bits_given += *num_clusters;
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
+					       struct inode *alloc_inode,
+					       struct ocfs2_group_desc *bg,
+					       struct buffer_head *group_bh,
+					       unsigned int bit_off,
+					       unsigned int num_bits)
+{
+	int status;
+	unsigned int tmp;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+	struct ocfs2_group_desc *undo_bg = NULL;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+
+	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
+				      journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
+
+	tmp = num_bits;
+	while(tmp--) {
+		ocfs2_clear_bit((bit_off + tmp),
+				(unsigned long *) bg->bg_bitmap);
+		if (ocfs2_is_cluster_bitmap(alloc_inode))
+			ocfs2_set_bit(bit_off + tmp,
+				      (unsigned long *) undo_bg->bg_bitmap);
+	}
+	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+
+	status = ocfs2_journal_dirty(handle, group_bh);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	return status;
+}
+
+/*
+ * expects the suballoc inode to already be locked.
+ */
+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *alloc_bh,
+				    unsigned int start_bit,
+				    u64 bg_blkno,
+				    unsigned int count)
+{
+	int status = 0;
+	u32 tmp_used;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
+
+	mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
+	     ", starting at %u\n",
+	     OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
+	     start_bit);
+
+	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
+				  alloc_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+	if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
+
+	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
+					      group, group_bh,
+					      start_bit, count);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
+		     count);
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+
+	status = ocfs2_journal_dirty(handle, alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	if (group_bh)
+		brelse(group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+	u64 group = block - (u64) bit;
+
+	return group;
+}
+
+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di)
+{
+	u64 blk = le64_to_cpu(di->i_blkno);
+	u16 bit = le16_to_cpu(di->i_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
+					inode_alloc_bh, bit, bg_blkno, 1);
+}
+
+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
+			    struct inode *eb_alloc_inode,
+			    struct buffer_head *eb_alloc_bh,
+			    struct ocfs2_extent_block *eb)
+{
+	u64 blk = le64_to_cpu(eb->h_blkno);
+	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
+					bit, bg_blkno, 1);
+}
+
+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+		       struct inode *bitmap_inode,
+		       struct buffer_head *bitmap_bh,
+		       u64 start_blk,
+		       unsigned int num_clusters)
+{
+	int status;
+	u16 bg_start_bit;
+	u64 bg_blkno;
+	struct ocfs2_dinode *fe;
+
+	/* You can't ever have a contiguous set of clusters
+	 * bigger than a block group bitmap so we never have to worry
+	 * about looping on them. */
+
+	mlog_entry_void();
+
+	/* This is expensive. We can safely remove once this stuff has
+	 * gotten tested really well. */
+	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+
+	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
+
+	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
+				     &bg_start_bit);
+
+	mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
+	     num_clusters, start_blk);
+	mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
+	     bg_blkno, bg_start_bit);
+
+	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					  bg_start_bit, bg_blkno,
+					  num_clusters);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
+{
+	printk("Block Group:\n");
+	printk("bg_signature:       %s\n", bg->bg_signature);
+	printk("bg_size:            %u\n", bg->bg_size);
+	printk("bg_bits:            %u\n", bg->bg_bits);
+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+	printk("bg_chain:           %u\n", bg->bg_chain);
+	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
+	printk("bg_next_group:      %"MLFu64"\n", bg->bg_next_group);
+	printk("bg_parent_dinode:   %"MLFu64"\n", bg->bg_parent_dinode);
+	printk("bg_blkno:           %"MLFu64"\n", bg->bg_blkno);
+}
+
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
+{
+	int i;
+
+	printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
+	printk("i_signature:                  %s\n", fe->i_signature);
+	printk("i_size:                       %"MLFu64"\n", fe->i_size);
+	printk("i_clusters:                   %u\n", fe->i_clusters);
+	printk("i_generation:                 %u\n",
+	       le32_to_cpu(fe->i_generation));
+	printk("id1.bitmap1.i_used:           %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_used));
+	printk("id1.bitmap1.i_total:          %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_total));
+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
+	printk("id2.i_chain.cl_next_free_rec: %u\n",
+	       fe->id2.i_chain.cl_next_free_rec);
+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_free);
+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_total);
+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_blkno);
+	}
+}

+ 132 - 0
fs/ocfs2/suballoc.h

@@ -0,0 +1,132 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+typedef int (group_search_t)(struct inode *,
+			     struct buffer_head *,
+			     u32,
+			     u32,
+			     u16 *,
+			     u16 *);
+
+struct ocfs2_alloc_context {
+	struct inode *ac_inode;    /* which bitmap are we allocating from? */
+	struct buffer_head *ac_bh; /* file entry bh */
+	u32    ac_bits_wanted;
+	u32    ac_bits_given;
+#define OCFS2_AC_USE_LOCAL 1
+#define OCFS2_AC_USE_MAIN  2
+#define OCFS2_AC_USE_INODE 3
+#define OCFS2_AC_USE_META  4
+	u32    ac_which;
+	struct ocfs2_journal_handle *ac_handle;
+
+	/* these are used by the chain search */
+	u16    ac_chain;
+	int    ac_allow_chain_relink;
+	group_search_t *ac_group_search;
+};
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
+static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
+{
+	return ac->ac_bits_wanted - ac->ac_bits_given;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_dinode *fe,
+			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   struct ocfs2_journal_handle *handle,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac);
+
+int ocfs2_claim_metadata(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u16 *suballoc_bit_start,
+			 u32 *num_bits,
+			 u64 *blkno_start);
+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+			  struct ocfs2_journal_handle *handle,
+			  struct ocfs2_alloc_context *ac,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno);
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters);
+
+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di);
+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
+			    struct inode *eb_alloc_inode,
+			    struct buffer_head *eb_alloc_bh,
+			    struct ocfs2_extent_block *eb);
+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters);
+
+static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
+					  u64 bg_blkno)
+{
+	/* This should work for all block group descriptors as only
+	 * the 1st group descriptor of the cluster bitmap is
+	 * different. */
+
+	if (bg_blkno == osb->first_cluster_group_blkno)
+		return 0;
+
+	/* the rest of the block groups are located at the beginning
+	 * of their 1st cluster, so a direct translation just
+	 * works. */
+	return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
+}
+
+static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
+}
+
+/* This is for local alloc ONLY. Others should use the task-specific
+ * apis above. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac);
+
+#endif /* _CHAINALLOC_H_ */

Some files were not shown because too many files changed in this diff