19 years ago · 29552b1462
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -12,10 +12,14 @@ cifs.txt
 
															 	- description of the CIFS filesystem
														
 
															 coda.txt
														
 
															 	- description of the CODA filesystem.
														
 
															+configfs/
														
 
															+	- directory containing configfs documentation and example code.
														
 
															 cramfs.txt
														
 
															 	- info on the cram filesystem for small storage (ROMs etc)
														
 
															 devfs/
														
 
															 	- directory containing devfs documentation.
														
 
															+dlmfs.txt
														
 
															+	- info on the userspace interface to the OCFS2 DLM.
														
 
															 ext2.txt
														
 
															 	- info, mount options and specifications for the Ext2 filesystem.
														
 
															 hpfs.txt
														
@@ -30,6 +34,8 @@ ntfs.txt
 
															 	- info and mount options for the NTFS filesystem (Windows NT).
														
 
															 proc.txt
														
 
															 	- info on Linux's /proc filesystem.
														
 
															+ocfs2.txt
														
 
															+	- info and mount options for the OCFS2 clustered filesystem.
														
 
															 romfs.txt
														
 
															 	- Description of the ROMFS filesystem.
														
 
															 smbfs.txt
														
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -0,0 +1,434 @@
 
															+
														
 
															+configfs - Userspace-driven kernel object configuation.
														
 
															+
														
 
															+Joel Becker <joel.becker@oracle.com>
														
 
															+
														
 
															+Updated: 31 March 2005
														
 
															+
														
 
															+Copyright (c) 2005 Oracle Corporation,
														
 
															+	Joel Becker <joel.becker@oracle.com>
														
 
															+
														
 
															+
														
 
															+[What is configfs?]
														
 
															+
														
 
															+configfs is a ram-based filesystem that provides the converse of
														
 
															+sysfs's functionality.  Where sysfs is a filesystem-based view of
														
 
															+kernel objects, configfs is a filesystem-based manager of kernel
														
 
															+objects, or config_items.
														
 
															+
														
 
															+With sysfs, an object is created in kernel (for example, when a device
														
 
															+is discovered) and it is registered with sysfs.  Its attributes then
														
 
															+appear in sysfs, allowing userspace to read the attributes via
														
 
															+readdir(3)/read(2).  It may allow some attributes to be modified via
														
 
															+write(2).  The important point is that the object is created and
														
 
															+destroyed in kernel, the kernel controls the lifecycle of the sysfs
														
 
															+representation, and sysfs is merely a window on all this.
														
 
															+
														
 
															+A configfs config_item is created via an explicit userspace operation:
														
 
															+mkdir(2).  It is destroyed via rmdir(2).  The attributes appear at
														
 
															+mkdir(2) time, and can be read or modified via read(2) and write(2).
														
 
															+As with sysfs, readdir(3) queries the list of items and/or attributes.
														
 
															+symlink(2) can be used to group items together.  Unlike sysfs, the
														
 
															+lifetime of the representation is completely driven by userspace.  The
														
 
															+kernel modules backing the items must respond to this.
														
 
															+
														
 
															+Both sysfs and configfs can and should exist together on the same
														
 
															+system.  One is not a replacement for the other.
														
 
															+
														
 
															+[Using configfs]
														
 
															+
														
 
															+configfs can be compiled as a module or into the kernel.  You can access
														
 
															+it by doing
														
 
															+
														
 
															+	mount -t configfs none /config
														
 
															+
														
 
															+The configfs tree will be empty unless client modules are also loaded.
														
 
															+These are modules that register their item types with configfs as
														
 
															+subsystems.  Once a client subsystem is loaded, it will appear as a
														
 
															+subdirectory (or more than one) under /config.  Like sysfs, the
														
 
															+configfs tree is always there, whether mounted on /config or not.
														
 
															+
														
 
															+An item is created via mkdir(2).  The item's attributes will also
														
 
															+appear at this time.  readdir(3) can determine what the attributes are,
														
 
															+read(2) can query their default values, and write(2) can store new
														
 
															+values.  Like sysfs, attributes should be ASCII text files, preferably
														
 
															+with only one value per file.  The same efficiency caveats from sysfs
														
 
															+apply.  Don't mix more than one attribute in one attribute file.
														
 
															+
														
 
															+Like sysfs, configfs expects write(2) to store the entire buffer at
														
 
															+once.  When writing to configfs attributes, userspace processes should
														
 
															+first read the entire file, modify the portions they wish to change, and
														
 
															+then write the entire buffer back.  Attribute files have a maximum size
														
 
															+of one page (PAGE_SIZE, 4096 on i386).
														
 
															+
														
 
															+When an item needs to be destroyed, remove it with rmdir(2).  An
														
 
															+item cannot be destroyed if any other item has a link to it (via
														
 
															+symlink(2)).  Links can be removed via unlink(2).
														
 
															+
														
 
															+[Configuring FakeNBD: an Example]
														
 
															+
														
 
															+Imagine there's a Network Block Device (NBD) driver that allows you to
														
 
															+access remote block devices.  Call it FakeNBD.  FakeNBD uses configfs
														
 
															+for its configuration.  Obviously, there will be a nice program that
														
 
															+sysadmins use to configure FakeNBD, but somehow that program has to tell
														
 
															+the driver about it.  Here's where configfs comes in.
														
 
															+
														
 
															+When the FakeNBD driver is loaded, it registers itself with configfs.
														
 
															+readdir(3) sees this just fine:
														
 
															+
														
 
															+	# ls /config
														
 
															+	fakenbd
														
 
															+
														
 
															+A fakenbd connection can be created with mkdir(2).  The name is
														
 
															+arbitrary, but likely the tool will make some use of the name.  Perhaps
														
 
															+it is a uuid or a disk name:
														
 
															+
														
 
															+	# mkdir /config/fakenbd/disk1
														
 
															+	# ls /config/fakenbd/disk1
														
 
															+	target device rw
														
 
															+
														
 
															+The target attribute contains the IP address of the server FakeNBD will
														
 
															+connect to.  The device attribute is the device on the server.
														
 
															+Predictably, the rw attribute determines whether the connection is
														
 
															+read-only or read-write.
														
 
															+
														
 
															+	# echo 10.0.0.1 > /config/fakenbd/disk1/target
														
 
															+	# echo /dev/sda1 > /config/fakenbd/disk1/device
														
 
															+	# echo 1 > /config/fakenbd/disk1/rw
														
 
															+
														
 
															+That's it.  That's all there is.  Now the device is configured, via the
														
 
															+shell no less.
														
 
															+
														
 
															+[Coding With configfs]
														
 
															+
														
 
															+Every object in configfs is a config_item.  A config_item reflects an
														
 
															+object in the subsystem.  It has attributes that match values on that
														
 
															+object.  configfs handles the filesystem representation of that object
														
 
															+and its attributes, allowing the subsystem to ignore all but the
														
 
															+basic show/store interaction.
														
 
															+
														
 
															+Items are created and destroyed inside a config_group.  A group is a
														
 
															+collection of items that share the same attributes and operations.
														
 
															+Items are created by mkdir(2) and removed by rmdir(2), but configfs
														
 
															+handles that.  The group has a set of operations to perform these tasks
														
 
															+
														
 
															+A subsystem is the top level of a client module.  During initialization,
														
 
															+the client module registers the subsystem with configfs, the subsystem
														
 
															+appears as a directory at the top of the configfs filesystem.  A
														
 
															+subsystem is also a config_group, and can do everything a config_group
														
 
															+can.
														
 
															+
														
 
															+[struct config_item]
														
 
															+
														
 
															+	struct config_item {
														
 
															+		char                    *ci_name;
														
 
															+		char                    ci_namebuf[UOBJ_NAME_LEN];
														
 
															+		struct kref             ci_kref;
														
 
															+		struct list_head        ci_entry;
														
 
															+		struct config_item      *ci_parent;
														
 
															+		struct config_group     *ci_group;
														
 
															+		struct config_item_type *ci_type;
														
 
															+		struct dentry           *ci_dentry;
														
 
															+	};
														
 
															+
														
 
															+	void config_item_init(struct config_item *);
														
 
															+	void config_item_init_type_name(struct config_item *,
														
 
															+					const char *name,
														
 
															+					struct config_item_type *type);
														
 
															+	struct config_item *config_item_get(struct config_item *);
														
 
															+	void config_item_put(struct config_item *);
														
 
															+
														
 
															+Generally, struct config_item is embedded in a container structure, a
														
 
															+structure that actually represents what the subsystem is doing.  The
														
 
															+config_item portion of that structure is how the object interacts with
														
 
															+configfs.
														
 
															+
														
 
															+Whether statically defined in a source file or created by a parent
														
 
															+config_group, a config_item must have one of the _init() functions
														
 
															+called on it.  This initializes the reference count and sets up the
														
 
															+appropriate fields.
														
 
															+
														
 
															+All users of a config_item should have a reference on it via
														
 
															+config_item_get(), and drop the reference when they are done via
														
 
															+config_item_put().
														
 
															+
														
 
															+By itself, a config_item cannot do much more than appear in configfs.
														
 
															+Usually a subsystem wants the item to display and/or store attributes,
														
 
															+among other things.  For that, it needs a type.
														
 
															+
														
 
															+[struct config_item_type]
														
 
															+
														
 
															+	struct configfs_item_operations {
														
 
															+		void (*release)(struct config_item *);
														
 
															+		ssize_t (*show_attribute)(struct config_item *,
														
 
															+					  struct configfs_attribute *,
														
 
															+					  char *);
														
 
															+		ssize_t (*store_attribute)(struct config_item *,
														
 
															+					   struct configfs_attribute *,
														
 
															+					   const char *, size_t);
														
 
															+		int (*allow_link)(struct config_item *src,
														
 
															+				  struct config_item *target);
														
 
															+		int (*drop_link)(struct config_item *src,
														
 
															+				 struct config_item *target);
														
 
															+	};
														
 
															+
														
 
															+	struct config_item_type {
														
 
															+		struct module                           *ct_owner;
														
 
															+		struct configfs_item_operations         *ct_item_ops;
														
 
															+		struct configfs_group_operations        *ct_group_ops;
														
 
															+		struct configfs_attribute               **ct_attrs;
														
 
															+	};
														
 
															+
														
 
															+The most basic function of a config_item_type is to define what
														
 
															+operations can be performed on a config_item.  All items that have been
														
 
															+allocated dynamically will need to provide the ct_item_ops->release()
														
 
															+method.  This method is called when the config_item's reference count
														
 
															+reaches zero.  Items that wish to display an attribute need to provide
														
 
															+the ct_item_ops->show_attribute() method.  Similarly, storing a new
														
 
															+attribute value uses the store_attribute() method.
														
 
															+
														
 
															+[struct configfs_attribute]
														
 
															+
														
 
															+	struct configfs_attribute {
														
 
															+		char                    *ca_name;
														
 
															+		struct module           *ca_owner;
														
 
															+		mode_t                  ca_mode;
														
 
															+	};
														
 
															+
														
 
															+When a config_item wants an attribute to appear as a file in the item's
														
 
															+configfs directory, it must define a configfs_attribute describing it.
														
 
															+It then adds the attribute to the NULL-terminated array
														
 
															+config_item_type->ct_attrs.  When the item appears in configfs, the
														
 
															+attribute file will appear with the configfs_attribute->ca_name
														
 
															+filename.  configfs_attribute->ca_mode specifies the file permissions.
														
 
															+
														
 
															+If an attribute is readable and the config_item provides a
														
 
															+ct_item_ops->show_attribute() method, that method will be called
														
 
															+whenever userspace asks for a read(2) on the attribute.  The converse
														
 
															+will happen for write(2).
														
 
															+
														
 
															+[struct config_group]
														
 
															+
														
 
															+A config_item cannot live in a vaccum.  The only way one can be created
														
 
															+is via mkdir(2) on a config_group.  This will trigger creation of a
														
 
															+child item.
														
 
															+
														
 
															+	struct config_group {
														
 
															+		struct config_item		cg_item;
														
 
															+		struct list_head		cg_children;
														
 
															+		struct configfs_subsystem 	*cg_subsys;
														
 
															+		struct config_group		**default_groups;
														
 
															+	};
														
 
															+
														
 
															+	void config_group_init(struct config_group *group);
														
 
															+	void config_group_init_type_name(struct config_group *group,
														
 
															+					 const char *name,
														
 
															+					 struct config_item_type *type);
														
 
															+
														
 
															+
														
 
															+The config_group structure contains a config_item.  Properly configuring
														
 
															+that item means that a group can behave as an item in its own right.
														
 
															+However, it can do more: it can create child items or groups.  This is
														
 
															+accomplished via the group operations specified on the group's
														
 
															+config_item_type.
														
 
															+
														
 
															+	struct configfs_group_operations {
														
 
															+		struct config_item *(*make_item)(struct config_group *group,
														
 
															+						 const char *name);
														
 
															+		struct config_group *(*make_group)(struct config_group *group,
														
 
															+						   const char *name);
														
 
															+		int (*commit_item)(struct config_item *item);
														
 
															+		void (*drop_item)(struct config_group *group,
														
 
															+				  struct config_item *item);
														
 
															+	};
														
 
															+
														
 
															+A group creates child items by providing the
														
 
															+ct_group_ops->make_item() method.  If provided, this method is called from mkdir(2) in the group's directory.  The subsystem allocates a new
														
 
															+config_item (or more likely, its container structure), initializes it,
														
 
															+and returns it to configfs.  Configfs will then populate the filesystem
														
 
															+tree to reflect the new item.
														
 
															+
														
 
															+If the subsystem wants the child to be a group itself, the subsystem
														
 
															+provides ct_group_ops->make_group().  Everything else behaves the same,
														
 
															+using the group _init() functions on the group.
														
 
															+
														
 
															+Finally, when userspace calls rmdir(2) on the item or group,
														
 
															+ct_group_ops->drop_item() is called.  As a config_group is also a
														
 
															+config_item, it is not necessary for a seperate drop_group() method.
														
 
															+The subsystem must config_item_put() the reference that was initialized
														
 
															+upon item allocation.  If a subsystem has no work to do, it may omit
														
 
															+the ct_group_ops->drop_item() method, and configfs will call
														
 
															+config_item_put() on the item on behalf of the subsystem.
														
 
															+
														
 
															+IMPORTANT: drop_item() is void, and as such cannot fail.  When rmdir(2)
														
 
															+is called, configfs WILL remove the item from the filesystem tree
														
 
															+(assuming that it has no children to keep it busy).  The subsystem is
														
 
															+responsible for responding to this.  If the subsystem has references to
														
 
															+the item in other threads, the memory is safe.  It may take some time
														
 
															+for the item to actually disappear from the subsystem's usage.  But it
														
 
															+is gone from configfs.
														
 
															+
														
 
															+A config_group cannot be removed while it still has child items.  This
														
 
															+is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
														
 
															+called, as the item has not been dropped.  rmdir(2) will fail, as the
														
 
															+directory is not empty.
														
 
															+
														
 
															+[struct configfs_subsystem]
														
 
															+
														
 
															+A subsystem must register itself, ususally at module_init time.  This
														
 
															+tells configfs to make the subsystem appear in the file tree.
														
 
															+
														
 
															+	struct configfs_subsystem {
														
 
															+		struct config_group	su_group;
														
 
															+		struct semaphore	su_sem;
														
 
															+	};
														
 
															+
														
 
															+	int configfs_register_subsystem(struct configfs_subsystem *subsys);
														
 
															+	void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
														
 
															+
														
 
															+	A subsystem consists of a toplevel config_group and a semaphore.
														
 
															+The group is where child config_items are created.  For a subsystem,
														
 
															+this group is usually defined statically.  Before calling
														
 
															+configfs_register_subsystem(), the subsystem must have initialized the
														
 
															+group via the usual group _init() functions, and it must also have
														
 
															+initialized the semaphore.
														
 
															+	When the register call returns, the subsystem is live, and it
														
 
															+will be visible via configfs.  At that point, mkdir(2) can be called and
														
 
															+the subsystem must be ready for it.
														
 
															+
														
 
															+[An Example]
														
 
															+
														
 
															+The best example of these basic concepts is the simple_children
														
 
															+subsystem/group and the simple_child item in configfs_example.c  It
														
 
															+shows a trivial object displaying and storing an attribute, and a simple
														
 
															+group creating and destroying these children.
														
 
															+
														
 
															+[Hierarchy Navigation and the Subsystem Semaphore]
														
 
															+
														
 
															+There is an extra bonus that configfs provides.  The config_groups and
														
 
															+config_items are arranged in a hierarchy due to the fact that they
														
 
															+appear in a filesystem.  A subsystem is NEVER to touch the filesystem
														
 
															+parts, but the subsystem might be interested in this hierarchy.  For
														
 
															+this reason, the hierarchy is mirrored via the config_group->cg_children
														
 
															+and config_item->ci_parent structure members.
														
 
															+
														
 
															+A subsystem can navigate the cg_children list and the ci_parent pointer
														
 
															+to see the tree created by the subsystem.  This can race with configfs'
														
 
															+management of the hierarchy, so configfs uses the subsystem semaphore to
														
 
															+protect modifications.  Whenever a subsystem wants to navigate the
														
 
															+hierarchy, it must do so under the protection of the subsystem
														
 
															+semaphore.
														
 
															+
														
 
															+A subsystem will be prevented from acquiring the semaphore while a newly
														
 
															+allocated item has not been linked into this hierarchy.   Similarly, it
														
 
															+will not be able to acquire the semaphore while a dropping item has not
														
 
															+yet been unlinked.  This means that an item's ci_parent pointer will
														
 
															+never be NULL while the item is in configfs, and that an item will only
														
 
															+be in its parent's cg_children list for the same duration.  This allows
														
 
															+a subsystem to trust ci_parent and cg_children while they hold the
														
 
															+semaphore.
														
 
															+
														
 
															+[Item Aggregation Via symlink(2)]
														
 
															+
														
 
															+configfs provides a simple group via the group->item parent/child
														
 
															+relationship.  Often, however, a larger environment requires aggregation
														
 
															+outside of the parent/child connection.  This is implemented via
														
 
															+symlink(2).
														
 
															+
														
 
															+A config_item may provide the ct_item_ops->allow_link() and
														
 
															+ct_item_ops->drop_link() methods.  If the ->allow_link() method exists,
														
 
															+symlink(2) may be called with the config_item as the source of the link.
														
 
															+These links are only allowed between configfs config_items.  Any
														
 
															+symlink(2) attempt outside the configfs filesystem will be denied.
														
 
															+
														
 
															+When symlink(2) is called, the source config_item's ->allow_link()
														
 
															+method is called with itself and a target item.  If the source item
														
 
															+allows linking to target item, it returns 0.  A source item may wish to
														
 
															+reject a link if it only wants links to a certain type of object (say,
														
 
															+in its own subsystem).
														
 
															+
														
 
															+When unlink(2) is called on the symbolic link, the source item is
														
 
															+notified via the ->drop_link() method.  Like the ->drop_item() method,
														
 
															+this is a void function and cannot return failure.  The subsystem is
														
 
															+responsible for responding to the change.
														
 
															+
														
 
															+A config_item cannot be removed while it links to any other item, nor
														
 
															+can it be removed while an item links to it.  Dangling symlinks are not
														
 
															+allowed in configfs.
														
 
															+
														
 
															+[Automatically Created Subgroups]
														
 
															+
														
 
															+A new config_group may want to have two types of child config_items.
														
 
															+While this could be codified by magic names in ->make_item(), it is much
														
 
															+more explicit to have a method whereby userspace sees this divergence.
														
 
															+
														
 
															+Rather than have a group where some items behave differently than
														
 
															+others, configfs provides a method whereby one or many subgroups are
														
 
															+automatically created inside the parent at its creation.  Thus,
														
 
															+mkdir("parent) results in "parent", "parent/subgroup1", up through
														
 
															+"parent/subgroupN".  Items of type 1 can now be created in
														
 
															+"parent/subgroup1", and items of type N can be created in
														
 
															+"parent/subgroupN".
														
 
															+
														
 
															+These automatic subgroups, or default groups, do not preclude other
														
 
															+children of the parent group.  If ct_group_ops->make_group() exists,
														
 
															+other child groups can be created on the parent group directly.
														
 
															+
														
 
															+A configfs subsystem specifies default groups by filling in the
														
 
															+NULL-terminated array default_groups on the config_group structure.
														
 
															+Each group in that array is populated in the configfs tree at the same
														
 
															+time as the parent group.  Similarly, they are removed at the same time
														
 
															+as the parent.  No extra notification is provided.  When a ->drop_item()
														
 
															+method call notifies the subsystem the parent group is going away, it
														
 
															+also means every default group child associated with that parent group.
														
 
															+
														
 
															+As a consequence of this, default_groups cannot be removed directly via
														
 
															+rmdir(2).  They also are not considered when rmdir(2) on the parent
														
 
															+group is checking for children.
														
 
															+
														
 
															+[Committable Items]
														
 
															+
														
 
															+NOTE: Committable items are currently unimplemented.
														
 
															+
														
 
															+Some config_items cannot have a valid initial state.  That is, no
														
 
															+default values can be specified for the item's attributes such that the
														
 
															+item can do its work.  Userspace must configure one or more attributes,
														
 
															+after which the subsystem can start whatever entity this item
														
 
															+represents.
														
 
															+
														
 
															+Consider the FakeNBD device from above.  Without a target address *and*
														
 
															+a target device, the subsystem has no idea what block device to import.
														
 
															+The simple example assumes that the subsystem merely waits until all the
														
 
															+appropriate attributes are configured, and then connects.  This will,
														
 
															+indeed, work, but now every attribute store must check if the attributes
														
 
															+are initialized.  Every attribute store must fire off the connection if
														
 
															+that condition is met.
														
 
															+
														
 
															+Far better would be an explicit action notifying the subsystem that the
														
 
															+config_item is ready to go.  More importantly, an explicit action allows
														
 
															+the subsystem to provide feedback as to whether the attibutes are
														
 
															+initialized in a way that makes sense.  configfs provides this as
														
 
															+committable items.
														
 
															+
														
 
															+configfs still uses only normal filesystem operations.  An item is
														
 
															+committed via rename(2).  The item is moved from a directory where it
														
 
															+can be modified to a directory where it cannot.
														
 
															+
														
 
															+Any group that provides the ct_group_ops->commit_item() method has
														
 
															+committable items.  When this group appears in configfs, mkdir(2) will
														
 
															+not work directly in the group.  Instead, the group will have two
														
 
															+subdirectories: "live" and "pending".  The "live" directory does not
														
 
															+support mkdir(2) or rmdir(2) either.  It only allows rename(2).  The
														
 
															+"pending" directory does allow mkdir(2) and rmdir(2).  An item is
														
 
															+created in the "pending" directory.  Its attributes can be modified at
														
 
															+will.  Userspace commits the item by renaming it into the "live"
														
 
															+directory.  At this point, the subsystem recieves the ->commit_item()
														
 
															+callback.  If all required attributes are filled to satisfaction, the
														
 
															+method returns zero and the item is moved to the "live" directory.
														
 
															+
														
 
															+As rmdir(2) does not work in the "live" directory, an item must be
														
 
															+shutdown, or "uncommitted".  Again, this is done via rename(2), this
														
 
															+time from the "live" directory back to the "pending" one.  The subsystem
														
 
															+is notified by the ct_group_ops->uncommit_object() method.
														
 
															+
														
 
															+
														
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -0,0 +1,474 @@
 
															+/*
														
 
															+ * vim: noexpandtab ts=8 sts=0 sw=8:
														
 
															+ *
														
 
															+ * configfs_example.c - This file is a demonstration module containing
														
 
															+ *      a number of configfs subsystems.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * Based on sysfs:
														
 
															+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
														
 
															+ *
														
 
															+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/slab.h>
														
 
															+
														
 
															+#include <linux/configfs.h>
														
 
															+
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * 01-childless
														
 
															+ *
														
 
															+ * This first example is a childless subsystem.  It cannot create
														
 
															+ * any config_items.  It just has attributes.
														
 
															+ *
														
 
															+ * Note that we are enclosing the configfs_subsystem inside a container.
														
 
															+ * This is not necessary if a subsystem has no attributes directly
														
 
															+ * on the subsystem.  See the next example, 02-simple-children, for
														
 
															+ * such a subsystem.
														
 
															+ */
														
 
															+
														
 
															+struct childless {
														
 
															+	struct configfs_subsystem subsys;
														
 
															+	int showme;
														
 
															+	int storeme;
														
 
															+};
														
 
															+
														
 
															+struct childless_attribute {
														
 
															+	struct configfs_attribute attr;
														
 
															+	ssize_t (*show)(struct childless *, char *);
														
 
															+	ssize_t (*store)(struct childless *, const char *, size_t);
														
 
															+};
														
 
															+
														
 
															+static inline struct childless *to_childless(struct config_item *item)
														
 
															+{
														
 
															+	return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
														
 
															+}
														
 
															+
														
 
															+static ssize_t childless_showme_read(struct childless *childless,
														
 
															+				     char *page)
														
 
															+{
														
 
															+	ssize_t pos;
														
 
															+
														
 
															+	pos = sprintf(page, "%d\n", childless->showme);
														
 
															+	childless->showme++;
														
 
															+
														
 
															+	return pos;
														
 
															+}
														
 
															+
														
 
															+static ssize_t childless_storeme_read(struct childless *childless,
														
 
															+				      char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%d\n", childless->storeme);
														
 
															+}
														
 
															+
														
 
															+static ssize_t childless_storeme_write(struct childless *childless,
														
 
															+				       const char *page,
														
 
															+				       size_t count)
														
 
															+{
														
 
															+	unsigned long tmp;
														
 
															+	char *p = (char *) page;
														
 
															+
														
 
															+	tmp = simple_strtoul(p, &p, 10);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (tmp > INT_MAX)
														
 
															+		return -ERANGE;
														
 
															+
														
 
															+	childless->storeme = tmp;
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static ssize_t childless_description_read(struct childless *childless,
														
 
															+					  char *page)
														
 
															+{
														
 
															+	return sprintf(page,
														
 
															+"[01-childless]\n"
														
 
															+"\n"
														
 
															+"The childless subsystem is the simplest possible subsystem in\n"
														
 
															+"configfs.  It does not support the creation of child config_items.\n"
														
 
															+"It only has a few attributes.  In fact, it isn't much different\n"
														
 
															+"than a directory in /proc.\n");
														
 
															+}
														
 
															+
														
 
															+static struct childless_attribute childless_attr_showme = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
														
 
															+	.show	= childless_showme_read,
														
 
															+};
														
 
															+static struct childless_attribute childless_attr_storeme = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= childless_storeme_read,
														
 
															+	.store	= childless_storeme_write,
														
 
															+};
														
 
															+static struct childless_attribute childless_attr_description = {
														
 
															+	.attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
														
 
															+	.show = childless_description_read,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_attribute *childless_attrs[] = {
														
 
															+	&childless_attr_showme.attr,
														
 
															+	&childless_attr_storeme.attr,
														
 
															+	&childless_attr_description.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static ssize_t childless_attr_show(struct config_item *item,
														
 
															+				   struct configfs_attribute *attr,
														
 
															+				   char *page)
														
 
															+{
														
 
															+	struct childless *childless = to_childless(item);
														
 
															+	struct childless_attribute *childless_attr =
														
 
															+		container_of(attr, struct childless_attribute, attr);
														
 
															+	ssize_t ret = 0;
														
 
															+
														
 
															+	if (childless_attr->show)
														
 
															+		ret = childless_attr->show(childless, page);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static ssize_t childless_attr_store(struct config_item *item,
														
 
															+				    struct configfs_attribute *attr,
														
 
															+				    const char *page, size_t count)
														
 
															+{
														
 
															+	struct childless *childless = to_childless(item);
														
 
															+	struct childless_attribute *childless_attr =
														
 
															+		container_of(attr, struct childless_attribute, attr);
														
 
															+	ssize_t ret = -EINVAL;
														
 
															+
														
 
															+	if (childless_attr->store)
														
 
															+		ret = childless_attr->store(childless, page, count);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static struct configfs_item_operations childless_item_ops = {
														
 
															+	.show_attribute		= childless_attr_show,
														
 
															+	.store_attribute	= childless_attr_store,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type childless_type = {
														
 
															+	.ct_item_ops	= &childless_item_ops,
														
 
															+	.ct_attrs	= childless_attrs,
														
 
															+	.ct_owner	= THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+static struct childless childless_subsys = {
														
 
															+	.subsys = {
														
 
															+		.su_group = {
														
 
															+			.cg_item = {
														
 
															+				.ci_namebuf = "01-childless",
														
 
															+				.ci_type = &childless_type,
														
 
															+			},
														
 
															+		},
														
 
															+	},
														
 
															+};
														
 
															+
														
 
															+
														
 
															+/* ----------------------------------------------------------------- */
														
 
															+
														
 
															+/*
														
 
															+ * 02-simple-children
														
 
															+ *
														
 
															+ * This example merely has a simple one-attribute child.  Note that
														
 
															+ * there is no extra attribute structure, as the child's attribute is
														
 
															+ * known from the get-go.  Also, there is no container for the
														
 
															+ * subsystem, as it has no attributes of its own.
														
 
															+ */
														
 
															+
														
 
															+struct simple_child {
														
 
															+	struct config_item item;
														
 
															+	int storeme;
														
 
															+};
														
 
															+
														
 
															+static inline struct simple_child *to_simple_child(struct config_item *item)
														
 
															+{
														
 
															+	return item ? container_of(item, struct simple_child, item) : NULL;
														
 
															+}
														
 
															+
														
 
															+static struct configfs_attribute simple_child_attr_storeme = {
														
 
															+	.ca_owner = THIS_MODULE,
														
 
															+	.ca_name = "storeme",
														
 
															+	.ca_mode = S_IRUGO | S_IWUSR,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_attribute *simple_child_attrs[] = {
														
 
															+	&simple_child_attr_storeme,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static ssize_t simple_child_attr_show(struct config_item *item,
														
 
															+				      struct configfs_attribute *attr,
														
 
															+				      char *page)
														
 
															+{
														
 
															+	ssize_t count;
														
 
															+	struct simple_child *simple_child = to_simple_child(item);
														
 
															+
														
 
															+	count = sprintf(page, "%d\n", simple_child->storeme);
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static ssize_t simple_child_attr_store(struct config_item *item,
														
 
															+				       struct configfs_attribute *attr,
														
 
															+				       const char *page, size_t count)
														
 
															+{
														
 
															+	struct simple_child *simple_child = to_simple_child(item);
														
 
															+	unsigned long tmp;
														
 
															+	char *p = (char *) page;
														
 
															+
														
 
															+	tmp = simple_strtoul(p, &p, 10);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (tmp > INT_MAX)
														
 
															+		return -ERANGE;
														
 
															+
														
 
															+	simple_child->storeme = tmp;
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static void simple_child_release(struct config_item *item)
														
 
															+{
														
 
															+	kfree(to_simple_child(item));
														
 
															+}
														
 
															+
														
 
															+static struct configfs_item_operations simple_child_item_ops = {
														
 
															+	.release		= simple_child_release,
														
 
															+	.show_attribute		= simple_child_attr_show,
														
 
															+	.store_attribute	= simple_child_attr_store,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type simple_child_type = {
														
 
															+	.ct_item_ops	= &simple_child_item_ops,
														
 
															+	.ct_attrs	= simple_child_attrs,
														
 
															+	.ct_owner	= THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+
														
 
															+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
														
 
															+{
														
 
															+	struct simple_child *simple_child;
														
 
															+
														
 
															+	simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
														
 
															+	if (!simple_child)
														
 
															+		return NULL;
														
 
															+
														
 
															+	memset(simple_child, 0, sizeof(struct simple_child));
														
 
															+
														
 
															+	config_item_init_type_name(&simple_child->item, name,
														
 
															+				   &simple_child_type);
														
 
															+
														
 
															+	simple_child->storeme = 0;
														
 
															+
														
 
															+	return &simple_child->item;
														
 
															+}
														
 
															+
														
 
															+static struct configfs_attribute simple_children_attr_description = {
														
 
															+	.ca_owner = THIS_MODULE,
														
 
															+	.ca_name = "description",
														
 
															+	.ca_mode = S_IRUGO,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_attribute *simple_children_attrs[] = {
														
 
															+	&simple_children_attr_description,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static ssize_t simple_children_attr_show(struct config_item *item,
														
 
															+			   		 struct configfs_attribute *attr,
														
 
															+			   		 char *page)
														
 
															+{
														
 
															+	return sprintf(page,
														
 
															+"[02-simple-children]\n"
														
 
															+"\n"
														
 
															+"This subsystem allows the creation of child config_items.  These\n"
														
 
															+"items have only one attribute that is readable and writeable.\n");
														
 
															+}
														
 
															+
														
 
															+static struct configfs_item_operations simple_children_item_ops = {
														
 
															+	.show_attribute	= simple_children_attr_show,
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Note that, since no extra work is required on ->drop_item(),
														
 
															+ * no ->drop_item() is provided.
														
 
															+ */
														
 
															+static struct configfs_group_operations simple_children_group_ops = {
														
 
															+	.make_item	= simple_children_make_item,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type simple_children_type = {
														
 
															+	.ct_item_ops	= &simple_children_item_ops,
														
 
															+	.ct_group_ops	= &simple_children_group_ops,
														
 
															+	.ct_attrs	= simple_children_attrs,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_subsystem simple_children_subsys = {
														
 
															+	.su_group = {
														
 
															+		.cg_item = {
														
 
															+			.ci_namebuf = "02-simple-children",
														
 
															+			.ci_type = &simple_children_type,
														
 
															+		},
														
 
															+	},
														
 
															+};
														
 
															+
														
 
															+
														
 
															+/* ----------------------------------------------------------------- */
														
 
															+
														
 
															+/*
														
 
															+ * 03-group-children
														
 
															+ *
														
 
															+ * This example reuses the simple_children group from above.  However,
														
 
															+ * the simple_children group is not the subsystem itself, it is a
														
 
															+ * child of the subsystem.  Creation of a group in the subsystem creates
														
 
															+ * a new simple_children group.  That group can then have simple_child
														
 
															+ * children of its own.
														
 
															+ */
														
 
															+
														
 
															+struct simple_children {
														
 
															+	struct config_group group;
														
 
															+};
														
 
															+
														
 
															+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
														
 
															+{
														
 
															+	struct simple_children *simple_children;
														
 
															+
														
 
															+	simple_children = kmalloc(sizeof(struct simple_children),
														
 
															+				  GFP_KERNEL);
														
 
															+	if (!simple_children)
														
 
															+		return NULL;
														
 
															+
														
 
															+	memset(simple_children, 0, sizeof(struct simple_children));
														
 
															+
														
 
															+	config_group_init_type_name(&simple_children->group, name,
														
 
															+				    &simple_children_type);
														
 
															+
														
 
															+	return &simple_children->group;
														
 
															+}
														
 
															+
														
 
															+static struct configfs_attribute group_children_attr_description = {
														
 
															+	.ca_owner = THIS_MODULE,
														
 
															+	.ca_name = "description",
														
 
															+	.ca_mode = S_IRUGO,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_attribute *group_children_attrs[] = {
														
 
															+	&group_children_attr_description,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static ssize_t group_children_attr_show(struct config_item *item,
														
 
															+			   		struct configfs_attribute *attr,
														
 
															+			   		char *page)
														
 
															+{
														
 
															+	return sprintf(page,
														
 
															+"[03-group-children]\n"
														
 
															+"\n"
														
 
															+"This subsystem allows the creation of child config_groups.  These\n"
														
 
															+"groups are like the subsystem simple-children.\n");
														
 
															+}
														
 
															+
														
 
															+static struct configfs_item_operations group_children_item_ops = {
														
 
															+	.show_attribute	= group_children_attr_show,
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Note that, since no extra work is required on ->drop_item(),
														
 
															+ * no ->drop_item() is provided.
														
 
															+ */
														
 
															+static struct configfs_group_operations group_children_group_ops = {
														
 
															+	.make_group	= group_children_make_group,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type group_children_type = {
														
 
															+	.ct_item_ops	= &group_children_item_ops,
														
 
															+	.ct_group_ops	= &group_children_group_ops,
														
 
															+	.ct_attrs	= group_children_attrs,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_subsystem group_children_subsys = {
														
 
															+	.su_group = {
														
 
															+		.cg_item = {
														
 
															+			.ci_namebuf = "03-group-children",
														
 
															+			.ci_type = &group_children_type,
														
 
															+		},
														
 
															+	},
														
 
															+};
														
 
															+
														
 
															+/* ----------------------------------------------------------------- */
														
 
															+
														
 
															+/*
														
 
															+ * We're now done with our subsystem definitions.
														
 
															+ * For convenience in this module, here's a list of them all.  It
														
 
															+ * allows the init function to easily register them.  Most modules
														
 
															+ * will only have one subsystem, and will only call register_subsystem
														
 
															+ * on it directly.
														
 
															+ */
														
 
															+static struct configfs_subsystem *example_subsys[] = {
														
 
															+	&childless_subsys.subsys,
														
 
															+	&simple_children_subsys,
														
 
															+	&group_children_subsys,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static int __init configfs_example_init(void)
														
 
															+{
														
 
															+	int ret;
														
 
															+	int i;
														
 
															+	struct configfs_subsystem *subsys;
														
 
															+
														
 
															+	for (i = 0; example_subsys[i]; i++) {
														
 
															+		subsys = example_subsys[i];
														
 
															+
														
 
															+		config_group_init(&subsys->su_group);
														
 
															+		init_MUTEX(&subsys->su_sem);
														
 
															+		ret = configfs_register_subsystem(subsys);
														
 
															+		if (ret) {
														
 
															+			printk(KERN_ERR "Error %d while registering subsystem %s\n",
														
 
															+			       ret,
														
 
															+			       subsys->su_group.cg_item.ci_namebuf);
														
 
															+			goto out_unregister;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+out_unregister:
														
 
															+	for (; i >= 0; i--) {
														
 
															+		configfs_unregister_subsystem(example_subsys[i]);
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void __exit configfs_example_exit(void)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	for (i = 0; example_subsys[i]; i++) {
														
 
															+		configfs_unregister_subsystem(example_subsys[i]);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+module_init(configfs_example_init);
														
 
															+module_exit(configfs_example_exit);
														
 
															+MODULE_LICENSE("GPL");
														
--- a/Documentation/filesystems/dlmfs.txt
+++ b/Documentation/filesystems/dlmfs.txt
@@ -0,0 +1,130 @@
 
															+dlmfs
														
 
															+==================
														
 
															+A minimal DLM userspace interface implemented via a virtual file
														
 
															+system.
														
 
															+
														
 
															+dlmfs is built with OCFS2 as it requires most of its infrastructure.
														
 
															+
														
 
															+Project web page:    http://oss.oracle.com/projects/ocfs2
														
 
															+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
														
 
															+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
														
 
															+
														
 
															+All code copyright 2005 Oracle except when otherwise noted.
														
 
															+
														
 
															+CREDITS
														
 
															+=======
														
 
															+
														
 
															+Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
														
 
															+and Transmeta Corp.
														
 
															+
														
 
															+Mark Fasheh <mark.fasheh@oracle.com>
														
 
															+
														
 
															+Caveats
														
 
															+=======
														
 
															+- Right now it only works with the OCFS2 DLM, though support for other
														
 
															+  DLM implementations should not be a major issue.
														
 
															+
														
 
															+Mount options
														
 
															+=============
														
 
															+None
														
 
															+
														
 
															+Usage
														
 
															+=====
														
 
															+
														
 
															+If you're just interested in OCFS2, then please see ocfs2.txt. The
														
 
															+rest of this document will be geared towards those who want to use
														
 
															+dlmfs for easy to setup and easy to use clustered locking in
														
 
															+userspace.
														
 
															+
														
 
															+Setup
														
 
															+=====
														
 
															+
														
 
															+dlmfs requires that the OCFS2 cluster infrastructure be in
														
 
															+place. Please download ocfs2-tools from the above url and configure a
														
 
															+cluster.
														
 
															+
														
 
															+You'll want to start heartbeating on a volume which all the nodes in
														
 
															+your lockspace can access. The easiest way to do this is via
														
 
															+ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
														
 
															+that an OCFS2 file system be in place so that it can automatically
														
 
															+find it's heartbeat area, though it will eventually support heartbeat
														
 
															+against raw disks.
														
 
															+
														
 
															+Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
														
 
															+with ocfs2-tools.
														
 
															+
														
 
															+Once you're heartbeating, DLM lock 'domains' can be easily created /
														
 
															+destroyed and locks within them accessed.
														
 
															+
														
 
															+Locking
														
 
															+=======
														
 
															+
														
 
															+Users may access dlmfs via standard file system calls, or they can use
														
 
															+'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
														
 
															+system calls and presents a more traditional locking api.
														
 
															+
														
 
															+dlmfs handles lock caching automatically for the user, so a lock
														
 
															+request for an already acquired lock will not generate another DLM
														
 
															+call. Userspace programs are assumed to handle their own local
														
 
															+locking.
														
 
															+
														
 
															+Two levels of locks are supported - Shared Read, and Exlcusive.
														
 
															+Also supported is a Trylock operation.
														
 
															+
														
 
															+For information on the libo2dlm interface, please see o2dlm.h,
														
 
															+distributed with ocfs2-tools.
														
 
															+
														
 
															+Lock value blocks can be read and written to a resource via read(2)
														
 
															+and write(2) against the fd obtained via your open(2) call. The
														
 
															+maximum currently supported LVB length is 64 bytes (though that is an
														
 
															+OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
														
 
															+small amounts of data amongst their nodes.
														
 
															+
														
 
															+mkdir(2) signals dlmfs to join a domain (which will have the same name
														
 
															+as the resulting directory)
														
 
															+
														
 
															+rmdir(2) signals dlmfs to leave the domain
														
 
															+
														
 
															+Locks for a given domain are represented by regular inodes inside the
														
 
															+domain directory.  Locking against them is done via the open(2) system
														
 
															+call.
														
 
															+
														
 
															+The open(2) call will not return until your lock has been granted or
														
 
															+an error has occurred, unless it has been instructed to do a trylock
														
 
															+operation. If the lock succeeds, you'll get an fd.
														
 
															+
														
 
															+open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
														
 
															+not automatically create inodes for existing lock resources.
														
 
															+
														
 
															+Open Flag     Lock Request Type
														
 
															+---------     -----------------
														
 
															+O_RDONLY      Shared Read
														
 
															+O_RDWR        Exclusive
														
 
															+
														
 
															+Open Flag     Resulting Locking Behavior
														
 
															+---------     --------------------------
														
 
															+O_NONBLOCK    Trylock operation
														
 
															+
														
 
															+You must provide exactly one of O_RDONLY or O_RDWR.
														
 
															+
														
 
															+If O_NONBLOCK is also provided and the trylock operation was valid but
														
 
															+could not lock the resource then open(2) will return ETXTBUSY.
														
 
															+
														
 
															+close(2) drops the lock associated with your fd.
														
 
															+
														
 
															+Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
														
 
															+supported locally as well. This means you can use them to restrict
														
 
															+access to the resources via dlmfs on your local node only.
														
 
															+
														
 
															+The resource LVB may be read from the fd in either Shared Read or
														
 
															+Exclusive modes via the read(2) system call. It can be written via
														
 
															+write(2) only when open in Exclusive mode.
														
 
															+
														
 
															+Once written, an LVB will be visible to other nodes who obtain Read
														
 
															+Only or higher level locks on the resource.
														
 
															+
														
 
															+See Also
														
 
															+========
														
 
															+http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
														
 
															+
														
 
															+For more information on the VMS distributed locking API.
														
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,55 @@
 
															+OCFS2 filesystem
														
 
															+==================
														
 
															+OCFS2 is a general purpose extent based shared disk cluster file
														
 
															+system with many similarities to ext3. It supports 64 bit inode
														
 
															+numbers, and has automatically extending metadata groups which may
														
 
															+also make it attractive for non-clustered use.
														
 
															+
														
 
															+You'll want to install the ocfs2-tools package in order to at least
														
 
															+get "mount.ocfs2" and "ocfs2_hb_ctl".
														
 
															+
														
 
															+Project web page:    http://oss.oracle.com/projects/ocfs2
														
 
															+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
														
 
															+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
														
 
															+
														
 
															+All code copyright 2005 Oracle except when otherwise noted.
														
 
															+
														
 
															+CREDITS:
														
 
															+Lots of code taken from ext3 and other projects.
														
 
															+
														
 
															+Authors in alphabetical order:
														
 
															+Joel Becker   <joel.becker@oracle.com>
														
 
															+Zach Brown    <zach.brown@oracle.com>
														
 
															+Mark Fasheh   <mark.fasheh@oracle.com>
														
 
															+Kurt Hackel   <kurt.hackel@oracle.com>
														
 
															+Sunil Mushran <sunil.mushran@oracle.com>
														
 
															+Manish Singh  <manish.singh@oracle.com>
														
 
															+
														
 
															+Caveats
														
 
															+=======
														
 
															+Features which OCFS2 does not support yet:
														
 
															+	- sparse files
														
 
															+	- extended attributes
														
 
															+	- shared writeable mmap
														
 
															+	- loopback is supported, but data written will not
														
 
															+	  be cluster coherent.
														
 
															+	- quotas
														
 
															+	- cluster aware flock
														
 
															+	- Directory change notification (F_NOTIFY)
														
 
															+	- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
														
 
															+	- POSIX ACLs
														
 
															+	- readpages / writepages (not user visible)
														
 
															+
														
 
															+Mount options
														
 
															+=============
														
 
															+
														
 
															+OCFS2 supports the following mount options:
														
 
															+(*) == default
														
 
															+
														
 
															+barrier=1		This enables/disables barriers. barrier=0 disables it,
														
 
															+			barrier=1 enables it.
														
 
															+errors=remount-ro(*)	Remount the filesystem read-only on an error.
														
 
															+errors=panic		Panic and halt the machine if an error occurs.
														
 
															+intr		(*)	Allow signals to interrupt cluster operations.
														
 
															+nointr			Do not allow signals to interrupt cluster
														
 
															+			operations.
														
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -554,6 +554,11 @@ W:	http://us1.samba.org/samba/Linux_CIFS_client.html
 
															 T:	git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
														
 
															 S:	Supported	
														
 
															+CONFIGFS
														
 
															+P:	Joel Becker
														
 
															+M:	Joel Becker <joel.becker@oracle.com>
														
 
															+S:	Supported
														
 
															+
														
 
															 CIRRUS LOGIC GENERIC FBDEV DRIVER
														
 
															 P:	Jeff Garzik
														
 
															 M:	jgarzik@pobox.com
														
@@ -1898,6 +1903,15 @@ M:	ajoshi@shell.unixbox.com
 
															 L:	linux-nvidia@lists.surfsouth.com
														
 
															 S:	Maintained
														
 
															+ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
														
 
															+P:	Mark Fasheh
														
 
															+M:	mark.fasheh@oracle.com
														
 
															+P:	Kurt Hackel
														
 
															+M:	kurt.hackel@oracle.com
														
 
															+L:	ocfs2-devel@oss.oracle.com
														
 
															+W:	http://oss.oracle.com/projects/ocfs2/
														
 
															+S:	Supported	
														
 
															+
														
 
															 OLYMPIC NETWORK DRIVER
														
 
															 P:	Peter De Shrijver
														
 
															 M:	p2@ace.ulyssis.student.kuleuven.ac.be
														
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 
															 	struct address_space_operations *aops = mapping->a_ops;
														
 
															 	pgoff_t index;
														
 
															 	unsigned offset, bv_offs;
														
 
															-	int len, ret = 0;
														
 
															+	int len, ret;
														
 
															 	down(&mapping->host->i_sem);
														
 
															 	index = pos >> PAGE_CACHE_SHIFT;
														
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 
															 		page = grab_cache_page(mapping, index);
														
 
															 		if (unlikely(!page))
														
 
															 			goto fail;
														
 
															-		if (unlikely(aops->prepare_write(file, page, offset,
														
 
															-				offset + size)))
														
 
															+		ret = aops->prepare_write(file, page, offset,
														
 
															+					  offset + size);
														
 
															+		if (unlikely(ret)) {
														
 
															+			if (ret == AOP_TRUNCATED_PAGE) {
														
 
															+				page_cache_release(page);
														
 
															+				continue;
														
 
															+			}
														
 
															 			goto unlock;
														
 
															+		}
														
 
															 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
														
 
															 				bvec->bv_page, bv_offs, size, IV);
														
 
															 		if (unlikely(transfer_result)) {
														
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 
															 			kunmap_atomic(kaddr, KM_USER0);
														
 
															 		}
														
 
															 		flush_dcache_page(page);
														
 
															-		if (unlikely(aops->commit_write(file, page, offset,
														
 
															-				offset + size)))
														
 
															+		ret = aops->commit_write(file, page, offset,
														
 
															+					 offset + size);
														
 
															+		if (unlikely(ret)) {
														
 
															+			if (ret == AOP_TRUNCATED_PAGE) {
														
 
															+				page_cache_release(page);
														
 
															+				continue;
														
 
															+			}
														
 
															 			goto unlock;
														
 
															+		}
														
 
															 		if (unlikely(transfer_result))
														
 
															 			goto unlock;
														
 
															 		bv_offs += size;
														
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 
															 		unlock_page(page);
														
 
															 		page_cache_release(page);
														
 
															 	}
														
 
															+	ret = 0;
														
 
															 out:
														
 
															 	up(&mapping->host->i_sem);
														
 
															 	return ret;
														
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
 
															 /*
														
 
															  * ->writepage to the the blockdev's mapping has to redirty the page so that the
														
 
															- * VM doesn't go and steal it.  We return WRITEPAGE_ACTIVATE so that the VM
														
 
															+ * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
														
 
															  * won't try to (pointlessly) write the page again for a while.
														
 
															  *
														
 
															  * Really, these pages should not be on the LRU at all.
														
@@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
 
															 		make_page_uptodate(page);
														
 
															 	SetPageDirty(page);
														
 
															 	if (wbc->for_reclaim)
														
 
															-		return WRITEPAGE_ACTIVATE;
														
 
															+		return AOP_WRITEPAGE_ACTIVATE;
														
 
															 	unlock_page(page);
														
 
															 	return 0;
														
 
															 }
														
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -70,6 +70,7 @@ config FS_XIP
 
															 config EXT3_FS
														
 
															 	tristate "Ext3 journalling file system support"
														
 
															+	select JBD
														
 
															 	help
														
 
															 	  This is the journaling version of the Second extended file system
														
 
															 	  (often called ext3), the de facto standard Linux file system
														
@@ -138,23 +139,20 @@ config EXT3_FS_SECURITY
 
															 	  extended attributes for file security labels, say N.
														
 
															 config JBD
														
 
															-# CONFIG_JBD could be its own option (even modular), but until there are
														
 
															-# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
														
 
															-# dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
														
 
															 	tristate
														
 
															-	default EXT3_FS
														
 
															 	help
														
 
															 	  This is a generic journaling layer for block devices.  It is
														
 
															-	  currently used by the ext3 file system, but it could also be used to
														
 
															-	  add journal support to other file systems or block devices such as
														
 
															-	  RAID or LVM.
														
 
															+	  currently used by the ext3 and OCFS2 file systems, but it could
														
 
															+	  also be used to add journal support to other file systems or block
														
 
															+	  devices such as RAID or LVM.
														
 
															-	  If you are using the ext3 file system, you need to say Y here. If
														
 
															-	  you are not using ext3 then you will probably want to say N.
														
 
															+	  If you are using the ext3 or OCFS2 file systems, you need to
														
 
															+	  say Y here. If you are not using ext3 OCFS2 then you will probably
														
 
															+	  want to say N.
														
 
															 	  To compile this device as a module, choose M here: the module will be
														
 
															-	  called jbd.  If you are compiling ext3 into the kernel, you cannot
														
 
															-	  compile this code as a module.
														
 
															+	  called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
														
 
															+	  you cannot compile this code as a module.
														
 
															 config JBD_DEBUG
														
 
															 	bool "JBD (ext3) debugging support"
														
@@ -326,6 +324,38 @@ config FS_POSIX_ACL
 
															 source "fs/xfs/Kconfig"
														
 
															+config OCFS2_FS
														
 
															+	tristate "OCFS2 file system support (EXPERIMENTAL)"
														
 
															+	depends on NET && EXPERIMENTAL
														
 
															+	select CONFIGFS_FS
														
 
															+	select JBD
														
 
															+	select CRC32
														
 
															+	select INET
														
 
															+	help
														
 
															+	  OCFS2 is a general purpose extent based shared disk cluster file
														
 
															+	  system with many similarities to ext3. It supports 64 bit inode
														
 
															+	  numbers, and has automatically extending metadata groups which may
														
 
															+	  also make it attractive for non-clustered use.
														
 
															+
														
 
															+	  You'll want to install the ocfs2-tools package in order to at least
														
 
															+	  get "mount.ocfs2".
														
 
															+
														
 
															+	  Project web page:    http://oss.oracle.com/projects/ocfs2
														
 
															+	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
														
 
															+	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
														
 
															+
														
 
															+	  Note: Features which OCFS2 does not support yet:
														
 
															+	          - extended attributes
														
 
															+		  - shared writeable mmap
														
 
															+	          - loopback is supported, but data written will not
														
 
															+	            be cluster coherent.
														
 
															+	          - quotas
														
 
															+	          - cluster aware flock
														
 
															+	          - Directory change notification (F_NOTIFY)
														
 
															+	          - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
														
 
															+	          - POSIX ACLs
														
 
															+	          - readpages / writepages (not user visible)
														
 
															+
														
 
															 config MINIX_FS
														
 
															 	tristate "Minix fs support"
														
 
															 	help
														
@@ -841,6 +871,20 @@ config RELAYFS_FS
 
															 	  If unsure, say N.
														
 
															+config CONFIGFS_FS
														
 
															+	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
														
 
															+	depends on EXPERIMENTAL
														
 
															+	help
														
 
															+	  configfs is a ram-based filesystem that provides the converse
														
 
															+	  of sysfs's functionality. Where sysfs is a filesystem-based
														
 
															+	  view of kernel objects, configfs is a filesystem-based manager
														
 
															+	  of kernel objects, or config_items.
														
 
															+
														
 
															+	  Both sysfs and configfs can and should exist together on the
														
 
															+	  same system. One is not a replacement for the other.
														
 
															+
														
 
															+	  If unsure, say N.
														
 
															+
														
 
															 endmenu
														
 
															 menu "Miscellaneous filesystems"
														
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -101,3 +101,5 @@ obj-$(CONFIG_BEFS_FS)		+= befs/
 
															 obj-$(CONFIG_HOSTFS)		+= hostfs/
														
 
															 obj-$(CONFIG_HPPFS)		+= hppfs/
														
 
															 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
														
 
															+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
														
 
															+obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
														
--- a/fs/configfs/Makefile
+++ b/fs/configfs/Makefile
@@ -0,0 +1,7 @@
 
															+#
														
 
															+# Makefile for the configfs virtual filesystem
														
 
															+#
														
 
															+
														
 
															+obj-$(CONFIG_CONFIGFS_FS)	+= configfs.o
														
 
															+
														
 
															+configfs-objs	:= inode.o file.o dir.o symlink.o mount.o item.o
														
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -0,0 +1,142 @@
 
															+/* -*- mode: c; c-basic-offset:8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * configfs_internal.h - Internal stuff for configfs
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * Based on sysfs:
														
 
															+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
														
 
															+ *
														
 
															+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/list.h>
														
 
															+
														
 
															+struct configfs_dirent {
														
 
															+	atomic_t		s_count;
														
 
															+	struct list_head	s_sibling;
														
 
															+	struct list_head	s_children;
														
 
															+	struct list_head	s_links;
														
 
															+	void 			* s_element;
														
 
															+	int			s_type;
														
 
															+	umode_t			s_mode;
														
 
															+	struct dentry		* s_dentry;
														
 
															+};
														
 
															+
														
 
															+#define CONFIGFS_ROOT		0x0001
														
 
															+#define CONFIGFS_DIR		0x0002
														
 
															+#define CONFIGFS_ITEM_ATTR 	0x0004
														
 
															+#define CONFIGFS_ITEM_LINK 	0x0020
														
 
															+#define CONFIGFS_USET_DIR	0x0040
														
 
															+#define CONFIGFS_USET_DEFAULT	0x0080
														
 
															+#define CONFIGFS_USET_DROPPING	0x0100
														
 
															+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
														
 
															+
														
 
															+extern struct vfsmount * configfs_mount;
														
 
															+
														
 
															+extern int configfs_is_root(struct config_item *item);
														
 
															+
														
 
															+extern struct inode * configfs_new_inode(mode_t mode);
														
 
															+extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
														
 
															+
														
 
															+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
														
 
															+extern int configfs_make_dirent(struct configfs_dirent *,
														
 
															+				struct dentry *, void *, umode_t, int);
														
 
															+
														
 
															+extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
														
 
															+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
														
 
															+
														
 
															+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
														
 
															+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
														
 
															+
														
 
															+extern int configfs_pin_fs(void);
														
 
															+extern void configfs_release_fs(void);
														
 
															+
														
 
															+extern struct rw_semaphore configfs_rename_sem;
														
 
															+extern struct super_block * configfs_sb;
														
 
															+extern struct file_operations configfs_dir_operations;
														
 
															+extern struct file_operations configfs_file_operations;
														
 
															+extern struct file_operations bin_fops;
														
 
															+extern struct inode_operations configfs_dir_inode_operations;
														
 
															+extern struct inode_operations configfs_symlink_inode_operations;
														
 
															+
														
 
															+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
														
 
															+			    const char *symname);
														
 
															+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
														
 
															+
														
 
															+struct configfs_symlink {
														
 
															+	struct list_head sl_list;
														
 
															+	struct config_item *sl_target;
														
 
															+};
														
 
															+
														
 
															+extern int configfs_create_link(struct configfs_symlink *sl,
														
 
															+				struct dentry *parent,
														
 
															+				struct dentry *dentry);
														
 
															+
														
 
															+static inline struct config_item * to_item(struct dentry * dentry)
														
 
															+{
														
 
															+	struct configfs_dirent * sd = dentry->d_fsdata;
														
 
															+	return ((struct config_item *) sd->s_element);
														
 
															+}
														
 
															+
														
 
															+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
														
 
															+{
														
 
															+	struct configfs_dirent * sd = dentry->d_fsdata;
														
 
															+	return ((struct configfs_attribute *) sd->s_element);
														
 
															+}
														
 
															+
														
 
															+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
														
 
															+{
														
 
															+	struct config_item * item = NULL;
														
 
															+
														
 
															+	spin_lock(&dcache_lock);
														
 
															+	if (!d_unhashed(dentry)) {
														
 
															+		struct configfs_dirent * sd = dentry->d_fsdata;
														
 
															+		if (sd->s_type & CONFIGFS_ITEM_LINK) {
														
 
															+			struct configfs_symlink * sl = sd->s_element;
														
 
															+			item = config_item_get(sl->sl_target);
														
 
															+		} else
														
 
															+			item = config_item_get(sd->s_element);
														
 
															+	}
														
 
															+	spin_unlock(&dcache_lock);
														
 
															+
														
 
															+	return item;
														
 
															+}
														
 
															+
														
 
															+static inline void release_configfs_dirent(struct configfs_dirent * sd)
														
 
															+{
														
 
															+	if (!(sd->s_type & CONFIGFS_ROOT))
														
 
															+		kfree(sd);
														
 
															+}
														
 
															+
														
 
															+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
														
 
															+{
														
 
															+	if (sd) {
														
 
															+		WARN_ON(!atomic_read(&sd->s_count));
														
 
															+		atomic_inc(&sd->s_count);
														
 
															+	}
														
 
															+	return sd;
														
 
															+}
														
 
															+
														
 
															+static inline void configfs_put(struct configfs_dirent * sd)
														
 
															+{
														
 
															+	WARN_ON(!atomic_read(&sd->s_count));
														
 
															+	if (atomic_dec_and_test(&sd->s_count))
														
 
															+		release_configfs_dirent(sd);
														
 
															+}
														
 
															+
														
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -0,0 +1,1102 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dir.c - Operations for configfs directories.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * Based on sysfs:
														
 
															+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
														
 
															+ *
														
 
															+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ */
														
 
															+
														
 
															+#undef DEBUG
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/mount.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/slab.h>
														
 
															+
														
 
															+#include <linux/configfs.h>
														
 
															+#include "configfs_internal.h"
														
 
															+
														
 
															+DECLARE_RWSEM(configfs_rename_sem);
														
 
															+
														
 
															+static void configfs_d_iput(struct dentry * dentry,
														
 
															+			    struct inode * inode)
														
 
															+{
														
 
															+	struct configfs_dirent * sd = dentry->d_fsdata;
														
 
															+
														
 
															+	if (sd) {
														
 
															+		BUG_ON(sd->s_dentry != dentry);
														
 
															+		sd->s_dentry = NULL;
														
 
															+		configfs_put(sd);
														
 
															+	}
														
 
															+	iput(inode);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * We _must_ delete our dentries on last dput, as the chain-to-parent
														
 
															+ * behavior is required to clear the parents of default_groups.
														
 
															+ */
														
 
															+static int configfs_d_delete(struct dentry *dentry)
														
 
															+{
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+static struct dentry_operations configfs_dentry_ops = {
														
 
															+	.d_iput		= configfs_d_iput,
														
 
															+	/* simple_delete_dentry() isn't exported */
														
 
															+	.d_delete	= configfs_d_delete,
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
														
 
															+ */
														
 
															+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
														
 
															+						void * element)
														
 
															+{
														
 
															+	struct configfs_dirent * sd;
														
 
															+
														
 
															+	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
														
 
															+	if (!sd)
														
 
															+		return NULL;
														
 
															+
														
 
															+	memset(sd, 0, sizeof(*sd));
														
 
															+	atomic_set(&sd->s_count, 1);
														
 
															+	INIT_LIST_HEAD(&sd->s_links);
														
 
															+	INIT_LIST_HEAD(&sd->s_children);
														
 
															+	list_add(&sd->s_sibling, &parent_sd->s_children);
														
 
															+	sd->s_element = element;
														
 
															+
														
 
															+	return sd;
														
 
															+}
														
 
															+
														
 
															+int configfs_make_dirent(struct configfs_dirent * parent_sd,
														
 
															+			 struct dentry * dentry, void * element,
														
 
															+			 umode_t mode, int type)
														
 
															+{
														
 
															+	struct configfs_dirent * sd;
														
 
															+
														
 
															+	sd = configfs_new_dirent(parent_sd, element);
														
 
															+	if (!sd)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	sd->s_mode = mode;
														
 
															+	sd->s_type = type;
														
 
															+	sd->s_dentry = dentry;
														
 
															+	if (dentry) {
														
 
															+		dentry->d_fsdata = configfs_get(sd);
														
 
															+		dentry->d_op = &configfs_dentry_ops;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int init_dir(struct inode * inode)
														
 
															+{
														
 
															+	inode->i_op = &configfs_dir_inode_operations;
														
 
															+	inode->i_fop = &configfs_dir_operations;
														
 
															+
														
 
															+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
														
 
															+	inode->i_nlink++;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int init_file(struct inode * inode)
														
 
															+{
														
 
															+	inode->i_size = PAGE_SIZE;
														
 
															+	inode->i_fop = &configfs_file_operations;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int init_symlink(struct inode * inode)
														
 
															+{
														
 
															+	inode->i_op = &configfs_symlink_inode_operations;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int create_dir(struct config_item * k, struct dentry * p,
														
 
															+		      struct dentry * d)
														
 
															+{
														
 
															+	int error;
														
 
															+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
														
 
															+
														
 
															+	error = configfs_create(d, mode, init_dir);
														
 
															+	if (!error) {
														
 
															+		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
														
 
															+					   CONFIGFS_DIR);
														
 
															+		if (!error) {
														
 
															+			p->d_inode->i_nlink++;
														
 
															+			(d)->d_op = &configfs_dentry_ops;
														
 
															+		}
														
 
															+	}
														
 
															+	return error;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	configfs_create_dir - create a directory for an config_item.
														
 
															+ *	@item:		config_itemwe're creating directory for.
														
 
															+ *	@dentry:	config_item's dentry.
														
 
															+ */
														
 
															+
														
 
															+static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
														
 
															+{
														
 
															+	struct dentry * parent;
														
 
															+	int error = 0;
														
 
															+
														
 
															+	BUG_ON(!item);
														
 
															+
														
 
															+	if (item->ci_parent)
														
 
															+		parent = item->ci_parent->ci_dentry;
														
 
															+	else if (configfs_mount && configfs_mount->mnt_sb)
														
 
															+		parent = configfs_mount->mnt_sb->s_root;
														
 
															+	else
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	error = create_dir(item,parent,dentry);
														
 
															+	if (!error)
														
 
															+		item->ci_dentry = dentry;
														
 
															+	return error;
														
 
															+}
														
 
															+
														
 
															+int configfs_create_link(struct configfs_symlink *sl,
														
 
															+			 struct dentry *parent,
														
 
															+			 struct dentry *dentry)
														
 
															+{
														
 
															+	int err = 0;
														
 
															+	umode_t mode = S_IFLNK | S_IRWXUGO;
														
 
															+
														
 
															+	err = configfs_create(dentry, mode, init_symlink);
														
 
															+	if (!err) {
														
 
															+		err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
														
 
															+					 mode, CONFIGFS_ITEM_LINK);
														
 
															+		if (!err)
														
 
															+			dentry->d_op = &configfs_dentry_ops;
														
 
															+	}
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static void remove_dir(struct dentry * d)
														
 
															+{
														
 
															+	struct dentry * parent = dget(d->d_parent);
														
 
															+	struct configfs_dirent * sd;
														
 
															+
														
 
															+	sd = d->d_fsdata;
														
 
															+ 	list_del_init(&sd->s_sibling);
														
 
															+	configfs_put(sd);
														
 
															+	if (d->d_inode)
														
 
															+		simple_rmdir(parent->d_inode,d);
														
 
															+
														
 
															+	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
														
 
															+		 atomic_read(&d->d_count));
														
 
															+
														
 
															+	dput(parent);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * configfs_remove_dir - remove an config_item's directory.
														
 
															+ * @item:	config_item we're removing.
														
 
															+ *
														
 
															+ * The only thing special about this is that we remove any files in
														
 
															+ * the directory before we remove the directory, and we've inlined
														
 
															+ * what used to be configfs_rmdir() below, instead of calling separately.
														
 
															+ */
														
 
															+
														
 
															+static void configfs_remove_dir(struct config_item * item)
														
 
															+{
														
 
															+	struct dentry * dentry = dget(item->ci_dentry);
														
 
															+
														
 
															+	if (!dentry)
														
 
															+		return;
														
 
															+
														
 
															+	remove_dir(dentry);
														
 
															+	/**
														
 
															+	 * Drop reference from dget() on entrance.
														
 
															+	 */
														
 
															+	dput(dentry);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* attaches attribute's configfs_dirent to the dentry corresponding to the
														
 
															+ * attribute file
														
 
															+ */
														
 
															+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
														
 
															+{
														
 
															+	struct configfs_attribute * attr = sd->s_element;
														
 
															+	int error;
														
 
															+
														
 
															+	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
														
 
															+	if (error)
														
 
															+		return error;
														
 
															+
														
 
															+	dentry->d_op = &configfs_dentry_ops;
														
 
															+	dentry->d_fsdata = configfs_get(sd);
														
 
															+	sd->s_dentry = dentry;
														
 
															+	d_rehash(dentry);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static struct dentry * configfs_lookup(struct inode *dir,
														
 
															+				       struct dentry *dentry,
														
 
															+				       struct nameidata *nd)
														
 
															+{
														
 
															+	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
														
 
															+	struct configfs_dirent * sd;
														
 
															+	int found = 0;
														
 
															+	int err = 0;
														
 
															+
														
 
															+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
														
 
															+		if (sd->s_type & CONFIGFS_NOT_PINNED) {
														
 
															+			const unsigned char * name = configfs_get_name(sd);
														
 
															+
														
 
															+			if (strcmp(name, dentry->d_name.name))
														
 
															+				continue;
														
 
															+
														
 
															+			found = 1;
														
 
															+			err = configfs_attach_attr(sd, dentry);
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (!found) {
														
 
															+		/*
														
 
															+		 * If it doesn't exist and it isn't a NOT_PINNED item,
														
 
															+		 * it must be negative.
														
 
															+		 */
														
 
															+		return simple_lookup(dir, dentry, nd);
														
 
															+	}
														
 
															+
														
 
															+	return ERR_PTR(err);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
														
 
															+ * attributes and are removed by rmdir().  We recurse, taking i_sem
														
 
															+ * on all children that are candidates for default detach.  If the
														
 
															+ * result is clean, then configfs_detach_group() will handle dropping
														
 
															+ * i_sem.  If there is an error, the caller will clean up the i_sem
														
 
															+ * holders via configfs_detach_rollback().
														
 
															+ */
														
 
															+static int configfs_detach_prep(struct dentry *dentry)
														
 
															+{
														
 
															+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
														
 
															+	struct configfs_dirent *sd;
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = -EBUSY;
														
 
															+	if (!list_empty(&parent_sd->s_links))
														
 
															+		goto out;
														
 
															+
														
 
															+	ret = 0;
														
 
															+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
														
 
															+		if (sd->s_type & CONFIGFS_NOT_PINNED)
														
 
															+			continue;
														
 
															+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
														
 
															+			down(&sd->s_dentry->d_inode->i_sem);
														
 
															+			/* Mark that we've taken i_sem */
														
 
															+			sd->s_type |= CONFIGFS_USET_DROPPING;
														
 
															+
														
 
															+			ret = configfs_detach_prep(sd->s_dentry);
														
 
															+			if (!ret)
														
 
															+			       	continue;
														
 
															+		} else
														
 
															+			ret = -ENOTEMPTY;
														
 
															+
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
														
 
															+ * set.
														
 
															+ */
														
 
															+static void configfs_detach_rollback(struct dentry *dentry)
														
 
															+{
														
 
															+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
														
 
															+	struct configfs_dirent *sd;
														
 
															+
														
 
															+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
														
 
															+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
														
 
															+			configfs_detach_rollback(sd->s_dentry);
														
 
															+
														
 
															+			if (sd->s_type & CONFIGFS_USET_DROPPING) {
														
 
															+				sd->s_type &= ~CONFIGFS_USET_DROPPING;
														
 
															+				up(&sd->s_dentry->d_inode->i_sem);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void detach_attrs(struct config_item * item)
														
 
															+{
														
 
															+	struct dentry * dentry = dget(item->ci_dentry);
														
 
															+	struct configfs_dirent * parent_sd;
														
 
															+	struct configfs_dirent * sd, * tmp;
														
 
															+
														
 
															+	if (!dentry)
														
 
															+		return;
														
 
															+
														
 
															+	pr_debug("configfs %s: dropping attrs for  dir\n",
														
 
															+		 dentry->d_name.name);
														
 
															+
														
 
															+	parent_sd = dentry->d_fsdata;
														
 
															+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
														
 
															+		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
														
 
															+			continue;
														
 
															+		list_del_init(&sd->s_sibling);
														
 
															+		configfs_drop_dentry(sd, dentry);
														
 
															+		configfs_put(sd);
														
 
															+	}
														
 
															+
														
 
															+	/**
														
 
															+	 * Drop reference from dget() on entrance.
														
 
															+	 */
														
 
															+	dput(dentry);
														
 
															+}
														
 
															+
														
 
															+static int populate_attrs(struct config_item *item)
														
 
															+{
														
 
															+	struct config_item_type *t = item->ci_type;
														
 
															+	struct configfs_attribute *attr;
														
 
															+	int error = 0;
														
 
															+	int i;
														
 
															+
														
 
															+	if (!t)
														
 
															+		return -EINVAL;
														
 
															+	if (t->ct_attrs) {
														
 
															+		for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
														
 
															+			if ((error = configfs_create_file(item, attr)))
														
 
															+				break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (error)
														
 
															+		detach_attrs(item);
														
 
															+
														
 
															+	return error;
														
 
															+}
														
 
															+
														
 
															+static int configfs_attach_group(struct config_item *parent_item,
														
 
															+				 struct config_item *item,
														
 
															+				 struct dentry *dentry);
														
 
															+static void configfs_detach_group(struct config_item *item);
														
 
															+
														
 
															+static void detach_groups(struct config_group *group)
														
 
															+{
														
 
															+	struct dentry * dentry = dget(group->cg_item.ci_dentry);
														
 
															+	struct dentry *child;
														
 
															+	struct configfs_dirent *parent_sd;
														
 
															+	struct configfs_dirent *sd, *tmp;
														
 
															+
														
 
															+	if (!dentry)
														
 
															+		return;
														
 
															+
														
 
															+	parent_sd = dentry->d_fsdata;
														
 
															+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
														
 
															+		if (!sd->s_element ||
														
 
															+		    !(sd->s_type & CONFIGFS_USET_DEFAULT))
														
 
															+			continue;
														
 
															+
														
 
															+		child = sd->s_dentry;
														
 
															+
														
 
															+		configfs_detach_group(sd->s_element);
														
 
															+		child->d_inode->i_flags |= S_DEAD;
														
 
															+
														
 
															+		/*
														
 
															+		 * From rmdir/unregister, a configfs_detach_prep() pass
														
 
															+		 * has taken our i_sem for us.  Drop it.
														
 
															+		 * From mkdir/register cleanup, there is no sem held.
														
 
															+		 */
														
 
															+		if (sd->s_type & CONFIGFS_USET_DROPPING)
														
 
															+			up(&child->d_inode->i_sem);
														
 
															+
														
 
															+		d_delete(child);
														
 
															+		dput(child);
														
 
															+	}
														
 
															+
														
 
															+	/**
														
 
															+	 * Drop reference from dget() on entrance.
														
 
															+	 */
														
 
															+	dput(dentry);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This fakes mkdir(2) on a default_groups[] entry.  It
														
 
															+ * creates a dentry, attachs it, and then does fixup
														
 
															+ * on the sd->s_type.
														
 
															+ *
														
 
															+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
														
 
															+ * try using vfs_mkdir.  Just a thought.
														
 
															+ */
														
 
															+static int create_default_group(struct config_group *parent_group,
														
 
															+				struct config_group *group)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct qstr name;
														
 
															+	struct configfs_dirent *sd;
														
 
															+	/* We trust the caller holds a reference to parent */
														
 
															+	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
														
 
															+
														
 
															+	if (!group->cg_item.ci_name)
														
 
															+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
														
 
															+	name.name = group->cg_item.ci_name;
														
 
															+	name.len = strlen(name.name);
														
 
															+	name.hash = full_name_hash(name.name, name.len);
														
 
															+
														
 
															+	ret = -ENOMEM;
														
 
															+	child = d_alloc(parent, &name);
														
 
															+	if (child) {
														
 
															+		d_add(child, NULL);
														
 
															+
														
 
															+		ret = configfs_attach_group(&parent_group->cg_item,
														
 
															+					    &group->cg_item, child);
														
 
															+		if (!ret) {
														
 
															+			sd = child->d_fsdata;
														
 
															+			sd->s_type |= CONFIGFS_USET_DEFAULT;
														
 
															+		} else {
														
 
															+			d_delete(child);
														
 
															+			dput(child);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int populate_groups(struct config_group *group)
														
 
															+{
														
 
															+	struct config_group *new_group;
														
 
															+	struct dentry *dentry = group->cg_item.ci_dentry;
														
 
															+	int ret = 0;
														
 
															+	int i;
														
 
															+
														
 
															+	if (group && group->default_groups) {
														
 
															+		/* FYI, we're faking mkdir here
														
 
															+		 * I'm not sure we need this semaphore, as we're called
														
 
															+		 * from our parent's mkdir.  That holds our parent's
														
 
															+		 * i_sem, so afaik lookup cannot continue through our
														
 
															+		 * parent to find us, let alone mess with our tree.
														
 
															+		 * That said, taking our i_sem is closer to mkdir
														
 
															+		 * emulation, and shouldn't hurt. */
														
 
															+		down(&dentry->d_inode->i_sem);
														
 
															+
														
 
															+		for (i = 0; group->default_groups[i]; i++) {
														
 
															+			new_group = group->default_groups[i];
														
 
															+
														
 
															+			ret = create_default_group(group, new_group);
														
 
															+			if (ret)
														
 
															+				break;
														
 
															+		}
														
 
															+
														
 
															+		up(&dentry->d_inode->i_sem);
														
 
															+	}
														
 
															+
														
 
															+	if (ret)
														
 
															+		detach_groups(group);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * All of link_obj/unlink_obj/link_group/unlink_group require that
														
 
															+ * subsys->su_sem is held.
														
 
															+ */
														
 
															+
														
 
															+static void unlink_obj(struct config_item *item)
														
 
															+{
														
 
															+	struct config_group *group;
														
 
															+
														
 
															+	group = item->ci_group;
														
 
															+	if (group) {
														
 
															+		list_del_init(&item->ci_entry);
														
 
															+
														
 
															+		item->ci_group = NULL;
														
 
															+		item->ci_parent = NULL;
														
 
															+		config_item_put(item);
														
 
															+
														
 
															+		config_group_put(group);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void link_obj(struct config_item *parent_item, struct config_item *item)
														
 
															+{
														
 
															+	/* Parent seems redundant with group, but it makes certain
														
 
															+	 * traversals much nicer. */
														
 
															+	item->ci_parent = parent_item;
														
 
															+	item->ci_group = config_group_get(to_config_group(parent_item));
														
 
															+	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
														
 
															+
														
 
															+	config_item_get(item);
														
 
															+}
														
 
															+
														
 
															+static void unlink_group(struct config_group *group)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct config_group *new_group;
														
 
															+
														
 
															+	if (group->default_groups) {
														
 
															+		for (i = 0; group->default_groups[i]; i++) {
														
 
															+			new_group = group->default_groups[i];
														
 
															+			unlink_group(new_group);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	group->cg_subsys = NULL;
														
 
															+	unlink_obj(&group->cg_item);
														
 
															+}
														
 
															+
														
 
															+static void link_group(struct config_group *parent_group, struct config_group *group)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct config_group *new_group;
														
 
															+	struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
														
 
															+
														
 
															+	link_obj(&parent_group->cg_item, &group->cg_item);
														
 
															+
														
 
															+	if (parent_group->cg_subsys)
														
 
															+		subsys = parent_group->cg_subsys;
														
 
															+	else if (configfs_is_root(&parent_group->cg_item))
														
 
															+		subsys = to_configfs_subsystem(group);
														
 
															+	else
														
 
															+		BUG();
														
 
															+	group->cg_subsys = subsys;
														
 
															+
														
 
															+	if (group->default_groups) {
														
 
															+		for (i = 0; group->default_groups[i]; i++) {
														
 
															+			new_group = group->default_groups[i];
														
 
															+			link_group(group, new_group);
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * The goal is that configfs_attach_item() (and
														
 
															+ * configfs_attach_group()) can be called from either the VFS or this
														
 
															+ * module.  That is, they assume that the items have been created,
														
 
															+ * the dentry allocated, and the dcache is all ready to go.
														
 
															+ *
														
 
															+ * If they fail, they must clean up after themselves as if they
														
 
															+ * had never been called.  The caller (VFS or local function) will
														
 
															+ * handle cleaning up the dcache bits.
														
 
															+ *
														
 
															+ * configfs_detach_group() and configfs_detach_item() behave similarly on
														
 
															+ * the way out.  They assume that the proper semaphores are held, they
														
 
															+ * clean up the configfs items, and they expect their callers will
														
 
															+ * handle the dcache bits.
														
 
															+ */
														
 
															+static int configfs_attach_item(struct config_item *parent_item,
														
 
															+				struct config_item *item,
														
 
															+				struct dentry *dentry)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = configfs_create_dir(item, dentry);
														
 
															+	if (!ret) {
														
 
															+		ret = populate_attrs(item);
														
 
															+		if (ret) {
														
 
															+			configfs_remove_dir(item);
														
 
															+			d_delete(dentry);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void configfs_detach_item(struct config_item *item)
														
 
															+{
														
 
															+	detach_attrs(item);
														
 
															+	configfs_remove_dir(item);
														
 
															+}
														
 
															+
														
 
															+static int configfs_attach_group(struct config_item *parent_item,
														
 
															+				 struct config_item *item,
														
 
															+				 struct dentry *dentry)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct configfs_dirent *sd;
														
 
															+
														
 
															+	ret = configfs_attach_item(parent_item, item, dentry);
														
 
															+	if (!ret) {
														
 
															+		sd = dentry->d_fsdata;
														
 
															+		sd->s_type |= CONFIGFS_USET_DIR;
														
 
															+
														
 
															+		ret = populate_groups(to_config_group(item));
														
 
															+		if (ret) {
														
 
															+			configfs_detach_item(item);
														
 
															+			d_delete(dentry);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void configfs_detach_group(struct config_item *item)
														
 
															+{
														
 
															+	detach_groups(to_config_group(item));
														
 
															+	configfs_detach_item(item);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Drop the initial reference from make_item()/make_group()
														
 
															+ * This function assumes that reference is held on item
														
 
															+ * and that item holds a valid reference to the parent.  Also, it
														
 
															+ * assumes the caller has validated ci_type.
														
 
															+ */
														
 
															+static void client_drop_item(struct config_item *parent_item,
														
 
															+			     struct config_item *item)
														
 
															+{
														
 
															+	struct config_item_type *type;
														
 
															+
														
 
															+	type = parent_item->ci_type;
														
 
															+	BUG_ON(!type);
														
 
															+
														
 
															+	if (type->ct_group_ops && type->ct_group_ops->drop_item)
														
 
															+		type->ct_group_ops->drop_item(to_config_group(parent_item),
														
 
															+						item);
														
 
															+	else
														
 
															+		config_item_put(item);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct config_group *group;
														
 
															+	struct config_item *item;
														
 
															+	struct config_item *parent_item;
														
 
															+	struct configfs_subsystem *subsys;
														
 
															+	struct configfs_dirent *sd;
														
 
															+	struct config_item_type *type;
														
 
															+	struct module *owner;
														
 
															+	char *name;
														
 
															+
														
 
															+	if (dentry->d_parent == configfs_sb->s_root)
														
 
															+		return -EPERM;
														
 
															+
														
 
															+	sd = dentry->d_parent->d_fsdata;
														
 
															+	if (!(sd->s_type & CONFIGFS_USET_DIR))
														
 
															+		return -EPERM;
														
 
															+
														
 
															+	parent_item = configfs_get_config_item(dentry->d_parent);
														
 
															+	type = parent_item->ci_type;
														
 
															+	subsys = to_config_group(parent_item)->cg_subsys;
														
 
															+	BUG_ON(!subsys);
														
 
															+
														
 
															+	if (!type || !type->ct_group_ops ||
														
 
															+	    (!type->ct_group_ops->make_group &&
														
 
															+	     !type->ct_group_ops->make_item)) {
														
 
															+		config_item_put(parent_item);
														
 
															+		return -EPERM;  /* What lack-of-mkdir returns */
														
 
															+	}
														
 
															+
														
 
															+	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
														
 
															+	if (!name) {
														
 
															+		config_item_put(parent_item);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
														
 
															+
														
 
															+	down(&subsys->su_sem);
														
 
															+	group = NULL;
														
 
															+	item = NULL;
														
 
															+	if (type->ct_group_ops->make_group) {
														
 
															+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
														
 
															+		if (group) {
														
 
															+			link_group(to_config_group(parent_item), group);
														
 
															+			item = &group->cg_item;
														
 
															+		}
														
 
															+	} else {
														
 
															+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
														
 
															+		if (item)
														
 
															+			link_obj(parent_item, item);
														
 
															+	}
														
 
															+	up(&subsys->su_sem);
														
 
															+
														
 
															+	kfree(name);
														
 
															+	if (!item) {
														
 
															+		config_item_put(parent_item);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	ret = -EINVAL;
														
 
															+	type = item->ci_type;
														
 
															+	if (type) {
														
 
															+		owner = type->ct_owner;
														
 
															+		if (try_module_get(owner)) {
														
 
															+			if (group) {
														
 
															+				ret = configfs_attach_group(parent_item,
														
 
															+							    item,
														
 
															+							    dentry);
														
 
															+			} else {
														
 
															+				ret = configfs_attach_item(parent_item,
														
 
															+							   item,
														
 
															+							   dentry);
														
 
															+			}
														
 
															+
														
 
															+			if (ret) {
														
 
															+				down(&subsys->su_sem);
														
 
															+				if (group)
														
 
															+					unlink_group(group);
														
 
															+				else
														
 
															+					unlink_obj(item);
														
 
															+				client_drop_item(parent_item, item);
														
 
															+				up(&subsys->su_sem);
														
 
															+
														
 
															+				config_item_put(parent_item);
														
 
															+				module_put(owner);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
														
 
															+{
														
 
															+	struct config_item *parent_item;
														
 
															+	struct config_item *item;
														
 
															+	struct configfs_subsystem *subsys;
														
 
															+	struct configfs_dirent *sd;
														
 
															+	struct module *owner = NULL;
														
 
															+	int ret;
														
 
															+
														
 
															+	if (dentry->d_parent == configfs_sb->s_root)
														
 
															+		return -EPERM;
														
 
															+
														
 
															+	sd = dentry->d_fsdata;
														
 
															+	if (sd->s_type & CONFIGFS_USET_DEFAULT)
														
 
															+		return -EPERM;
														
 
															+
														
 
															+	parent_item = configfs_get_config_item(dentry->d_parent);
														
 
															+	subsys = to_config_group(parent_item)->cg_subsys;
														
 
															+	BUG_ON(!subsys);
														
 
															+
														
 
															+	if (!parent_item->ci_type) {
														
 
															+		config_item_put(parent_item);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	ret = configfs_detach_prep(dentry);
														
 
															+	if (ret) {
														
 
															+		configfs_detach_rollback(dentry);
														
 
															+		config_item_put(parent_item);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	item = configfs_get_config_item(dentry);
														
 
															+
														
 
															+	/* Drop reference from above, item already holds one. */
														
 
															+	config_item_put(parent_item);
														
 
															+
														
 
															+	if (item->ci_type)
														
 
															+		owner = item->ci_type->ct_owner;
														
 
															+
														
 
															+	if (sd->s_type & CONFIGFS_USET_DIR) {
														
 
															+		configfs_detach_group(item);
														
 
															+
														
 
															+		down(&subsys->su_sem);
														
 
															+		unlink_group(to_config_group(item));
														
 
															+	} else {
														
 
															+		configfs_detach_item(item);
														
 
															+
														
 
															+		down(&subsys->su_sem);
														
 
															+		unlink_obj(item);
														
 
															+	}
														
 
															+
														
 
															+	client_drop_item(parent_item, item);
														
 
															+	up(&subsys->su_sem);
														
 
															+
														
 
															+	/* Drop our reference from above */
														
 
															+	config_item_put(item);
														
 
															+
														
 
															+	module_put(owner);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+struct inode_operations configfs_dir_inode_operations = {
														
 
															+	.mkdir		= configfs_mkdir,
														
 
															+	.rmdir		= configfs_rmdir,
														
 
															+	.symlink	= configfs_symlink,
														
 
															+	.unlink		= configfs_unlink,
														
 
															+	.lookup		= configfs_lookup,
														
 
															+};
														
 
															+
														
 
															+#if 0
														
 
															+int configfs_rename_dir(struct config_item * item, const char *new_name)
														
 
															+{
														
 
															+	int error = 0;
														
 
															+	struct dentry * new_dentry, * parent;
														
 
															+
														
 
															+	if (!strcmp(config_item_name(item), new_name))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (!item->parent)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	down_write(&configfs_rename_sem);
														
 
															+	parent = item->parent->dentry;
														
 
															+
														
 
															+	down(&parent->d_inode->i_sem);
														
 
															+
														
 
															+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
														
 
															+	if (!IS_ERR(new_dentry)) {
														
 
															+  		if (!new_dentry->d_inode) {
														
 
															+			error = config_item_set_name(item, "%s", new_name);
														
 
															+			if (!error) {
														
 
															+				d_add(new_dentry, NULL);
														
 
															+				d_move(item->dentry, new_dentry);
														
 
															+			}
														
 
															+			else
														
 
															+				d_delete(new_dentry);
														
 
															+		} else
														
 
															+			error = -EEXIST;
														
 
															+		dput(new_dentry);
														
 
															+	}
														
 
															+	up(&parent->d_inode->i_sem);
														
 
															+	up_write(&configfs_rename_sem);
														
 
															+
														
 
															+	return error;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+static int configfs_dir_open(struct inode *inode, struct file *file)
														
 
															+{
														
 
															+	struct dentry * dentry = file->f_dentry;
														
 
															+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
														
 
															+
														
 
															+	down(&dentry->d_inode->i_sem);
														
 
															+	file->private_data = configfs_new_dirent(parent_sd, NULL);
														
 
															+	up(&dentry->d_inode->i_sem);
														
 
															+
														
 
															+	return file->private_data ? 0 : -ENOMEM;
														
 
															+
														
 
															+}
														
 
															+
														
 
															+static int configfs_dir_close(struct inode *inode, struct file *file)
														
 
															+{
														
 
															+	struct dentry * dentry = file->f_dentry;
														
 
															+	struct configfs_dirent * cursor = file->private_data;
														
 
															+
														
 
															+	down(&dentry->d_inode->i_sem);
														
 
															+	list_del_init(&cursor->s_sibling);
														
 
															+	up(&dentry->d_inode->i_sem);
														
 
															+
														
 
															+	release_configfs_dirent(cursor);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Relationship between s_mode and the DT_xxx types */
														
 
															+static inline unsigned char dt_type(struct configfs_dirent *sd)
														
 
															+{
														
 
															+	return (sd->s_mode >> 12) & 15;
														
 
															+}
														
 
															+
														
 
															+static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
														
 
															+{
														
 
															+	struct dentry *dentry = filp->f_dentry;
														
 
															+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
														
 
															+	struct configfs_dirent *cursor = filp->private_data;
														
 
															+	struct list_head *p, *q = &cursor->s_sibling;
														
 
															+	ino_t ino;
														
 
															+	int i = filp->f_pos;
														
 
															+
														
 
															+	switch (i) {
														
 
															+		case 0:
														
 
															+			ino = dentry->d_inode->i_ino;
														
 
															+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
														
 
															+				break;
														
 
															+			filp->f_pos++;
														
 
															+			i++;
														
 
															+			/* fallthrough */
														
 
															+		case 1:
														
 
															+			ino = parent_ino(dentry);
														
 
															+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
														
 
															+				break;
														
 
															+			filp->f_pos++;
														
 
															+			i++;
														
 
															+			/* fallthrough */
														
 
															+		default:
														
 
															+			if (filp->f_pos == 2) {
														
 
															+				list_del(q);
														
 
															+				list_add(q, &parent_sd->s_children);
														
 
															+			}
														
 
															+			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
														
 
															+				struct configfs_dirent *next;
														
 
															+				const char * name;
														
 
															+				int len;
														
 
															+
														
 
															+				next = list_entry(p, struct configfs_dirent,
														
 
															+						   s_sibling);
														
 
															+				if (!next->s_element)
														
 
															+					continue;
														
 
															+
														
 
															+				name = configfs_get_name(next);
														
 
															+				len = strlen(name);
														
 
															+				if (next->s_dentry)
														
 
															+					ino = next->s_dentry->d_inode->i_ino;
														
 
															+				else
														
 
															+					ino = iunique(configfs_sb, 2);
														
 
															+
														
 
															+				if (filldir(dirent, name, len, filp->f_pos, ino,
														
 
															+						 dt_type(next)) < 0)
														
 
															+					return 0;
														
 
															+
														
 
															+				list_del(q);
														
 
															+				list_add(q, p);
														
 
															+				p = q;
														
 
															+				filp->f_pos++;
														
 
															+			}
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
														
 
															+{
														
 
															+	struct dentry * dentry = file->f_dentry;
														
 
															+
														
 
															+	down(&dentry->d_inode->i_sem);
														
 
															+	switch (origin) {
														
 
															+		case 1:
														
 
															+			offset += file->f_pos;
														
 
															+		case 0:
														
 
															+			if (offset >= 0)
														
 
															+				break;
														
 
															+		default:
														
 
															+			up(&file->f_dentry->d_inode->i_sem);
														
 
															+			return -EINVAL;
														
 
															+	}
														
 
															+	if (offset != file->f_pos) {
														
 
															+		file->f_pos = offset;
														
 
															+		if (file->f_pos >= 2) {
														
 
															+			struct configfs_dirent *sd = dentry->d_fsdata;
														
 
															+			struct configfs_dirent *cursor = file->private_data;
														
 
															+			struct list_head *p;
														
 
															+			loff_t n = file->f_pos - 2;
														
 
															+
														
 
															+			list_del(&cursor->s_sibling);
														
 
															+			p = sd->s_children.next;
														
 
															+			while (n && p != &sd->s_children) {
														
 
															+				struct configfs_dirent *next;
														
 
															+				next = list_entry(p, struct configfs_dirent,
														
 
															+						   s_sibling);
														
 
															+				if (next->s_element)
														
 
															+					n--;
														
 
															+				p = p->next;
														
 
															+			}
														
 
															+			list_add_tail(&cursor->s_sibling, p);
														
 
															+		}
														
 
															+	}
														
 
															+	up(&dentry->d_inode->i_sem);
														
 
															+	return offset;
														
 
															+}
														
 
															+
														
 
															+struct file_operations configfs_dir_operations = {
														
 
															+	.open		= configfs_dir_open,
														
 
															+	.release	= configfs_dir_close,
														
 
															+	.llseek		= configfs_dir_lseek,
														
 
															+	.read		= generic_read_dir,
														
 
															+	.readdir	= configfs_readdir,
														
 
															+};
														
 
															+
														
 
															+int configfs_register_subsystem(struct configfs_subsystem *subsys)
														
 
															+{
														
 
															+	int err;
														
 
															+	struct config_group *group = &subsys->su_group;
														
 
															+	struct qstr name;
														
 
															+	struct dentry *dentry;
														
 
															+	struct configfs_dirent *sd;
														
 
															+
														
 
															+	err = configfs_pin_fs();
														
 
															+	if (err)
														
 
															+		return err;
														
 
															+
														
 
															+	if (!group->cg_item.ci_name)
														
 
															+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
														
 
															+
														
 
															+	sd = configfs_sb->s_root->d_fsdata;
														
 
															+	link_group(to_config_group(sd->s_element), group);
														
 
															+
														
 
															+	down(&configfs_sb->s_root->d_inode->i_sem);
														
 
															+
														
 
															+	name.name = group->cg_item.ci_name;
														
 
															+	name.len = strlen(name.name);
														
 
															+	name.hash = full_name_hash(name.name, name.len);
														
 
															+
														
 
															+	err = -ENOMEM;
														
 
															+	dentry = d_alloc(configfs_sb->s_root, &name);
														
 
															+	if (!dentry)
														
 
															+		goto out_release;
														
 
															+
														
 
															+	d_add(dentry, NULL);
														
 
															+
														
 
															+	err = configfs_attach_group(sd->s_element, &group->cg_item,
														
 
															+				    dentry);
														
 
															+	if (!err)
														
 
															+		dentry = NULL;
														
 
															+	else
														
 
															+		d_delete(dentry);
														
 
															+
														
 
															+	up(&configfs_sb->s_root->d_inode->i_sem);
														
 
															+
														
 
															+	if (dentry) {
														
 
															+	    dput(dentry);
														
 
															+out_release:
														
 
															+	    unlink_group(group);
														
 
															+	    configfs_release_fs();
														
 
															+	}
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
														
 
															+{
														
 
															+	struct config_group *group = &subsys->su_group;
														
 
															+	struct dentry *dentry = group->cg_item.ci_dentry;
														
 
															+
														
 
															+	if (dentry->d_parent != configfs_sb->s_root) {
														
 
															+		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	down(&configfs_sb->s_root->d_inode->i_sem);
														
 
															+	down(&dentry->d_inode->i_sem);
														
 
															+	if (configfs_detach_prep(dentry)) {
														
 
															+		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
														
 
															+	}
														
 
															+	configfs_detach_group(&group->cg_item);
														
 
															+	dentry->d_inode->i_flags |= S_DEAD;
														
 
															+	up(&dentry->d_inode->i_sem);
														
 
															+
														
 
															+	d_delete(dentry);
														
 
															+
														
 
															+	up(&configfs_sb->s_root->d_inode->i_sem);
														
 
															+
														
 
															+	dput(dentry);
														
 
															+
														
 
															+	unlink_group(group);
														
 
															+	configfs_release_fs();
														
 
															+}
														
 
															+
														
 
															+EXPORT_SYMBOL(configfs_register_subsystem);
														
 
															+EXPORT_SYMBOL(configfs_unregister_subsystem);
														
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -0,0 +1,360 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * file.c - operations for regular (text) files.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * Based on sysfs:
														
 
															+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
														
 
															+ *
														
 
															+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/dnotify.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <asm/uaccess.h>
														
 
															+#include <asm/semaphore.h>
														
 
															+
														
 
															+#include <linux/configfs.h>
														
 
															+#include "configfs_internal.h"
														
 
															+
														
 
															+
														
 
															+struct configfs_buffer {
														
 
															+	size_t			count;
														
 
															+	loff_t			pos;
														
 
															+	char			* page;
														
 
															+	struct configfs_item_operations	* ops;
														
 
															+	struct semaphore	sem;
														
 
															+	int			needs_read_fill;
														
 
															+};
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	fill_read_buffer - allocate and fill buffer from item.
														
 
															+ *	@dentry:	dentry pointer.
														
 
															+ *	@buffer:	data buffer for file.
														
 
															+ *
														
 
															+ *	Allocate @buffer->page, if it hasn't been already, then call the
														
 
															+ *	config_item's show() method to fill the buffer with this attribute's
														
 
															+ *	data.
														
 
															+ *	This is called only once, on the file's first read.
														
 
															+ */
														
 
															+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
														
 
															+{
														
 
															+	struct configfs_attribute * attr = to_attr(dentry);
														
 
															+	struct config_item * item = to_item(dentry->d_parent);
														
 
															+	struct configfs_item_operations * ops = buffer->ops;
														
 
															+	int ret = 0;
														
 
															+	ssize_t count;
														
 
															+
														
 
															+	if (!buffer->page)
														
 
															+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
														
 
															+	if (!buffer->page)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	count = ops->show_attribute(item,attr,buffer->page);
														
 
															+	buffer->needs_read_fill = 0;
														
 
															+	BUG_ON(count > (ssize_t)PAGE_SIZE);
														
 
															+	if (count >= 0)
														
 
															+		buffer->count = count;
														
 
															+	else
														
 
															+		ret = count;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	flush_read_buffer - push buffer to userspace.
														
 
															+ *	@buffer:	data buffer for file.
														
 
															+ *	@userbuf:	user-passed buffer.
														
 
															+ *	@count:		number of bytes requested.
														
 
															+ *	@ppos:		file position.
														
 
															+ *
														
 
															+ *	Copy the buffer we filled in fill_read_buffer() to userspace.
														
 
															+ *	This is done at the reader's leisure, copying and advancing
														
 
															+ *	the amount they specify each time.
														
 
															+ *	This may be called continuously until the buffer is empty.
														
 
															+ */
														
 
															+static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
														
 
															+			     size_t count, loff_t * ppos)
														
 
															+{
														
 
															+	int error;
														
 
															+
														
 
															+	if (*ppos > buffer->count)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (count > (buffer->count - *ppos))
														
 
															+		count = buffer->count - *ppos;
														
 
															+
														
 
															+	error = copy_to_user(buf,buffer->page + *ppos,count);
														
 
															+	if (!error)
														
 
															+		*ppos += count;
														
 
															+	return error ? -EFAULT : count;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ *	configfs_read_file - read an attribute.
														
 
															+ *	@file:	file pointer.
														
 
															+ *	@buf:	buffer to fill.
														
 
															+ *	@count:	number of bytes to read.
														
 
															+ *	@ppos:	starting offset in file.
														
 
															+ *
														
 
															+ *	Userspace wants to read an attribute file. The attribute descriptor
														
 
															+ *	is in the file's ->d_fsdata. The target item is in the directory's
														
 
															+ *	->d_fsdata.
														
 
															+ *
														
 
															+ *	We call fill_read_buffer() to allocate and fill the buffer from the
														
 
															+ *	item's show() method exactly once (if the read is happening from
														
 
															+ *	the beginning of the file). That should fill the entire buffer with
														
 
															+ *	all the data the item has to offer for that attribute.
														
 
															+ *	We then call flush_read_buffer() to copy the buffer to userspace
														
 
															+ *	in the increments specified.
														
 
															+ */
														
 
															+
														
 
															+static ssize_t
														
 
															+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
														
 
															+{
														
 
															+	struct configfs_buffer * buffer = file->private_data;
														
 
															+	ssize_t retval = 0;
														
 
															+
														
 
															+	down(&buffer->sem);
														
 
															+	if (buffer->needs_read_fill) {
														
 
															+		if ((retval = fill_read_buffer(file->f_dentry,buffer)))
														
 
															+			goto out;
														
 
															+	}
														
 
															+	pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
														
 
															+		 __FUNCTION__,count,*ppos,buffer->page);
														
 
															+	retval = flush_read_buffer(buffer,buf,count,ppos);
														
 
															+out:
														
 
															+	up(&buffer->sem);
														
 
															+	return retval;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	fill_write_buffer - copy buffer from userspace.
														
 
															+ *	@buffer:	data buffer for file.
														
 
															+ *	@userbuf:	data from user.
														
 
															+ *	@count:		number of bytes in @userbuf.
														
 
															+ *
														
 
															+ *	Allocate @buffer->page if it hasn't been already, then
														
 
															+ *	copy the user-supplied buffer into it.
														
 
															+ */
														
 
															+
														
 
															+static int
														
 
															+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
														
 
															+{
														
 
															+	int error;
														
 
															+
														
 
															+	if (!buffer->page)
														
 
															+		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
														
 
															+	if (!buffer->page)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	if (count > PAGE_SIZE)
														
 
															+		count = PAGE_SIZE;
														
 
															+	error = copy_from_user(buffer->page,buf,count);
														
 
															+	buffer->needs_read_fill = 1;
														
 
															+	return error ? -EFAULT : count;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	flush_write_buffer - push buffer to config_item.
														
 
															+ *	@file:		file pointer.
														
 
															+ *	@buffer:	data buffer for file.
														
 
															+ *
														
 
															+ *	Get the correct pointers for the config_item and the attribute we're
														
 
															+ *	dealing with, then call the store() method for the attribute,
														
 
															+ *	passing the buffer that we acquired in fill_write_buffer().
														
 
															+ */
														
 
															+
														
 
															+static int
														
 
															+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
														
 
															+{
														
 
															+	struct configfs_attribute * attr = to_attr(dentry);
														
 
															+	struct config_item * item = to_item(dentry->d_parent);
														
 
															+	struct configfs_item_operations * ops = buffer->ops;
														
 
															+
														
 
															+	return ops->store_attribute(item,attr,buffer->page,count);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	configfs_write_file - write an attribute.
														
 
															+ *	@file:	file pointer
														
 
															+ *	@buf:	data to write
														
 
															+ *	@count:	number of bytes
														
 
															+ *	@ppos:	starting offset
														
 
															+ *
														
 
															+ *	Similar to configfs_read_file(), though working in the opposite direction.
														
 
															+ *	We allocate and fill the data from the user in fill_write_buffer(),
														
 
															+ *	then push it to the config_item in flush_write_buffer().
														
 
															+ *	There is no easy way for us to know if userspace is only doing a partial
														
 
															+ *	write, so we don't support them. We expect the entire buffer to come
														
 
															+ *	on the first write.
														
 
															+ *	Hint: if you're writing a value, first read the file, modify only the
														
 
															+ *	the value you're changing, then write entire buffer back.
														
 
															+ */
														
 
															+
														
 
															+static ssize_t
														
 
															+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
														
 
															+{
														
 
															+	struct configfs_buffer * buffer = file->private_data;
														
 
															+
														
 
															+	down(&buffer->sem);
														
 
															+	count = fill_write_buffer(buffer,buf,count);
														
 
															+	if (count > 0)
														
 
															+		count = flush_write_buffer(file->f_dentry,buffer,count);
														
 
															+	if (count > 0)
														
 
															+		*ppos += count;
														
 
															+	up(&buffer->sem);
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static int check_perm(struct inode * inode, struct file * file)
														
 
															+{
														
 
															+	struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
														
 
															+	struct configfs_attribute * attr = to_attr(file->f_dentry);
														
 
															+	struct configfs_buffer * buffer;
														
 
															+	struct configfs_item_operations * ops = NULL;
														
 
															+	int error = 0;
														
 
															+
														
 
															+	if (!item || !attr)
														
 
															+		goto Einval;
														
 
															+
														
 
															+	/* Grab the module reference for this attribute if we have one */
														
 
															+	if (!try_module_get(attr->ca_owner)) {
														
 
															+		error = -ENODEV;
														
 
															+		goto Done;
														
 
															+	}
														
 
															+
														
 
															+	if (item->ci_type)
														
 
															+		ops = item->ci_type->ct_item_ops;
														
 
															+	else
														
 
															+		goto Eaccess;
														
 
															+
														
 
															+	/* File needs write support.
														
 
															+	 * The inode's perms must say it's ok,
														
 
															+	 * and we must have a store method.
														
 
															+	 */
														
 
															+	if (file->f_mode & FMODE_WRITE) {
														
 
															+
														
 
															+		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
														
 
															+			goto Eaccess;
														
 
															+
														
 
															+	}
														
 
															+
														
 
															+	/* File needs read support.
														
 
															+	 * The inode's perms must say it's ok, and we there
														
 
															+	 * must be a show method for it.
														
 
															+	 */
														
 
															+	if (file->f_mode & FMODE_READ) {
														
 
															+		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
														
 
															+			goto Eaccess;
														
 
															+	}
														
 
															+
														
 
															+	/* No error? Great, allocate a buffer for the file, and store it
														
 
															+	 * it in file->private_data for easy access.
														
 
															+	 */
														
 
															+	buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
														
 
															+	if (buffer) {
														
 
															+		memset(buffer,0,sizeof(struct configfs_buffer));
														
 
															+		init_MUTEX(&buffer->sem);
														
 
															+		buffer->needs_read_fill = 1;
														
 
															+		buffer->ops = ops;
														
 
															+		file->private_data = buffer;
														
 
															+	} else
														
 
															+		error = -ENOMEM;
														
 
															+	goto Done;
														
 
															+
														
 
															+ Einval:
														
 
															+	error = -EINVAL;
														
 
															+	goto Done;
														
 
															+ Eaccess:
														
 
															+	error = -EACCES;
														
 
															+	module_put(attr->ca_owner);
														
 
															+ Done:
														
 
															+	if (error && item)
														
 
															+		config_item_put(item);
														
 
															+	return error;
														
 
															+}
														
 
															+
														
 
															+static int configfs_open_file(struct inode * inode, struct file * filp)
														
 
															+{
														
 
															+	return check_perm(inode,filp);
														
 
															+}
														
 
															+
														
 
															+static int configfs_release(struct inode * inode, struct file * filp)
														
 
															+{
														
 
															+	struct config_item * item = to_item(filp->f_dentry->d_parent);
														
 
															+	struct configfs_attribute * attr = to_attr(filp->f_dentry);
														
 
															+	struct module * owner = attr->ca_owner;
														
 
															+	struct configfs_buffer * buffer = filp->private_data;
														
 
															+
														
 
															+	if (item)
														
 
															+		config_item_put(item);
														
 
															+	/* After this point, attr should not be accessed. */
														
 
															+	module_put(owner);
														
 
															+
														
 
															+	if (buffer) {
														
 
															+		if (buffer->page)
														
 
															+			free_page((unsigned long)buffer->page);
														
 
															+		kfree(buffer);
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+struct file_operations configfs_file_operations = {
														
 
															+	.read		= configfs_read_file,
														
 
															+	.write		= configfs_write_file,
														
 
															+	.llseek		= generic_file_llseek,
														
 
															+	.open		= configfs_open_file,
														
 
															+	.release	= configfs_release,
														
 
															+};
														
 
															+
														
 
															+
														
 
															+int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
														
 
															+{
														
 
															+	struct configfs_dirent * parent_sd = dir->d_fsdata;
														
 
															+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
														
 
															+	int error = 0;
														
 
															+
														
 
															+	down(&dir->d_inode->i_sem);
														
 
															+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
														
 
															+	up(&dir->d_inode->i_sem);
														
 
															+
														
 
															+	return error;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	configfs_create_file - create an attribute file for an item.
														
 
															+ *	@item:	item we're creating for.
														
 
															+ *	@attr:	atrribute descriptor.
														
 
															+ */
														
 
															+
														
 
															+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
														
 
															+{
														
 
															+	BUG_ON(!item || !item->ci_dentry || !attr);
														
 
															+
														
 
															+	return configfs_add_file(item->ci_dentry, attr,
														
 
															+				 CONFIGFS_ITEM_ATTR);
														
 
															+}
														
 
															+
														
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -0,0 +1,162 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * inode.c - basic inode and dentry operations.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * Based on sysfs:
														
 
															+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
														
 
															+ *
														
 
															+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * Please see Documentation/filesystems/configfs.txt for more information.
														
 
															+ */
														
 
															+
														
 
															+#undef DEBUG
														
 
															+
														
 
															+#include <linux/pagemap.h>
														
 
															+#include <linux/namei.h>
														
 
															+#include <linux/backing-dev.h>
														
 
															+
														
 
															+#include <linux/configfs.h>
														
 
															+#include "configfs_internal.h"
														
 
															+
														
 
															+extern struct super_block * configfs_sb;
														
 
															+
														
 
															+static struct address_space_operations configfs_aops = {
														
 
															+	.readpage	= simple_readpage,
														
 
															+	.prepare_write	= simple_prepare_write,
														
 
															+	.commit_write	= simple_commit_write
														
 
															+};
														
 
															+
														
 
															+static struct backing_dev_info configfs_backing_dev_info = {
														
 
															+	.ra_pages	= 0,	/* No readahead */
														
 
															+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
														
 
															+};
														
 
															+
														
 
															+struct inode * configfs_new_inode(mode_t mode)
														
 
															+{
														
 
															+	struct inode * inode = new_inode(configfs_sb);
														
 
															+	if (inode) {
														
 
															+		inode->i_mode = mode;
														
 
															+		inode->i_uid = 0;
														
 
															+		inode->i_gid = 0;
														
 
															+		inode->i_blksize = PAGE_CACHE_SIZE;
														
 
															+		inode->i_blocks = 0;
														
 
															+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
														
 
															+		inode->i_mapping->a_ops = &configfs_aops;
														
 
															+		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
														
 
															+	}
														
 
															+	return inode;
														
 
															+}
														
 
															+
														
 
															+int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
														
 
															+{
														
 
															+	int error = 0;
														
 
															+	struct inode * inode = NULL;
														
 
															+	if (dentry) {
														
 
															+		if (!dentry->d_inode) {
														
 
															+			if ((inode = configfs_new_inode(mode))) {
														
 
															+				if (dentry->d_parent && dentry->d_parent->d_inode) {
														
 
															+					struct inode *p_inode = dentry->d_parent->d_inode;
														
 
															+					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
														
 
															+				}
														
 
															+				goto Proceed;
														
 
															+			}
														
 
															+			else
														
 
															+				error = -ENOMEM;
														
 
															+		} else
														
 
															+			error = -EEXIST;
														
 
															+	} else
														
 
															+		error = -ENOENT;
														
 
															+	goto Done;
														
 
															+
														
 
															+ Proceed:
														
 
															+	if (init)
														
 
															+		error = init(inode);
														
 
															+	if (!error) {
														
 
															+		d_instantiate(dentry, inode);
														
 
															+		if (S_ISDIR(mode) || S_ISLNK(mode))
														
 
															+			dget(dentry);  /* pin link and directory dentries in core */
														
 
															+	} else
														
 
															+		iput(inode);
														
 
															+ Done:
														
 
															+	return error;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Get the name for corresponding element represented by the given configfs_dirent
														
 
															+ */
														
 
															+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
														
 
															+{
														
 
															+	struct attribute * attr;
														
 
															+
														
 
															+	if (!sd || !sd->s_element)
														
 
															+		BUG();
														
 
															+
														
 
															+	/* These always have a dentry, so use that */
														
 
															+	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
														
 
															+		return sd->s_dentry->d_name.name;
														
 
															+
														
 
															+	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
														
 
															+		attr = sd->s_element;
														
 
															+		return attr->name;
														
 
															+	}
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * Unhashes the dentry corresponding to given configfs_dirent
														
 
															+ * Called with parent inode's i_sem held.
														
 
															+ */
														
 
															+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
														
 
															+{
														
 
															+	struct dentry * dentry = sd->s_dentry;
														
 
															+
														
 
															+	if (dentry) {
														
 
															+		spin_lock(&dcache_lock);
														
 
															+		if (!(d_unhashed(dentry) && dentry->d_inode)) {
														
 
															+			dget_locked(dentry);
														
 
															+			__d_drop(dentry);
														
 
															+			spin_unlock(&dcache_lock);
														
 
															+			simple_unlink(parent->d_inode, dentry);
														
 
															+		} else
														
 
															+			spin_unlock(&dcache_lock);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+void configfs_hash_and_remove(struct dentry * dir, const char * name)
														
 
															+{
														
 
															+	struct configfs_dirent * sd;
														
 
															+	struct configfs_dirent * parent_sd = dir->d_fsdata;
														
 
															+
														
 
															+	down(&dir->d_inode->i_sem);
														
 
															+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
														
 
															+		if (!sd->s_element)
														
 
															+			continue;
														
 
															+		if (!strcmp(configfs_get_name(sd), name)) {
														
 
															+			list_del_init(&sd->s_sibling);
														
 
															+			configfs_drop_dentry(sd, dir);
														
 
															+			configfs_put(sd);
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+	up(&dir->d_inode->i_sem);
														
 
															+}
														
 
															+
														
 
															+
														
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -0,0 +1,227 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * item.c - library routines for handling generic config items
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * Based on kobject:
														
 
															+ * 	kobject is Copyright (c) 2002-2003 Patrick Mochel
														
 
															+ *
														
 
															+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * Please see the file Documentation/filesystems/configfs.txt for
														
 
															+ * critical information about using the config_item interface.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/string.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/stat.h>
														
 
															+#include <linux/slab.h>
														
 
															+
														
 
															+#include <linux/configfs.h>
														
 
															+
														
 
															+
														
 
															+static inline struct config_item * to_item(struct list_head * entry)
														
 
															+{
														
 
															+	return container_of(entry,struct config_item,ci_entry);
														
 
															+}
														
 
															+
														
 
															+/* Evil kernel */
														
 
															+static void config_item_release(struct kref *kref);
														
 
															+
														
 
															+/**
														
 
															+ *	config_item_init - initialize item.
														
 
															+ *	@item:	item in question.
														
 
															+ */
														
 
															+void config_item_init(struct config_item * item)
														
 
															+{
														
 
															+	kref_init(&item->ci_kref);
														
 
															+	INIT_LIST_HEAD(&item->ci_entry);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ *	config_item_set_name - Set the name of an item
														
 
															+ *	@item:	item.
														
 
															+ *	@name:	name.
														
 
															+ *
														
 
															+ *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
														
 
															+ *	dynamically allocated string that @item->ci_name points to.
														
 
															+ *	Otherwise, use the static @item->ci_namebuf array.
														
 
															+ */
														
 
															+
														
 
															+int config_item_set_name(struct config_item * item, const char * fmt, ...)
														
 
															+{
														
 
															+	int error = 0;
														
 
															+	int limit = CONFIGFS_ITEM_NAME_LEN;
														
 
															+	int need;
														
 
															+	va_list args;
														
 
															+	char * name;
														
 
															+
														
 
															+	/*
														
 
															+	 * First, try the static array
														
 
															+	 */
														
 
															+	va_start(args,fmt);
														
 
															+	need = vsnprintf(item->ci_namebuf,limit,fmt,args);
														
 
															+	va_end(args);
														
 
															+	if (need < limit)
														
 
															+		name = item->ci_namebuf;
														
 
															+	else {
														
 
															+		/*
														
 
															+		 * Need more space? Allocate it and try again
														
 
															+		 */
														
 
															+		limit = need + 1;
														
 
															+		name = kmalloc(limit,GFP_KERNEL);
														
 
															+		if (!name) {
														
 
															+			error = -ENOMEM;
														
 
															+			goto Done;
														
 
															+		}
														
 
															+		va_start(args,fmt);
														
 
															+		need = vsnprintf(name,limit,fmt,args);
														
 
															+		va_end(args);
														
 
															+
														
 
															+		/* Still? Give up. */
														
 
															+		if (need >= limit) {
														
 
															+			kfree(name);
														
 
															+			error = -EFAULT;
														
 
															+			goto Done;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Free the old name, if necessary. */
														
 
															+	if (item->ci_name && item->ci_name != item->ci_namebuf)
														
 
															+		kfree(item->ci_name);
														
 
															+
														
 
															+	/* Now, set the new name */
														
 
															+	item->ci_name = name;
														
 
															+ Done:
														
 
															+	return error;
														
 
															+}
														
 
															+
														
 
															+EXPORT_SYMBOL(config_item_set_name);
														
 
															+
														
 
															+void config_item_init_type_name(struct config_item *item,
														
 
															+				const char *name,
														
 
															+				struct config_item_type *type)
														
 
															+{
														
 
															+	config_item_set_name(item, name);
														
 
															+	item->ci_type = type;
														
 
															+	config_item_init(item);
														
 
															+}
														
 
															+EXPORT_SYMBOL(config_item_init_type_name);
														
 
															+
														
 
															+void config_group_init_type_name(struct config_group *group, const char *name,
														
 
															+			 struct config_item_type *type)
														
 
															+{
														
 
															+	config_item_set_name(&group->cg_item, name);
														
 
															+	group->cg_item.ci_type = type;
														
 
															+	config_group_init(group);
														
 
															+}
														
 
															+EXPORT_SYMBOL(config_group_init_type_name);
														
 
															+
														
 
															+struct config_item * config_item_get(struct config_item * item)
														
 
															+{
														
 
															+	if (item)
														
 
															+		kref_get(&item->ci_kref);
														
 
															+	return item;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ *	config_item_cleanup - free config_item resources.
														
 
															+ *	@item:	item.
														
 
															+ */
														
 
															+
														
 
															+void config_item_cleanup(struct config_item * item)
														
 
															+{
														
 
															+	struct config_item_type * t = item->ci_type;
														
 
															+	struct config_group * s = item->ci_group;
														
 
															+	struct config_item * parent = item->ci_parent;
														
 
															+
														
 
															+	pr_debug("config_item %s: cleaning up\n",config_item_name(item));
														
 
															+	if (item->ci_name != item->ci_namebuf)
														
 
															+		kfree(item->ci_name);
														
 
															+	item->ci_name = NULL;
														
 
															+	if (t && t->ct_item_ops && t->ct_item_ops->release)
														
 
															+		t->ct_item_ops->release(item);
														
 
															+	if (s)
														
 
															+		config_group_put(s);
														
 
															+	if (parent)
														
 
															+		config_item_put(parent);
														
 
															+}
														
 
															+
														
 
															+static void config_item_release(struct kref *kref)
														
 
															+{
														
 
															+	config_item_cleanup(container_of(kref, struct config_item, ci_kref));
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ *	config_item_put - decrement refcount for item.
														
 
															+ *	@item:	item.
														
 
															+ *
														
 
															+ *	Decrement the refcount, and if 0, call config_item_cleanup().
														
 
															+ */
														
 
															+void config_item_put(struct config_item * item)
														
 
															+{
														
 
															+	if (item)
														
 
															+		kref_put(&item->ci_kref, config_item_release);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	config_group_init - initialize a group for use
														
 
															+ *	@k:	group
														
 
															+ */
														
 
															+
														
 
															+void config_group_init(struct config_group *group)
														
 
															+{
														
 
															+	config_item_init(&group->cg_item);
														
 
															+	INIT_LIST_HEAD(&group->cg_children);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ *	config_group_find_obj - search for item in group.
														
 
															+ *	@group:	group we're looking in.
														
 
															+ *	@name:	item's name.
														
 
															+ *
														
 
															+ *	Lock group via @group->cg_subsys, and iterate over @group->cg_list,
														
 
															+ *	looking for a matching config_item. If matching item is found
														
 
															+ *	take a reference and return the item.
														
 
															+ */
														
 
															+
														
 
															+struct config_item * config_group_find_obj(struct config_group * group, const char * name)
														
 
															+{
														
 
															+	struct list_head * entry;
														
 
															+	struct config_item * ret = NULL;
														
 
															+
														
 
															+        /* XXX LOCKING! */
														
 
															+	list_for_each(entry,&group->cg_children) {
														
 
															+		struct config_item * item = to_item(entry);
														
 
															+		if (config_item_name(item) &&
														
 
															+                    !strcmp(config_item_name(item), name)) {
														
 
															+			ret = config_item_get(item);
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+EXPORT_SYMBOL(config_item_init);
														
 
															+EXPORT_SYMBOL(config_group_init);
														
 
															+EXPORT_SYMBOL(config_item_get);
														
 
															+EXPORT_SYMBOL(config_item_put);
														
 
															+
														
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -0,0 +1,159 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * mount.c - operations for initializing and mounting configfs.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * Based on sysfs:
														
 
															+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
														
 
															+ *
														
 
															+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/mount.h>
														
 
															+#include <linux/pagemap.h>
														
 
															+#include <linux/init.h>
														
 
															+
														
 
															+#include <linux/configfs.h>
														
 
															+#include "configfs_internal.h"
														
 
															+
														
 
															+/* Random magic number */
														
 
															+#define CONFIGFS_MAGIC 0x62656570
														
 
															+
														
 
															+struct vfsmount * configfs_mount = NULL;
														
 
															+struct super_block * configfs_sb = NULL;
														
 
															+static int configfs_mnt_count = 0;
														
 
															+
														
 
															+static struct super_operations configfs_ops = {
														
 
															+	.statfs		= simple_statfs,
														
 
															+	.drop_inode	= generic_delete_inode,
														
 
															+};
														
 
															+
														
 
															+static struct config_group configfs_root_group = {
														
 
															+	.cg_item = {
														
 
															+		.ci_namebuf	= "root",
														
 
															+		.ci_name	= configfs_root_group.cg_item.ci_namebuf,
														
 
															+	},
														
 
															+};
														
 
															+
														
 
															+int configfs_is_root(struct config_item *item)
														
 
															+{
														
 
															+	return item == &configfs_root_group.cg_item;
														
 
															+}
														
 
															+
														
 
															+static struct configfs_dirent configfs_root = {
														
 
															+	.s_sibling	= LIST_HEAD_INIT(configfs_root.s_sibling),
														
 
															+	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
														
 
															+	.s_element	= &configfs_root_group.cg_item,
														
 
															+	.s_type		= CONFIGFS_ROOT,
														
 
															+};
														
 
															+
														
 
															+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
														
 
															+{
														
 
															+	struct inode *inode;
														
 
															+	struct dentry *root;
														
 
															+
														
 
															+	sb->s_blocksize = PAGE_CACHE_SIZE;
														
 
															+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
														
 
															+	sb->s_magic = CONFIGFS_MAGIC;
														
 
															+	sb->s_op = &configfs_ops;
														
 
															+	configfs_sb = sb;
														
 
															+
														
 
															+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
														
 
															+	if (inode) {
														
 
															+		inode->i_op = &configfs_dir_inode_operations;
														
 
															+		inode->i_fop = &configfs_dir_operations;
														
 
															+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
														
 
															+		inode->i_nlink++;
														
 
															+	} else {
														
 
															+		pr_debug("configfs: could not get root inode\n");
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	root = d_alloc_root(inode);
														
 
															+	if (!root) {
														
 
															+		pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
														
 
															+		iput(inode);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+	config_group_init(&configfs_root_group);
														
 
															+	configfs_root_group.cg_item.ci_dentry = root;
														
 
															+	root->d_fsdata = &configfs_root;
														
 
															+	sb->s_root = root;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
														
 
															+	int flags, const char *dev_name, void *data)
														
 
															+{
														
 
															+	return get_sb_single(fs_type, flags, data, configfs_fill_super);
														
 
															+}
														
 
															+
														
 
															+static struct file_system_type configfs_fs_type = {
														
 
															+	.owner		= THIS_MODULE,
														
 
															+	.name		= "configfs",
														
 
															+	.get_sb		= configfs_get_sb,
														
 
															+	.kill_sb	= kill_litter_super,
														
 
															+};
														
 
															+
														
 
															+int configfs_pin_fs(void)
														
 
															+{
														
 
															+	return simple_pin_fs("configfs", &configfs_mount,
														
 
															+			     &configfs_mnt_count);
														
 
															+}
														
 
															+
														
 
															+void configfs_release_fs(void)
														
 
															+{
														
 
															+	simple_release_fs(&configfs_mount, &configfs_mnt_count);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static decl_subsys(config, NULL, NULL);
														
 
															+
														
 
															+static int __init configfs_init(void)
														
 
															+{
														
 
															+	int err;
														
 
															+
														
 
															+	kset_set_kset_s(&config_subsys, kernel_subsys);
														
 
															+	err = subsystem_register(&config_subsys);
														
 
															+	if (err)
														
 
															+		return err;
														
 
															+
														
 
															+	err = register_filesystem(&configfs_fs_type);
														
 
															+	if (err) {
														
 
															+		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
														
 
															+		subsystem_unregister(&config_subsys);
														
 
															+	}
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static void __exit configfs_exit(void)
														
 
															+{
														
 
															+	unregister_filesystem(&configfs_fs_type);
														
 
															+	subsystem_unregister(&config_subsys);
														
 
															+}
														
 
															+
														
 
															+MODULE_AUTHOR("Oracle");
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_VERSION("0.0.1");
														
 
															+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
														
 
															+
														
 
															+module_init(configfs_init);
														
 
															+module_exit(configfs_exit);
														
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -0,0 +1,281 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * symlink.c - operations for configfs symlinks.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * Based on sysfs:
														
 
															+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
														
 
															+ *
														
 
															+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/namei.h>
														
 
															+
														
 
															+#include <linux/configfs.h>
														
 
															+#include "configfs_internal.h"
														
 
															+
														
 
															+static int item_depth(struct config_item * item)
														
 
															+{
														
 
															+	struct config_item * p = item;
														
 
															+	int depth = 0;
														
 
															+	do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
														
 
															+	return depth;
														
 
															+}
														
 
															+
														
 
															+static int item_path_length(struct config_item * item)
														
 
															+{
														
 
															+	struct config_item * p = item;
														
 
															+	int length = 1;
														
 
															+	do {
														
 
															+		length += strlen(config_item_name(p)) + 1;
														
 
															+		p = p->ci_parent;
														
 
															+	} while (p && !configfs_is_root(p));
														
 
															+	return length;
														
 
															+}
														
 
															+
														
 
															+static void fill_item_path(struct config_item * item, char * buffer, int length)
														
 
															+{
														
 
															+	struct config_item * p;
														
 
															+
														
 
															+	--length;
														
 
															+	for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
														
 
															+		int cur = strlen(config_item_name(p));
														
 
															+
														
 
															+		/* back up enough to print this bus id with '/' */
														
 
															+		length -= cur;
														
 
															+		strncpy(buffer + length,config_item_name(p),cur);
														
 
															+		*(buffer + --length) = '/';
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int create_link(struct config_item *parent_item,
														
 
															+ 		       struct config_item *item,
														
 
															+		       struct dentry *dentry)
														
 
															+{
														
 
															+	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
														
 
															+	struct configfs_symlink *sl;
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = -ENOMEM;
														
 
															+	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
														
 
															+	if (sl) {
														
 
															+		sl->sl_target = config_item_get(item);
														
 
															+		/* FIXME: needs a lock, I'd bet */
														
 
															+		list_add(&sl->sl_list, &target_sd->s_links);
														
 
															+		ret = configfs_create_link(sl, parent_item->ci_dentry,
														
 
															+					   dentry);
														
 
															+		if (ret) {
														
 
															+			list_del_init(&sl->sl_list);
														
 
															+			config_item_put(item);
														
 
															+			kfree(sl);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int get_target(const char *symname, struct nameidata *nd,
														
 
															+		      struct config_item **target)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
														
 
															+	if (!ret) {
														
 
															+		if (nd->dentry->d_sb == configfs_sb) {
														
 
															+			*target = configfs_get_config_item(nd->dentry);
														
 
															+			if (!*target) {
														
 
															+				ret = -ENOENT;
														
 
															+				path_release(nd);
														
 
															+			}
														
 
															+		} else
														
 
															+			ret = -EPERM;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct nameidata nd;
														
 
															+	struct config_item *parent_item;
														
 
															+	struct config_item *target_item;
														
 
															+	struct config_item_type *type;
														
 
															+
														
 
															+	ret = -EPERM;  /* What lack-of-symlink returns */
														
 
															+	if (dentry->d_parent == configfs_sb->s_root)
														
 
															+		goto out;
														
 
															+
														
 
															+	parent_item = configfs_get_config_item(dentry->d_parent);
														
 
															+	type = parent_item->ci_type;
														
 
															+
														
 
															+	if (!type || !type->ct_item_ops ||
														
 
															+	    !type->ct_item_ops->allow_link)
														
 
															+		goto out_put;
														
 
															+
														
 
															+	ret = get_target(symname, &nd, &target_item);
														
 
															+	if (ret)
														
 
															+		goto out_put;
														
 
															+
														
 
															+	ret = type->ct_item_ops->allow_link(parent_item, target_item);
														
 
															+	if (!ret)
														
 
															+		ret = create_link(parent_item, target_item, dentry);
														
 
															+
														
 
															+	config_item_put(target_item);
														
 
															+	path_release(&nd);
														
 
															+
														
 
															+out_put:
														
 
															+	config_item_put(parent_item);
														
 
															+
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int configfs_unlink(struct inode *dir, struct dentry *dentry)
														
 
															+{
														
 
															+	struct configfs_dirent *sd = dentry->d_fsdata;
														
 
															+	struct configfs_symlink *sl;
														
 
															+	struct config_item *parent_item;
														
 
															+	struct config_item_type *type;
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = -EPERM;  /* What lack-of-symlink returns */
														
 
															+	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
														
 
															+		goto out;
														
 
															+
														
 
															+	if (dentry->d_parent == configfs_sb->s_root)
														
 
															+		BUG();
														
 
															+
														
 
															+	sl = sd->s_element;
														
 
															+
														
 
															+	parent_item = configfs_get_config_item(dentry->d_parent);
														
 
															+	type = parent_item->ci_type;
														
 
															+
														
 
															+	list_del_init(&sd->s_sibling);
														
 
															+	configfs_drop_dentry(sd, dentry->d_parent);
														
 
															+	dput(dentry);
														
 
															+	configfs_put(sd);
														
 
															+
														
 
															+	/*
														
 
															+	 * drop_link() must be called before
														
 
															+	 * list_del_init(&sl->sl_list), so that the order of
														
 
															+	 * drop_link(this, target) and drop_item(target) is preserved.
														
 
															+	 */
														
 
															+	if (type && type->ct_item_ops &&
														
 
															+	    type->ct_item_ops->drop_link)
														
 
															+		type->ct_item_ops->drop_link(parent_item,
														
 
															+					       sl->sl_target);
														
 
															+
														
 
															+	/* FIXME: Needs lock */
														
 
															+	list_del_init(&sl->sl_list);
														
 
															+
														
 
															+	/* Put reference from create_link() */
														
 
															+	config_item_put(sl->sl_target);
														
 
															+	kfree(sl);
														
 
															+
														
 
															+	config_item_put(parent_item);
														
 
															+
														
 
															+	ret = 0;
														
 
															+
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
														
 
															+				   char *path)
														
 
															+{
														
 
															+	char * s;
														
 
															+	int depth, size;
														
 
															+
														
 
															+	depth = item_depth(item);
														
 
															+	size = item_path_length(target) + depth * 3 - 1;
														
 
															+	if (size > PATH_MAX)
														
 
															+		return -ENAMETOOLONG;
														
 
															+
														
 
															+	pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
														
 
															+
														
 
															+	for (s = path; depth--; s += 3)
														
 
															+		strcpy(s,"../");
														
 
															+
														
 
															+	fill_item_path(target, path, size);
														
 
															+	pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int configfs_getlink(struct dentry *dentry, char * path)
														
 
															+{
														
 
															+	struct config_item *item, *target_item;
														
 
															+	int error = 0;
														
 
															+
														
 
															+	item = configfs_get_config_item(dentry->d_parent);
														
 
															+	if (!item)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	target_item = configfs_get_config_item(dentry);
														
 
															+	if (!target_item) {
														
 
															+		config_item_put(item);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	down_read(&configfs_rename_sem);
														
 
															+	error = configfs_get_target_path(item, target_item, path);
														
 
															+	up_read(&configfs_rename_sem);
														
 
															+
														
 
															+	config_item_put(item);
														
 
															+	config_item_put(target_item);
														
 
															+	return error;
														
 
															+
														
 
															+}
														
 
															+
														
 
															+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
														
 
															+{
														
 
															+	int error = -ENOMEM;
														
 
															+	unsigned long page = get_zeroed_page(GFP_KERNEL);
														
 
															+
														
 
															+	if (page) {
														
 
															+		error = configfs_getlink(dentry, (char *)page);
														
 
															+		if (!error) {
														
 
															+			nd_set_link(nd, (char *)page);
														
 
															+			return (void *)page;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	nd_set_link(nd, ERR_PTR(error));
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
														
 
															+			      void *cookie)
														
 
															+{
														
 
															+	if (cookie) {
														
 
															+		unsigned long page = (unsigned long)cookie;
														
 
															+		free_page(page);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+struct inode_operations configfs_symlink_inode_operations = {
														
 
															+	.follow_link = configfs_follow_link,
														
 
															+	.readlink = generic_readlink,
														
 
															+	.put_link = configfs_put_link,
														
 
															+};
														
 
															+
														
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -721,7 +721,7 @@ retry:
 
															 						&last_block_in_bio, &ret, wbc,
														
 
															 						page->mapping->a_ops->writepage);
														
 
															 			}
														
 
															-			if (unlikely(ret == WRITEPAGE_ACTIVATE))
														
 
															+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
														
 
															 				unlock_page(page);
														
 
															 			if (ret || (--(wbc->nr_to_write) <= 0))
														
 
															 				done = 1;
														
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
 
															+EXTRA_CFLAGS += -Ifs/ocfs2
														
 
															+
														
 
															+EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
														
 
															+
														
 
															+obj-$(CONFIG_OCFS2_FS) += ocfs2.o
														
 
															+
														
 
															+ocfs2-objs := \
														
 
															+	alloc.o 		\
														
 
															+	aops.o 			\
														
 
															+	buffer_head_io.o	\
														
 
															+	dcache.o 		\
														
 
															+	dir.o 			\
														
 
															+	dlmglue.o 		\
														
 
															+	export.o 		\
														
 
															+	extent_map.o 		\
														
 
															+	file.o 			\
														
 
															+	heartbeat.o 		\
														
 
															+	inode.o 		\
														
 
															+	journal.o 		\
														
 
															+	localalloc.o 		\
														
 
															+	mmap.o 			\
														
 
															+	namei.o 		\
														
 
															+	slot_map.o 		\
														
 
															+	suballoc.o 		\
														
 
															+	super.o 		\
														
 
															+	symlink.o 		\
														
 
															+	sysfile.o 		\
														
 
															+	uptodate.o		\
														
 
															+	ver.o 			\
														
 
															+	vote.o
														
 
															+
														
 
															+obj-$(CONFIG_OCFS2_FS) += cluster/
														
 
															+obj-$(CONFIG_OCFS2_FS) += dlm/
														
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * alloc.c
														
 
															+ *
														
 
															+ * Extent allocs and frees
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "localalloc.h"
														
 
															+#include "suballoc.h"
														
 
															+#include "sysfile.h"
														
 
															+#include "file.h"
														
 
															+#include "super.h"
														
 
															+#include "uptodate.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+static int ocfs2_extent_contig(struct inode *inode,
														
 
															+			       struct ocfs2_extent_rec *ext,
														
 
															+			       u64 blkno);
														
 
															+
														
 
															+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
														
 
															+				     struct ocfs2_journal_handle *handle,
														
 
															+				     struct inode *inode,
														
 
															+				     int wanted,
														
 
															+				     struct ocfs2_alloc_context *meta_ac,
														
 
															+				     struct buffer_head *bhs[]);
														
 
															+
														
 
															+static int ocfs2_add_branch(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_journal_handle *handle,
														
 
															+			    struct inode *inode,
														
 
															+			    struct buffer_head *fe_bh,
														
 
															+			    struct buffer_head *eb_bh,
														
 
															+			    struct buffer_head *last_eb_bh,
														
 
															+			    struct ocfs2_alloc_context *meta_ac);
														
 
															+
														
 
															+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
														
 
															+				  struct ocfs2_journal_handle *handle,
														
 
															+				  struct inode *inode,
														
 
															+				  struct buffer_head *fe_bh,
														
 
															+				  struct ocfs2_alloc_context *meta_ac,
														
 
															+				  struct buffer_head **ret_new_eb_bh);
														
 
															+
														
 
															+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
														
 
															+				  struct ocfs2_journal_handle *handle,
														
 
															+				  struct inode *inode,
														
 
															+				  struct buffer_head *fe_bh,
														
 
															+				  u64 blkno,
														
 
															+				  u32 new_clusters);
														
 
															+
														
 
															+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
														
 
															+				    struct inode *inode,
														
 
															+				    struct buffer_head *fe_bh,
														
 
															+				    struct buffer_head **target_bh);
														
 
															+
														
 
															+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
														
 
															+				       struct inode *inode,
														
 
															+				       struct ocfs2_dinode *fe,
														
 
															+				       unsigned int new_i_clusters,
														
 
															+				       struct buffer_head *old_last_eb,
														
 
															+				       struct buffer_head **new_last_eb);
														
 
															+
														
 
															+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
														
 
															+
														
 
															+static int ocfs2_extent_contig(struct inode *inode,
														
 
															+			       struct ocfs2_extent_rec *ext,
														
 
															+			       u64 blkno)
														
 
															+{
														
 
															+	return blkno == (le64_to_cpu(ext->e_blkno) +
														
 
															+			 ocfs2_clusters_to_blocks(inode->i_sb,
														
 
															+						  le32_to_cpu(ext->e_clusters)));
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * How many free extents have we got before we need more meta data?
														
 
															+ */
														
 
															+int ocfs2_num_free_extents(struct ocfs2_super *osb,
														
 
															+			   struct inode *inode,
														
 
															+			   struct ocfs2_dinode *fe)
														
 
															+{
														
 
															+	int retval;
														
 
															+	struct ocfs2_extent_list *el;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct buffer_head *eb_bh = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
														
 
															+		retval = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (fe->i_last_eb_blk) {
														
 
															+		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
														
 
															+					  &eb_bh, OCFS2_BH_CACHED, inode);
														
 
															+		if (retval < 0) {
														
 
															+			mlog_errno(retval);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
														
 
															+		el = &eb->h_list;
														
 
															+	} else
														
 
															+		el = &fe->id2.i_list;
														
 
															+
														
 
															+	BUG_ON(el->l_tree_depth != 0);
														
 
															+
														
 
															+	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
														
 
															+bail:
														
 
															+	if (eb_bh)
														
 
															+		brelse(eb_bh);
														
 
															+
														
 
															+	mlog_exit(retval);
														
 
															+	return retval;
														
 
															+}
														
 
															+
														
 
															+/* expects array to already be allocated
														
 
															+ *
														
 
															+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
														
 
															+ * l_count for you
														
 
															+ */
														
 
															+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
														
 
															+				     struct ocfs2_journal_handle *handle,
														
 
															+				     struct inode *inode,
														
 
															+				     int wanted,
														
 
															+				     struct ocfs2_alloc_context *meta_ac,
														
 
															+				     struct buffer_head *bhs[])
														
 
															+{
														
 
															+	int count, status, i;
														
 
															+	u16 suballoc_bit_start;
														
 
															+	u32 num_got;
														
 
															+	u64 first_blkno;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	count = 0;
														
 
															+	while (count < wanted) {
														
 
															+		status = ocfs2_claim_metadata(osb,
														
 
															+					      handle,
														
 
															+					      meta_ac,
														
 
															+					      wanted - count,
														
 
															+					      &suballoc_bit_start,
														
 
															+					      &num_got,
														
 
															+					      &first_blkno);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		for(i = count;  i < (num_got + count); i++) {
														
 
															+			bhs[i] = sb_getblk(osb->sb, first_blkno);
														
 
															+			if (bhs[i] == NULL) {
														
 
															+				status = -EIO;
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
														
 
															+
														
 
															+			status = ocfs2_journal_access(handle, inode, bhs[i],
														
 
															+						      OCFS2_JOURNAL_ACCESS_CREATE);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+
														
 
															+			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
														
 
															+			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
														
 
															+			/* Ok, setup the minimal stuff here. */
														
 
															+			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
														
 
															+			eb->h_blkno = cpu_to_le64(first_blkno);
														
 
															+			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
														
 
															+
														
 
															+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
														
 
															+			/* we always use slot zero's suballocator */
														
 
															+			eb->h_suballoc_slot = 0;
														
 
															+#else
														
 
															+			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
														
 
															+#endif
														
 
															+			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
														
 
															+			eb->h_list.l_count =
														
 
															+				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
														
 
															+
														
 
															+			suballoc_bit_start++;
														
 
															+			first_blkno++;
														
 
															+
														
 
															+			/* We'll also be dirtied by the caller, so
														
 
															+			 * this isn't absolutely necessary. */
														
 
															+			status = ocfs2_journal_dirty(handle, bhs[i]);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		count += num_got;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (status < 0) {
														
 
															+		for(i = 0; i < wanted; i++) {
														
 
															+			if (bhs[i])
														
 
															+				brelse(bhs[i]);
														
 
															+			bhs[i] = NULL;
														
 
															+		}
														
 
															+	}
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Add an entire tree branch to our inode. eb_bh is the extent block
														
 
															+ * to start at, if we don't want to start the branch at the dinode
														
 
															+ * structure.
														
 
															+ *
														
 
															+ * last_eb_bh is required as we have to update it's next_leaf pointer
														
 
															+ * for the new last extent block.
														
 
															+ *
														
 
															+ * the new branch will be 'empty' in the sense that every block will
														
 
															+ * contain a single record with e_clusters == 0.
														
 
															+ */
														
 
															+static int ocfs2_add_branch(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_journal_handle *handle,
														
 
															+			    struct inode *inode,
														
 
															+			    struct buffer_head *fe_bh,
														
 
															+			    struct buffer_head *eb_bh,
														
 
															+			    struct buffer_head *last_eb_bh,
														
 
															+			    struct ocfs2_alloc_context *meta_ac)
														
 
															+{
														
 
															+	int status, new_blocks, i;
														
 
															+	u64 next_blkno, new_last_eb_blk;
														
 
															+	struct buffer_head *bh;
														
 
															+	struct buffer_head **new_eb_bhs = NULL;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_list  *eb_el;
														
 
															+	struct ocfs2_extent_list  *el;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!last_eb_bh);
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+
														
 
															+	if (eb_bh) {
														
 
															+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
														
 
															+		el = &eb->h_list;
														
 
															+	} else
														
 
															+		el = &fe->id2.i_list;
														
 
															+
														
 
															+	/* we never add a branch to a leaf. */
														
 
															+	BUG_ON(!el->l_tree_depth);
														
 
															+
														
 
															+	new_blocks = le16_to_cpu(el->l_tree_depth);
														
 
															+
														
 
															+	/* allocate the number of new eb blocks we need */
														
 
															+	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
														
 
															+			     GFP_KERNEL);
														
 
															+	if (!new_eb_bhs) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
														
 
															+					   meta_ac, new_eb_bhs);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
														
 
															+	 * linked with the rest of the tree.
														
 
															+	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
														
 
															+	 *
														
 
															+	 * when we leave the loop, new_last_eb_blk will point to the
														
 
															+	 * newest leaf, and next_blkno will point to the topmost extent
														
 
															+	 * block. */
														
 
															+	next_blkno = new_last_eb_blk = 0;
														
 
															+	for(i = 0; i < new_blocks; i++) {
														
 
															+		bh = new_eb_bhs[i];
														
 
															+		eb = (struct ocfs2_extent_block *) bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		eb_el = &eb->h_list;
														
 
															+
														
 
															+		status = ocfs2_journal_access(handle, inode, bh,
														
 
															+					      OCFS2_JOURNAL_ACCESS_CREATE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		eb->h_next_leaf_blk = 0;
														
 
															+		eb_el->l_tree_depth = cpu_to_le16(i);
														
 
															+		eb_el->l_next_free_rec = cpu_to_le16(1);
														
 
															+		eb_el->l_recs[0].e_cpos = fe->i_clusters;
														
 
															+		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
														
 
															+		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
														
 
															+		if (!eb_el->l_tree_depth)
														
 
															+			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
														
 
															+
														
 
															+		status = ocfs2_journal_dirty(handle, bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		next_blkno = le64_to_cpu(eb->h_blkno);
														
 
															+	}
														
 
															+
														
 
															+	/* This is a bit hairy. We want to update up to three blocks
														
 
															+	 * here without leaving any of them in an inconsistent state
														
 
															+	 * in case of error. We don't have to worry about
														
 
															+	 * journal_dirty erroring as it won't unless we've aborted the
														
 
															+	 * handle (in which case we would never be here) so reserving
														
 
															+	 * the write with journal_access is all we need to do. */
														
 
															+	status = ocfs2_journal_access(handle, inode, last_eb_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	status = ocfs2_journal_access(handle, inode, fe_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (eb_bh) {
														
 
															+		status = ocfs2_journal_access(handle, inode, eb_bh,
														
 
															+					      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Link the new branch into the rest of the tree (el will
														
 
															+	 * either be on the fe, or the extent block passed in. */
														
 
															+	i = le16_to_cpu(el->l_next_free_rec);
														
 
															+	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
														
 
															+	el->l_recs[i].e_cpos = fe->i_clusters;
														
 
															+	el->l_recs[i].e_clusters = 0;
														
 
															+	le16_add_cpu(&el->l_next_free_rec, 1);
														
 
															+
														
 
															+	/* fe needs a new last extent block pointer, as does the
														
 
															+	 * next_leaf on the previously last-extent-block. */
														
 
															+	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
														
 
															+
														
 
															+	eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
														
 
															+	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, last_eb_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+	status = ocfs2_journal_dirty(handle, fe_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+	if (eb_bh) {
														
 
															+		status = ocfs2_journal_dirty(handle, eb_bh);
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (new_eb_bhs) {
														
 
															+		for (i = 0; i < new_blocks; i++)
														
 
															+			if (new_eb_bhs[i])
														
 
															+				brelse(new_eb_bhs[i]);
														
 
															+		kfree(new_eb_bhs);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * adds another level to the allocation tree.
														
 
															+ * returns back the new extent block so you can add a branch to it
														
 
															+ * after this call.
														
 
															+ */
														
 
															+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
														
 
															+				  struct ocfs2_journal_handle *handle,
														
 
															+				  struct inode *inode,
														
 
															+				  struct buffer_head *fe_bh,
														
 
															+				  struct ocfs2_alloc_context *meta_ac,
														
 
															+				  struct buffer_head **ret_new_eb_bh)
														
 
															+{
														
 
															+	int status, i;
														
 
															+	struct buffer_head *new_eb_bh = NULL;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_list  *fe_el;
														
 
															+	struct ocfs2_extent_list  *eb_el;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
														
 
															+					   &new_eb_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	eb_el = &eb->h_list;
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+	fe_el = &fe->id2.i_list;
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, inode, new_eb_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_CREATE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* copy the fe data into the new extent block */
														
 
															+	eb_el->l_tree_depth = fe_el->l_tree_depth;
														
 
															+	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
														
 
															+	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
														
 
															+		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
														
 
															+		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
														
 
															+		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, new_eb_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, inode, fe_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* update fe now */
														
 
															+	le16_add_cpu(&fe_el->l_tree_depth, 1);
														
 
															+	fe_el->l_recs[0].e_cpos = 0;
														
 
															+	fe_el->l_recs[0].e_blkno = eb->h_blkno;
														
 
															+	fe_el->l_recs[0].e_clusters = fe->i_clusters;
														
 
															+	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
														
 
															+		fe_el->l_recs[i].e_cpos = 0;
														
 
															+		fe_el->l_recs[i].e_clusters = 0;
														
 
															+		fe_el->l_recs[i].e_blkno = 0;
														
 
															+	}
														
 
															+	fe_el->l_next_free_rec = cpu_to_le16(1);
														
 
															+
														
 
															+	/* If this is our 1st tree depth shift, then last_eb_blk
														
 
															+	 * becomes the allocated extent block */
														
 
															+	if (fe_el->l_tree_depth == cpu_to_le16(1))
														
 
															+		fe->i_last_eb_blk = eb->h_blkno;
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	*ret_new_eb_bh = new_eb_bh;
														
 
															+	new_eb_bh = NULL;
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (new_eb_bh)
														
 
															+		brelse(new_eb_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Expects the tree to already have room in the rightmost leaf for the
														
 
															+ * extent.  Updates all the extent blocks (and the dinode) on the way
														
 
															+ * down.
														
 
															+ */
														
 
															+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
														
 
															+				  struct ocfs2_journal_handle *handle,
														
 
															+				  struct inode *inode,
														
 
															+				  struct buffer_head *fe_bh,
														
 
															+				  u64 start_blk,
														
 
															+				  u32 new_clusters)
														
 
															+{
														
 
															+	int status, i, num_bhs = 0;
														
 
															+	u64 next_blkno;
														
 
															+	u16 next_free;
														
 
															+	struct buffer_head **eb_bhs = NULL;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_list  *el;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, inode, fe_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+	el = &fe->id2.i_list;
														
 
															+	if (el->l_tree_depth) {
														
 
															+		/* This is another operation where we want to be
														
 
															+		 * careful about our tree updates. An error here means
														
 
															+		 * none of the previous changes we made should roll
														
 
															+		 * forward. As a result, we have to record the buffers
														
 
															+		 * for this part of the tree in an array and reserve a
														
 
															+		 * journal write to them before making any changes. */
														
 
															+		num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
														
 
															+		eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
														
 
															+				 GFP_KERNEL);
														
 
															+		if (!eb_bhs) {
														
 
															+			status = -ENOMEM;
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		i = 0;
														
 
															+		while(el->l_tree_depth) {
														
 
															+			next_free = le16_to_cpu(el->l_next_free_rec);
														
 
															+			if (next_free == 0) {
														
 
															+				ocfs2_error(inode->i_sb,
														
 
															+					    "Dinode %"MLFu64" has a bad "
														
 
															+					    "extent list",
														
 
															+					    OCFS2_I(inode)->ip_blkno);
														
 
															+				status = -EIO;
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
														
 
															+
														
 
															+			BUG_ON(i >= num_bhs);
														
 
															+			status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
														
 
															+						  OCFS2_BH_CACHED, inode);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
														
 
															+			if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
														
 
															+								 eb);
														
 
															+				status = -EIO;
														
 
															+				goto bail;
														
 
															+			}
														
 
															+
														
 
															+			status = ocfs2_journal_access(handle, inode, eb_bhs[i],
														
 
															+						      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+
														
 
															+			el = &eb->h_list;
														
 
															+			i++;
														
 
															+			/* When we leave this loop, eb_bhs[num_bhs - 1] will
														
 
															+			 * hold the bottom-most leaf extent block. */
														
 
															+		}
														
 
															+		BUG_ON(el->l_tree_depth);
														
 
															+
														
 
															+		el = &fe->id2.i_list;
														
 
															+		/* If we have tree depth, then the fe update is
														
 
															+		 * trivial, and we want to switch el out for the
														
 
															+		 * bottom-most leaf in order to update it with the
														
 
															+		 * actual extent data below. */
														
 
															+		next_free = le16_to_cpu(el->l_next_free_rec);
														
 
															+		if (next_free == 0) {
														
 
															+			ocfs2_error(inode->i_sb,
														
 
															+				    "Dinode %"MLFu64" has a bad "
														
 
															+				    "extent list",
														
 
															+				    OCFS2_I(inode)->ip_blkno);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
														
 
															+			     new_clusters);
														
 
															+		/* (num_bhs - 1) to avoid the leaf */
														
 
															+		for(i = 0; i < (num_bhs - 1); i++) {
														
 
															+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
														
 
															+			el = &eb->h_list;
														
 
															+
														
 
															+			/* finally, make our actual change to the
														
 
															+			 * intermediate extent blocks. */
														
 
															+			next_free = le16_to_cpu(el->l_next_free_rec);
														
 
															+			le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
														
 
															+				     new_clusters);
														
 
															+
														
 
															+			status = ocfs2_journal_dirty(handle, eb_bhs[i]);
														
 
															+			if (status < 0)
														
 
															+				mlog_errno(status);
														
 
															+		}
														
 
															+		BUG_ON(i != (num_bhs - 1));
														
 
															+		/* note that the leaf block wasn't touched in
														
 
															+		 * the loop above */
														
 
															+		eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
														
 
															+		el = &eb->h_list;
														
 
															+		BUG_ON(el->l_tree_depth);
														
 
															+	}
														
 
															+
														
 
															+	/* yay, we can finally add the actual extent now! */
														
 
															+	i = le16_to_cpu(el->l_next_free_rec) - 1;
														
 
															+	if (le16_to_cpu(el->l_next_free_rec) &&
														
 
															+	    ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
														
 
															+		le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
														
 
															+	} else if (le16_to_cpu(el->l_next_free_rec) &&
														
 
															+		   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
														
 
															+		/* having an empty extent at eof is legal. */
														
 
															+		if (el->l_recs[i].e_cpos != fe->i_clusters) {
														
 
															+			ocfs2_error(inode->i_sb,
														
 
															+				    "Dinode %"MLFu64" trailing extent is bad: "
														
 
															+				    "cpos (%u) != number of clusters (%u)",
														
 
															+				    le32_to_cpu(el->l_recs[i].e_cpos),
														
 
															+				    le32_to_cpu(fe->i_clusters));
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
														
 
															+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
														
 
															+	} else {
														
 
															+		/* No contiguous record, or no empty record at eof, so
														
 
															+		 * we add a new one. */
														
 
															+
														
 
															+		BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
														
 
															+		       le16_to_cpu(el->l_count));
														
 
															+		i = le16_to_cpu(el->l_next_free_rec);
														
 
															+
														
 
															+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
														
 
															+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
														
 
															+		el->l_recs[i].e_cpos = fe->i_clusters;
														
 
															+		le16_add_cpu(&el->l_next_free_rec, 1);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * extent_map errors are not fatal, so they are ignored outside
														
 
															+	 * of flushing the thing.
														
 
															+	 */
														
 
															+	status = ocfs2_extent_map_append(inode, &el->l_recs[i],
														
 
															+					 new_clusters);
														
 
															+	if (status) {
														
 
															+		mlog_errno(status);
														
 
															+		ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, fe_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+	if (fe->id2.i_list.l_tree_depth) {
														
 
															+		status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (eb_bhs) {
														
 
															+		for (i = 0; i < num_bhs; i++)
														
 
															+			if (eb_bhs[i])
														
 
															+				brelse(eb_bhs[i]);
														
 
															+		kfree(eb_bhs);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Should only be called when there is no space left in any of the
														
 
															+ * leaf nodes. What we want to do is find the lowest tree depth
														
 
															+ * non-leaf extent block with room for new records. There are three
														
 
															+ * valid results of this search:
														
 
															+ *
														
 
															+ * 1) a lowest extent block is found, then we pass it back in
														
 
															+ *    *lowest_eb_bh and return '0'
														
 
															+ *
														
 
															+ * 2) the search fails to find anything, but the dinode has room. We
														
 
															+ *    pass NULL back in *lowest_eb_bh, but still return '0'
														
 
															+ *
														
 
															+ * 3) the search fails to find anything AND the dinode is full, in
														
 
															+ *    which case we return > 0
														
 
															+ *
														
 
															+ * return status < 0 indicates an error.
														
 
															+ */
														
 
															+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
														
 
															+				    struct inode *inode,
														
 
															+				    struct buffer_head *fe_bh,
														
 
															+				    struct buffer_head **target_bh)
														
 
															+{
														
 
															+	int status = 0, i;
														
 
															+	u64 blkno;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_list  *el;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct buffer_head *lowest_bh = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	*target_bh = NULL;
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+	el = &fe->id2.i_list;
														
 
															+
														
 
															+	while(le16_to_cpu(el->l_tree_depth) > 1) {
														
 
															+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
														
 
															+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
														
 
															+				    "extent list (next_free_rec == 0)",
														
 
															+				    OCFS2_I(inode)->ip_blkno);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		i = le16_to_cpu(el->l_next_free_rec) - 1;
														
 
															+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
														
 
															+		if (!blkno) {
														
 
															+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
														
 
															+				    "list where extent # %d has no physical "
														
 
															+				    "block start",
														
 
															+				    OCFS2_I(inode)->ip_blkno, i);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		if (bh) {
														
 
															+			brelse(bh);
														
 
															+			bh = NULL;
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
														
 
															+					  inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		eb = (struct ocfs2_extent_block *) bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		el = &eb->h_list;
														
 
															+
														
 
															+		if (le16_to_cpu(el->l_next_free_rec) <
														
 
															+		    le16_to_cpu(el->l_count)) {
														
 
															+			if (lowest_bh)
														
 
															+				brelse(lowest_bh);
														
 
															+			lowest_bh = bh;
														
 
															+			get_bh(lowest_bh);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* If we didn't find one and the fe doesn't have any room,
														
 
															+	 * then return '1' */
														
 
															+	if (!lowest_bh
														
 
															+	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
														
 
															+		status = 1;
														
 
															+
														
 
															+	*target_bh = lowest_bh;
														
 
															+bail:
														
 
															+	if (bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* the caller needs to update fe->i_clusters */
														
 
															+int ocfs2_insert_extent(struct ocfs2_super *osb,
														
 
															+			struct ocfs2_journal_handle *handle,
														
 
															+			struct inode *inode,
														
 
															+			struct buffer_head *fe_bh,
														
 
															+			u64 start_blk,
														
 
															+			u32 new_clusters,
														
 
															+			struct ocfs2_alloc_context *meta_ac)
														
 
															+{
														
 
															+	int status, i, shift;
														
 
															+	struct buffer_head *last_eb_bh = NULL;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_list  *el;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "add %u clusters starting at block %"MLFu64" to "
														
 
															+		"inode %"MLFu64"\n",
														
 
															+	     new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+	el = &fe->id2.i_list;
														
 
															+
														
 
															+	if (el->l_tree_depth) {
														
 
															+		/* jump to end of tree */
														
 
															+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
														
 
															+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_exit(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
														
 
															+		el = &eb->h_list;
														
 
															+	}
														
 
															+
														
 
															+	/* Can we allocate without adding/shifting tree bits? */
														
 
															+	i = le16_to_cpu(el->l_next_free_rec) - 1;
														
 
															+	if (le16_to_cpu(el->l_next_free_rec) == 0
														
 
															+	    || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
														
 
															+	    || le32_to_cpu(el->l_recs[i].e_clusters) == 0
														
 
															+	    || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
														
 
															+		goto out_add;
														
 
															+
														
 
															+	mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
														
 
															+	     "tree now.\n");
														
 
															+
														
 
															+	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
														
 
															+	if (shift < 0) {
														
 
															+		status = shift;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* We traveled all the way to the bottom of the allocation tree
														
 
															+	 * and didn't find room for any more extents - we need to add
														
 
															+	 * another tree level */
														
 
															+	if (shift) {
														
 
															+		/* if we hit a leaf, we'd better be empty :) */
														
 
															+		BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
														
 
															+		       le16_to_cpu(el->l_count));
														
 
															+		BUG_ON(bh);
														
 
															+		mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
														
 
															+		     "(current = %u)\n",
														
 
															+		     le16_to_cpu(fe->id2.i_list.l_tree_depth));
														
 
															+
														
 
															+		/* ocfs2_shift_tree_depth will return us a buffer with
														
 
															+		 * the new extent block (so we can pass that to
														
 
															+		 * ocfs2_add_branch). */
														
 
															+		status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
														
 
															+						meta_ac, &bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		/* Special case: we have room now if we shifted from
														
 
															+		 * tree_depth 0 */
														
 
															+		if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
														
 
															+			goto out_add;
														
 
															+	}
														
 
															+
														
 
															+	/* call ocfs2_add_branch to add the final part of the tree with
														
 
															+	 * the new data. */
														
 
															+	mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
														
 
															+	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
														
 
															+				  meta_ac);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+out_add:
														
 
															+	/* Finally, we can add clusters. */
														
 
															+	status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
														
 
															+					start_blk, new_clusters);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail:
														
 
															+	if (bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	if (last_eb_bh)
														
 
															+		brelse(last_eb_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	struct buffer_head *tl_bh = osb->osb_tl_bh;
														
 
															+	struct ocfs2_dinode *di;
														
 
															+	struct ocfs2_truncate_log *tl;
														
 
															+
														
 
															+	di = (struct ocfs2_dinode *) tl_bh->b_data;
														
 
															+	tl = &di->id2.i_dealloc;
														
 
															+
														
 
															+	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
														
 
															+			"slot %d, invalid truncate log parameters: used = "
														
 
															+			"%u, count = %u\n", osb->slot_num,
														
 
															+			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
														
 
															+	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
														
 
															+					   unsigned int new_start)
														
 
															+{
														
 
															+	unsigned int tail_index;
														
 
															+	unsigned int current_tail;
														
 
															+
														
 
															+	/* No records, nothing to coalesce */
														
 
															+	if (!le16_to_cpu(tl->tl_used))
														
 
															+		return 0;
														
 
															+
														
 
															+	tail_index = le16_to_cpu(tl->tl_used) - 1;
														
 
															+	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
														
 
															+	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
														
 
															+
														
 
															+	return current_tail == new_start;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
														
 
															+				     struct ocfs2_journal_handle *handle,
														
 
															+				     u64 start_blk,
														
 
															+				     unsigned int num_clusters)
														
 
															+{
														
 
															+	int status, index;
														
 
															+	unsigned int start_cluster, tl_count;
														
 
															+	struct inode *tl_inode = osb->osb_tl_inode;
														
 
															+	struct buffer_head *tl_bh = osb->osb_tl_bh;
														
 
															+	struct ocfs2_dinode *di;
														
 
															+	struct ocfs2_truncate_log *tl;
														
 
															+
														
 
															+	mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
														
 
															+		   num_clusters);
														
 
															+
														
 
															+	BUG_ON(!down_trylock(&tl_inode->i_sem));
														
 
															+
														
 
															+	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
														
 
															+
														
 
															+	di = (struct ocfs2_dinode *) tl_bh->b_data;
														
 
															+	tl = &di->id2.i_dealloc;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(di)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	tl_count = le16_to_cpu(tl->tl_count);
														
 
															+	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
														
 
															+			tl_count == 0,
														
 
															+			"Truncate record count on #%"MLFu64" invalid ("
														
 
															+			"wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
														
 
															+			ocfs2_truncate_recs_per_inode(osb->sb),
														
 
															+			le16_to_cpu(tl->tl_count));
														
 
															+
														
 
															+	/* Caller should have known to flush before calling us. */
														
 
															+	index = le16_to_cpu(tl->tl_used);
														
 
															+	if (index >= tl_count) {
														
 
															+		status = -ENOSPC;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
														
 
															+	     "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
														
 
															+	     OCFS2_I(tl_inode)->ip_blkno, index);
														
 
															+
														
 
															+	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
														
 
															+		/*
														
 
															+		 * Move index back to the record we are coalescing with.
														
 
															+		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
														
 
															+		 */
														
 
															+		index--;
														
 
															+
														
 
															+		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
														
 
															+		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
														
 
															+		     index, le32_to_cpu(tl->tl_recs[index].t_start),
														
 
															+		     num_clusters);
														
 
															+	} else {
														
 
															+		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
														
 
															+		tl->tl_used = cpu_to_le16(index + 1);
														
 
															+	}
														
 
															+	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, tl_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
														
 
															+					 struct ocfs2_journal_handle *handle,
														
 
															+					 struct inode *data_alloc_inode,
														
 
															+					 struct buffer_head *data_alloc_bh)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int i;
														
 
															+	unsigned int num_clusters;
														
 
															+	u64 start_blk;
														
 
															+	struct ocfs2_truncate_rec rec;
														
 
															+	struct ocfs2_dinode *di;
														
 
															+	struct ocfs2_truncate_log *tl;
														
 
															+	struct inode *tl_inode = osb->osb_tl_inode;
														
 
															+	struct buffer_head *tl_bh = osb->osb_tl_bh;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	di = (struct ocfs2_dinode *) tl_bh->b_data;
														
 
															+	tl = &di->id2.i_dealloc;
														
 
															+	i = le16_to_cpu(tl->tl_used) - 1;
														
 
															+	while (i >= 0) {
														
 
															+		/* Caller has given us at least enough credits to
														
 
															+		 * update the truncate log dinode */
														
 
															+		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
														
 
															+					      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		tl->tl_used = cpu_to_le16(i);
														
 
															+
														
 
															+		status = ocfs2_journal_dirty(handle, tl_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		/* TODO: Perhaps we can calculate the bulk of the
														
 
															+		 * credits up front rather than extending like
														
 
															+		 * this. */
														
 
															+		status = ocfs2_extend_trans(handle,
														
 
															+					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		rec = tl->tl_recs[i];
														
 
															+		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
														
 
															+						    le32_to_cpu(rec.t_start));
														
 
															+		num_clusters = le32_to_cpu(rec.t_clusters);
														
 
															+
														
 
															+		/* if start_blk is not set, we ignore the record as
														
 
															+		 * invalid. */
														
 
															+		if (start_blk) {
														
 
															+			mlog(0, "free record %d, start = %u, clusters = %u\n",
														
 
															+			     i, le32_to_cpu(rec.t_start), num_clusters);
														
 
															+
														
 
															+			status = ocfs2_free_clusters(handle, data_alloc_inode,
														
 
															+						     data_alloc_bh, start_blk,
														
 
															+						     num_clusters);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+		i--;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Expects you to already be holding tl_inode->i_sem */
														
 
															+static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	unsigned int num_to_flush;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct inode *tl_inode = osb->osb_tl_inode;
														
 
															+	struct inode *data_alloc_inode = NULL;
														
 
															+	struct buffer_head *tl_bh = osb->osb_tl_bh;
														
 
															+	struct buffer_head *data_alloc_bh = NULL;
														
 
															+	struct ocfs2_dinode *di;
														
 
															+	struct ocfs2_truncate_log *tl;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!down_trylock(&tl_inode->i_sem));
														
 
															+
														
 
															+	di = (struct ocfs2_dinode *) tl_bh->b_data;
														
 
															+	tl = &di->id2.i_dealloc;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(di)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	num_to_flush = le16_to_cpu(tl->tl_used);
														
 
															+	mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
														
 
															+	     num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
														
 
															+	if (!num_to_flush) {
														
 
															+		status = 0;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (!handle) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	data_alloc_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						       GLOBAL_BITMAP_SYSTEM_INODE,
														
 
															+						       OCFS2_INVALID_SLOT);
														
 
															+	if (!data_alloc_inode) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_handle_add_inode(handle, data_alloc_inode);
														
 
															+	status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
														
 
															+					       data_alloc_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (data_alloc_inode)
														
 
															+		iput(data_alloc_inode);
														
 
															+
														
 
															+	if (data_alloc_bh)
														
 
															+		brelse(data_alloc_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *tl_inode = osb->osb_tl_inode;
														
 
															+
														
 
															+	down(&tl_inode->i_sem);
														
 
															+	status = __ocfs2_flush_truncate_log(osb);
														
 
															+	up(&tl_inode->i_sem);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_truncate_log_worker(void *data)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_super *osb = data;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	status = ocfs2_flush_truncate_log(osb);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+}
														
 
															+
														
 
															+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
														
 
															+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
														
 
															+				       int cancel)
														
 
															+{
														
 
															+	if (osb->osb_tl_inode) {
														
 
															+		/* We want to push off log flushes while truncates are
														
 
															+		 * still running. */
														
 
															+		if (cancel)
														
 
															+			cancel_delayed_work(&osb->osb_truncate_log_wq);
														
 
															+
														
 
															+		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
														
 
															+				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
														
 
															+				       int slot_num,
														
 
															+				       struct inode **tl_inode,
														
 
															+				       struct buffer_head **tl_bh)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+
														
 
															+	inode = ocfs2_get_system_file_inode(osb,
														
 
															+					   TRUNCATE_LOG_SYSTEM_INODE,
														
 
															+					   slot_num);
														
 
															+	if (!inode) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
														
 
															+				  OCFS2_BH_CACHED, inode);
														
 
															+	if (status < 0) {
														
 
															+		iput(inode);
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	*tl_inode = inode;
														
 
															+	*tl_bh    = bh;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* called during the 1st stage of node recovery. we stamp a clean
														
 
															+ * truncate log and pass back a copy for processing later. if the
														
 
															+ * truncate log does not require processing, a *tl_copy is set to
														
 
															+ * NULL. */
														
 
															+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
														
 
															+				      int slot_num,
														
 
															+				      struct ocfs2_dinode **tl_copy)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *tl_inode = NULL;
														
 
															+	struct buffer_head *tl_bh = NULL;
														
 
															+	struct ocfs2_dinode *di;
														
 
															+	struct ocfs2_truncate_log *tl;
														
 
															+
														
 
															+	*tl_copy = NULL;
														
 
															+
														
 
															+	mlog(0, "recover truncate log from slot %d\n", slot_num);
														
 
															+
														
 
															+	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	di = (struct ocfs2_dinode *) tl_bh->b_data;
														
 
															+	tl = &di->id2.i_dealloc;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(di)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (le16_to_cpu(tl->tl_used)) {
														
 
															+		mlog(0, "We'll have %u logs to recover\n",
														
 
															+		     le16_to_cpu(tl->tl_used));
														
 
															+
														
 
															+		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
														
 
															+		if (!(*tl_copy)) {
														
 
															+			status = -ENOMEM;
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		/* Assuming the write-out below goes well, this copy
														
 
															+		 * will be passed back to recovery for processing. */
														
 
															+		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
														
 
															+
														
 
															+		/* All we need to do to clear the truncate log is set
														
 
															+		 * tl_used. */
														
 
															+		tl->tl_used = 0;
														
 
															+
														
 
															+		status = ocfs2_write_block(osb, tl_bh, tl_inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	if (tl_inode)
														
 
															+		iput(tl_inode);
														
 
															+	if (tl_bh)
														
 
															+		brelse(tl_bh);
														
 
															+
														
 
															+	if (status < 0 && (*tl_copy)) {
														
 
															+		kfree(*tl_copy);
														
 
															+		*tl_copy = NULL;
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
														
 
															+					 struct ocfs2_dinode *tl_copy)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int i;
														
 
															+	unsigned int clusters, num_recs, start_cluster;
														
 
															+	u64 start_blk;
														
 
															+	struct ocfs2_journal_handle *handle;
														
 
															+	struct inode *tl_inode = osb->osb_tl_inode;
														
 
															+	struct ocfs2_truncate_log *tl;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
														
 
															+		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	tl = &tl_copy->id2.i_dealloc;
														
 
															+	num_recs = le16_to_cpu(tl->tl_used);
														
 
															+	mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
														
 
															+	     tl_copy->i_blkno);
														
 
															+
														
 
															+	down(&tl_inode->i_sem);
														
 
															+	for(i = 0; i < num_recs; i++) {
														
 
															+		if (ocfs2_truncate_log_needs_flush(osb)) {
														
 
															+			status = __ocfs2_flush_truncate_log(osb);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail_up;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		handle = ocfs2_start_trans(osb, NULL,
														
 
															+					   OCFS2_TRUNCATE_LOG_UPDATE);
														
 
															+		if (IS_ERR(handle)) {
														
 
															+			status = PTR_ERR(handle);
														
 
															+			mlog_errno(status);
														
 
															+			goto bail_up;
														
 
															+		}
														
 
															+
														
 
															+		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
														
 
															+		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
														
 
															+		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
														
 
															+
														
 
															+		status = ocfs2_truncate_log_append(osb, handle,
														
 
															+						   start_blk, clusters);
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail_up;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+bail_up:
														
 
															+	up(&tl_inode->i_sem);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *tl_inode = osb->osb_tl_inode;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (tl_inode) {
														
 
															+		cancel_delayed_work(&osb->osb_truncate_log_wq);
														
 
															+		flush_workqueue(ocfs2_wq);
														
 
															+
														
 
															+		status = ocfs2_flush_truncate_log(osb);
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+
														
 
															+		brelse(osb->osb_tl_bh);
														
 
															+		iput(osb->osb_tl_inode);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+int ocfs2_truncate_log_init(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *tl_inode = NULL;
														
 
															+	struct buffer_head *tl_bh = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	status = ocfs2_get_truncate_log_info(osb,
														
 
															+					     osb->slot_num,
														
 
															+					     &tl_inode,
														
 
															+					     &tl_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	/* ocfs2_truncate_log_shutdown keys on the existence of
														
 
															+	 * osb->osb_tl_inode so we don't set any of the osb variables
														
 
															+	 * until we're sure all is well. */
														
 
															+	INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
														
 
															+	osb->osb_tl_bh    = tl_bh;
														
 
															+	osb->osb_tl_inode = tl_inode;
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* This function will figure out whether the currently last extent
														
 
															+ * block will be deleted, and if it will, what the new last extent
														
 
															+ * block will be so we can update his h_next_leaf_blk field, as well
														
 
															+ * as the dinodes i_last_eb_blk */
														
 
															+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
														
 
															+				       struct inode *inode,
														
 
															+				       struct ocfs2_dinode *fe,
														
 
															+				       u32 new_i_clusters,
														
 
															+				       struct buffer_head *old_last_eb,
														
 
															+				       struct buffer_head **new_last_eb)
														
 
															+{
														
 
															+	int i, status = 0;
														
 
															+	u64 block = 0;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_list *el;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+
														
 
															+	*new_last_eb = NULL;
														
 
															+
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* we have no tree, so of course, no last_eb. */
														
 
															+	if (!fe->id2.i_list.l_tree_depth)
														
 
															+		goto bail;
														
 
															+
														
 
															+	/* trunc to zero special case - this makes tree_depth = 0
														
 
															+	 * regardless of what it is.  */
														
 
															+	if (!new_i_clusters)
														
 
															+		goto bail;
														
 
															+
														
 
															+	eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
														
 
															+	el = &(eb->h_list);
														
 
															+	BUG_ON(!el->l_next_free_rec);
														
 
															+
														
 
															+	/* Make sure that this guy will actually be empty after we
														
 
															+	 * clear away the data. */
														
 
															+	if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
														
 
															+		goto bail;
														
 
															+
														
 
															+	/* Ok, at this point, we know that last_eb will definitely
														
 
															+	 * change, so lets traverse the tree and find the second to
														
 
															+	 * last extent block. */
														
 
															+	el = &(fe->id2.i_list);
														
 
															+	/* go down the tree, */
														
 
															+	do {
														
 
															+		for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
														
 
															+			if (le32_to_cpu(el->l_recs[i].e_cpos) <
														
 
															+			    new_i_clusters) {
														
 
															+				block = le64_to_cpu(el->l_recs[i].e_blkno);
														
 
															+				break;
														
 
															+			}
														
 
															+		}
														
 
															+		BUG_ON(i < 0);
														
 
															+
														
 
															+		if (bh) {
														
 
															+			brelse(bh);
														
 
															+			bh = NULL;
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
														
 
															+					 inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		eb = (struct ocfs2_extent_block *) bh->b_data;
														
 
															+		el = &eb->h_list;
														
 
															+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	} while (el->l_tree_depth);
														
 
															+
														
 
															+	*new_last_eb = bh;
														
 
															+	get_bh(*new_last_eb);
														
 
															+	mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
														
 
															+bail:
														
 
															+	if (bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_do_truncate(struct ocfs2_super *osb,
														
 
															+			     unsigned int clusters_to_del,
														
 
															+			     struct inode *inode,
														
 
															+			     struct buffer_head *fe_bh,
														
 
															+			     struct buffer_head *old_last_eb_bh,
														
 
															+			     struct ocfs2_journal_handle *handle,
														
 
															+			     struct ocfs2_truncate_context *tc)
														
 
															+{
														
 
															+	int status, i, depth;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_block *last_eb = NULL;
														
 
															+	struct ocfs2_extent_list *el;
														
 
															+	struct buffer_head *eb_bh = NULL;
														
 
															+	struct buffer_head *last_eb_bh = NULL;
														
 
															+	u64 next_eb = 0;
														
 
															+	u64 delete_blk = 0;
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+
														
 
															+	status = ocfs2_find_new_last_ext_blk(osb,
														
 
															+					     inode,
														
 
															+					     fe,
														
 
															+					     le32_to_cpu(fe->i_clusters) -
														
 
															+					     		clusters_to_del,
														
 
															+					     old_last_eb_bh,
														
 
															+					     &last_eb_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (last_eb_bh)
														
 
															+		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, inode, fe_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	el = &(fe->id2.i_list);
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
														
 
															+				      clusters_to_del;
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
														
 
															+	fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
														
 
															+	fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
														
 
															+
														
 
															+	i = le16_to_cpu(el->l_next_free_rec) - 1;
														
 
															+
														
 
															+	BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
														
 
															+	le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
														
 
															+	/* tree depth zero, we can just delete the clusters, otherwise
														
 
															+	 * we need to record the offset of the next level extent block
														
 
															+	 * as we may overwrite it. */
														
 
															+	if (!el->l_tree_depth)
														
 
															+		delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
														
 
															+			+ ocfs2_clusters_to_blocks(osb->sb,
														
 
															+					le32_to_cpu(el->l_recs[i].e_clusters));
														
 
															+	else
														
 
															+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
														
 
															+
														
 
															+	if (!el->l_recs[i].e_clusters) {
														
 
															+		/* if we deleted the whole extent record, then clear
														
 
															+		 * out the other fields and update the extent
														
 
															+		 * list. For depth > 0 trees, we've already recorded
														
 
															+		 * the extent block in 'next_eb' */
														
 
															+		el->l_recs[i].e_cpos = 0;
														
 
															+		el->l_recs[i].e_blkno = 0;
														
 
															+		BUG_ON(!el->l_next_free_rec);
														
 
															+		le16_add_cpu(&el->l_next_free_rec, -1);
														
 
															+	}
														
 
															+
														
 
															+	depth = le16_to_cpu(el->l_tree_depth);
														
 
															+	if (!fe->i_clusters) {
														
 
															+		/* trunc to zero is a special case. */
														
 
															+		el->l_tree_depth = 0;
														
 
															+		fe->i_last_eb_blk = 0;
														
 
															+	} else if (last_eb)
														
 
															+		fe->i_last_eb_blk = last_eb->h_blkno;
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (last_eb) {
														
 
															+		/* If there will be a new last extent block, then by
														
 
															+		 * definition, there cannot be any leaves to the right of
														
 
															+		 * him. */
														
 
															+		status = ocfs2_journal_access(handle, inode, last_eb_bh,
														
 
															+					      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		last_eb->h_next_leaf_blk = 0;
														
 
															+		status = ocfs2_journal_dirty(handle, last_eb_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* if our tree depth > 0, update all the tree blocks below us. */
														
 
															+	while (depth) {
														
 
															+		mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
														
 
															+		     depth,  next_eb);
														
 
															+		status = ocfs2_read_block(osb, next_eb, &eb_bh,
														
 
															+					  OCFS2_BH_CACHED, inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		el = &(eb->h_list);
														
 
															+
														
 
															+		status = ocfs2_journal_access(handle, inode, eb_bh,
														
 
															+					      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
														
 
															+		BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
														
 
															+
														
 
															+		i = le16_to_cpu(el->l_next_free_rec) - 1;
														
 
															+
														
 
															+		mlog(0, "extent block %"MLFu64", before: record %d: "
														
 
															+		     "(%u, %u, %"MLFu64"), next = %u\n",
														
 
															+		     le64_to_cpu(eb->h_blkno), i,
														
 
															+		     le32_to_cpu(el->l_recs[i].e_cpos),
														
 
															+		     le32_to_cpu(el->l_recs[i].e_clusters),
														
 
															+		     le64_to_cpu(el->l_recs[i].e_blkno),
														
 
															+		     le16_to_cpu(el->l_next_free_rec));
														
 
															+
														
 
															+		BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
														
 
															+		le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
														
 
															+
														
 
															+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
														
 
															+		/* bottom-most block requires us to delete data.*/
														
 
															+		if (!el->l_tree_depth)
														
 
															+			delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
														
 
															+				+ ocfs2_clusters_to_blocks(osb->sb,
														
 
															+					le32_to_cpu(el->l_recs[i].e_clusters));
														
 
															+		if (!el->l_recs[i].e_clusters) {
														
 
															+			el->l_recs[i].e_cpos = 0;
														
 
															+			el->l_recs[i].e_blkno = 0;
														
 
															+			BUG_ON(!el->l_next_free_rec);
														
 
															+			le16_add_cpu(&el->l_next_free_rec, -1);
														
 
															+		}
														
 
															+		mlog(0, "extent block %"MLFu64", after: record %d: "
														
 
															+		     "(%u, %u, %"MLFu64"), next = %u\n",
														
 
															+		     le64_to_cpu(eb->h_blkno), i,
														
 
															+		     le32_to_cpu(el->l_recs[i].e_cpos),
														
 
															+		     le32_to_cpu(el->l_recs[i].e_clusters),
														
 
															+		     le64_to_cpu(el->l_recs[i].e_blkno),
														
 
															+		     le16_to_cpu(el->l_next_free_rec));
														
 
															+
														
 
															+		status = ocfs2_journal_dirty(handle, eb_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		if (!el->l_next_free_rec) {
														
 
															+			mlog(0, "deleting this extent block.\n");
														
 
															+
														
 
															+			ocfs2_remove_from_cache(inode, eb_bh);
														
 
															+
														
 
															+			BUG_ON(eb->h_suballoc_slot);
														
 
															+			BUG_ON(el->l_recs[0].e_clusters);
														
 
															+			BUG_ON(el->l_recs[0].e_cpos);
														
 
															+			BUG_ON(el->l_recs[0].e_blkno);
														
 
															+			status = ocfs2_free_extent_block(handle,
														
 
															+							 tc->tc_ext_alloc_inode,
														
 
															+							 tc->tc_ext_alloc_bh,
														
 
															+							 eb);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+		brelse(eb_bh);
														
 
															+		eb_bh = NULL;
														
 
															+		depth--;
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(!delete_blk);
														
 
															+	status = ocfs2_truncate_log_append(osb, handle, delete_blk,
														
 
															+					   clusters_to_del);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (!status)
														
 
															+		ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
														
 
															+	else
														
 
															+		ocfs2_extent_map_drop(inode, 0);
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * It is expected, that by the time you call this function,
														
 
															+ * inode->i_size and fe->i_size have been adjusted.
														
 
															+ *
														
 
															+ * WARNING: This will kfree the truncate context
														
 
															+ */
														
 
															+int ocfs2_commit_truncate(struct ocfs2_super *osb,
														
 
															+			  struct inode *inode,
														
 
															+			  struct buffer_head *fe_bh,
														
 
															+			  struct ocfs2_truncate_context *tc)
														
 
															+{
														
 
															+	int status, i, credits, tl_sem = 0;
														
 
															+	u32 clusters_to_del, target_i_clusters;
														
 
															+	u64 last_eb = 0;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_list *el;
														
 
															+	struct buffer_head *last_eb_bh;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct inode *tl_inode = osb->osb_tl_inode;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+
														
 
															+	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
														
 
															+						     i_size_read(inode));
														
 
															+
														
 
															+	last_eb_bh = tc->tc_last_eb_bh;
														
 
															+	tc->tc_last_eb_bh = NULL;
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+
														
 
															+	if (fe->id2.i_list.l_tree_depth) {
														
 
															+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
														
 
															+		el = &eb->h_list;
														
 
															+	} else
														
 
															+		el = &fe->id2.i_list;
														
 
															+	last_eb = le64_to_cpu(fe->i_last_eb_blk);
														
 
															+start:
														
 
															+	mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
														
 
															+	     "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
														
 
															+	     "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
														
 
															+	     le32_to_cpu(fe->i_clusters), last_eb,
														
 
															+	     le64_to_cpu(fe->i_last_eb_blk),
														
 
															+	     le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
														
 
															+
														
 
															+	if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
														
 
															+		mlog(0, "last_eb changed!\n");
														
 
															+		BUG_ON(!fe->id2.i_list.l_tree_depth);
														
 
															+		last_eb = le64_to_cpu(fe->i_last_eb_blk);
														
 
															+		/* i_last_eb_blk may have changed, read it if
														
 
															+		 * necessary. We don't have to worry about the
														
 
															+		 * truncate to zero case here (where there becomes no
														
 
															+		 * last_eb) because we never loop back after our work
														
 
															+		 * is done. */
														
 
															+		if (last_eb_bh) {
														
 
															+			brelse(last_eb_bh);
														
 
															+			last_eb_bh = NULL;
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_read_block(osb, last_eb,
														
 
															+					  &last_eb_bh, OCFS2_BH_CACHED,
														
 
															+					  inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		el = &(eb->h_list);
														
 
															+	}
														
 
															+
														
 
															+	/* by now, el will point to the extent list on the bottom most
														
 
															+	 * portion of this tree. */
														
 
															+	i = le16_to_cpu(el->l_next_free_rec) - 1;
														
 
															+	if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
														
 
															+		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
														
 
															+	else
														
 
															+		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
														
 
															+				   le32_to_cpu(el->l_recs[i].e_cpos)) -
														
 
															+				  target_i_clusters;
														
 
															+
														
 
															+	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
														
 
															+
														
 
															+	down(&tl_inode->i_sem);
														
 
															+	tl_sem = 1;
														
 
															+	/* ocfs2_truncate_log_needs_flush guarantees us at least one
														
 
															+	 * record is free for use. If there isn't any, we flush to get
														
 
															+	 * an empty truncate log.  */
														
 
															+	if (ocfs2_truncate_log_needs_flush(osb)) {
														
 
															+		status = __ocfs2_flush_truncate_log(osb);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
														
 
															+						fe, el);
														
 
															+	handle = ocfs2_start_trans(osb, NULL, credits);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
														
 
															+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
														
 
															+				   last_eb_bh, handle, tc);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	up(&tl_inode->i_sem);
														
 
															+	tl_sem = 0;
														
 
															+
														
 
															+	ocfs2_commit_trans(handle);
														
 
															+	handle = NULL;
														
 
															+
														
 
															+	BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
														
 
															+	if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
														
 
															+		goto start;
														
 
															+bail:
														
 
															+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+
														
 
															+	ocfs2_schedule_truncate_log_flush(osb, 1);
														
 
															+
														
 
															+	if (tl_sem)
														
 
															+		up(&tl_inode->i_sem);
														
 
															+
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (last_eb_bh)
														
 
															+		brelse(last_eb_bh);
														
 
															+
														
 
															+	/* This will drop the ext_alloc cluster lock for us */
														
 
															+	ocfs2_free_truncate_context(tc);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * Expects the inode to already be locked. This will figure out which
														
 
															+ * inodes need to be locked and will put them on the returned truncate
														
 
															+ * context.
														
 
															+ */
														
 
															+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
														
 
															+			   struct inode *inode,
														
 
															+			   struct buffer_head *fe_bh,
														
 
															+			   struct ocfs2_truncate_context **tc)
														
 
															+{
														
 
															+	int status, metadata_delete;
														
 
															+	unsigned int new_i_clusters;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_list *el;
														
 
															+	struct buffer_head *last_eb_bh = NULL;
														
 
															+	struct inode *ext_alloc_inode = NULL;
														
 
															+	struct buffer_head *ext_alloc_bh = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	*tc = NULL;
														
 
															+
														
 
															+	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
														
 
															+						  i_size_read(inode));
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+
														
 
															+	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
														
 
															+	     "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
														
 
															+
														
 
															+	if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
														
 
															+		ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
														
 
															+			    "%u and size %"MLFu64" whereas struct inode has "
														
 
															+			    "cluster count %u and size %llu which caused an "
														
 
															+			    "invalid truncate to %u clusters.",
														
 
															+			    le64_to_cpu(fe->i_blkno),
														
 
															+			    le32_to_cpu(fe->i_clusters),
														
 
															+			    le64_to_cpu(fe->i_size),
														
 
															+			    OCFS2_I(inode)->ip_clusters, i_size_read(inode),
														
 
															+			    new_i_clusters);
														
 
															+		mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	*tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
														
 
															+	if (!(*tc)) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	metadata_delete = 0;
														
 
															+	if (fe->id2.i_list.l_tree_depth) {
														
 
															+		/* If we have a tree, then the truncate may result in
														
 
															+		 * metadata deletes. Figure this out from the
														
 
															+		 * rightmost leaf block.*/
														
 
															+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
														
 
															+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+
														
 
															+			brelse(last_eb_bh);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		el = &(eb->h_list);
														
 
															+		if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
														
 
															+			metadata_delete = 1;
														
 
															+	}
														
 
															+
														
 
															+	(*tc)->tc_last_eb_bh = last_eb_bh;
														
 
															+
														
 
															+	if (metadata_delete) {
														
 
															+		mlog(0, "Will have to delete metadata for this trunc. "
														
 
															+		     "locking allocator.\n");
														
 
															+		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
														
 
															+		if (!ext_alloc_inode) {
														
 
															+			status = -ENOMEM;
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		down(&ext_alloc_inode->i_sem);
														
 
															+		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
														
 
															+
														
 
															+		status = ocfs2_meta_lock(ext_alloc_inode,
														
 
															+					 NULL,
														
 
															+					 &ext_alloc_bh,
														
 
															+					 1);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
														
 
															+		(*tc)->tc_ext_alloc_locked = 1;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (status < 0) {
														
 
															+		if (*tc)
														
 
															+			ocfs2_free_truncate_context(*tc);
														
 
															+		*tc = NULL;
														
 
															+	}
														
 
															+	mlog_exit_void();
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
														
 
															+{
														
 
															+	if (tc->tc_ext_alloc_inode) {
														
 
															+		if (tc->tc_ext_alloc_locked)
														
 
															+			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
														
 
															+
														
 
															+		up(&tc->tc_ext_alloc_inode->i_sem);
														
 
															+		iput(tc->tc_ext_alloc_inode);
														
 
															+	}
														
 
															+
														
 
															+	if (tc->tc_ext_alloc_bh)
														
 
															+		brelse(tc->tc_ext_alloc_bh);
														
 
															+
														
 
															+	if (tc->tc_last_eb_bh)
														
 
															+		brelse(tc->tc_last_eb_bh);
														
 
															+
														
 
															+	kfree(tc);
														
 
															+}
														
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,82 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * alloc.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_ALLOC_H
														
 
															+#define OCFS2_ALLOC_H
														
 
															+
														
 
															+struct ocfs2_alloc_context;
														
 
															+int ocfs2_insert_extent(struct ocfs2_super *osb,
														
 
															+			struct ocfs2_journal_handle *handle,
														
 
															+			struct inode *inode,
														
 
															+			struct buffer_head *fe_bh,
														
 
															+			u64 blkno,
														
 
															+			u32 new_clusters,
														
 
															+			struct ocfs2_alloc_context *meta_ac);
														
 
															+int ocfs2_num_free_extents(struct ocfs2_super *osb,
														
 
															+			   struct inode *inode,
														
 
															+			   struct ocfs2_dinode *fe);
														
 
															+/* how many new metadata chunks would an allocation need at maximum? */
														
 
															+static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
														
 
															+{
														
 
															+	/*
														
 
															+	 * Rather than do all the work of determining how much we need
														
 
															+	 * (involves a ton of reads and locks), just ask for the
														
 
															+	 * maximal limit.  That's a tree depth shift.  So, one block for
														
 
															+	 * level of the tree (current l_tree_depth), one block for the
														
 
															+	 * new tree_depth==0 extent_block, and one block at the new
														
 
															+	 * top-of-the tree.
														
 
															+	 */
														
 
															+	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
														
 
															+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
														
 
															+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
														
 
															+				       int cancel);
														
 
															+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
														
 
															+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
														
 
															+				      int slot_num,
														
 
															+				      struct ocfs2_dinode **tl_copy);
														
 
															+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
														
 
															+					 struct ocfs2_dinode *tl_copy);
														
 
															+
														
 
															+struct ocfs2_truncate_context {
														
 
															+	struct inode *tc_ext_alloc_inode;
														
 
															+	struct buffer_head *tc_ext_alloc_bh;
														
 
															+	int tc_ext_alloc_locked; /* is it cluster locked? */
														
 
															+	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
														
 
															+	struct buffer_head *tc_last_eb_bh;
														
 
															+};
														
 
															+
														
 
															+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
														
 
															+			   struct inode *inode,
														
 
															+			   struct buffer_head *fe_bh,
														
 
															+			   struct ocfs2_truncate_context **tc);
														
 
															+int ocfs2_commit_truncate(struct ocfs2_super *osb,
														
 
															+			  struct inode *inode,
														
 
															+			  struct buffer_head *fe_bh,
														
 
															+			  struct ocfs2_truncate_context *tc);
														
 
															+
														
 
															+#endif /* OCFS2_ALLOC_H */
														
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,643 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/pagemap.h>
														
 
															+#include <asm/byteorder.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_FILE_IO
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "aops.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "file.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "super.h"
														
 
															+#include "symlink.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
														
 
															+				   struct buffer_head *bh_result, int create)
														
 
															+{
														
 
															+	int err = -EIO;
														
 
															+	int status;
														
 
															+	struct ocfs2_dinode *fe = NULL;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct buffer_head *buffer_cache_bh = NULL;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	void *kaddr;
														
 
															+
														
 
															+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
														
 
															+		   (unsigned long long)iblock, bh_result, create);
														
 
															+
														
 
															+	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
														
 
															+
														
 
															+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
														
 
															+		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
														
 
															+		     (unsigned long long)iblock);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
														
 
															+				  OCFS2_I(inode)->ip_blkno,
														
 
															+				  &bh, OCFS2_BH_CACHED, inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	fe = (struct ocfs2_dinode *) bh->b_data;
														
 
															+
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
														
 
															+		     fe->i_blkno, 7, fe->i_signature);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
														
 
															+						    le32_to_cpu(fe->i_clusters))) {
														
 
															+		mlog(ML_ERROR, "block offset is outside the allocated size: "
														
 
															+		     "%llu\n", (unsigned long long)iblock);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* We don't use the page cache to create symlink data, so if
														
 
															+	 * need be, copy it over from the buffer cache. */
														
 
															+	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
														
 
															+		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
														
 
															+			    iblock;
														
 
															+		buffer_cache_bh = sb_getblk(osb->sb, blkno);
														
 
															+		if (!buffer_cache_bh) {
														
 
															+			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		/* we haven't locked out transactions, so a commit
														
 
															+		 * could've happened. Since we've got a reference on
														
 
															+		 * the bh, even if it commits while we're doing the
														
 
															+		 * copy, the data is still good. */
														
 
															+		if (buffer_jbd(buffer_cache_bh)
														
 
															+		    && ocfs2_inode_is_new(inode)) {
														
 
															+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
														
 
															+			if (!kaddr) {
														
 
															+				mlog(ML_ERROR, "couldn't kmap!\n");
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			memcpy(kaddr + (bh_result->b_size * iblock),
														
 
															+			       buffer_cache_bh->b_data,
														
 
															+			       bh_result->b_size);
														
 
															+			kunmap_atomic(kaddr, KM_USER0);
														
 
															+			set_buffer_uptodate(bh_result);
														
 
															+		}
														
 
															+		brelse(buffer_cache_bh);
														
 
															+	}
														
 
															+
														
 
															+	map_bh(bh_result, inode->i_sb,
														
 
															+	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
														
 
															+
														
 
															+	err = 0;
														
 
															+
														
 
															+bail:
														
 
															+	if (bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	mlog_exit(err);
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_get_block(struct inode *inode, sector_t iblock,
														
 
															+			   struct buffer_head *bh_result, int create)
														
 
															+{
														
 
															+	int err = 0;
														
 
															+	u64 p_blkno, past_eof;
														
 
															+
														
 
															+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
														
 
															+		   (unsigned long long)iblock, bh_result, create);
														
 
															+
														
 
															+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
														
 
															+		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
														
 
															+		     inode, inode->i_ino);
														
 
															+
														
 
															+	if (S_ISLNK(inode->i_mode)) {
														
 
															+		/* this always does I/O for some reason. */
														
 
															+		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* this can happen if another node truncs after our extend! */
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
														
 
															+					       OCFS2_I(inode)->ip_clusters))
														
 
															+		err = -EIO;
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+	if (err)
														
 
															+		goto bail;
														
 
															+
														
 
															+	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
														
 
															+					  NULL);
														
 
															+	if (err) {
														
 
															+		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
														
 
															+		     "%"MLFu64", NULL)\n", err, inode,
														
 
															+		     (unsigned long long)iblock, p_blkno);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	map_bh(bh_result, inode->i_sb, p_blkno);
														
 
															+
														
 
															+	if (bh_result->b_blocknr == 0) {
														
 
															+		err = -EIO;
														
 
															+		mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
														
 
															+		     "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
														
 
															+		     p_blkno, OCFS2_I(inode)->ip_blkno);
														
 
															+	}
														
 
															+
														
 
															+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
														
 
															+	mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
														
 
															+
														
 
															+	if (create && (iblock >= past_eof))
														
 
															+		set_buffer_new(bh_result);
														
 
															+
														
 
															+bail:
														
 
															+	if (err < 0)
														
 
															+		err = -EIO;
														
 
															+
														
 
															+	mlog_exit(err);
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_readpage(struct file *file, struct page *page)
														
 
															+{
														
 
															+	struct inode *inode = page->mapping->host;
														
 
															+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
														
 
															+	int ret, unlock = 1;
														
 
															+
														
 
															+	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
														
 
															+
														
 
															+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
														
 
															+	if (ret != 0) {
														
 
															+		if (ret == AOP_TRUNCATED_PAGE)
														
 
															+			unlock = 0;
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+
														
 
															+	/*
														
 
															+	 * i_size might have just been updated as we grabed the meta lock.  We
														
 
															+	 * might now be discovering a truncate that hit on another node.
														
 
															+	 * block_read_full_page->get_block freaks out if it is asked to read
														
 
															+	 * beyond the end of a file, so we check here.  Callers
														
 
															+	 * (generic_file_read, fault->nopage) are clever enough to check i_size
														
 
															+	 * and notice that the page they just read isn't needed.
														
 
															+	 *
														
 
															+	 * XXX sys_readahead() seems to get that wrong?
														
 
															+	 */
														
 
															+	if (start >= i_size_read(inode)) {
														
 
															+		char *addr = kmap(page);
														
 
															+		memset(addr, 0, PAGE_SIZE);
														
 
															+		flush_dcache_page(page);
														
 
															+		kunmap(page);
														
 
															+		SetPageUptodate(page);
														
 
															+		ret = 0;
														
 
															+		goto out_alloc;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_data_lock_with_page(inode, 0, page);
														
 
															+	if (ret != 0) {
														
 
															+		if (ret == AOP_TRUNCATED_PAGE)
														
 
															+			unlock = 0;
														
 
															+		mlog_errno(ret);
														
 
															+		goto out_alloc;
														
 
															+	}
														
 
															+
														
 
															+	ret = block_read_full_page(page, ocfs2_get_block);
														
 
															+	unlock = 0;
														
 
															+
														
 
															+	ocfs2_data_unlock(inode, 0);
														
 
															+out_alloc:
														
 
															+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+	ocfs2_meta_unlock(inode, 0);
														
 
															+out:
														
 
															+	if (unlock)
														
 
															+		unlock_page(page);
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* Note: Because we don't support holes, our allocation has
														
 
															+ * already happened (allocation writes zeros to the file data)
														
 
															+ * so we don't have to worry about ordered writes in
														
 
															+ * ocfs2_writepage.
														
 
															+ *
														
 
															+ * ->writepage is called during the process of invalidating the page cache
														
 
															+ * during blocked lock processing.  It can't block on any cluster locks
														
 
															+ * to during block mapping.  It's relying on the fact that the block
														
 
															+ * mapping can't have disappeared under the dirty pages that it is
														
 
															+ * being asked to write back.
														
 
															+ */
														
 
															+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	mlog_entry("(0x%p)\n", page);
														
 
															+
														
 
															+	ret = block_write_full_page(page, ocfs2_get_block, wbc);
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
														
 
															+ * from loopback.  It must be able to perform its own locking around
														
 
															+ * ocfs2_get_block().
														
 
															+ */
														
 
															+int ocfs2_prepare_write(struct file *file, struct page *page,
														
 
															+			unsigned from, unsigned to)
														
 
															+{
														
 
															+	struct inode *inode = page->mapping->host;
														
 
															+	int ret;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
														
 
															+
														
 
															+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
														
 
															+	if (ret != 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+
														
 
															+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
														
 
															+
														
 
															+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+
														
 
															+	ocfs2_meta_unlock(inode, 0);
														
 
															+out:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* Taken from ext3. We don't necessarily need the full blown
														
 
															+ * functionality yet, but IMHO it's better to cut and paste the whole
														
 
															+ * thing so we can avoid introducing our own bugs (and easily pick up
														
 
															+ * their fixes when they happen) --Mark */
														
 
															+static int walk_page_buffers(	handle_t *handle,
														
 
															+				struct buffer_head *head,
														
 
															+				unsigned from,
														
 
															+				unsigned to,
														
 
															+				int *partial,
														
 
															+				int (*fn)(	handle_t *handle,
														
 
															+						struct buffer_head *bh))
														
 
															+{
														
 
															+	struct buffer_head *bh;
														
 
															+	unsigned block_start, block_end;
														
 
															+	unsigned blocksize = head->b_size;
														
 
															+	int err, ret = 0;
														
 
															+	struct buffer_head *next;
														
 
															+
														
 
															+	for (	bh = head, block_start = 0;
														
 
															+		ret == 0 && (bh != head || !block_start);
														
 
															+	    	block_start = block_end, bh = next)
														
 
															+	{
														
 
															+		next = bh->b_this_page;
														
 
															+		block_end = block_start + blocksize;
														
 
															+		if (block_end <= from || block_start >= to) {
														
 
															+			if (partial && !buffer_uptodate(bh))
														
 
															+				*partial = 1;
														
 
															+			continue;
														
 
															+		}
														
 
															+		err = (*fn)(handle, bh);
														
 
															+		if (!ret)
														
 
															+			ret = err;
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
														
 
															+							 struct page *page,
														
 
															+							 unsigned from,
														
 
															+							 unsigned to)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
														
 
															+	if (!handle) {
														
 
															+		ret = -ENOMEM;
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (ocfs2_should_order_data(inode)) {
														
 
															+		ret = walk_page_buffers(handle->k_handle,
														
 
															+					page_buffers(page),
														
 
															+					from, to, NULL,
														
 
															+					ocfs2_journal_dirty_data);
														
 
															+		if (ret < 0) 
														
 
															+			mlog_errno(ret);
														
 
															+	}
														
 
															+out:
														
 
															+	if (ret) {
														
 
															+		if (handle)
														
 
															+			ocfs2_commit_trans(handle);
														
 
															+		handle = ERR_PTR(ret);
														
 
															+	}
														
 
															+	return handle;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_commit_write(struct file *file, struct page *page,
														
 
															+			      unsigned from, unsigned to)
														
 
															+{
														
 
															+	int ret, extending = 0, locklevel = 0;
														
 
															+	loff_t new_i_size;
														
 
															+	struct buffer_head *di_bh = NULL;
														
 
															+	struct inode *inode = page->mapping->host;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
														
 
															+
														
 
															+	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
														
 
															+	 * us to sample inode->i_size here without the metadata lock:
														
 
															+	 *
														
 
															+	 * 1) We're currently holding the inode alloc lock, so no
														
 
															+	 *    nodes can change it underneath us.
														
 
															+	 *
														
 
															+	 * 2) We've had to take the metadata lock at least once
														
 
															+	 *    already to check for extending writes, hence insuring
														
 
															+	 *    that our current copy is also up to date.
														
 
															+	 */
														
 
															+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
														
 
															+	if (new_i_size > i_size_read(inode)) {
														
 
															+		extending = 1;
														
 
															+		locklevel = 1;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
														
 
															+	if (ret != 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_data_lock_with_page(inode, 1, page);
														
 
															+	if (ret != 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out_unlock_meta;
														
 
															+	}
														
 
															+
														
 
															+	if (extending) {
														
 
															+		handle = ocfs2_start_walk_page_trans(inode, page, from, to);
														
 
															+		if (IS_ERR(handle)) {
														
 
															+			ret = PTR_ERR(handle);
														
 
															+			handle = NULL;
														
 
															+			goto out_unlock_data;
														
 
															+		}
														
 
															+
														
 
															+		/* Mark our buffer early. We'd rather catch this error up here
														
 
															+		 * as opposed to after a successful commit_write which would
														
 
															+		 * require us to set back inode->i_size. */
														
 
															+		ret = ocfs2_journal_access(handle, inode, di_bh,
														
 
															+					   OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out_commit;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* might update i_size */
														
 
															+	ret = generic_commit_write(file, page, from, to);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out_commit;
														
 
															+	}
														
 
															+
														
 
															+	if (extending) {
														
 
															+		loff_t size = (u64) i_size_read(inode);
														
 
															+		struct ocfs2_dinode *di =
														
 
															+			(struct ocfs2_dinode *)di_bh->b_data;
														
 
															+
														
 
															+		/* ocfs2_mark_inode_dirty is too heavy to use here. */
														
 
															+		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
														
 
															+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
														
 
															+
														
 
															+		di->i_size = cpu_to_le64(size);
														
 
															+		di->i_ctime = di->i_mtime = 
														
 
															+				cpu_to_le64(inode->i_mtime.tv_sec);
														
 
															+		di->i_ctime_nsec = di->i_mtime_nsec = 
														
 
															+				cpu_to_le32(inode->i_mtime.tv_nsec);
														
 
															+
														
 
															+		ret = ocfs2_journal_dirty(handle, di_bh);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out_commit;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(extending && (i_size_read(inode) != new_i_size));
														
 
															+
														
 
															+out_commit:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+out_unlock_data:
														
 
															+	ocfs2_data_unlock(inode, 1);
														
 
															+out_unlock_meta:
														
 
															+	ocfs2_meta_unlock(inode, locklevel);
														
 
															+out:
														
 
															+	if (di_bh)
														
 
															+		brelse(di_bh);
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
														
 
															+{
														
 
															+	sector_t status;
														
 
															+	u64 p_blkno = 0;
														
 
															+	int err = 0;
														
 
															+	struct inode *inode = mapping->host;
														
 
															+
														
 
															+	mlog_entry("(block = %llu)\n", (unsigned long long)block);
														
 
															+
														
 
															+	/* We don't need to lock journal system files, since they aren't
														
 
															+	 * accessed concurrently from multiple nodes.
														
 
															+	 */
														
 
															+	if (!INODE_JOURNAL(inode)) {
														
 
															+		err = ocfs2_meta_lock(inode, NULL, NULL, 0);
														
 
															+		if (err) {
														
 
															+			if (err != -ENOENT)
														
 
															+				mlog_errno(err);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+	}
														
 
															+
														
 
															+	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
														
 
															+					  NULL);
														
 
															+
														
 
															+	if (!INODE_JOURNAL(inode)) {
														
 
															+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+		ocfs2_meta_unlock(inode, 0);
														
 
															+	}
														
 
															+
														
 
															+	if (err) {
														
 
															+		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
														
 
															+		     (unsigned long long)block);
														
 
															+		mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+bail:
														
 
															+	status = err ? 0 : p_blkno;
														
 
															+
														
 
															+	mlog_exit((int)status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * TODO: Make this into a generic get_blocks function.
														
 
															+ *
														
 
															+ * From do_direct_io in direct-io.c:
														
 
															+ *  "So what we do is to permit the ->get_blocks function to populate
														
 
															+ *   bh.b_size with the size of IO which is permitted at this offset and
														
 
															+ *   this i_blkbits."
														
 
															+ *
														
 
															+ * This function is called directly from get_more_blocks in direct-io.c.
														
 
															+ *
														
 
															+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
														
 
															+ * 					fs_count, map_bh, dio->rw == WRITE);
														
 
															+ */
														
 
															+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
														
 
															+				     unsigned long max_blocks,
														
 
															+				     struct buffer_head *bh_result, int create)
														
 
															+{
														
 
															+	int ret;
														
 
															+	u64 vbo_max; /* file offset, max_blocks from iblock */
														
 
															+	u64 p_blkno;
														
 
															+	int contig_blocks;
														
 
															+	unsigned char blocksize_bits;
														
 
															+
														
 
															+	if (!inode || !bh_result) {
														
 
															+		mlog(ML_ERROR, "inode or bh_result is null\n");
														
 
															+		return -EIO;
														
 
															+	}
														
 
															+
														
 
															+	blocksize_bits = inode->i_sb->s_blocksize_bits;
														
 
															+
														
 
															+	/* This function won't even be called if the request isn't all
														
 
															+	 * nicely aligned and of the right size, so there's no need
														
 
															+	 * for us to check any of that. */
														
 
															+
														
 
															+	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	if ((iblock + max_blocks) >
														
 
															+	    ocfs2_clusters_to_blocks(inode->i_sb,
														
 
															+				     OCFS2_I(inode)->ip_clusters)) {
														
 
															+		spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+		ret = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	/* This figures out the size of the next contiguous block, and
														
 
															+	 * our logical offset */
														
 
															+	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
														
 
															+					  &contig_blocks);
														
 
															+	if (ret) {
														
 
															+		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
														
 
															+		     (unsigned long long)iblock);
														
 
															+		ret = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	map_bh(bh_result, inode->i_sb, p_blkno);
														
 
															+
														
 
															+	/* make sure we don't map more than max_blocks blocks here as
														
 
															+	   that's all the kernel will handle at this point. */
														
 
															+	if (max_blocks < contig_blocks)
														
 
															+		contig_blocks = max_blocks;
														
 
															+	bh_result->b_size = contig_blocks << blocksize_bits;
														
 
															+bail:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* 
														
 
															+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
														
 
															+ * particularly interested in the aio/dio case.  Like the core uses
														
 
															+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
														
 
															+ * truncation on another.
														
 
															+ */
														
 
															+static void ocfs2_dio_end_io(struct kiocb *iocb,
														
 
															+			     loff_t offset,
														
 
															+			     ssize_t bytes,
														
 
															+			     void *private)
														
 
															+{
														
 
															+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
														
 
															+
														
 
															+	/* this io's submitter should not have unlocked this before we could */
														
 
															+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
														
 
															+	ocfs2_iocb_clear_rw_locked(iocb);
														
 
															+	up_read(&inode->i_alloc_sem);
														
 
															+	ocfs2_rw_unlock(inode, 0);
														
 
															+}
														
 
															+
														
 
															+static ssize_t ocfs2_direct_IO(int rw,
														
 
															+			       struct kiocb *iocb,
														
 
															+			       const struct iovec *iov,
														
 
															+			       loff_t offset,
														
 
															+			       unsigned long nr_segs)
														
 
															+{
														
 
															+	struct file *file = iocb->ki_filp;
														
 
															+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
														
 
															+	int ret;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
														
 
															+					    inode->i_sb->s_bdev, iov, offset,
														
 
															+					    nr_segs, 
														
 
															+					    ocfs2_direct_IO_get_blocks,
														
 
															+					    ocfs2_dio_end_io);
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct address_space_operations ocfs2_aops = {
														
 
															+	.readpage	= ocfs2_readpage,
														
 
															+	.writepage	= ocfs2_writepage,
														
 
															+	.prepare_write	= ocfs2_prepare_write,
														
 
															+	.commit_write	= ocfs2_commit_write,
														
 
															+	.bmap		= ocfs2_bmap,
														
 
															+	.sync_page	= block_sync_page,
														
 
															+	.direct_IO	= ocfs2_direct_IO
														
 
															+};
														
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,41 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_AOPS_H
														
 
															+#define OCFS2_AOPS_H
														
 
															+
														
 
															+int ocfs2_prepare_write(struct file *file, struct page *page,
														
 
															+			unsigned from, unsigned to);
														
 
															+
														
 
															+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
														
 
															+							 struct page *page,
														
 
															+							 unsigned from,
														
 
															+							 unsigned to);
														
 
															+
														
 
															+/* all ocfs2_dio_end_io()'s fault */
														
 
															+#define ocfs2_iocb_is_rw_locked(iocb) \
														
 
															+	test_bit(0, (unsigned long *)&iocb->private)
														
 
															+#define ocfs2_iocb_set_rw_locked(iocb) \
														
 
															+	set_bit(0, (unsigned long *)&iocb->private)
														
 
															+#define ocfs2_iocb_clear_rw_locked(iocb) \
														
 
															+	clear_bit(0, (unsigned long *)&iocb->private)
														
 
															+
														
 
															+#endif /* OCFS2_FILE_H */
														
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,232 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * io.c
														
 
															+ *
														
 
															+ * Buffer cache handling
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "uptodate.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
														
 
															+		      struct inode *inode)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
														
 
															+		   (unsigned long long)bh->b_blocknr, inode);
														
 
															+
														
 
															+	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
														
 
															+	BUG_ON(buffer_jbd(bh));
														
 
															+
														
 
															+	/* No need to check for a soft readonly file system here. non
														
 
															+	 * journalled writes are only ever done on system files which
														
 
															+	 * can get modified during recovery even if read-only. */
														
 
															+	if (ocfs2_is_hard_readonly(osb)) {
														
 
															+		ret = -EROFS;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	down(&OCFS2_I(inode)->ip_io_sem);
														
 
															+
														
 
															+	lock_buffer(bh);
														
 
															+	set_buffer_uptodate(bh);
														
 
															+
														
 
															+	/* remove from dirty list before I/O. */
														
 
															+	clear_buffer_dirty(bh);
														
 
															+
														
 
															+	get_bh(bh); /* for end_buffer_write_sync() */                   
														
 
															+	bh->b_end_io = end_buffer_write_sync;
														
 
															+	submit_bh(WRITE, bh);
														
 
															+
														
 
															+	wait_on_buffer(bh);
														
 
															+
														
 
															+	if (buffer_uptodate(bh)) {
														
 
															+		ocfs2_set_buffer_uptodate(inode, bh);
														
 
															+	} else {
														
 
															+		/* We don't need to remove the clustered uptodate
														
 
															+		 * information for this bh as it's not marked locally
														
 
															+		 * uptodate. */
														
 
															+		ret = -EIO;
														
 
															+		brelse(bh);
														
 
															+	}
														
 
															+
														
 
															+	up(&OCFS2_I(inode)->ip_io_sem);
														
 
															+out:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
														
 
															+		      struct buffer_head *bhs[], int flags,
														
 
															+		      struct inode *inode)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct super_block *sb;
														
 
															+	int i, ignore_cache = 0;
														
 
															+	struct buffer_head *bh;
														
 
															+
														
 
															+	mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
														
 
															+		   block, nr, flags, inode);
														
 
															+
														
 
															+	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (nr < 0) {
														
 
															+		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (nr == 0) {
														
 
															+		mlog(ML_BH_IO, "No buffers will be read!\n");
														
 
															+		status = 0;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	sb = osb->sb;
														
 
															+
														
 
															+	if (flags & OCFS2_BH_CACHED && !inode)
														
 
															+		flags &= ~OCFS2_BH_CACHED;
														
 
															+
														
 
															+	if (inode)
														
 
															+		down(&OCFS2_I(inode)->ip_io_sem);
														
 
															+	for (i = 0 ; i < nr ; i++) {
														
 
															+		if (bhs[i] == NULL) {
														
 
															+			bhs[i] = sb_getblk(sb, block++);
														
 
															+			if (bhs[i] == NULL) {
														
 
															+				if (inode)
														
 
															+					up(&OCFS2_I(inode)->ip_io_sem);
														
 
															+				status = -EIO;
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+		bh = bhs[i];
														
 
															+		ignore_cache = 0;
														
 
															+
														
 
															+		if (flags & OCFS2_BH_CACHED &&
														
 
															+		    !ocfs2_buffer_uptodate(inode, bh)) {
														
 
															+			mlog(ML_UPTODATE,
														
 
															+			     "bh (%llu), inode %"MLFu64" not uptodate\n",
														
 
															+			     (unsigned long long)bh->b_blocknr,
														
 
															+			     OCFS2_I(inode)->ip_blkno);
														
 
															+			ignore_cache = 1;
														
 
															+		}
														
 
															+
														
 
															+		/* XXX: Can we ever get this and *not* have the cached
														
 
															+		 * flag set? */
														
 
															+		if (buffer_jbd(bh)) {
														
 
															+			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
														
 
															+				mlog(ML_BH_IO, "trying to sync read a jbd "
														
 
															+					       "managed bh (blocknr = %llu)\n",
														
 
															+				     (unsigned long long)bh->b_blocknr);
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
														
 
															+			if (buffer_dirty(bh)) {
														
 
															+				/* This should probably be a BUG, or
														
 
															+				 * at least return an error. */
														
 
															+				mlog(ML_BH_IO, "asking me to sync read a dirty "
														
 
															+					       "buffer! (blocknr = %llu)\n",
														
 
															+				     (unsigned long long)bh->b_blocknr);
														
 
															+				continue;
														
 
															+			}
														
 
															+
														
 
															+			lock_buffer(bh);
														
 
															+			if (buffer_jbd(bh)) {
														
 
															+#ifdef CATCH_BH_JBD_RACES
														
 
															+				mlog(ML_ERROR, "block %llu had the JBD bit set "
														
 
															+					       "while I was in lock_buffer!",
														
 
															+				     (unsigned long long)bh->b_blocknr);
														
 
															+				BUG();
														
 
															+#else
														
 
															+				unlock_buffer(bh);
														
 
															+				continue;
														
 
															+#endif
														
 
															+			}
														
 
															+			clear_buffer_uptodate(bh);
														
 
															+			get_bh(bh); /* for end_buffer_read_sync() */
														
 
															+			bh->b_end_io = end_buffer_read_sync;
														
 
															+			if (flags & OCFS2_BH_READAHEAD)
														
 
															+				submit_bh(READA, bh);
														
 
															+			else
														
 
															+				submit_bh(READ, bh);
														
 
															+			continue;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+
														
 
															+	for (i = (nr - 1); i >= 0; i--) {
														
 
															+		bh = bhs[i];
														
 
															+
														
 
															+		/* We know this can't have changed as we hold the
														
 
															+		 * inode sem. Avoid doing any work on the bh if the
														
 
															+		 * journal has it. */
														
 
															+		if (!buffer_jbd(bh))
														
 
															+			wait_on_buffer(bh);
														
 
															+
														
 
															+		if (!buffer_uptodate(bh)) {
														
 
															+			/* Status won't be cleared from here on out,
														
 
															+			 * so we can safely record this and loop back
														
 
															+			 * to cleanup the other buffers. Don't need to
														
 
															+			 * remove the clustered uptodate information
														
 
															+			 * for this bh as it's not marked locally
														
 
															+			 * uptodate. */
														
 
															+			status = -EIO;
														
 
															+			brelse(bh);
														
 
															+			bhs[i] = NULL;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		if (inode)
														
 
															+			ocfs2_set_buffer_uptodate(inode, bh);
														
 
															+	}
														
 
															+	if (inode)
														
 
															+		up(&OCFS2_I(inode)->ip_io_sem);
														
 
															+
														
 
															+	mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
														
 
															+	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
														
 
															+
														
 
															+bail:
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,73 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ocfs2_buffer_head.h
														
 
															+ *
														
 
															+ * Buffer cache handling functions defined
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_BUFFER_HEAD_IO_H
														
 
															+#define OCFS2_BUFFER_HEAD_IO_H
														
 
															+
														
 
															+#include <linux/buffer_head.h>
														
 
															+
														
 
															+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
														
 
															+			     int uptodate);
														
 
															+
														
 
															+static inline int ocfs2_read_block(struct ocfs2_super          *osb,
														
 
															+				   u64                  off,
														
 
															+				   struct buffer_head **bh,
														
 
															+				   int                  flags,
														
 
															+				   struct inode        *inode);
														
 
															+
														
 
															+int ocfs2_write_block(struct ocfs2_super          *osb,
														
 
															+		      struct buffer_head  *bh,
														
 
															+		      struct inode        *inode);
														
 
															+int ocfs2_read_blocks(struct ocfs2_super          *osb,
														
 
															+		      u64                  block,
														
 
															+		      int                  nr,
														
 
															+		      struct buffer_head  *bhs[],
														
 
															+		      int                  flags,
														
 
															+		      struct inode        *inode);
														
 
															+
														
 
															+
														
 
															+#define OCFS2_BH_CACHED            1
														
 
															+#define OCFS2_BH_READAHEAD         8	/* use this to pass READA down to submit_bh */
														
 
															+
														
 
															+static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
														
 
															+				   struct buffer_head **bh, int flags,
														
 
															+				   struct inode *inode)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+
														
 
															+	if (bh == NULL) {
														
 
															+		printk("ocfs2: bh == NULL\n");
														
 
															+		status = -EINVAL;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_read_blocks(osb, off, 1, bh,
														
 
															+				   flags, inode);
														
 
															+
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+#endif /* OCFS2_BUFFER_HEAD_IO_H */
														
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -0,0 +1,4 @@
 
															+obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
														
 
															+
														
 
															+ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
														
 
															+	quorum.o tcp.o ver.o
														
--- a/fs/ocfs2/cluster/endian.h
+++ b/fs/ocfs2/cluster/endian.h
@@ -0,0 +1,30 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_CLUSTER_ENDIAN_H
														
 
															+#define OCFS2_CLUSTER_ENDIAN_H
														
 
															+
														
 
															+static inline void be32_add_cpu(__be32 *var, u32 val)
														
 
															+{
														
 
															+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
														
 
															+}
														
 
															+
														
 
															+#endif /* OCFS2_CLUSTER_ENDIAN_H */
														
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -0,0 +1,1797 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/sched.h>
														
 
															+#include <linux/jiffies.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/bio.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/delay.h>
														
 
															+#include <linux/file.h>
														
 
															+#include <linux/kthread.h>
														
 
															+#include <linux/configfs.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/crc32.h>
														
 
															+#include <linux/time.h>
														
 
															+
														
 
															+#include "heartbeat.h"
														
 
															+#include "tcp.h"
														
 
															+#include "nodemanager.h"
														
 
															+#include "quorum.h"
														
 
															+
														
 
															+#include "masklog.h"
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * The first heartbeat pass had one global thread that would serialize all hb
														
 
															+ * callback calls.  This global serializing sem should only be removed once
														
 
															+ * we've made sure that all callees can deal with being called concurrently
														
 
															+ * from multiple hb region threads.
														
 
															+ */
														
 
															+static DECLARE_RWSEM(o2hb_callback_sem);
														
 
															+
														
 
															+/*
														
 
															+ * multiple hb threads are watching multiple regions.  A node is live
														
 
															+ * whenever any of the threads sees activity from the node in its region.
														
 
															+ */
														
 
															+static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
														
 
															+static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
														
 
															+static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+static LIST_HEAD(o2hb_node_events);
														
 
															+static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
														
 
															+
														
 
															+static LIST_HEAD(o2hb_all_regions);
														
 
															+
														
 
															+static struct o2hb_callback {
														
 
															+	struct list_head list;
														
 
															+} o2hb_callbacks[O2HB_NUM_CB];
														
 
															+
														
 
															+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
														
 
															+
														
 
															+#define O2HB_DEFAULT_BLOCK_BITS       9
														
 
															+
														
 
															+unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
														
 
															+
														
 
															+/* Only sets a new threshold if there are no active regions. 
														
 
															+ *
														
 
															+ * No locking or otherwise interesting code is required for reading
														
 
															+ * o2hb_dead_threshold as it can't change once regions are active and
														
 
															+ * it's not interesting to anyone until then anyway. */
														
 
															+static void o2hb_dead_threshold_set(unsigned int threshold)
														
 
															+{
														
 
															+	if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
														
 
															+		spin_lock(&o2hb_live_lock);
														
 
															+		if (list_empty(&o2hb_all_regions))
														
 
															+			o2hb_dead_threshold = threshold;
														
 
															+		spin_unlock(&o2hb_live_lock);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+struct o2hb_node_event {
														
 
															+	struct list_head        hn_item;
														
 
															+	enum o2hb_callback_type hn_event_type;
														
 
															+	struct o2nm_node        *hn_node;
														
 
															+	int                     hn_node_num;
														
 
															+};
														
 
															+
														
 
															+struct o2hb_disk_slot {
														
 
															+	struct o2hb_disk_heartbeat_block *ds_raw_block;
														
 
															+	u8			ds_node_num;
														
 
															+	u64			ds_last_time;
														
 
															+	u64			ds_last_generation;
														
 
															+	u16			ds_equal_samples;
														
 
															+	u16			ds_changed_samples;
														
 
															+	struct list_head	ds_live_item;
														
 
															+};
														
 
															+
														
 
															+/* each thread owns a region.. when we're asked to tear down the region
														
 
															+ * we ask the thread to stop, who cleans up the region */
														
 
															+struct o2hb_region {
														
 
															+	struct config_item	hr_item;
														
 
															+
														
 
															+	struct list_head	hr_all_item;
														
 
															+	unsigned		hr_unclean_stop:1;
														
 
															+
														
 
															+	/* protected by the hr_callback_sem */
														
 
															+	struct task_struct 	*hr_task;
														
 
															+
														
 
															+	unsigned int		hr_blocks;
														
 
															+	unsigned long long	hr_start_block;
														
 
															+
														
 
															+	unsigned int		hr_block_bits;
														
 
															+	unsigned int		hr_block_bytes;
														
 
															+
														
 
															+	unsigned int		hr_slots_per_page;
														
 
															+	unsigned int		hr_num_pages;
														
 
															+
														
 
															+	struct page             **hr_slot_data;
														
 
															+	struct block_device	*hr_bdev;
														
 
															+	struct o2hb_disk_slot	*hr_slots;
														
 
															+
														
 
															+	/* let the person setting up hb wait for it to return until it
														
 
															+	 * has reached a 'steady' state.  This will be fixed when we have
														
 
															+	 * a more complete api that doesn't lead to this sort of fragility. */
														
 
															+	atomic_t		hr_steady_iterations;
														
 
															+
														
 
															+	char			hr_dev_name[BDEVNAME_SIZE];
														
 
															+
														
 
															+	unsigned int		hr_timeout_ms;
														
 
															+
														
 
															+	/* randomized as the region goes up and down so that a node
														
 
															+	 * recognizes a node going up and down in one iteration */
														
 
															+	u64			hr_generation;
														
 
															+
														
 
															+	struct work_struct	hr_write_timeout_work;
														
 
															+	unsigned long		hr_last_timeout_start;
														
 
															+
														
 
															+	/* Used during o2hb_check_slot to hold a copy of the block
														
 
															+	 * being checked because we temporarily have to zero out the
														
 
															+	 * crc field. */
														
 
															+	struct o2hb_disk_heartbeat_block *hr_tmp_block;
														
 
															+};
														
 
															+
														
 
															+struct o2hb_bio_wait_ctxt {
														
 
															+	atomic_t          wc_num_reqs;
														
 
															+	struct completion wc_io_complete;
														
 
															+};
														
 
															+
														
 
															+static void o2hb_write_timeout(void *arg)
														
 
															+{
														
 
															+	struct o2hb_region *reg = arg;
														
 
															+
														
 
															+	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
														
 
															+	     "milliseconds\n", reg->hr_dev_name,
														
 
															+	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 
														
 
															+	o2quo_disk_timeout();
														
 
															+}
														
 
															+
														
 
															+static void o2hb_arm_write_timeout(struct o2hb_region *reg)
														
 
															+{
														
 
															+	mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
														
 
															+
														
 
															+	cancel_delayed_work(&reg->hr_write_timeout_work);
														
 
															+	reg->hr_last_timeout_start = jiffies;
														
 
															+	schedule_delayed_work(&reg->hr_write_timeout_work,
														
 
															+			      msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
														
 
															+}
														
 
															+
														
 
															+static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
														
 
															+{
														
 
															+	cancel_delayed_work(&reg->hr_write_timeout_work);
														
 
															+	flush_scheduled_work();
														
 
															+}
														
 
															+
														
 
															+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
														
 
															+				      unsigned int num_ios)
														
 
															+{
														
 
															+	atomic_set(&wc->wc_num_reqs, num_ios);
														
 
															+	init_completion(&wc->wc_io_complete);
														
 
															+}
														
 
															+
														
 
															+/* Used in error paths too */
														
 
															+static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
														
 
															+				     unsigned int num)
														
 
															+{
														
 
															+	/* sadly atomic_sub_and_test() isn't available on all platforms.  The
														
 
															+	 * good news is that the fast path only completes one at a time */
														
 
															+	while(num--) {
														
 
															+		if (atomic_dec_and_test(&wc->wc_num_reqs)) {
														
 
															+			BUG_ON(num > 0);
														
 
															+			complete(&wc->wc_io_complete);
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void o2hb_wait_on_io(struct o2hb_region *reg,
														
 
															+			    struct o2hb_bio_wait_ctxt *wc)
														
 
															+{
														
 
															+	struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
														
 
															+
														
 
															+	blk_run_address_space(mapping);
														
 
															+
														
 
															+	wait_for_completion(&wc->wc_io_complete);
														
 
															+}
														
 
															+
														
 
															+static int o2hb_bio_end_io(struct bio *bio,
														
 
															+			   unsigned int bytes_done,
														
 
															+			   int error)
														
 
															+{
														
 
															+	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
														
 
															+
														
 
															+	if (error)
														
 
															+		mlog(ML_ERROR, "IO Error %d\n", error);
														
 
															+
														
 
															+	if (bio->bi_size)
														
 
															+		return 1;
														
 
															+
														
 
															+	o2hb_bio_wait_dec(wc, 1);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Setup a Bio to cover I/O against num_slots slots starting at
														
 
															+ * start_slot. */
														
 
															+static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
														
 
															+				      struct o2hb_bio_wait_ctxt *wc,
														
 
															+				      unsigned int start_slot,
														
 
															+				      unsigned int num_slots)
														
 
															+{
														
 
															+	int i, nr_vecs, len, first_page, last_page;
														
 
															+	unsigned int vec_len, vec_start;
														
 
															+	unsigned int bits = reg->hr_block_bits;
														
 
															+	unsigned int spp = reg->hr_slots_per_page;
														
 
															+	struct bio *bio;
														
 
															+	struct page *page;
														
 
															+
														
 
															+	nr_vecs = (num_slots + spp - 1) / spp;
														
 
															+
														
 
															+	/* Testing has shown this allocation to take long enough under
														
 
															+	 * GFP_KERNEL that the local node can get fenced. It would be
														
 
															+	 * nicest if we could pre-allocate these bios and avoid this
														
 
															+	 * all together. */
														
 
															+	bio = bio_alloc(GFP_ATOMIC, nr_vecs);
														
 
															+	if (!bio) {
														
 
															+		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
														
 
															+		bio = ERR_PTR(-ENOMEM);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* Must put everything in 512 byte sectors for the bio... */
														
 
															+	bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
														
 
															+	bio->bi_bdev = reg->hr_bdev;
														
 
															+	bio->bi_private = wc;
														
 
															+	bio->bi_end_io = o2hb_bio_end_io;
														
 
															+
														
 
															+	first_page = start_slot / spp;
														
 
															+	last_page = first_page + nr_vecs;
														
 
															+	vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
														
 
															+	for(i = first_page; i < last_page; i++) {
														
 
															+		page = reg->hr_slot_data[i];
														
 
															+
														
 
															+		vec_len = PAGE_CACHE_SIZE;
														
 
															+		/* last page might be short */
														
 
															+		if (((i + 1) * spp) > (start_slot + num_slots))
														
 
															+			vec_len = ((num_slots + start_slot) % spp) << bits;
														
 
															+		vec_len -=  vec_start;
														
 
															+
														
 
															+		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
														
 
															+		     i, vec_len, vec_start);
														
 
															+
														
 
															+		len = bio_add_page(bio, page, vec_len, vec_start);
														
 
															+		if (len != vec_len) {
														
 
															+			bio_put(bio);
														
 
															+			bio = ERR_PTR(-EIO);
														
 
															+
														
 
															+			mlog(ML_ERROR, "Error adding page to bio i = %d, "
														
 
															+			     "vec_len = %u, len = %d\n, start = %u\n",
														
 
															+			     i, vec_len, len, vec_start);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		vec_start = 0;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	return bio;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Compute the maximum number of sectors the bdev can handle in one bio,
														
 
															+ * as a power of two.
														
 
															+ *
														
 
															+ * Stolen from oracleasm, thanks Joel!
														
 
															+ */
														
 
															+static int compute_max_sectors(struct block_device *bdev)
														
 
															+{
														
 
															+	int max_pages, max_sectors, pow_two_sectors;
														
 
															+
														
 
															+	struct request_queue *q;
														
 
															+
														
 
															+	q = bdev_get_queue(bdev);
														
 
															+	max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
														
 
															+	if (max_pages > BIO_MAX_PAGES)
														
 
															+		max_pages = BIO_MAX_PAGES;
														
 
															+	if (max_pages > q->max_phys_segments)
														
 
															+		max_pages = q->max_phys_segments;
														
 
															+	if (max_pages > q->max_hw_segments)
														
 
															+		max_pages = q->max_hw_segments;
														
 
															+	max_pages--; /* Handle I/Os that straddle a page */
														
 
															+
														
 
															+	max_sectors = max_pages << (PAGE_SHIFT - 9);
														
 
															+
														
 
															+	/* Why is fls() 1-based???? */
														
 
															+	pow_two_sectors = 1 << (fls(max_sectors) - 1);
														
 
															+
														
 
															+	return pow_two_sectors;
														
 
															+}
														
 
															+
														
 
															+static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
														
 
															+					       unsigned int num_slots,
														
 
															+					       unsigned int *num_bios,
														
 
															+					       unsigned int *slots_per_bio)
														
 
															+{
														
 
															+	unsigned int max_sectors, io_sectors;
														
 
															+
														
 
															+	max_sectors = compute_max_sectors(reg->hr_bdev);
														
 
															+
														
 
															+	io_sectors = num_slots << (reg->hr_block_bits - 9);
														
 
															+
														
 
															+	*num_bios = (io_sectors + max_sectors - 1) / max_sectors;
														
 
															+	*slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
														
 
															+
														
 
															+	mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
														
 
															+	     "device can handle %u sectors of I/O\n", io_sectors, num_slots,
														
 
															+	     max_sectors);
														
 
															+	mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
														
 
															+	     *num_bios, *slots_per_bio);
														
 
															+}
														
 
															+
														
 
															+static int o2hb_read_slots(struct o2hb_region *reg,
														
 
															+			   unsigned int max_slots)
														
 
															+{
														
 
															+	unsigned int num_bios, slots_per_bio, start_slot, num_slots;
														
 
															+	int i, status;
														
 
															+	struct o2hb_bio_wait_ctxt wc;
														
 
															+	struct bio **bios;
														
 
															+	struct bio *bio;
														
 
															+
														
 
															+	o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
														
 
															+
														
 
															+	bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
														
 
															+	if (!bios) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		return status;
														
 
															+	}
														
 
															+
														
 
															+	o2hb_bio_wait_init(&wc, num_bios);
														
 
															+
														
 
															+	num_slots = slots_per_bio;
														
 
															+	for(i = 0; i < num_bios; i++) {
														
 
															+		start_slot = i * slots_per_bio;
														
 
															+
														
 
															+		/* adjust num_slots at last bio */
														
 
															+		if (max_slots < (start_slot + num_slots))
														
 
															+			num_slots = max_slots - start_slot;
														
 
															+
														
 
															+		bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
														
 
															+		if (IS_ERR(bio)) {
														
 
															+			o2hb_bio_wait_dec(&wc, num_bios - i);
														
 
															+
														
 
															+			status = PTR_ERR(bio);
														
 
															+			mlog_errno(status);
														
 
															+			goto bail_and_wait;
														
 
															+		}
														
 
															+		bios[i] = bio;
														
 
															+
														
 
															+		submit_bio(READ, bio);
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+
														
 
															+bail_and_wait:
														
 
															+	o2hb_wait_on_io(reg, &wc);
														
 
															+
														
 
															+	if (bios) {
														
 
															+		for(i = 0; i < num_bios; i++)
														
 
															+			if (bios[i])
														
 
															+				bio_put(bios[i]);
														
 
															+		kfree(bios);
														
 
															+	}
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int o2hb_issue_node_write(struct o2hb_region *reg,
														
 
															+				 struct bio **write_bio,
														
 
															+				 struct o2hb_bio_wait_ctxt *write_wc)
														
 
															+{
														
 
															+	int status;
														
 
															+	unsigned int slot;
														
 
															+	struct bio *bio;
														
 
															+
														
 
															+	o2hb_bio_wait_init(write_wc, 1);
														
 
															+
														
 
															+	slot = o2nm_this_node();
														
 
															+
														
 
															+	bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
														
 
															+	if (IS_ERR(bio)) {
														
 
															+		status = PTR_ERR(bio);
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	submit_bio(WRITE, bio);
														
 
															+
														
 
															+	*write_bio = bio;
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
														
 
															+				     struct o2hb_disk_heartbeat_block *hb_block)
														
 
															+{
														
 
															+	__le32 old_cksum;
														
 
															+	u32 ret;
														
 
															+
														
 
															+	/* We want to compute the block crc with a 0 value in the
														
 
															+	 * hb_cksum field. Save it off here and replace after the
														
 
															+	 * crc. */
														
 
															+	old_cksum = hb_block->hb_cksum;
														
 
															+	hb_block->hb_cksum = 0;
														
 
															+
														
 
															+	ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
														
 
															+
														
 
															+	hb_block->hb_cksum = old_cksum;
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
														
 
															+{
														
 
															+	mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
														
 
															+	     "cksum = 0x%x, generation 0x%"MLFx64"\n",
														
 
															+	     le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
														
 
															+	     le32_to_cpu(hb_block->hb_cksum),
														
 
															+	     le64_to_cpu(hb_block->hb_generation));
														
 
															+}
														
 
															+
														
 
															+static int o2hb_verify_crc(struct o2hb_region *reg,
														
 
															+			   struct o2hb_disk_heartbeat_block *hb_block)
														
 
															+{
														
 
															+	u32 read, computed;
														
 
															+
														
 
															+	read = le32_to_cpu(hb_block->hb_cksum);
														
 
															+	computed = o2hb_compute_block_crc_le(reg, hb_block);
														
 
															+
														
 
															+	return read == computed;
														
 
															+}
														
 
															+
														
 
															+/* We want to make sure that nobody is heartbeating on top of us --
														
 
															+ * this will help detect an invalid configuration. */
														
 
															+static int o2hb_check_last_timestamp(struct o2hb_region *reg)
														
 
															+{
														
 
															+	int node_num, ret;
														
 
															+	struct o2hb_disk_slot *slot;
														
 
															+	struct o2hb_disk_heartbeat_block *hb_block;
														
 
															+
														
 
															+	node_num = o2nm_this_node();
														
 
															+
														
 
															+	ret = 1;
														
 
															+	slot = &reg->hr_slots[node_num];
														
 
															+	/* Don't check on our 1st timestamp */
														
 
															+	if (slot->ds_last_time) {
														
 
															+		hb_block = slot->ds_raw_block;
														
 
															+
														
 
															+		if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
														
 
															+			ret = 0;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline void o2hb_prepare_block(struct o2hb_region *reg,
														
 
															+				      u64 generation)
														
 
															+{
														
 
															+	int node_num;
														
 
															+	u64 cputime;
														
 
															+	struct o2hb_disk_slot *slot;
														
 
															+	struct o2hb_disk_heartbeat_block *hb_block;
														
 
															+
														
 
															+	node_num = o2nm_this_node();
														
 
															+	slot = &reg->hr_slots[node_num];
														
 
															+
														
 
															+	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
														
 
															+	memset(hb_block, 0, reg->hr_block_bytes);
														
 
															+	/* TODO: time stuff */
														
 
															+	cputime = CURRENT_TIME.tv_sec;
														
 
															+	if (!cputime)
														
 
															+		cputime = 1;
														
 
															+
														
 
															+	hb_block->hb_seq = cpu_to_le64(cputime);
														
 
															+	hb_block->hb_node = node_num;
														
 
															+	hb_block->hb_generation = cpu_to_le64(generation);
														
 
															+
														
 
															+	/* This step must always happen last! */
														
 
															+	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
														
 
															+								   hb_block));
														
 
															+
														
 
															+	mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
														
 
															+	     cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum));
														
 
															+}
														
 
															+
														
 
															+static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
														
 
															+				struct o2nm_node *node,
														
 
															+				int idx)
														
 
															+{
														
 
															+	struct list_head *iter;
														
 
															+	struct o2hb_callback_func *f;
														
 
															+
														
 
															+	list_for_each(iter, &hbcall->list) {
														
 
															+		f = list_entry(iter, struct o2hb_callback_func, hc_item);
														
 
															+		mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
														
 
															+		(f->hc_func)(node, idx, f->hc_data);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* Will run the list in order until we process the passed event */
														
 
															+static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
														
 
															+{
														
 
															+	int empty;
														
 
															+	struct o2hb_callback *hbcall;
														
 
															+	struct o2hb_node_event *event;
														
 
															+
														
 
															+	spin_lock(&o2hb_live_lock);
														
 
															+	empty = list_empty(&queued_event->hn_item);
														
 
															+	spin_unlock(&o2hb_live_lock);
														
 
															+	if (empty)
														
 
															+		return;
														
 
															+
														
 
															+	/* Holding callback sem assures we don't alter the callback
														
 
															+	 * lists when doing this, and serializes ourselves with other
														
 
															+	 * processes wanting callbacks. */
														
 
															+	down_write(&o2hb_callback_sem);
														
 
															+
														
 
															+	spin_lock(&o2hb_live_lock);
														
 
															+	while (!list_empty(&o2hb_node_events)
														
 
															+	       && !list_empty(&queued_event->hn_item)) {
														
 
															+		event = list_entry(o2hb_node_events.next,
														
 
															+				   struct o2hb_node_event,
														
 
															+				   hn_item);
														
 
															+		list_del_init(&event->hn_item);
														
 
															+		spin_unlock(&o2hb_live_lock);
														
 
															+
														
 
															+		mlog(ML_HEARTBEAT, "Node %s event for %d\n",
														
 
															+		     event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
														
 
															+		     event->hn_node_num);
														
 
															+
														
 
															+		hbcall = hbcall_from_type(event->hn_event_type);
														
 
															+
														
 
															+		/* We should *never* have gotten on to the list with a
														
 
															+		 * bad type... This isn't something that we should try
														
 
															+		 * to recover from. */
														
 
															+		BUG_ON(IS_ERR(hbcall));
														
 
															+
														
 
															+		o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
														
 
															+
														
 
															+		spin_lock(&o2hb_live_lock);
														
 
															+	}
														
 
															+	spin_unlock(&o2hb_live_lock);
														
 
															+
														
 
															+	up_write(&o2hb_callback_sem);
														
 
															+}
														
 
															+
														
 
															+static void o2hb_queue_node_event(struct o2hb_node_event *event,
														
 
															+				  enum o2hb_callback_type type,
														
 
															+				  struct o2nm_node *node,
														
 
															+				  int node_num)
														
 
															+{
														
 
															+	assert_spin_locked(&o2hb_live_lock);
														
 
															+
														
 
															+	event->hn_event_type = type;
														
 
															+	event->hn_node = node;
														
 
															+	event->hn_node_num = node_num;
														
 
															+
														
 
															+	mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
														
 
															+	     type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
														
 
															+
														
 
															+	list_add_tail(&event->hn_item, &o2hb_node_events);
														
 
															+}
														
 
															+
														
 
															+static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
														
 
															+{
														
 
															+	struct o2hb_node_event event =
														
 
															+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
														
 
															+	struct o2nm_node *node;
														
 
															+
														
 
															+	node = o2nm_get_node_by_num(slot->ds_node_num);
														
 
															+	if (!node)
														
 
															+		return;
														
 
															+
														
 
															+	spin_lock(&o2hb_live_lock);
														
 
															+	if (!list_empty(&slot->ds_live_item)) {
														
 
															+		mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
														
 
															+		     slot->ds_node_num);
														
 
															+
														
 
															+		list_del_init(&slot->ds_live_item);
														
 
															+
														
 
															+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
														
 
															+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
														
 
															+
														
 
															+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
														
 
															+					      slot->ds_node_num);
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&o2hb_live_lock);
														
 
															+
														
 
															+	o2hb_run_event_list(&event);
														
 
															+
														
 
															+	o2nm_node_put(node);
														
 
															+}
														
 
															+
														
 
															+static int o2hb_check_slot(struct o2hb_region *reg,
														
 
															+			   struct o2hb_disk_slot *slot)
														
 
															+{
														
 
															+	int changed = 0, gen_changed = 0;
														
 
															+	struct o2hb_node_event event =
														
 
															+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
														
 
															+	struct o2nm_node *node;
														
 
															+	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
														
 
															+	u64 cputime;
														
 
															+
														
 
															+	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
														
 
															+
														
 
															+	/* Is this correct? Do we assume that the node doesn't exist
														
 
															+	 * if we're not configured for him? */
														
 
															+	node = o2nm_get_node_by_num(slot->ds_node_num);
														
 
															+	if (!node)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (!o2hb_verify_crc(reg, hb_block)) {
														
 
															+		/* all paths from here will drop o2hb_live_lock for
														
 
															+		 * us. */
														
 
															+		spin_lock(&o2hb_live_lock);
														
 
															+
														
 
															+		/* Don't print an error on the console in this case -
														
 
															+		 * a freshly formatted heartbeat area will not have a
														
 
															+		 * crc set on it. */
														
 
															+		if (list_empty(&slot->ds_live_item))
														
 
															+			goto out;
														
 
															+
														
 
															+		/* The node is live but pushed out a bad crc. We
														
 
															+		 * consider it a transient miss but don't populate any
														
 
															+		 * other values as they may be junk. */
														
 
															+		mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
														
 
															+		     slot->ds_node_num, reg->hr_dev_name);
														
 
															+		o2hb_dump_slot(hb_block);
														
 
															+
														
 
															+		slot->ds_equal_samples++;
														
 
															+		goto fire_callbacks;
														
 
															+	}
														
 
															+
														
 
															+	/* we don't care if these wrap.. the state transitions below
														
 
															+	 * clear at the right places */
														
 
															+	cputime = le64_to_cpu(hb_block->hb_seq);
														
 
															+	if (slot->ds_last_time != cputime)
														
 
															+		slot->ds_changed_samples++;
														
 
															+	else
														
 
															+		slot->ds_equal_samples++;
														
 
															+	slot->ds_last_time = cputime;
														
 
															+
														
 
															+	/* The node changed heartbeat generations. We assume this to
														
 
															+	 * mean it dropped off but came back before we timed out. We
														
 
															+	 * want to consider it down for the time being but don't want
														
 
															+	 * to lose any changed_samples state we might build up to
														
 
															+	 * considering it live again. */
														
 
															+	if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
														
 
															+		gen_changed = 1;
														
 
															+		slot->ds_equal_samples = 0;
														
 
															+		mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
														
 
															+		     "to 0x%"MLFx64")\n", slot->ds_node_num,
														
 
															+		     slot->ds_last_generation,
														
 
															+		     le64_to_cpu(hb_block->hb_generation));
														
 
															+	}
														
 
															+
														
 
															+	slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
														
 
															+
														
 
															+	mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
														
 
															+	     "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
														
 
															+	     slot->ds_node_num, slot->ds_last_generation,
														
 
															+	     le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq), 
														
 
															+	     slot->ds_last_time, slot->ds_changed_samples,
														
 
															+	     slot->ds_equal_samples);
														
 
															+
														
 
															+	spin_lock(&o2hb_live_lock);
														
 
															+
														
 
															+fire_callbacks:
														
 
															+	/* dead nodes only come to life after some number of
														
 
															+	 * changes at any time during their dead time */
														
 
															+	if (list_empty(&slot->ds_live_item) &&
														
 
															+	    slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
														
 
															+		mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
														
 
															+		     "region\n", slot->ds_node_num, slot->ds_last_generation);
														
 
															+
														
 
															+		/* first on the list generates a callback */
														
 
															+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
														
 
															+			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
														
 
															+
														
 
															+			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
														
 
															+					      slot->ds_node_num);
														
 
															+
														
 
															+			changed = 1;
														
 
															+		}
														
 
															+
														
 
															+		list_add_tail(&slot->ds_live_item,
														
 
															+			      &o2hb_live_slots[slot->ds_node_num]);
														
 
															+
														
 
															+		slot->ds_equal_samples = 0;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/* if the list is dead, we're done.. */
														
 
															+	if (list_empty(&slot->ds_live_item))
														
 
															+		goto out;
														
 
															+
														
 
															+	/* live nodes only go dead after enough consequtive missed
														
 
															+	 * samples..  reset the missed counter whenever we see
														
 
															+	 * activity */
														
 
															+	if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
														
 
															+		mlog(ML_HEARTBEAT, "Node %d left my region\n",
														
 
															+		     slot->ds_node_num);
														
 
															+
														
 
															+		/* last off the live_slot generates a callback */
														
 
															+		list_del_init(&slot->ds_live_item);
														
 
															+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
														
 
															+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
														
 
															+
														
 
															+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
														
 
															+					      slot->ds_node_num);
														
 
															+
														
 
															+			changed = 1;
														
 
															+		}
														
 
															+
														
 
															+		/* We don't clear this because the node is still
														
 
															+		 * actually writing new blocks. */
														
 
															+		if (!gen_changed)
														
 
															+			slot->ds_changed_samples = 0;
														
 
															+		goto out;
														
 
															+	}
														
 
															+	if (slot->ds_changed_samples) {
														
 
															+		slot->ds_changed_samples = 0;
														
 
															+		slot->ds_equal_samples = 0;
														
 
															+	}
														
 
															+out:
														
 
															+	spin_unlock(&o2hb_live_lock);
														
 
															+
														
 
															+	o2hb_run_event_list(&event);
														
 
															+
														
 
															+	o2nm_node_put(node);
														
 
															+	return changed;
														
 
															+}
														
 
															+
														
 
															+/* This could be faster if we just implmented a find_last_bit, but I
														
 
															+ * don't think the circumstances warrant it. */
														
 
															+static int o2hb_highest_node(unsigned long *nodes,
														
 
															+			     int numbits)
														
 
															+{
														
 
															+	int highest, node;
														
 
															+
														
 
															+	highest = numbits;
														
 
															+	node = -1;
														
 
															+	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
														
 
															+		if (node >= numbits)
														
 
															+			break;
														
 
															+
														
 
															+		highest = node;
														
 
															+	}
														
 
															+
														
 
															+	return highest;
														
 
															+}
														
 
															+
														
 
															+static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
														
 
															+{
														
 
															+	int i, ret, highest_node, change = 0;
														
 
															+	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	struct bio *write_bio;
														
 
															+	struct o2hb_bio_wait_ctxt write_wc;
														
 
															+
														
 
															+	if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
														
 
															+		return;
														
 
															+
														
 
															+	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
														
 
															+	if (highest_node >= O2NM_MAX_NODES) {
														
 
															+		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/* No sense in reading the slots of nodes that don't exist
														
 
															+	 * yet. Of course, if the node definitions have holes in them
														
 
															+	 * then we're reading an empty slot anyway... Consider this
														
 
															+	 * best-effort. */
														
 
															+	ret = o2hb_read_slots(reg, highest_node + 1);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/* With an up to date view of the slots, we can check that no
														
 
															+	 * other node has been improperly configured to heartbeat in
														
 
															+	 * our slot. */
														
 
															+	if (!o2hb_check_last_timestamp(reg))
														
 
															+		mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
														
 
															+		     "in our slot!\n", reg->hr_dev_name);
														
 
															+
														
 
															+	/* fill in the proper info for our next heartbeat */
														
 
															+	o2hb_prepare_block(reg, reg->hr_generation);
														
 
															+
														
 
															+	/* And fire off the write. Note that we don't wait on this I/O
														
 
															+	 * until later. */
														
 
															+	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	i = -1;
														
 
															+	while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
														
 
															+
														
 
															+		change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * We have to be sure we've advertised ourselves on disk
														
 
															+	 * before we can go to steady state.  This ensures that
														
 
															+	 * people we find in our steady state have seen us.
														
 
															+	 */
														
 
															+	o2hb_wait_on_io(reg, &write_wc);
														
 
															+	bio_put(write_bio);
														
 
															+	o2hb_arm_write_timeout(reg);
														
 
															+
														
 
															+	/* let the person who launched us know when things are steady */
														
 
															+	if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
														
 
															+		if (atomic_dec_and_test(&reg->hr_steady_iterations))
														
 
															+			wake_up(&o2hb_steady_queue);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* Subtract b from a, storing the result in a. a *must* have a larger
														
 
															+ * value than b. */
														
 
															+static void o2hb_tv_subtract(struct timeval *a,
														
 
															+			     struct timeval *b)
														
 
															+{
														
 
															+	/* just return 0 when a is after b */
														
 
															+	if (a->tv_sec < b->tv_sec ||
														
 
															+	    (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
														
 
															+		a->tv_sec = 0;
														
 
															+		a->tv_usec = 0;
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	a->tv_sec -= b->tv_sec;
														
 
															+	a->tv_usec -= b->tv_usec;
														
 
															+	while ( a->tv_usec < 0 ) {
														
 
															+		a->tv_sec--;
														
 
															+		a->tv_usec += 1000000;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static unsigned int o2hb_elapsed_msecs(struct timeval *start,
														
 
															+				       struct timeval *end)
														
 
															+{
														
 
															+	struct timeval res = *end;
														
 
															+
														
 
															+	o2hb_tv_subtract(&res, start);
														
 
															+
														
 
															+	return res.tv_sec * 1000 + res.tv_usec / 1000;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * we ride the region ref that the region dir holds.  before the region
														
 
															+ * dir is removed and drops it ref it will wait to tear down this
														
 
															+ * thread.
														
 
															+ */
														
 
															+static int o2hb_thread(void *data)
														
 
															+{
														
 
															+	int i, ret;
														
 
															+	struct o2hb_region *reg = data;
														
 
															+	struct bio *write_bio;
														
 
															+	struct o2hb_bio_wait_ctxt write_wc;
														
 
															+	struct timeval before_hb, after_hb;
														
 
															+	unsigned int elapsed_msec;
														
 
															+
														
 
															+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
														
 
															+
														
 
															+	set_user_nice(current, -20);
														
 
															+
														
 
															+	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
														
 
															+		/* We track the time spent inside
														
 
															+		 * o2hb_do_disk_heartbeat so that we avoid more then
														
 
															+		 * hr_timeout_ms between disk writes. On busy systems
														
 
															+		 * this should result in a heartbeat which is less
														
 
															+		 * likely to time itself out. */
														
 
															+		do_gettimeofday(&before_hb);
														
 
															+
														
 
															+		o2hb_do_disk_heartbeat(reg);
														
 
															+
														
 
															+		do_gettimeofday(&after_hb);
														
 
															+		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
														
 
															+
														
 
															+		mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
														
 
															+		     before_hb.tv_sec, before_hb.tv_usec,
														
 
															+		     after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
														
 
															+
														
 
															+		if (elapsed_msec < reg->hr_timeout_ms) {
														
 
															+			/* the kthread api has blocked signals for us so no
														
 
															+			 * need to record the return value. */
														
 
															+			msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	o2hb_disarm_write_timeout(reg);
														
 
															+
														
 
															+	/* unclean stop is only used in very bad situation */
														
 
															+	for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
														
 
															+		o2hb_shutdown_slot(&reg->hr_slots[i]);
														
 
															+
														
 
															+	/* Explicit down notification - avoid forcing the other nodes
														
 
															+	 * to timeout on this region when we could just as easily
														
 
															+	 * write a clear generation - thus indicating to them that
														
 
															+	 * this node has left this region.
														
 
															+	 *
														
 
															+	 * XXX: Should we skip this on unclean_stop? */
														
 
															+	o2hb_prepare_block(reg, 0);
														
 
															+	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
														
 
															+	if (ret == 0) {
														
 
															+		o2hb_wait_on_io(reg, &write_wc);
														
 
															+		bio_put(write_bio);
														
 
															+	} else {
														
 
															+		mlog_errno(ret);
														
 
															+	}
														
 
															+
														
 
															+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void o2hb_init(void)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
														
 
															+		INIT_LIST_HEAD(&o2hb_callbacks[i].list);
														
 
															+
														
 
															+	for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
														
 
															+		INIT_LIST_HEAD(&o2hb_live_slots[i]);
														
 
															+
														
 
															+	INIT_LIST_HEAD(&o2hb_node_events);
														
 
															+
														
 
															+	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
														
 
															+}
														
 
															+
														
 
															+/* if we're already in a callback then we're already serialized by the sem */
														
 
															+static void o2hb_fill_node_map_from_callback(unsigned long *map,
														
 
															+					     unsigned bytes)
														
 
															+{
														
 
															+	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
														
 
															+
														
 
															+	memcpy(map, &o2hb_live_node_bitmap, bytes);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * get a map of all nodes that are heartbeating in any regions
														
 
															+ */
														
 
															+void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
														
 
															+{
														
 
															+	/* callers want to serialize this map and callbacks so that they
														
 
															+	 * can trust that they don't miss nodes coming to the party */
														
 
															+	down_read(&o2hb_callback_sem);
														
 
															+	spin_lock(&o2hb_live_lock);
														
 
															+	o2hb_fill_node_map_from_callback(map, bytes);
														
 
															+	spin_unlock(&o2hb_live_lock);
														
 
															+	up_read(&o2hb_callback_sem);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
														
 
															+
														
 
															+/*
														
 
															+ * heartbeat configfs bits.  The heartbeat set is a default set under
														
 
															+ * the cluster set in nodemanager.c.
														
 
															+ */
														
 
															+
														
 
															+static struct o2hb_region *to_o2hb_region(struct config_item *item)
														
 
															+{
														
 
															+	return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
														
 
															+}
														
 
															+
														
 
															+/* drop_item only drops its ref after killing the thread, nothing should
														
 
															+ * be using the region anymore.  this has to clean up any state that
														
 
															+ * attributes might have built up. */
														
 
															+static void o2hb_region_release(struct config_item *item)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct page *page;
														
 
															+	struct o2hb_region *reg = to_o2hb_region(item);
														
 
															+
														
 
															+	if (reg->hr_tmp_block)
														
 
															+		kfree(reg->hr_tmp_block);
														
 
															+
														
 
															+	if (reg->hr_slot_data) {
														
 
															+		for (i = 0; i < reg->hr_num_pages; i++) {
														
 
															+			page = reg->hr_slot_data[i];
														
 
															+			if (page)
														
 
															+				__free_page(page);
														
 
															+		}
														
 
															+		kfree(reg->hr_slot_data);
														
 
															+	}
														
 
															+
														
 
															+	if (reg->hr_bdev)
														
 
															+		blkdev_put(reg->hr_bdev);
														
 
															+
														
 
															+	if (reg->hr_slots)
														
 
															+		kfree(reg->hr_slots);
														
 
															+
														
 
															+	spin_lock(&o2hb_live_lock);
														
 
															+	list_del(&reg->hr_all_item);
														
 
															+	spin_unlock(&o2hb_live_lock);
														
 
															+
														
 
															+	kfree(reg);
														
 
															+}
														
 
															+
														
 
															+static int o2hb_read_block_input(struct o2hb_region *reg,
														
 
															+				 const char *page,
														
 
															+				 size_t count,
														
 
															+				 unsigned long *ret_bytes,
														
 
															+				 unsigned int *ret_bits)
														
 
															+{
														
 
															+	unsigned long bytes;
														
 
															+	char *p = (char *)page;
														
 
															+
														
 
															+	bytes = simple_strtoul(p, &p, 0);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	/* Heartbeat and fs min / max block sizes are the same. */
														
 
															+	if (bytes > 4096 || bytes < 512)
														
 
															+		return -ERANGE;
														
 
															+	if (hweight16(bytes) != 1)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (ret_bytes)
														
 
															+		*ret_bytes = bytes;
														
 
															+	if (ret_bits)
														
 
															+		*ret_bits = ffs(bytes) - 1;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
														
 
															+					    char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%u\n", reg->hr_block_bytes);
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
														
 
															+					     const char *page,
														
 
															+					     size_t count)
														
 
															+{
														
 
															+	int status;
														
 
															+	unsigned long block_bytes;
														
 
															+	unsigned int block_bits;
														
 
															+
														
 
															+	if (reg->hr_bdev)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	status = o2hb_read_block_input(reg, page, count,
														
 
															+				       &block_bytes, &block_bits);
														
 
															+	if (status)
														
 
															+		return status;
														
 
															+
														
 
															+	reg->hr_block_bytes = (unsigned int)block_bytes;
														
 
															+	reg->hr_block_bits = block_bits;
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
														
 
															+					    char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%llu\n", reg->hr_start_block);
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
														
 
															+					     const char *page,
														
 
															+					     size_t count)
														
 
															+{
														
 
															+	unsigned long long tmp;
														
 
															+	char *p = (char *)page;
														
 
															+
														
 
															+	if (reg->hr_bdev)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	tmp = simple_strtoull(p, &p, 0);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	reg->hr_start_block = tmp;
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
														
 
															+				       char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%d\n", reg->hr_blocks);
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
														
 
															+					const char *page,
														
 
															+					size_t count)
														
 
															+{
														
 
															+	unsigned long tmp;
														
 
															+	char *p = (char *)page;
														
 
															+
														
 
															+	if (reg->hr_bdev)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	tmp = simple_strtoul(p, &p, 0);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (tmp > O2NM_MAX_NODES || tmp == 0)
														
 
															+		return -ERANGE;
														
 
															+
														
 
															+	reg->hr_blocks = (unsigned int)tmp;
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
														
 
															+				    char *page)
														
 
															+{
														
 
															+	unsigned int ret = 0;
														
 
															+
														
 
															+	if (reg->hr_bdev)
														
 
															+		ret = sprintf(page, "%s\n", reg->hr_dev_name);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2hb_init_region_params(struct o2hb_region *reg)
														
 
															+{
														
 
															+	reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
														
 
															+	reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
														
 
															+
														
 
															+	mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
														
 
															+	     reg->hr_start_block, reg->hr_blocks);
														
 
															+	mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
														
 
															+	     reg->hr_block_bytes, reg->hr_block_bits);
														
 
															+	mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
														
 
															+	mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
														
 
															+}
														
 
															+
														
 
															+static int o2hb_map_slot_data(struct o2hb_region *reg)
														
 
															+{
														
 
															+	int i, j;
														
 
															+	unsigned int last_slot;
														
 
															+	unsigned int spp = reg->hr_slots_per_page;
														
 
															+	struct page *page;
														
 
															+	char *raw;
														
 
															+	struct o2hb_disk_slot *slot;
														
 
															+
														
 
															+	reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
														
 
															+	if (reg->hr_tmp_block == NULL) {
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	reg->hr_slots = kcalloc(reg->hr_blocks,
														
 
															+				sizeof(struct o2hb_disk_slot), GFP_KERNEL);
														
 
															+	if (reg->hr_slots == NULL) {
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	for(i = 0; i < reg->hr_blocks; i++) {
														
 
															+		slot = &reg->hr_slots[i];
														
 
															+		slot->ds_node_num = i;
														
 
															+		INIT_LIST_HEAD(&slot->ds_live_item);
														
 
															+		slot->ds_raw_block = NULL;
														
 
															+	}
														
 
															+
														
 
															+	reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
														
 
															+	mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
														
 
															+			   "at %u blocks per page\n",
														
 
															+	     reg->hr_num_pages, reg->hr_blocks, spp);
														
 
															+
														
 
															+	reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
														
 
															+				    GFP_KERNEL);
														
 
															+	if (!reg->hr_slot_data) {
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	for(i = 0; i < reg->hr_num_pages; i++) {
														
 
															+		page = alloc_page(GFP_KERNEL);
														
 
															+		if (!page) {
														
 
															+			mlog_errno(-ENOMEM);
														
 
															+			return -ENOMEM;
														
 
															+		}
														
 
															+
														
 
															+		reg->hr_slot_data[i] = page;
														
 
															+
														
 
															+		last_slot = i * spp;
														
 
															+		raw = page_address(page);
														
 
															+		for (j = 0;
														
 
															+		     (j < spp) && ((j + last_slot) < reg->hr_blocks);
														
 
															+		     j++) {
														
 
															+			BUG_ON((j + last_slot) >= reg->hr_blocks);
														
 
															+
														
 
															+			slot = &reg->hr_slots[j + last_slot];
														
 
															+			slot->ds_raw_block =
														
 
															+				(struct o2hb_disk_heartbeat_block *) raw;
														
 
															+
														
 
															+			raw += reg->hr_block_bytes;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Read in all the slots available and populate the tracking
														
 
															+ * structures so that we can start with a baseline idea of what's
														
 
															+ * there. */
														
 
															+static int o2hb_populate_slot_data(struct o2hb_region *reg)
														
 
															+{
														
 
															+	int ret, i;
														
 
															+	struct o2hb_disk_slot *slot;
														
 
															+	struct o2hb_disk_heartbeat_block *hb_block;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	ret = o2hb_read_slots(reg, reg->hr_blocks);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/* We only want to get an idea of the values initially in each
														
 
															+	 * slot, so we do no verification - o2hb_check_slot will
														
 
															+	 * actually determine if each configured slot is valid and
														
 
															+	 * whether any values have changed. */
														
 
															+	for(i = 0; i < reg->hr_blocks; i++) {
														
 
															+		slot = &reg->hr_slots[i];
														
 
															+		hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
														
 
															+
														
 
															+		/* Only fill the values that o2hb_check_slot uses to
														
 
															+		 * determine changing slots */
														
 
															+		slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
														
 
															+		slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
														
 
															+static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
														
 
															+				     const char *page,
														
 
															+				     size_t count)
														
 
															+{
														
 
															+	long fd;
														
 
															+	int sectsize;
														
 
															+	char *p = (char *)page;
														
 
															+	struct file *filp = NULL;
														
 
															+	struct inode *inode = NULL;
														
 
															+	ssize_t ret = -EINVAL;
														
 
															+
														
 
															+	if (reg->hr_bdev)
														
 
															+		goto out;
														
 
															+
														
 
															+	/* We can't heartbeat without having had our node number
														
 
															+	 * configured yet. */
														
 
															+	if (o2nm_this_node() == O2NM_MAX_NODES)
														
 
															+		goto out;
														
 
															+
														
 
															+	fd = simple_strtol(p, &p, 0);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		goto out;
														
 
															+
														
 
															+	if (fd < 0 || fd >= INT_MAX)
														
 
															+		goto out;
														
 
															+
														
 
															+	filp = fget(fd);
														
 
															+	if (filp == NULL)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
														
 
															+	    reg->hr_block_bytes == 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	inode = igrab(filp->f_mapping->host);
														
 
															+	if (inode == NULL)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (!S_ISBLK(inode->i_mode))
														
 
															+		goto out;
														
 
															+
														
 
															+	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
														
 
															+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
														
 
															+	if (ret) {
														
 
															+		reg->hr_bdev = NULL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+	inode = NULL;
														
 
															+
														
 
															+	bdevname(reg->hr_bdev, reg->hr_dev_name);
														
 
															+
														
 
															+	sectsize = bdev_hardsect_size(reg->hr_bdev);
														
 
															+	if (sectsize != reg->hr_block_bytes) {
														
 
															+		mlog(ML_ERROR,
														
 
															+		     "blocksize %u incorrect for device, expected %d",
														
 
															+		     reg->hr_block_bytes, sectsize);
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	o2hb_init_region_params(reg);
														
 
															+
														
 
															+	/* Generation of zero is invalid */
														
 
															+	do {
														
 
															+		get_random_bytes(&reg->hr_generation,
														
 
															+				 sizeof(reg->hr_generation));
														
 
															+	} while (reg->hr_generation == 0);
														
 
															+
														
 
															+	ret = o2hb_map_slot_data(reg);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = o2hb_populate_slot_data(reg);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	INIT_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout, reg);
														
 
															+
														
 
															+	/*
														
 
															+	 * A node is considered live after it has beat LIVE_THRESHOLD
														
 
															+	 * times.  We're not steady until we've given them a chance
														
 
															+	 * _after_ our first read.
														
 
															+	 */
														
 
															+	atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
														
 
															+
														
 
															+	reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
														
 
															+				   reg->hr_item.ci_name);
														
 
															+	if (IS_ERR(reg->hr_task)) {
														
 
															+		ret = PTR_ERR(reg->hr_task);
														
 
															+		mlog_errno(ret);
														
 
															+		reg->hr_task = NULL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = wait_event_interruptible(o2hb_steady_queue,
														
 
															+				atomic_read(&reg->hr_steady_iterations) == 0);
														
 
															+	if (ret) {
														
 
															+		kthread_stop(reg->hr_task);
														
 
															+		reg->hr_task = NULL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = count;
														
 
															+out:
														
 
															+	if (filp)
														
 
															+		fput(filp);
														
 
															+	if (inode)
														
 
															+		iput(inode);
														
 
															+	if (ret < 0) {
														
 
															+		if (reg->hr_bdev) {
														
 
															+			blkdev_put(reg->hr_bdev);
														
 
															+			reg->hr_bdev = NULL;
														
 
															+		}
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct o2hb_region_attribute {
														
 
															+	struct configfs_attribute attr;
														
 
															+	ssize_t (*show)(struct o2hb_region *, char *);
														
 
															+	ssize_t (*store)(struct o2hb_region *, const char *, size_t);
														
 
															+};
														
 
															+
														
 
															+static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "block_bytes",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2hb_region_block_bytes_read,
														
 
															+	.store	= o2hb_region_block_bytes_write,
														
 
															+};
														
 
															+
														
 
															+static struct o2hb_region_attribute o2hb_region_attr_start_block = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "start_block",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2hb_region_start_block_read,
														
 
															+	.store	= o2hb_region_start_block_write,
														
 
															+};
														
 
															+
														
 
															+static struct o2hb_region_attribute o2hb_region_attr_blocks = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "blocks",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2hb_region_blocks_read,
														
 
															+	.store	= o2hb_region_blocks_write,
														
 
															+};
														
 
															+
														
 
															+static struct o2hb_region_attribute o2hb_region_attr_dev = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "dev",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2hb_region_dev_read,
														
 
															+	.store	= o2hb_region_dev_write,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_attribute *o2hb_region_attrs[] = {
														
 
															+	&o2hb_region_attr_block_bytes.attr,
														
 
															+	&o2hb_region_attr_start_block.attr,
														
 
															+	&o2hb_region_attr_blocks.attr,
														
 
															+	&o2hb_region_attr_dev.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static ssize_t o2hb_region_show(struct config_item *item,
														
 
															+				struct configfs_attribute *attr,
														
 
															+				char *page)
														
 
															+{
														
 
															+	struct o2hb_region *reg = to_o2hb_region(item);
														
 
															+	struct o2hb_region_attribute *o2hb_region_attr =
														
 
															+		container_of(attr, struct o2hb_region_attribute, attr);
														
 
															+	ssize_t ret = 0;
														
 
															+
														
 
															+	if (o2hb_region_attr->show)
														
 
															+		ret = o2hb_region_attr->show(reg, page);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_region_store(struct config_item *item,
														
 
															+				 struct configfs_attribute *attr,
														
 
															+				 const char *page, size_t count)
														
 
															+{
														
 
															+	struct o2hb_region *reg = to_o2hb_region(item);
														
 
															+	struct o2hb_region_attribute *o2hb_region_attr =
														
 
															+		container_of(attr, struct o2hb_region_attribute, attr);
														
 
															+	ssize_t ret = -EINVAL;
														
 
															+
														
 
															+	if (o2hb_region_attr->store)
														
 
															+		ret = o2hb_region_attr->store(reg, page, count);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static struct configfs_item_operations o2hb_region_item_ops = {
														
 
															+	.release		= o2hb_region_release,
														
 
															+	.show_attribute		= o2hb_region_show,
														
 
															+	.store_attribute	= o2hb_region_store,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type o2hb_region_type = {
														
 
															+	.ct_item_ops	= &o2hb_region_item_ops,
														
 
															+	.ct_attrs	= o2hb_region_attrs,
														
 
															+	.ct_owner	= THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+/* heartbeat set */
														
 
															+
														
 
															+struct o2hb_heartbeat_group {
														
 
															+	struct config_group hs_group;
														
 
															+	/* some stuff? */
														
 
															+};
														
 
															+
														
 
															+static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
														
 
															+{
														
 
															+	return group ?
														
 
															+		container_of(group, struct o2hb_heartbeat_group, hs_group)
														
 
															+		: NULL;
														
 
															+}
														
 
															+
														
 
															+static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
														
 
															+							  const char *name)
														
 
															+{
														
 
															+	struct o2hb_region *reg = NULL;
														
 
															+	struct config_item *ret = NULL;
														
 
															+
														
 
															+	reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL);
														
 
															+	if (reg == NULL)
														
 
															+		goto out; /* ENOMEM */
														
 
															+
														
 
															+	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
														
 
															+
														
 
															+	ret = &reg->hr_item;
														
 
															+
														
 
															+	spin_lock(&o2hb_live_lock);
														
 
															+	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
														
 
															+	spin_unlock(&o2hb_live_lock);
														
 
															+out:
														
 
															+	if (ret == NULL)
														
 
															+		kfree(reg);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2hb_heartbeat_group_drop_item(struct config_group *group,
														
 
															+					   struct config_item *item)
														
 
															+{
														
 
															+	struct o2hb_region *reg = to_o2hb_region(item);
														
 
															+
														
 
															+	/* stop the thread when the user removes the region dir */
														
 
															+	if (reg->hr_task) {
														
 
															+		kthread_stop(reg->hr_task);
														
 
															+		reg->hr_task = NULL;
														
 
															+	}
														
 
															+
														
 
															+	config_item_put(item);
														
 
															+}
														
 
															+
														
 
															+struct o2hb_heartbeat_group_attribute {
														
 
															+	struct configfs_attribute attr;
														
 
															+	ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
														
 
															+	ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
														
 
															+};
														
 
															+
														
 
															+static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
														
 
															+					 struct configfs_attribute *attr,
														
 
															+					 char *page)
														
 
															+{
														
 
															+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
														
 
															+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
														
 
															+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
														
 
															+	ssize_t ret = 0;
														
 
															+
														
 
															+	if (o2hb_heartbeat_group_attr->show)
														
 
															+		ret = o2hb_heartbeat_group_attr->show(reg, page);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
														
 
															+					  struct configfs_attribute *attr,
														
 
															+					  const char *page, size_t count)
														
 
															+{
														
 
															+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
														
 
															+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
														
 
															+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
														
 
															+	ssize_t ret = -EINVAL;
														
 
															+
														
 
															+	if (o2hb_heartbeat_group_attr->store)
														
 
															+		ret = o2hb_heartbeat_group_attr->store(reg, page, count);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
														
 
															+						     char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%u\n", o2hb_dead_threshold);
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
														
 
															+						    const char *page,
														
 
															+						    size_t count)
														
 
															+{
														
 
															+	unsigned long tmp;
														
 
															+	char *p = (char *)page;
														
 
															+
														
 
															+	tmp = simple_strtoul(p, &p, 10);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+                return -EINVAL;
														
 
															+
														
 
															+	/* this will validate ranges for us. */
														
 
															+	o2hb_dead_threshold_set((unsigned int) tmp);
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "dead_threshold",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2hb_heartbeat_group_threshold_show,
														
 
															+	.store	= o2hb_heartbeat_group_threshold_store,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
														
 
															+	&o2hb_heartbeat_group_attr_threshold.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
														
 
															+	.show_attribute		= o2hb_heartbeat_group_show,
														
 
															+	.store_attribute	= o2hb_heartbeat_group_store,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
														
 
															+	.make_item	= o2hb_heartbeat_group_make_item,
														
 
															+	.drop_item	= o2hb_heartbeat_group_drop_item,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type o2hb_heartbeat_group_type = {
														
 
															+	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
														
 
															+	.ct_item_ops	= &o2hb_hearbeat_group_item_ops,
														
 
															+	.ct_attrs	= o2hb_heartbeat_group_attrs,
														
 
															+	.ct_owner	= THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+/* this is just here to avoid touching group in heartbeat.h which the
														
 
															+ * entire damn world #includes */
														
 
															+struct config_group *o2hb_alloc_hb_set(void)
														
 
															+{
														
 
															+	struct o2hb_heartbeat_group *hs = NULL;
														
 
															+	struct config_group *ret = NULL;
														
 
															+
														
 
															+	hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
														
 
															+	if (hs == NULL)
														
 
															+		goto out;
														
 
															+
														
 
															+	config_group_init_type_name(&hs->hs_group, "heartbeat",
														
 
															+				    &o2hb_heartbeat_group_type);
														
 
															+
														
 
															+	ret = &hs->hs_group;
														
 
															+out:
														
 
															+	if (ret == NULL)
														
 
															+		kfree(hs);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+void o2hb_free_hb_set(struct config_group *group)
														
 
															+{
														
 
															+	struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
														
 
															+	kfree(hs);
														
 
															+}
														
 
															+
														
 
															+/* hb callback registration and issueing */
														
 
															+
														
 
															+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
														
 
															+{
														
 
															+	if (type == O2HB_NUM_CB)
														
 
															+		return ERR_PTR(-EINVAL);
														
 
															+
														
 
															+	return &o2hb_callbacks[type];
														
 
															+}
														
 
															+
														
 
															+void o2hb_setup_callback(struct o2hb_callback_func *hc,
														
 
															+			 enum o2hb_callback_type type,
														
 
															+			 o2hb_cb_func *func,
														
 
															+			 void *data,
														
 
															+			 int priority)
														
 
															+{
														
 
															+	INIT_LIST_HEAD(&hc->hc_item);
														
 
															+	hc->hc_func = func;
														
 
															+	hc->hc_data = data;
														
 
															+	hc->hc_priority = priority;
														
 
															+	hc->hc_type = type;
														
 
															+	hc->hc_magic = O2HB_CB_MAGIC;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2hb_setup_callback);
														
 
															+
														
 
															+int o2hb_register_callback(struct o2hb_callback_func *hc)
														
 
															+{
														
 
															+	struct o2hb_callback_func *tmp;
														
 
															+	struct list_head *iter;
														
 
															+	struct o2hb_callback *hbcall;
														
 
															+	int ret;
														
 
															+
														
 
															+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
														
 
															+	BUG_ON(!list_empty(&hc->hc_item));
														
 
															+
														
 
															+	hbcall = hbcall_from_type(hc->hc_type);
														
 
															+	if (IS_ERR(hbcall)) {
														
 
															+		ret = PTR_ERR(hbcall);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	down_write(&o2hb_callback_sem);
														
 
															+
														
 
															+	list_for_each(iter, &hbcall->list) {
														
 
															+		tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
														
 
															+		if (hc->hc_priority < tmp->hc_priority) {
														
 
															+			list_add_tail(&hc->hc_item, iter);
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+	if (list_empty(&hc->hc_item))
														
 
															+		list_add_tail(&hc->hc_item, &hbcall->list);
														
 
															+
														
 
															+	up_write(&o2hb_callback_sem);
														
 
															+	ret = 0;
														
 
															+out:
														
 
															+	mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
														
 
															+	     ret, __builtin_return_address(0), hc);
														
 
															+	return ret;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2hb_register_callback);
														
 
															+
														
 
															+int o2hb_unregister_callback(struct o2hb_callback_func *hc)
														
 
															+{
														
 
															+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
														
 
															+
														
 
															+	mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
														
 
															+	     __builtin_return_address(0), hc);
														
 
															+
														
 
															+	if (list_empty(&hc->hc_item))
														
 
															+		return 0;
														
 
															+
														
 
															+	down_write(&o2hb_callback_sem);
														
 
															+
														
 
															+	list_del_init(&hc->hc_item);
														
 
															+
														
 
															+	up_write(&o2hb_callback_sem);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
														
 
															+
														
 
															+int o2hb_check_node_heartbeating(u8 node_num)
														
 
															+{
														
 
															+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+
														
 
															+	o2hb_fill_node_map(testing_map, sizeof(testing_map));
														
 
															+	if (!test_bit(node_num, testing_map)) {
														
 
															+		mlog(ML_HEARTBEAT,
														
 
															+		     "node (%u) does not have heartbeating enabled.\n",
														
 
															+		     node_num);
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
														
 
															+
														
 
															+int o2hb_check_node_heartbeating_from_callback(u8 node_num)
														
 
															+{
														
 
															+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+
														
 
															+	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
														
 
															+	if (!test_bit(node_num, testing_map)) {
														
 
															+		mlog(ML_HEARTBEAT,
														
 
															+		     "node (%u) does not have heartbeating enabled.\n",
														
 
															+		     node_num);
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
														
 
															+
														
 
															+/* Makes sure our local node is configured with a node number, and is
														
 
															+ * heartbeating. */
														
 
															+int o2hb_check_local_node_heartbeating(void)
														
 
															+{
														
 
															+	u8 node_num;
														
 
															+
														
 
															+	/* if this node was set then we have networking */
														
 
															+	node_num = o2nm_this_node();
														
 
															+	if (node_num == O2NM_MAX_NODES) {
														
 
															+		mlog(ML_HEARTBEAT, "this node has not been configured.\n");
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return o2hb_check_node_heartbeating(node_num);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
														
 
															+
														
 
															+/*
														
 
															+ * this is just a hack until we get the plumbing which flips file systems
														
 
															+ * read only and drops the hb ref instead of killing the node dead.
														
 
															+ */
														
 
															+void o2hb_stop_all_regions(void)
														
 
															+{
														
 
															+	struct o2hb_region *reg;
														
 
															+
														
 
															+	mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
														
 
															+
														
 
															+	spin_lock(&o2hb_live_lock);
														
 
															+
														
 
															+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
														
 
															+		reg->hr_unclean_stop = 1;
														
 
															+
														
 
															+	spin_unlock(&o2hb_live_lock);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
														
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -0,0 +1,82 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * heartbeat.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef O2CLUSTER_HEARTBEAT_H
														
 
															+#define O2CLUSTER_HEARTBEAT_H
														
 
															+
														
 
															+#include "ocfs2_heartbeat.h"
														
 
															+
														
 
															+#define O2HB_REGION_TIMEOUT_MS		2000
														
 
															+
														
 
															+/* number of changes to be seen as live */
														
 
															+#define O2HB_LIVE_THRESHOLD	   2
														
 
															+/* number of equal samples to be seen as dead */
														
 
															+extern unsigned int o2hb_dead_threshold;
														
 
															+#define O2HB_DEFAULT_DEAD_THRESHOLD	   7
														
 
															+/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
														
 
															+#define O2HB_MIN_DEAD_THRESHOLD	  2
														
 
															+#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
														
 
															+
														
 
															+#define O2HB_CB_MAGIC		0x51d1e4ec
														
 
															+
														
 
															+/* callback stuff */
														
 
															+enum o2hb_callback_type {
														
 
															+	O2HB_NODE_DOWN_CB = 0,
														
 
															+	O2HB_NODE_UP_CB,
														
 
															+	O2HB_NUM_CB
														
 
															+};
														
 
															+
														
 
															+struct o2nm_node;
														
 
															+typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
														
 
															+
														
 
															+struct o2hb_callback_func {
														
 
															+	u32			hc_magic;
														
 
															+	struct list_head	hc_item;
														
 
															+	o2hb_cb_func		*hc_func;
														
 
															+	void			*hc_data;
														
 
															+	int			hc_priority;
														
 
															+	enum o2hb_callback_type hc_type;
														
 
															+};
														
 
															+
														
 
															+struct config_group *o2hb_alloc_hb_set(void);
														
 
															+void o2hb_free_hb_set(struct config_group *group);
														
 
															+
														
 
															+void o2hb_setup_callback(struct o2hb_callback_func *hc,
														
 
															+			 enum o2hb_callback_type type,
														
 
															+			 o2hb_cb_func *func,
														
 
															+			 void *data,
														
 
															+			 int priority);
														
 
															+int o2hb_register_callback(struct o2hb_callback_func *hc);
														
 
															+int o2hb_unregister_callback(struct o2hb_callback_func *hc);
														
 
															+void o2hb_fill_node_map(unsigned long *map,
														
 
															+			unsigned bytes);
														
 
															+void o2hb_init(void);
														
 
															+int o2hb_check_node_heartbeating(u8 node_num);
														
 
															+int o2hb_check_node_heartbeating_from_callback(u8 node_num);
														
 
															+int o2hb_check_local_node_heartbeating(void);
														
 
															+void o2hb_stop_all_regions(void);
														
 
															+
														
 
															+#endif /* O2CLUSTER_HEARTBEAT_H */
														
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -0,0 +1,166 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/proc_fs.h>
														
 
															+#include <linux/seq_file.h>
														
 
															+#include <linux/string.h>
														
 
															+#include <asm/uaccess.h>
														
 
															+
														
 
															+#include "masklog.h"
														
 
															+
														
 
															+struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
														
 
															+EXPORT_SYMBOL_GPL(mlog_and_bits);
														
 
															+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
														
 
															+EXPORT_SYMBOL_GPL(mlog_not_bits);
														
 
															+
														
 
															+static ssize_t mlog_mask_show(u64 mask, char *buf)
														
 
															+{
														
 
															+	char *state;
														
 
															+
														
 
															+	if (__mlog_test_u64(mask, mlog_and_bits))
														
 
															+		state = "allow";
														
 
															+	else if (__mlog_test_u64(mask, mlog_not_bits))
														
 
															+		state = "deny";
														
 
															+	else
														
 
															+		state = "off";
														
 
															+
														
 
															+	return snprintf(buf, PAGE_SIZE, "%s\n", state);
														
 
															+}
														
 
															+
														
 
															+static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
														
 
															+{
														
 
															+	if (!strnicmp(buf, "allow", 5)) {
														
 
															+		__mlog_set_u64(mask, mlog_and_bits);
														
 
															+		__mlog_clear_u64(mask, mlog_not_bits);
														
 
															+	} else if (!strnicmp(buf, "deny", 4)) {
														
 
															+		__mlog_set_u64(mask, mlog_not_bits);
														
 
															+		__mlog_clear_u64(mask, mlog_and_bits);
														
 
															+	} else if (!strnicmp(buf, "off", 3)) {
														
 
															+		__mlog_clear_u64(mask, mlog_not_bits);
														
 
															+		__mlog_clear_u64(mask, mlog_and_bits);
														
 
															+	} else
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+struct mlog_attribute {
														
 
															+	struct attribute attr;
														
 
															+	u64 mask;
														
 
															+};
														
 
															+
														
 
															+#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
														
 
															+
														
 
															+#define define_mask(_name) {			\
														
 
															+	.attr = {				\
														
 
															+		.name = #_name,			\
														
 
															+		.mode = S_IRUGO | S_IWUSR,	\
														
 
															+	},					\
														
 
															+	.mask = ML_##_name,			\
														
 
															+}
														
 
															+
														
 
															+static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
														
 
															+	define_mask(ENTRY),
														
 
															+	define_mask(EXIT),
														
 
															+	define_mask(TCP),
														
 
															+	define_mask(MSG),
														
 
															+	define_mask(SOCKET),
														
 
															+	define_mask(HEARTBEAT),
														
 
															+	define_mask(HB_BIO),
														
 
															+	define_mask(DLMFS),
														
 
															+	define_mask(DLM),
														
 
															+	define_mask(DLM_DOMAIN),
														
 
															+	define_mask(DLM_THREAD),
														
 
															+	define_mask(DLM_MASTER),
														
 
															+	define_mask(DLM_RECOVERY),
														
 
															+	define_mask(AIO),
														
 
															+	define_mask(JOURNAL),
														
 
															+	define_mask(DISK_ALLOC),
														
 
															+	define_mask(SUPER),
														
 
															+	define_mask(FILE_IO),
														
 
															+	define_mask(EXTENT_MAP),
														
 
															+	define_mask(DLM_GLUE),
														
 
															+	define_mask(BH_IO),
														
 
															+	define_mask(UPTODATE),
														
 
															+	define_mask(NAMEI),
														
 
															+	define_mask(INODE),
														
 
															+	define_mask(VOTE),
														
 
															+	define_mask(DCACHE),
														
 
															+	define_mask(CONN),
														
 
															+	define_mask(QUORUM),
														
 
															+	define_mask(EXPORT),
														
 
															+	define_mask(ERROR),
														
 
															+	define_mask(NOTICE),
														
 
															+	define_mask(KTHREAD),
														
 
															+};
														
 
															+
														
 
															+static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
														
 
															+
														
 
															+static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
														
 
															+			 char *buf)
														
 
															+{
														
 
															+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
														
 
															+
														
 
															+	return mlog_mask_show(mlog_attr->mask, buf);
														
 
															+}
														
 
															+
														
 
															+static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
														
 
															+			  const char *buf, size_t count)
														
 
															+{
														
 
															+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
														
 
															+
														
 
															+	return mlog_mask_store(mlog_attr->mask, buf, count);
														
 
															+}
														
 
															+
														
 
															+static struct sysfs_ops mlog_attr_ops = {
														
 
															+	.show  = mlog_show,
														
 
															+	.store = mlog_store,
														
 
															+};
														
 
															+
														
 
															+static struct kobj_type mlog_ktype = {
														
 
															+	.default_attrs = mlog_attr_ptrs,
														
 
															+	.sysfs_ops     = &mlog_attr_ops,
														
 
															+};
														
 
															+
														
 
															+static struct kset mlog_kset = {
														
 
															+	.kobj   = {.name = "logmask", .ktype = &mlog_ktype},
														
 
															+};
														
 
															+
														
 
															+int mlog_sys_init(struct subsystem *o2cb_subsys)
														
 
															+{
														
 
															+	int i = 0;
														
 
															+
														
 
															+	while (mlog_attrs[i].attr.mode) {
														
 
															+		mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
														
 
															+		i++;
														
 
															+	}
														
 
															+	mlog_attr_ptrs[i] = NULL;
														
 
															+
														
 
															+	mlog_kset.subsys = o2cb_subsys;
														
 
															+	return kset_register(&mlog_kset);
														
 
															+}
														
 
															+
														
 
															+void mlog_sys_shutdown(void)
														
 
															+{
														
 
															+	kset_unregister(&mlog_kset);
														
 
															+}
														
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -0,0 +1,275 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef O2CLUSTER_MASKLOG_H
														
 
															+#define O2CLUSTER_MASKLOG_H
														
 
															+
														
 
															+/*
														
 
															+ * For now this is a trivial wrapper around printk() that gives the critical
														
 
															+ * ability to enable sets of debugging output at run-time.  In the future this
														
 
															+ * will almost certainly be redirected to relayfs so that it can pay a
														
 
															+ * substantially lower heisenberg tax.
														
 
															+ *
														
 
															+ * Callers associate the message with a bitmask and a global bitmask is
														
 
															+ * maintained with help from /proc.  If any of the bits match the message is
														
 
															+ * output.
														
 
															+ *
														
 
															+ * We must have efficient bit tests on i386 and it seems gcc still emits crazy
														
 
															+ * code for the 64bit compare.  It emits very good code for the dual unsigned
														
 
															+ * long tests, though, completely avoiding tests that can never pass if the
														
 
															+ * caller gives a constant bitmask that fills one of the longs with all 0s.  So
														
 
															+ * the desire is to have almost all of the calls decided on by comparing just
														
 
															+ * one of the longs.  This leads to having infrequently given bits that are
														
 
															+ * frequently matched in the high bits.
														
 
															+ *
														
 
															+ * _ERROR and _NOTICE are used for messages that always go to the console and
														
 
															+ * have appropriate KERN_ prefixes.  We wrap these in our function instead of
														
 
															+ * just calling printk() so that this can eventually make its way through
														
 
															+ * relayfs along with the debugging messages.  Everything else gets KERN_DEBUG.
														
 
															+ * The inline tests and macro dance give GCC the opportunity to quite cleverly
														
 
															+ * only emit the appropriage printk() when the caller passes in a constant
														
 
															+ * mask, as is almost always the case.
														
 
															+ *
														
 
															+ * All this bitmask nonsense is hidden from the /proc interface so that Joel
														
 
															+ * doesn't have an aneurism.  Reading the file gives a straight forward
														
 
															+ * indication of which bits are on or off:
														
 
															+ * 	ENTRY off
														
 
															+ * 	EXIT off
														
 
															+ * 	TCP off
														
 
															+ * 	MSG off
														
 
															+ * 	SOCKET off
														
 
															+ * 	ERROR off
														
 
															+ * 	NOTICE on
														
 
															+ *
														
 
															+ * Writing changes the state of a given bit and requires a strictly formatted
														
 
															+ * single write() call:
														
 
															+ *
														
 
															+ * 	write(fd, "ENTRY on", 8);
														
 
															+ *
														
 
															+ * would turn the entry bit on.  "1" is also accepted in the place of "on", and
														
 
															+ * "off" and "0" behave as expected.
														
 
															+ *
														
 
															+ * Some trivial shell can flip all the bits on or off:
														
 
															+ *
														
 
															+ * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
														
 
															+ * cat $log_mask | (
														
 
															+ * 	while read bit status; do
														
 
															+ * 		# $1 is "on" or "off", say
														
 
															+ * 		echo "$bit $1" > $log_mask
														
 
															+ * 	done
														
 
															+ * )
														
 
															+ */
														
 
															+
														
 
															+/* for task_struct */
														
 
															+#include <linux/sched.h>
														
 
															+
														
 
															+/* bits that are frequently given and infrequently matched in the low word */
														
 
															+/* NOTE: If you add a flag, you need to also update mlog.c! */
														
 
															+#define ML_ENTRY	0x0000000000000001ULL /* func call entry */
														
 
															+#define ML_EXIT		0x0000000000000002ULL /* func call exit */
														
 
															+#define ML_TCP		0x0000000000000004ULL /* net cluster/tcp.c */
														
 
															+#define ML_MSG		0x0000000000000008ULL /* net network messages */
														
 
															+#define ML_SOCKET	0x0000000000000010ULL /* net socket lifetime */
														
 
															+#define ML_HEARTBEAT	0x0000000000000020ULL /* hb all heartbeat tracking */
														
 
															+#define ML_HB_BIO	0x0000000000000040ULL /* hb io tracing */
														
 
															+#define ML_DLMFS	0x0000000000000080ULL /* dlm user dlmfs */
														
 
															+#define ML_DLM		0x0000000000000100ULL /* dlm general debugging */
														
 
															+#define ML_DLM_DOMAIN	0x0000000000000200ULL /* dlm domain debugging */
														
 
															+#define ML_DLM_THREAD	0x0000000000000400ULL /* dlm domain thread */
														
 
															+#define ML_DLM_MASTER	0x0000000000000800ULL /* dlm master functions */
														
 
															+#define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
														
 
															+#define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
														
 
															+#define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
														
 
															+#define ML_DISK_ALLOC	0x0000000000008000ULL /* ocfs2 disk allocation */
														
 
															+#define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
														
 
															+#define ML_FILE_IO	0x0000000000020000ULL /* ocfs2 file I/O */
														
 
															+#define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
														
 
															+#define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
														
 
															+#define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
														
 
															+#define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
														
 
															+#define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
														
 
															+#define ML_INODE	0x0000000000800000ULL /* ocfs2 inode manipulation */
														
 
															+#define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
														
 
															+#define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
														
 
															+#define ML_CONN		0x0000000004000000ULL /* net connection management */
														
 
															+#define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
														
 
															+#define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
														
 
															+/* bits that are infrequently given and frequently matched in the high word */
														
 
															+#define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
														
 
															+#define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
														
 
															+#define ML_KTHREAD	0x0000000400000000ULL /* kernel thread activity */
														
 
															+
														
 
															+#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
														
 
															+#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
														
 
															+#ifndef MLOG_MASK_PREFIX
														
 
															+#define MLOG_MASK_PREFIX 0
														
 
															+#endif
														
 
															+
														
 
															+#define MLOG_MAX_BITS 64
														
 
															+
														
 
															+struct mlog_bits {
														
 
															+	unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
														
 
															+};
														
 
															+
														
 
															+extern struct mlog_bits mlog_and_bits, mlog_not_bits;
														
 
															+
														
 
															+#if BITS_PER_LONG == 32
														
 
															+
														
 
															+#define __mlog_test_u64(mask, bits)			\
														
 
															+	( (u32)(mask & 0xffffffff) & bits.words[0] || 	\
														
 
															+	  ((u64)(mask) >> 32) & bits.words[1] )
														
 
															+#define __mlog_set_u64(mask, bits) do {			\
														
 
															+	bits.words[0] |= (u32)(mask & 0xffffffff);	\
														
 
															+       	bits.words[1] |= (u64)(mask) >> 32;		\
														
 
															+} while (0)
														
 
															+#define __mlog_clear_u64(mask, bits) do {		\
														
 
															+	bits.words[0] &= ~((u32)(mask & 0xffffffff));	\
														
 
															+       	bits.words[1] &= ~((u64)(mask) >> 32);		\
														
 
															+} while (0)
														
 
															+#define MLOG_BITS_RHS(mask) {				\
														
 
															+	{						\
														
 
															+		[0] = (u32)(mask & 0xffffffff),		\
														
 
															+		[1] = (u64)(mask) >> 32,		\
														
 
															+	}						\
														
 
															+}
														
 
															+
														
 
															+#else /* 32bit long above, 64bit long below */
														
 
															+
														
 
															+#define __mlog_test_u64(mask, bits)	((mask) & bits.words[0])
														
 
															+#define __mlog_set_u64(mask, bits) do {		\
														
 
															+	bits.words[0] |= (mask);		\
														
 
															+} while (0)
														
 
															+#define __mlog_clear_u64(mask, bits) do {	\
														
 
															+	bits.words[0] &= ~(mask);		\
														
 
															+} while (0)
														
 
															+#define MLOG_BITS_RHS(mask) { { (mask) } }
														
 
															+
														
 
															+#endif
														
 
															+
														
 
															+/*
														
 
															+ * smp_processor_id() "helpfully" screams when called outside preemptible
														
 
															+ * regions in current kernels.  sles doesn't have the variants that don't
														
 
															+ * scream.  just do this instead of trying to guess which we're building
														
 
															+ * against.. *sigh*.
														
 
															+ */
														
 
															+#define __mlog_cpu_guess ({		\
														
 
															+	unsigned long _cpu = get_cpu();	\
														
 
															+	put_cpu();			\
														
 
															+	_cpu;				\
														
 
															+})
														
 
															+
														
 
															+/* In the following two macros, the whitespace after the ',' just
														
 
															+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
														
 
															+ * previous token if args expands to nothing.
														
 
															+ */
														
 
															+#define __mlog_printk(level, fmt, args...)				\
														
 
															+	printk(level "(%u,%lu):%s:%d " fmt, current->pid,		\
														
 
															+	       __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ ,	\
														
 
															+	       ##args)
														
 
															+
														
 
															+#define mlog(mask, fmt, args...) do {					\
														
 
															+	u64 __m = MLOG_MASK_PREFIX | (mask);				\
														
 
															+	if (__mlog_test_u64(__m, mlog_and_bits) &&			\
														
 
															+	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
														
 
															+		if (__m & ML_ERROR)					\
														
 
															+			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
														
 
															+		else if (__m & ML_NOTICE)				\
														
 
															+			__mlog_printk(KERN_NOTICE, fmt , ##args);	\
														
 
															+		else __mlog_printk(KERN_INFO, fmt , ##args);		\
														
 
															+	}								\
														
 
															+} while (0)
														
 
															+
														
 
															+#define mlog_errno(st) do {						\
														
 
															+	int _st = (st);							\
														
 
															+	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
														
 
															+	    _st != AOP_TRUNCATED_PAGE)					\
														
 
															+		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
														
 
															+} while (0)
														
 
															+
														
 
															+#define mlog_entry(fmt, args...) do {					\
														
 
															+	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
														
 
															+} while (0)
														
 
															+
														
 
															+#define mlog_entry_void() do {						\
														
 
															+	mlog(ML_ENTRY, "ENTRY:\n");					\
														
 
															+} while (0)
														
 
															+
														
 
															+/* We disable this for old compilers since they don't have support for
														
 
															+ * __builtin_types_compatible_p.
														
 
															+ */
														
 
															+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
														
 
															+    !defined(__CHECKER__)
														
 
															+#define mlog_exit(st) do {						     \
														
 
															+	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
														
 
															+		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
														
 
															+	else if (__builtin_types_compatible_p(typeof(st), signed long))      \
														
 
															+		mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st));	     \
														
 
															+	else if (__builtin_types_compatible_p(typeof(st), unsigned int)	     \
														
 
															+		 || __builtin_types_compatible_p(typeof(st), unsigned short) \
														
 
															+		 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
														
 
															+		mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st));	     \
														
 
															+	else if (__builtin_types_compatible_p(typeof(st), signed int)	     \
														
 
															+		 || __builtin_types_compatible_p(typeof(st), signed short)   \
														
 
															+		 || __builtin_types_compatible_p(typeof(st), signed char))   \
														
 
															+		mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st));		     \
														
 
															+	else if (__builtin_types_compatible_p(typeof(st), long long))	     \
														
 
															+		mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));	     \
														
 
															+	else								     \
														
 
															+		mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st));    \
														
 
															+} while (0)
														
 
															+#else
														
 
															+#define mlog_exit(st) do {						     \
														
 
															+	mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));		     \
														
 
															+} while (0)
														
 
															+#endif
														
 
															+
														
 
															+#define mlog_exit_ptr(ptr) do {						\
														
 
															+	mlog(ML_EXIT, "EXIT: %p\n", ptr);				\
														
 
															+} while (0)
														
 
															+
														
 
															+#define mlog_exit_void() do {						\
														
 
															+	mlog(ML_EXIT, "EXIT\n");					\
														
 
															+} while (0)
														
 
															+
														
 
															+#define mlog_bug_on_msg(cond, fmt, args...) do {			\
														
 
															+	if (cond) {							\
														
 
															+		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
														
 
															+		mlog(ML_ERROR, fmt, ##args);				\
														
 
															+		BUG();							\
														
 
															+	}								\
														
 
															+} while (0)
														
 
															+
														
 
															+#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
														
 
															+#define MLFi64 "lld"
														
 
															+#define MLFu64 "llu"
														
 
															+#define MLFx64 "llx"
														
 
															+#else
														
 
															+#define MLFi64 "ld"
														
 
															+#define MLFu64 "lu"
														
 
															+#define MLFx64 "lx"
														
 
															+#endif
														
 
															+
														
 
															+#include <linux/kobject.h>
														
 
															+#include <linux/sysfs.h>
														
 
															+int mlog_sys_init(struct subsystem *o2cb_subsys);
														
 
															+void mlog_sys_shutdown(void);
														
 
															+
														
 
															+#endif /* O2CLUSTER_MASKLOG_H */
														
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -0,0 +1,791 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/configfs.h>
														
 
															+
														
 
															+#include "endian.h"
														
 
															+#include "tcp.h"
														
 
															+#include "nodemanager.h"
														
 
															+#include "heartbeat.h"
														
 
															+#include "masklog.h"
														
 
															+#include "sys.h"
														
 
															+#include "ver.h"
														
 
															+
														
 
															+/* for now we operate under the assertion that there can be only one
														
 
															+ * cluster active at a time.  Changing this will require trickling
														
 
															+ * cluster references throughout where nodes are looked up */
														
 
															+static struct o2nm_cluster *o2nm_single_cluster = NULL;
														
 
															+
														
 
															+#define OCFS2_MAX_HB_CTL_PATH 256
														
 
															+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
														
 
															+
														
 
															+static ctl_table ocfs2_nm_table[] = {
														
 
															+	{
														
 
															+		.ctl_name	= 1,
														
 
															+		.procname	= "hb_ctl_path",
														
 
															+		.data		= ocfs2_hb_ctl_path,
														
 
															+		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
														
 
															+		.mode		= 0644,
														
 
															+		.proc_handler	= &proc_dostring,
														
 
															+		.strategy	= &sysctl_string,
														
 
															+	},
														
 
															+	{ .ctl_name = 0 }
														
 
															+};
														
 
															+
														
 
															+static ctl_table ocfs2_mod_table[] = {
														
 
															+	{
														
 
															+		.ctl_name	= KERN_OCFS2_NM,
														
 
															+		.procname	= "nm",
														
 
															+		.data		= NULL,
														
 
															+		.maxlen		= 0,
														
 
															+		.mode		= 0555,
														
 
															+		.child		= ocfs2_nm_table
														
 
															+	},
														
 
															+	{ .ctl_name = 0}
														
 
															+};
														
 
															+
														
 
															+static ctl_table ocfs2_kern_table[] = {
														
 
															+	{
														
 
															+		.ctl_name	= KERN_OCFS2,
														
 
															+		.procname	= "ocfs2",
														
 
															+		.data		= NULL,
														
 
															+		.maxlen		= 0,
														
 
															+		.mode		= 0555,
														
 
															+		.child		= ocfs2_mod_table
														
 
															+	},
														
 
															+	{ .ctl_name = 0}
														
 
															+};
														
 
															+
														
 
															+static ctl_table ocfs2_root_table[] = {
														
 
															+	{
														
 
															+		.ctl_name	= CTL_FS,
														
 
															+		.procname	= "fs",
														
 
															+		.data		= NULL,
														
 
															+		.maxlen		= 0,
														
 
															+		.mode		= 0555,
														
 
															+		.child		= ocfs2_kern_table
														
 
															+	},
														
 
															+	{ .ctl_name = 0 }
														
 
															+};
														
 
															+
														
 
															+static struct ctl_table_header *ocfs2_table_header = NULL;
														
 
															+
														
 
															+const char *o2nm_get_hb_ctl_path(void)
														
 
															+{
														
 
															+	return ocfs2_hb_ctl_path;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
														
 
															+
														
 
															+struct o2nm_cluster {
														
 
															+	struct config_group	cl_group;
														
 
															+	unsigned		cl_has_local:1;
														
 
															+	u8			cl_local_node;
														
 
															+	rwlock_t		cl_nodes_lock;
														
 
															+	struct o2nm_node  	*cl_nodes[O2NM_MAX_NODES];
														
 
															+	struct rb_root		cl_node_ip_tree;
														
 
															+	/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
														
 
															+	unsigned long	cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+};
														
 
															+
														
 
															+struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
														
 
															+{
														
 
															+	struct o2nm_node *node = NULL;
														
 
															+
														
 
															+	if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
														
 
															+		goto out;
														
 
															+
														
 
															+	read_lock(&o2nm_single_cluster->cl_nodes_lock);
														
 
															+	node = o2nm_single_cluster->cl_nodes[node_num];
														
 
															+	if (node)
														
 
															+		config_item_get(&node->nd_item);
														
 
															+	read_unlock(&o2nm_single_cluster->cl_nodes_lock);
														
 
															+out:
														
 
															+	return node;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
														
 
															+
														
 
															+int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
														
 
															+{
														
 
															+	struct o2nm_cluster *cluster = o2nm_single_cluster;
														
 
															+
														
 
															+	BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
														
 
															+
														
 
															+	if (cluster == NULL)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	read_lock(&cluster->cl_nodes_lock);
														
 
															+	memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
														
 
															+	read_unlock(&cluster->cl_nodes_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
														
 
															+
														
 
															+static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
														
 
															+						  __be32 ip_needle,
														
 
															+						  struct rb_node ***ret_p,
														
 
															+						  struct rb_node **ret_parent)
														
 
															+{
														
 
															+	struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
														
 
															+	struct rb_node *parent = NULL;
														
 
															+	struct o2nm_node *node, *ret = NULL;
														
 
															+
														
 
															+	while (*p) {
														
 
															+		parent = *p;
														
 
															+		node = rb_entry(parent, struct o2nm_node, nd_ip_node);
														
 
															+
														
 
															+		if (memcmp(&ip_needle, &node->nd_ipv4_address,
														
 
															+		           sizeof(ip_needle)) < 0)
														
 
															+			p = &(*p)->rb_left;
														
 
															+		else if (memcmp(&ip_needle, &node->nd_ipv4_address,
														
 
															+			        sizeof(ip_needle)) > 0)
														
 
															+			p = &(*p)->rb_right;
														
 
															+		else {
														
 
															+			ret = node;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (ret_p != NULL)
														
 
															+		*ret_p = p;
														
 
															+	if (ret_parent != NULL)
														
 
															+		*ret_parent = parent;
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
														
 
															+{
														
 
															+	struct o2nm_node *node = NULL;
														
 
															+	struct o2nm_cluster *cluster = o2nm_single_cluster;
														
 
															+
														
 
															+	if (cluster == NULL)
														
 
															+		goto out;
														
 
															+
														
 
															+	read_lock(&cluster->cl_nodes_lock);
														
 
															+	node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
														
 
															+	if (node)
														
 
															+		config_item_get(&node->nd_item);
														
 
															+	read_unlock(&cluster->cl_nodes_lock);
														
 
															+
														
 
															+out:
														
 
															+	return node;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
														
 
															+
														
 
															+void o2nm_node_put(struct o2nm_node *node)
														
 
															+{
														
 
															+	config_item_put(&node->nd_item);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2nm_node_put);
														
 
															+
														
 
															+void o2nm_node_get(struct o2nm_node *node)
														
 
															+{
														
 
															+	config_item_get(&node->nd_item);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2nm_node_get);
														
 
															+
														
 
															+u8 o2nm_this_node(void)
														
 
															+{
														
 
															+	u8 node_num = O2NM_MAX_NODES;
														
 
															+
														
 
															+	if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
														
 
															+		node_num = o2nm_single_cluster->cl_local_node;
														
 
															+
														
 
															+	return node_num;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2nm_this_node);
														
 
															+
														
 
															+/* node configfs bits */
														
 
															+
														
 
															+static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
														
 
															+{
														
 
															+	return item ?
														
 
															+		container_of(to_config_group(item), struct o2nm_cluster,
														
 
															+			     cl_group)
														
 
															+		: NULL;
														
 
															+}
														
 
															+
														
 
															+static struct o2nm_node *to_o2nm_node(struct config_item *item)
														
 
															+{
														
 
															+	return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
														
 
															+}
														
 
															+
														
 
															+static void o2nm_node_release(struct config_item *item)
														
 
															+{
														
 
															+	struct o2nm_node *node = to_o2nm_node(item);
														
 
															+	kfree(node);
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%d\n", node->nd_num);
														
 
															+}
														
 
															+
														
 
															+static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
														
 
															+{
														
 
															+	/* through the first node_set .parent
														
 
															+	 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
														
 
															+	return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
														
 
															+}
														
 
															+
														
 
															+enum {
														
 
															+	O2NM_NODE_ATTR_NUM = 0,
														
 
															+	O2NM_NODE_ATTR_PORT,
														
 
															+	O2NM_NODE_ATTR_ADDRESS,
														
 
															+	O2NM_NODE_ATTR_LOCAL,
														
 
															+};
														
 
															+
														
 
															+static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
														
 
															+				   size_t count)
														
 
															+{
														
 
															+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
														
 
															+	unsigned long tmp;
														
 
															+	char *p = (char *)page;
														
 
															+
														
 
															+	tmp = simple_strtoul(p, &p, 0);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (tmp >= O2NM_MAX_NODES)
														
 
															+		return -ERANGE;
														
 
															+
														
 
															+	/* once we're in the cl_nodes tree networking can look us up by
														
 
															+	 * node number and try to use our address and port attributes
														
 
															+	 * to connect to this node.. make sure that they've been set
														
 
															+	 * before writing the node attribute? */
														
 
															+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
														
 
															+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
														
 
															+		return -EINVAL; /* XXX */
														
 
															+
														
 
															+	write_lock(&cluster->cl_nodes_lock);
														
 
															+	if (cluster->cl_nodes[tmp])
														
 
															+		p = NULL;
														
 
															+	else  {
														
 
															+		cluster->cl_nodes[tmp] = node;
														
 
															+		node->nd_num = tmp;
														
 
															+		set_bit(tmp, cluster->cl_nodes_bitmap);
														
 
															+	}
														
 
															+	write_unlock(&cluster->cl_nodes_lock);
														
 
															+	if (p == NULL)
														
 
															+		return -EEXIST;
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
														
 
															+					 const char *page, size_t count)
														
 
															+{
														
 
															+	unsigned long tmp;
														
 
															+	char *p = (char *)page;
														
 
															+
														
 
															+	tmp = simple_strtoul(p, &p, 0);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (tmp == 0)
														
 
															+		return -EINVAL;
														
 
															+	if (tmp >= (u16)-1)
														
 
															+		return -ERANGE;
														
 
															+
														
 
															+	node->nd_ipv4_port = htons(tmp);
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
														
 
															+					    const char *page,
														
 
															+					    size_t count)
														
 
															+{
														
 
															+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
														
 
															+	int ret, i;
														
 
															+	struct rb_node **p, *parent;
														
 
															+	unsigned int octets[4];
														
 
															+	__be32 ipv4_addr = 0;
														
 
															+
														
 
															+	ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
														
 
															+		     &octets[1], &octets[0]);
														
 
															+	if (ret != 4)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	for (i = 0; i < ARRAY_SIZE(octets); i++) {
														
 
															+		if (octets[i] > 255)
														
 
															+			return -ERANGE;
														
 
															+		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
														
 
															+	}
														
 
															+
														
 
															+	ret = 0;
														
 
															+	write_lock(&cluster->cl_nodes_lock);
														
 
															+	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
														
 
															+		ret = -EEXIST;
														
 
															+	else {
														
 
															+		rb_link_node(&node->nd_ip_node, parent, p);
														
 
															+		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
														
 
															+	}
														
 
															+	write_unlock(&cluster->cl_nodes_lock);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
														
 
															+{
														
 
															+	return sprintf(page, "%d\n", node->nd_local);
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
														
 
															+				     size_t count)
														
 
															+{
														
 
															+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
														
 
															+	unsigned long tmp;
														
 
															+	char *p = (char *)page;
														
 
															+	ssize_t ret;
														
 
															+
														
 
															+	tmp = simple_strtoul(p, &p, 0);
														
 
															+	if (!p || (*p && (*p != '\n')))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	tmp = !!tmp; /* boolean of whether this node wants to be local */
														
 
															+
														
 
															+	/* setting local turns on networking rx for now so we require having
														
 
															+	 * set everything else first */
														
 
															+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
														
 
															+	    !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
														
 
															+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
														
 
															+		return -EINVAL; /* XXX */
														
 
															+
														
 
															+	/* the only failure case is trying to set a new local node
														
 
															+	 * when a different one is already set */
														
 
															+	if (tmp && tmp == cluster->cl_has_local &&
														
 
															+	    cluster->cl_local_node != node->nd_num)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	/* bring up the rx thread if we're setting the new local node. */
														
 
															+	if (tmp && !cluster->cl_has_local) {
														
 
															+		ret = o2net_start_listening(node);
														
 
															+		if (ret)
														
 
															+			return ret;
														
 
															+	}
														
 
															+
														
 
															+	if (!tmp && cluster->cl_has_local &&
														
 
															+	    cluster->cl_local_node == node->nd_num) {
														
 
															+		o2net_stop_listening(node);
														
 
															+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
														
 
															+	}
														
 
															+
														
 
															+	node->nd_local = tmp;
														
 
															+	if (node->nd_local) {
														
 
															+		cluster->cl_has_local = tmp;
														
 
															+		cluster->cl_local_node = node->nd_num;
														
 
															+	}
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+struct o2nm_node_attribute {
														
 
															+	struct configfs_attribute attr;
														
 
															+	ssize_t (*show)(struct o2nm_node *, char *);
														
 
															+	ssize_t (*store)(struct o2nm_node *, const char *, size_t);
														
 
															+};
														
 
															+
														
 
															+static struct o2nm_node_attribute o2nm_node_attr_num = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "num",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2nm_node_num_read,
														
 
															+	.store	= o2nm_node_num_write,
														
 
															+};
														
 
															+
														
 
															+static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "ipv4_port",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2nm_node_ipv4_port_read,
														
 
															+	.store	= o2nm_node_ipv4_port_write,
														
 
															+};
														
 
															+
														
 
															+static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "ipv4_address",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2nm_node_ipv4_address_read,
														
 
															+	.store	= o2nm_node_ipv4_address_write,
														
 
															+};
														
 
															+
														
 
															+static struct o2nm_node_attribute o2nm_node_attr_local = {
														
 
															+	.attr	= { .ca_owner = THIS_MODULE,
														
 
															+		    .ca_name = "local",
														
 
															+		    .ca_mode = S_IRUGO | S_IWUSR },
														
 
															+	.show	= o2nm_node_local_read,
														
 
															+	.store	= o2nm_node_local_write,
														
 
															+};
														
 
															+
														
 
															+static struct configfs_attribute *o2nm_node_attrs[] = {
														
 
															+	[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
														
 
															+	[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
														
 
															+	[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
														
 
															+	[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static int o2nm_attr_index(struct configfs_attribute *attr)
														
 
															+{
														
 
															+	int i;
														
 
															+	for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
														
 
															+		if (attr == o2nm_node_attrs[i])
														
 
															+			return i;
														
 
															+	}
														
 
															+	BUG();
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2nm_node_show(struct config_item *item,
														
 
															+			      struct configfs_attribute *attr,
														
 
															+			      char *page)
														
 
															+{
														
 
															+	struct o2nm_node *node = to_o2nm_node(item);
														
 
															+	struct o2nm_node_attribute *o2nm_node_attr =
														
 
															+		container_of(attr, struct o2nm_node_attribute, attr);
														
 
															+	ssize_t ret = 0;
														
 
															+
														
 
															+	if (o2nm_node_attr->show)
														
 
															+		ret = o2nm_node_attr->show(node, page);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static ssize_t o2nm_node_store(struct config_item *item,
														
 
															+			       struct configfs_attribute *attr,
														
 
															+			       const char *page, size_t count)
														
 
															+{
														
 
															+	struct o2nm_node *node = to_o2nm_node(item);
														
 
															+	struct o2nm_node_attribute *o2nm_node_attr =
														
 
															+		container_of(attr, struct o2nm_node_attribute, attr);
														
 
															+	ssize_t ret;
														
 
															+	int attr_index = o2nm_attr_index(attr);
														
 
															+
														
 
															+	if (o2nm_node_attr->store == NULL) {
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (test_bit(attr_index, &node->nd_set_attributes))
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	ret = o2nm_node_attr->store(node, page, count);
														
 
															+	if (ret < count)
														
 
															+		goto out;
														
 
															+
														
 
															+	set_bit(attr_index, &node->nd_set_attributes);
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static struct configfs_item_operations o2nm_node_item_ops = {
														
 
															+	.release		= o2nm_node_release,
														
 
															+	.show_attribute		= o2nm_node_show,
														
 
															+	.store_attribute	= o2nm_node_store,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type o2nm_node_type = {
														
 
															+	.ct_item_ops	= &o2nm_node_item_ops,
														
 
															+	.ct_attrs	= o2nm_node_attrs,
														
 
															+	.ct_owner	= THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+/* node set */
														
 
															+
														
 
															+struct o2nm_node_group {
														
 
															+	struct config_group ns_group;
														
 
															+	/* some stuff? */
														
 
															+};
														
 
															+
														
 
															+#if 0
														
 
															+static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
														
 
															+{
														
 
															+	return group ?
														
 
															+		container_of(group, struct o2nm_node_group, ns_group)
														
 
															+		: NULL;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+static struct config_item *o2nm_node_group_make_item(struct config_group *group,
														
 
															+						     const char *name)
														
 
															+{
														
 
															+	struct o2nm_node *node = NULL;
														
 
															+	struct config_item *ret = NULL;
														
 
															+
														
 
															+	if (strlen(name) > O2NM_MAX_NAME_LEN)
														
 
															+		goto out; /* ENAMETOOLONG */
														
 
															+
														
 
															+	node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
														
 
															+	if (node == NULL)
														
 
															+		goto out; /* ENOMEM */
														
 
															+
														
 
															+	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
														
 
															+	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
														
 
															+	spin_lock_init(&node->nd_lock);
														
 
															+
														
 
															+	ret = &node->nd_item;
														
 
															+
														
 
															+out:
														
 
															+	if (ret == NULL)
														
 
															+		kfree(node);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2nm_node_group_drop_item(struct config_group *group,
														
 
															+				      struct config_item *item)
														
 
															+{
														
 
															+	struct o2nm_node *node = to_o2nm_node(item);
														
 
															+	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
														
 
															+
														
 
															+	o2net_disconnect_node(node);
														
 
															+
														
 
															+	if (cluster->cl_has_local &&
														
 
															+	    (cluster->cl_local_node == node->nd_num)) {
														
 
															+		cluster->cl_has_local = 0;
														
 
															+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
														
 
															+		o2net_stop_listening(node);
														
 
															+	}
														
 
															+
														
 
															+	/* XXX call into net to stop this node from trading messages */
														
 
															+
														
 
															+	write_lock(&cluster->cl_nodes_lock);
														
 
															+
														
 
															+	/* XXX sloppy */
														
 
															+	if (node->nd_ipv4_address)
														
 
															+		rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
														
 
															+
														
 
															+	/* nd_num might be 0 if the node number hasn't been set.. */
														
 
															+	if (cluster->cl_nodes[node->nd_num] == node) {
														
 
															+		cluster->cl_nodes[node->nd_num] = NULL;
														
 
															+		clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
														
 
															+	}
														
 
															+	write_unlock(&cluster->cl_nodes_lock);
														
 
															+
														
 
															+	config_item_put(item);
														
 
															+}
														
 
															+
														
 
															+static struct configfs_group_operations o2nm_node_group_group_ops = {
														
 
															+	.make_item	= o2nm_node_group_make_item,
														
 
															+	.drop_item	= o2nm_node_group_drop_item,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type o2nm_node_group_type = {
														
 
															+	.ct_group_ops	= &o2nm_node_group_group_ops,
														
 
															+	.ct_owner	= THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+/* cluster */
														
 
															+
														
 
															+static void o2nm_cluster_release(struct config_item *item)
														
 
															+{
														
 
															+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
														
 
															+
														
 
															+	kfree(cluster->cl_group.default_groups);
														
 
															+	kfree(cluster);
														
 
															+}
														
 
															+
														
 
															+static struct configfs_item_operations o2nm_cluster_item_ops = {
														
 
															+	.release	= o2nm_cluster_release,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type o2nm_cluster_type = {
														
 
															+	.ct_item_ops	= &o2nm_cluster_item_ops,
														
 
															+	.ct_owner	= THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+/* cluster set */
														
 
															+
														
 
															+struct o2nm_cluster_group {
														
 
															+	struct configfs_subsystem cs_subsys;
														
 
															+	/* some stuff? */
														
 
															+};
														
 
															+
														
 
															+#if 0
														
 
															+static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
														
 
															+{
														
 
															+	return group ?
														
 
															+		container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
														
 
															+	       : NULL;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
														
 
															+							  const char *name)
														
 
															+{
														
 
															+	struct o2nm_cluster *cluster = NULL;
														
 
															+	struct o2nm_node_group *ns = NULL;
														
 
															+	struct config_group *o2hb_group = NULL, *ret = NULL;
														
 
															+	void *defs = NULL;
														
 
															+
														
 
															+	/* this runs under the parent dir's i_sem; there can be only
														
 
															+	 * one caller in here at a time */
														
 
															+	if (o2nm_single_cluster)
														
 
															+		goto out; /* ENOSPC */
														
 
															+
														
 
															+	cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
														
 
															+	ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
														
 
															+	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
														
 
															+	o2hb_group = o2hb_alloc_hb_set();
														
 
															+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
														
 
															+		goto out;
														
 
															+
														
 
															+	config_group_init_type_name(&cluster->cl_group, name,
														
 
															+				    &o2nm_cluster_type);
														
 
															+	config_group_init_type_name(&ns->ns_group, "node",
														
 
															+				    &o2nm_node_group_type);
														
 
															+
														
 
															+	cluster->cl_group.default_groups = defs;
														
 
															+	cluster->cl_group.default_groups[0] = &ns->ns_group;
														
 
															+	cluster->cl_group.default_groups[1] = o2hb_group;
														
 
															+	cluster->cl_group.default_groups[2] = NULL;
														
 
															+	rwlock_init(&cluster->cl_nodes_lock);
														
 
															+	cluster->cl_node_ip_tree = RB_ROOT;
														
 
															+
														
 
															+	ret = &cluster->cl_group;
														
 
															+	o2nm_single_cluster = cluster;
														
 
															+
														
 
															+out:
														
 
															+	if (ret == NULL) {
														
 
															+		kfree(cluster);
														
 
															+		kfree(ns);
														
 
															+		o2hb_free_hb_set(o2hb_group);
														
 
															+		kfree(defs);
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
														
 
															+{
														
 
															+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
														
 
															+	int i;
														
 
															+	struct config_item *killme;
														
 
															+
														
 
															+	BUG_ON(o2nm_single_cluster != cluster);
														
 
															+	o2nm_single_cluster = NULL;
														
 
															+
														
 
															+	for (i = 0; cluster->cl_group.default_groups[i]; i++) {
														
 
															+		killme = &cluster->cl_group.default_groups[i]->cg_item;
														
 
															+		cluster->cl_group.default_groups[i] = NULL;
														
 
															+		config_item_put(killme);
														
 
															+	}
														
 
															+
														
 
															+	config_item_put(item);
														
 
															+}
														
 
															+
														
 
															+static struct configfs_group_operations o2nm_cluster_group_group_ops = {
														
 
															+	.make_group	= o2nm_cluster_group_make_group,
														
 
															+	.drop_item	= o2nm_cluster_group_drop_item,
														
 
															+};
														
 
															+
														
 
															+static struct config_item_type o2nm_cluster_group_type = {
														
 
															+	.ct_group_ops	= &o2nm_cluster_group_group_ops,
														
 
															+	.ct_owner	= THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+static struct o2nm_cluster_group o2nm_cluster_group = {
														
 
															+	.cs_subsys = {
														
 
															+		.su_group = {
														
 
															+			.cg_item = {
														
 
															+				.ci_namebuf = "cluster",
														
 
															+				.ci_type = &o2nm_cluster_group_type,
														
 
															+			},
														
 
															+		},
														
 
															+	},
														
 
															+};
														
 
															+
														
 
															+static void __exit exit_o2nm(void)
														
 
															+{
														
 
															+	if (ocfs2_table_header)
														
 
															+		unregister_sysctl_table(ocfs2_table_header);
														
 
															+
														
 
															+	/* XXX sync with hb callbacks and shut down hb? */
														
 
															+	o2net_unregister_hb_callbacks();
														
 
															+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
														
 
															+	o2cb_sys_shutdown();
														
 
															+
														
 
															+	o2net_exit();
														
 
															+}
														
 
															+
														
 
															+static int __init init_o2nm(void)
														
 
															+{
														
 
															+	int ret = -1;
														
 
															+
														
 
															+	cluster_print_version();
														
 
															+
														
 
															+	o2hb_init();
														
 
															+	o2net_init();
														
 
															+
														
 
															+	ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
														
 
															+	if (!ocfs2_table_header) {
														
 
															+		printk(KERN_ERR "nodemanager: unable to register sysctl\n");
														
 
															+		ret = -ENOMEM; /* or something. */
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = o2net_register_hb_callbacks();
														
 
															+	if (ret)
														
 
															+		goto out_sysctl;
														
 
															+
														
 
															+	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
														
 
															+	init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
														
 
															+	ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
														
 
															+	if (ret) {
														
 
															+		printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
														
 
															+		goto out_callbacks;
														
 
															+	}
														
 
															+
														
 
															+	ret = o2cb_sys_init();
														
 
															+	if (!ret)
														
 
															+		goto out;
														
 
															+
														
 
															+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
														
 
															+out_callbacks:
														
 
															+	o2net_unregister_hb_callbacks();
														
 
															+out_sysctl:
														
 
															+	unregister_sysctl_table(ocfs2_table_header);
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+MODULE_AUTHOR("Oracle");
														
 
															+MODULE_LICENSE("GPL");
														
 
															+
														
 
															+module_init(init_o2nm)
														
 
															+module_exit(exit_o2nm)
														
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -0,0 +1,64 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * nodemanager.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef O2CLUSTER_NODEMANAGER_H
														
 
															+#define O2CLUSTER_NODEMANAGER_H
														
 
															+
														
 
															+#include "ocfs2_nodemanager.h"
														
 
															+
														
 
															+/* This totally doesn't belong here. */
														
 
															+#include <linux/configfs.h>
														
 
															+#include <linux/rbtree.h>
														
 
															+
														
 
															+#define KERN_OCFS2		988
														
 
															+#define KERN_OCFS2_NM		1
														
 
															+
														
 
															+const char *o2nm_get_hb_ctl_path(void);
														
 
															+
														
 
															+struct o2nm_node {
														
 
															+	spinlock_t		nd_lock;
														
 
															+	struct config_item	nd_item;
														
 
															+	char			nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
														
 
															+	__u8			nd_num;
														
 
															+	/* only one address per node, as attributes, for now. */
														
 
															+	__be32			nd_ipv4_address;
														
 
															+	__be16			nd_ipv4_port;
														
 
															+	struct rb_node		nd_ip_node;
														
 
															+	/* there can be only one local node for now */
														
 
															+	int			nd_local;
														
 
															+
														
 
															+	unsigned long		nd_set_attributes;
														
 
															+};
														
 
															+
														
 
															+u8 o2nm_this_node(void);
														
 
															+
														
 
															+int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
														
 
															+struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
														
 
															+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
														
 
															+void o2nm_node_get(struct o2nm_node *node);
														
 
															+void o2nm_node_put(struct o2nm_node *node);
														
 
															+
														
 
															+#endif /* O2CLUSTER_NODEMANAGER_H */
														
--- a/fs/ocfs2/cluster/ocfs2_heartbeat.h
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -0,0 +1,37 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ocfs2_heartbeat.h
														
 
															+ *
														
 
															+ * On-disk structures for ocfs2_heartbeat
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef _OCFS2_HEARTBEAT_H
														
 
															+#define _OCFS2_HEARTBEAT_H
														
 
															+
														
 
															+struct o2hb_disk_heartbeat_block {
														
 
															+	__le64 hb_seq;
														
 
															+	__u8  hb_node;
														
 
															+	__u8  hb_pad1[3];
														
 
															+	__le32 hb_cksum;
														
 
															+	__le64 hb_generation;
														
 
															+};
														
 
															+
														
 
															+#endif /* _OCFS2_HEARTBEAT_H */
														
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -0,0 +1,39 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ocfs2_nodemanager.h
														
 
															+ *
														
 
															+ * Header describing the interface between userspace and the kernel
														
 
															+ * for the ocfs2_nodemanager module.
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef _OCFS2_NODEMANAGER_H
														
 
															+#define _OCFS2_NODEMANAGER_H
														
 
															+
														
 
															+#define O2NM_API_VERSION	5
														
 
															+
														
 
															+#define O2NM_MAX_NODES		255
														
 
															+#define O2NM_INVALID_NODE_NUM	255
														
 
															+
														
 
															+/* host name, group name, cluster name all 64 bytes */
														
 
															+#define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
														
 
															+
														
 
															+#endif /* _OCFS2_NODEMANAGER_H */
														
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,315 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ *
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+/* This quorum hack is only here until we transition to some more rational
														
 
															+ * approach that is driven from userspace.  Honest.  No foolin'.
														
 
															+ *
														
 
															+ * Imagine two nodes lose network connectivity to each other but they're still
														
 
															+ * up and operating in every other way.  Presumably a network timeout indicates
														
 
															+ * that a node is broken and should be recovered.  They can't both recover each
														
 
															+ * other and both carry on without serialising their access to the file system.
														
 
															+ * They need to decide who is authoritative.  Now extend that problem to
														
 
															+ * arbitrary groups of nodes losing connectivity between each other.
														
 
															+ *
														
 
															+ * So we declare that a node which has given up on connecting to a majority
														
 
															+ * of nodes who are still heartbeating will fence itself.
														
 
															+ *
														
 
															+ * There are huge opportunities for races here.  After we give up on a node's
														
 
															+ * connection we need to wait long enough to give heartbeat an opportunity
														
 
															+ * to declare the node as truly dead.  We also need to be careful with the
														
 
															+ * race between when we see a node start heartbeating and when we connect
														
 
															+ * to it.
														
 
															+ *
														
 
															+ * So nodes that are in this transtion put a hold on the quorum decision
														
 
															+ * with a counter.  As they fall out of this transition they drop the count
														
 
															+ * and if they're the last, they fire off the decision.
														
 
															+ */
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/workqueue.h>
														
 
															+
														
 
															+#include "heartbeat.h"
														
 
															+#include "nodemanager.h"
														
 
															+#define MLOG_MASK_PREFIX ML_QUORUM
														
 
															+#include "masklog.h"
														
 
															+#include "quorum.h"
														
 
															+
														
 
															+static struct o2quo_state {
														
 
															+	spinlock_t		qs_lock;
														
 
															+	struct work_struct	qs_work;
														
 
															+	int			qs_pending;
														
 
															+	int			qs_heartbeating;
														
 
															+	unsigned long		qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	int			qs_connected;
														
 
															+	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	int			qs_holds;
														
 
															+	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+} o2quo_state;
														
 
															+
														
 
															+/* this is horribly heavy-handed.  It should instead flip the file
														
 
															+ * system RO and call some userspace script. */
														
 
															+static void o2quo_fence_self(void)
														
 
															+{
														
 
															+	/* panic spins with interrupts enabled.  with preempt
														
 
															+	 * threads can still schedule, etc, etc */
														
 
															+	o2hb_stop_all_regions();
														
 
															+	panic("ocfs2 is very sorry to be fencing this system by panicing\n");
														
 
															+}
														
 
															+
														
 
															+/* Indicate that a timeout occured on a hearbeat region write. The
														
 
															+ * other nodes in the cluster may consider us dead at that time so we
														
 
															+ * want to "fence" ourselves so that we don't scribble on the disk
														
 
															+ * after they think they've recovered us. This can't solve all
														
 
															+ * problems related to writeout after recovery but this hack can at
														
 
															+ * least close some of those gaps. When we have real fencing, this can
														
 
															+ * go away as our node would be fenced externally before other nodes
														
 
															+ * begin recovery. */
														
 
															+void o2quo_disk_timeout(void)
														
 
															+{
														
 
															+	o2quo_fence_self();
														
 
															+}
														
 
															+
														
 
															+static void o2quo_make_decision(void *arg)
														
 
															+{
														
 
															+	int quorum;
														
 
															+	int lowest_hb, lowest_reachable = 0, fence = 0;
														
 
															+	struct o2quo_state *qs = &o2quo_state;
														
 
															+
														
 
															+	spin_lock(&qs->qs_lock);
														
 
															+
														
 
															+	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
														
 
															+	if (lowest_hb != O2NM_MAX_NODES)
														
 
															+		lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
														
 
															+
														
 
															+	mlog(0, "heartbeating: %d, connected: %d, "
														
 
															+	     "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
														
 
															+	     qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
														
 
															+
														
 
															+	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
														
 
															+	    qs->qs_heartbeating == 1)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (qs->qs_heartbeating & 1) {
														
 
															+		/* the odd numbered cluster case is straight forward --
														
 
															+		 * if we can't talk to the majority we're hosed */
														
 
															+		quorum = (qs->qs_heartbeating + 1)/2;
														
 
															+		if (qs->qs_connected < quorum) {
														
 
															+			mlog(ML_ERROR, "fencing this node because it is "
														
 
															+			     "only connected to %u nodes and %u is needed "
														
 
															+			     "to make a quorum out of %u heartbeating nodes\n",
														
 
															+			     qs->qs_connected, quorum,
														
 
															+			     qs->qs_heartbeating);
														
 
															+			fence = 1;
														
 
															+		}
														
 
															+	} else {
														
 
															+		/* the even numbered cluster adds the possibility of each half
														
 
															+		 * of the cluster being able to talk amongst themselves.. in
														
 
															+		 * that case we're hosed if we can't talk to the group that has
														
 
															+		 * the lowest numbered node */
														
 
															+		quorum = qs->qs_heartbeating / 2;
														
 
															+		if (qs->qs_connected < quorum) {
														
 
															+			mlog(ML_ERROR, "fencing this node because it is "
														
 
															+			     "only connected to %u nodes and %u is needed "
														
 
															+			     "to make a quorum out of %u heartbeating nodes\n",
														
 
															+			     qs->qs_connected, quorum,
														
 
															+			     qs->qs_heartbeating);
														
 
															+			fence = 1;
														
 
															+		}
														
 
															+		else if ((qs->qs_connected == quorum) &&
														
 
															+			 !lowest_reachable) {
														
 
															+			mlog(ML_ERROR, "fencing this node because it is "
														
 
															+			     "connected to a half-quorum of %u out of %u "
														
 
															+			     "nodes which doesn't include the lowest active "
														
 
															+			     "node %u\n", quorum, qs->qs_heartbeating,
														
 
															+			     lowest_hb);
														
 
															+			fence = 1;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	spin_unlock(&qs->qs_lock);
														
 
															+	if (fence)
														
 
															+		o2quo_fence_self();
														
 
															+}
														
 
															+
														
 
															+static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
														
 
															+{
														
 
															+	assert_spin_locked(&qs->qs_lock);
														
 
															+
														
 
															+	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
														
 
															+		qs->qs_holds++;
														
 
															+		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
														
 
															+			        "node %u\n", node);
														
 
															+		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
														
 
															+{
														
 
															+	assert_spin_locked(&qs->qs_lock);
														
 
															+
														
 
															+	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
														
 
															+		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
														
 
															+		if (--qs->qs_holds == 0) {
														
 
															+			if (qs->qs_pending) {
														
 
															+				qs->qs_pending = 0;
														
 
															+				schedule_work(&qs->qs_work);
														
 
															+			}
														
 
															+		}
														
 
															+		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
														
 
															+				node, qs->qs_holds);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* as a node comes up we delay the quorum decision until we know the fate of
														
 
															+ * the connection.  the hold will be droped in conn_up or hb_down.  it might be
														
 
															+ * perpetuated by con_err until hb_down.  if we already have a conn, we might
														
 
															+ * be dropping a hold that conn_up got. */
														
 
															+void o2quo_hb_up(u8 node)
														
 
															+{
														
 
															+	struct o2quo_state *qs = &o2quo_state;
														
 
															+
														
 
															+	spin_lock(&qs->qs_lock);
														
 
															+
														
 
															+	qs->qs_heartbeating++;
														
 
															+	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
														
 
															+		        "node %u\n", node);
														
 
															+	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
														
 
															+	set_bit(node, qs->qs_hb_bm);
														
 
															+
														
 
															+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
														
 
															+
														
 
															+	if (!test_bit(node, qs->qs_conn_bm))
														
 
															+		o2quo_set_hold(qs, node);
														
 
															+	else
														
 
															+		o2quo_clear_hold(qs, node);
														
 
															+
														
 
															+	spin_unlock(&qs->qs_lock);
														
 
															+}
														
 
															+
														
 
															+/* hb going down releases any holds we might have had due to this node from
														
 
															+ * conn_up, conn_err, or hb_up */
														
 
															+void o2quo_hb_down(u8 node)
														
 
															+{
														
 
															+	struct o2quo_state *qs = &o2quo_state;
														
 
															+
														
 
															+	spin_lock(&qs->qs_lock);
														
 
															+
														
 
															+	qs->qs_heartbeating--;
														
 
															+	mlog_bug_on_msg(qs->qs_heartbeating < 0,
														
 
															+			"node %u, %d heartbeating\n",
														
 
															+			node, qs->qs_heartbeating);
														
 
															+	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
														
 
															+	clear_bit(node, qs->qs_hb_bm);
														
 
															+
														
 
															+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
														
 
															+
														
 
															+	o2quo_clear_hold(qs, node);
														
 
															+
														
 
															+	spin_unlock(&qs->qs_lock);
														
 
															+}
														
 
															+
														
 
															+/* this tells us that we've decided that the node is still heartbeating
														
 
															+ * even though we've lost it's conn.  it must only be called after conn_err
														
 
															+ * and indicates that we must now make a quorum decision in the future,
														
 
															+ * though we might be doing so after waiting for holds to drain.  Here
														
 
															+ * we'll be dropping the hold from conn_err. */
														
 
															+void o2quo_hb_still_up(u8 node)
														
 
															+{
														
 
															+	struct o2quo_state *qs = &o2quo_state;
														
 
															+
														
 
															+	spin_lock(&qs->qs_lock);
														
 
															+
														
 
															+	mlog(0, "node %u\n", node);
														
 
															+
														
 
															+	qs->qs_pending = 1;
														
 
															+	o2quo_clear_hold(qs, node);
														
 
															+
														
 
															+	spin_unlock(&qs->qs_lock);
														
 
															+}
														
 
															+
														
 
															+/* This is analagous to hb_up.  as a node's connection comes up we delay the
														
 
															+ * quorum decision until we see it heartbeating.  the hold will be droped in
														
 
															+ * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
														
 
															+ * it's already heartbeating we we might be dropping a hold that conn_up got.
														
 
															+ * */
														
 
															+void o2quo_conn_up(u8 node)
														
 
															+{
														
 
															+	struct o2quo_state *qs = &o2quo_state;
														
 
															+
														
 
															+	spin_lock(&qs->qs_lock);
														
 
															+
														
 
															+	qs->qs_connected++;
														
 
															+	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
														
 
															+		        "node %u\n", node);
														
 
															+	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
														
 
															+	set_bit(node, qs->qs_conn_bm);
														
 
															+
														
 
															+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
														
 
															+
														
 
															+	if (!test_bit(node, qs->qs_hb_bm))
														
 
															+		o2quo_set_hold(qs, node);
														
 
															+	else
														
 
															+		o2quo_clear_hold(qs, node);
														
 
															+
														
 
															+	spin_unlock(&qs->qs_lock);
														
 
															+}
														
 
															+
														
 
															+/* we've decided that we won't ever be connecting to the node again.  if it's
														
 
															+ * still heartbeating we grab a hold that will delay decisions until either the
														
 
															+ * node stops heartbeating from hb_down or the caller decides that the node is
														
 
															+ * still up and calls still_up */
														
 
															+void o2quo_conn_err(u8 node)
														
 
															+{
														
 
															+	struct o2quo_state *qs = &o2quo_state;
														
 
															+
														
 
															+	spin_lock(&qs->qs_lock);
														
 
															+
														
 
															+	if (test_bit(node, qs->qs_conn_bm)) {
														
 
															+		qs->qs_connected--;
														
 
															+		mlog_bug_on_msg(qs->qs_connected < 0,
														
 
															+				"node %u, connected %d\n",
														
 
															+				node, qs->qs_connected);
														
 
															+
														
 
															+		clear_bit(node, qs->qs_conn_bm);
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
														
 
															+
														
 
															+	if (test_bit(node, qs->qs_hb_bm))
														
 
															+		o2quo_set_hold(qs, node);
														
 
															+
														
 
															+	spin_unlock(&qs->qs_lock);
														
 
															+}
														
 
															+
														
 
															+void o2quo_init(void)
														
 
															+{
														
 
															+	struct o2quo_state *qs = &o2quo_state;
														
 
															+
														
 
															+	spin_lock_init(&qs->qs_lock);
														
 
															+	INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
														
 
															+}
														
 
															+
														
 
															+void o2quo_exit(void)
														
 
															+{
														
 
															+	flush_scheduled_work();
														
 
															+}
														
--- a/fs/ocfs2/cluster/quorum.h
+++ b/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef O2CLUSTER_QUORUM_H
														
 
															+#define O2CLUSTER_QUORUM_H
														
 
															+
														
 
															+void o2quo_init(void);
														
 
															+void o2quo_exit(void);
														
 
															+
														
 
															+void o2quo_hb_up(u8 node);
														
 
															+void o2quo_hb_down(u8 node);
														
 
															+void o2quo_hb_still_up(u8 node);
														
 
															+void o2quo_conn_up(u8 node);
														
 
															+void o2quo_conn_err(u8 node);
														
 
															+void o2quo_disk_timeout(void);
														
 
															+
														
 
															+#endif /* O2CLUSTER_QUORUM_H */
														
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,124 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * sys.c
														
 
															+ *
														
 
															+ * OCFS2 cluster sysfs interface
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation,
														
 
															+ * version 2 of the License.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/kobject.h>
														
 
															+#include <linux/sysfs.h>
														
 
															+
														
 
															+#include "ocfs2_nodemanager.h"
														
 
															+#include "masklog.h"
														
 
															+#include "sys.h"
														
 
															+
														
 
															+struct o2cb_attribute {
														
 
															+	struct attribute	attr;
														
 
															+	ssize_t (*show)(char *buf);
														
 
															+	ssize_t (*store)(const char *buf, size_t count);
														
 
															+};
														
 
															+
														
 
															+#define O2CB_ATTR(_name, _mode, _show, _store)	\
														
 
															+struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
														
 
															+
														
 
															+#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
														
 
															+#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
														
 
															+
														
 
															+static ssize_t o2cb_interface_revision_show(char *buf)
														
 
															+{
														
 
															+	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
														
 
															+}
														
 
															+
														
 
															+static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
														
 
															+
														
 
															+static struct attribute *o2cb_attrs[] = {
														
 
															+	&o2cb_attr_interface_revision.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static ssize_t
														
 
															+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
														
 
															+static ssize_t
														
 
															+o2cb_store(struct kobject * kobj, struct attribute * attr,
														
 
															+	   const char * buffer, size_t count);
														
 
															+static struct sysfs_ops o2cb_sysfs_ops = {
														
 
															+	.show	= o2cb_show,
														
 
															+	.store	= o2cb_store,
														
 
															+};
														
 
															+
														
 
															+static struct kobj_type o2cb_subsys_type = {
														
 
															+	.default_attrs	= o2cb_attrs,
														
 
															+	.sysfs_ops	= &o2cb_sysfs_ops,
														
 
															+};
														
 
															+
														
 
															+/* gives us o2cb_subsys */
														
 
															+static decl_subsys(o2cb, NULL, NULL);
														
 
															+
														
 
															+static ssize_t
														
 
															+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
														
 
															+{
														
 
															+	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
														
 
															+	struct subsystem *sbs = to_o2cb_subsys(kobj);
														
 
															+
														
 
															+	BUG_ON(sbs != &o2cb_subsys);
														
 
															+
														
 
															+	if (o2cb_attr->show)
														
 
															+		return o2cb_attr->show(buffer);
														
 
															+	return -EIO;
														
 
															+}
														
 
															+
														
 
															+static ssize_t
														
 
															+o2cb_store(struct kobject * kobj, struct attribute * attr,
														
 
															+	     const char * buffer, size_t count)
														
 
															+{
														
 
															+	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
														
 
															+	struct subsystem *sbs = to_o2cb_subsys(kobj);
														
 
															+
														
 
															+	BUG_ON(sbs != &o2cb_subsys);
														
 
															+
														
 
															+	if (o2cb_attr->store)
														
 
															+		return o2cb_attr->store(buffer, count);
														
 
															+	return -EIO;
														
 
															+}
														
 
															+
														
 
															+void o2cb_sys_shutdown(void)
														
 
															+{
														
 
															+	mlog_sys_shutdown();
														
 
															+	subsystem_unregister(&o2cb_subsys);
														
 
															+}
														
 
															+
														
 
															+int o2cb_sys_init(void)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
														
 
															+	ret = subsystem_register(&o2cb_subsys);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	ret = mlog_sys_init(&o2cb_subsys);
														
 
															+	if (ret)
														
 
															+		subsystem_unregister(&o2cb_subsys);
														
 
															+	return ret;
														
 
															+}
														
--- a/fs/ocfs2/cluster/sys.h
+++ b/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * sys.h
														
 
															+ *
														
 
															+ * Function prototypes for o2cb sysfs interface
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation,
														
 
															+ * version 2 of the License.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef O2CLUSTER_SYS_H
														
 
															+#define O2CLUSTER_SYS_H
														
 
															+
														
 
															+void o2cb_sys_shutdown(void);
														
 
															+int o2cb_sys_init(void);
														
 
															+
														
 
															+#endif /* O2CLUSTER_SYS_H */
														
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,1829 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ *
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ * ----
														
 
															+ *
														
 
															+ * Callers for this were originally written against a very simple synchronus
														
 
															+ * API.  This implementation reflects those simple callers.  Some day I'm sure
														
 
															+ * we'll need to move to a more robust posting/callback mechanism.
														
 
															+ *
														
 
															+ * Transmit calls pass in kernel virtual addresses and block copying this into
														
 
															+ * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
														
 
															+ * for a failed socket to timeout.  TX callers can also pass in a poniter to an
														
 
															+ * 'int' which gets filled with an errno off the wire in response to the
														
 
															+ * message they send.
														
 
															+ *
														
 
															+ * Handlers for unsolicited messages are registered.  Each socket has a page
														
 
															+ * that incoming data is copied into.  First the header, then the data.
														
 
															+ * Handlers are called from only one thread with a reference to this per-socket
														
 
															+ * page.  This page is destroyed after the handler call, so it can't be
														
 
															+ * referenced beyond the call.  Handlers may block but are discouraged from
														
 
															+ * doing so.
														
 
															+ *
														
 
															+ * Any framing errors (bad magic, large payload lengths) close a connection.
														
 
															+ *
														
 
															+ * Our sock_container holds the state we associate with a socket.  It's current
														
 
															+ * framing state is held there as well as the refcounting we do around when it
														
 
															+ * is safe to tear down the socket.  The socket is only finally torn down from
														
 
															+ * the container when the container loses all of its references -- so as long
														
 
															+ * as you hold a ref on the container you can trust that the socket is valid
														
 
															+ * for use with kernel socket APIs.
														
 
															+ *
														
 
															+ * Connections are initiated between a pair of nodes when the node with the
														
 
															+ * higher node number gets a heartbeat callback which indicates that the lower
														
 
															+ * numbered node has started heartbeating.  The lower numbered node is passive
														
 
															+ * and only accepts the connection if the higher numbered node is heartbeating.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/jiffies.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/idr.h>
														
 
															+#include <linux/kref.h>
														
 
															+#include <net/tcp.h>
														
 
															+
														
 
															+#include <asm/uaccess.h>
														
 
															+
														
 
															+#include "heartbeat.h"
														
 
															+#include "tcp.h"
														
 
															+#include "nodemanager.h"
														
 
															+#define MLOG_MASK_PREFIX ML_TCP
														
 
															+#include "masklog.h"
														
 
															+#include "quorum.h"
														
 
															+
														
 
															+#include "tcp_internal.h"
														
 
															+
														
 
															+/* 
														
 
															+ * The linux network stack isn't sparse endian clean.. It has macros like
														
 
															+ * ntohs() which perform the endian checks and structs like sockaddr_in
														
 
															+ * which aren't annotated.  So __force is found here to get the build
														
 
															+ * clean.  When they emerge from the dark ages and annotate the code
														
 
															+ * we can remove these.
														
 
															+ */
														
 
															+
														
 
															+#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
														
 
															+#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
														
 
															+			  NIPQUAD(sc->sc_node->nd_ipv4_address),	\
														
 
															+			  ntohs(sc->sc_node->nd_ipv4_port)
														
 
															+
														
 
															+/*
														
 
															+ * In the following two log macros, the whitespace after the ',' just
														
 
															+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
														
 
															+ * previous token if args expands to nothing.
														
 
															+ */
														
 
															+#define msglog(hdr, fmt, args...) do {					\
														
 
															+	typeof(hdr) __hdr = (hdr);					\
														
 
															+	mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d "	\
														
 
															+	     "key %08x num %u] " fmt,					\
														
 
															+	     be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), 	\
														
 
															+	     be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status),	\
														
 
															+	     be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key),	\
														
 
															+	     be32_to_cpu(__hdr->msg_num) ,  ##args);			\
														
 
															+} while (0)
														
 
															+
														
 
															+#define sclog(sc, fmt, args...) do {					\
														
 
															+	typeof(sc) __sc = (sc);						\
														
 
															+	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
														
 
															+	     "pg_off %zu] " fmt, __sc,					\
														
 
															+	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
														
 
															+	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
														
 
															+	    ##args);							\
														
 
															+} while (0)
														
 
															+
														
 
															+static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
														
 
															+static struct rb_root o2net_handler_tree = RB_ROOT;
														
 
															+
														
 
															+static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
														
 
															+
														
 
															+/* XXX someday we'll need better accounting */
														
 
															+static struct socket *o2net_listen_sock = NULL;
														
 
															+
														
 
															+/*
														
 
															+ * listen work is only queued by the listening socket callbacks on the
														
 
															+ * o2net_wq.  teardown detaches the callbacks before destroying the workqueue.
														
 
															+ * quorum work is queued as sock containers are shutdown.. stop_listening
														
 
															+ * tears down all the node's sock containers, preventing future shutdowns
														
 
															+ * and queued quroum work, before canceling delayed quorum work and
														
 
															+ * destroying the work queue.
														
 
															+ */
														
 
															+static struct workqueue_struct *o2net_wq;
														
 
															+static struct work_struct o2net_listen_work;
														
 
															+
														
 
															+static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
														
 
															+#define O2NET_HB_PRI 0x1
														
 
															+
														
 
															+static struct o2net_handshake *o2net_hand;
														
 
															+static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
														
 
															+
														
 
															+static int o2net_sys_err_translations[O2NET_ERR_MAX] =
														
 
															+		{[O2NET_ERR_NONE]	= 0,
														
 
															+		 [O2NET_ERR_NO_HNDLR]	= -ENOPROTOOPT,
														
 
															+		 [O2NET_ERR_OVERFLOW]	= -EOVERFLOW,
														
 
															+		 [O2NET_ERR_DIED]	= -EHOSTDOWN,};
														
 
															+
														
 
															+/* can't quite avoid *all* internal declarations :/ */
														
 
															+static void o2net_sc_connect_completed(void *arg);
														
 
															+static void o2net_rx_until_empty(void *arg);
														
 
															+static void o2net_shutdown_sc(void *arg);
														
 
															+static void o2net_listen_data_ready(struct sock *sk, int bytes);
														
 
															+static void o2net_sc_send_keep_req(void *arg);
														
 
															+static void o2net_idle_timer(unsigned long data);
														
 
															+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
														
 
															+
														
 
															+static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
														
 
															+{
														
 
															+	int trans;
														
 
															+	BUG_ON(err >= O2NET_ERR_MAX);
														
 
															+	trans = o2net_sys_err_translations[err];
														
 
															+
														
 
															+	/* Just in case we mess up the translation table above */
														
 
															+	BUG_ON(err != O2NET_ERR_NONE && trans == 0);
														
 
															+	return trans;
														
 
															+}
														
 
															+
														
 
															+static struct o2net_node * o2net_nn_from_num(u8 node_num)
														
 
															+{
														
 
															+	BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
														
 
															+	return &o2net_nodes[node_num];
														
 
															+}
														
 
															+
														
 
															+static u8 o2net_num_from_nn(struct o2net_node *nn)
														
 
															+{
														
 
															+	BUG_ON(nn == NULL);
														
 
															+	return nn - o2net_nodes;
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	do {
														
 
															+		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
														
 
															+			ret = -EAGAIN;
														
 
															+			break;
														
 
															+		}
														
 
															+		spin_lock(&nn->nn_lock);
														
 
															+		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
														
 
															+		if (ret == 0)
														
 
															+			list_add_tail(&nsw->ns_node_item,
														
 
															+				      &nn->nn_status_list);
														
 
															+		spin_unlock(&nn->nn_lock);
														
 
															+	} while (ret == -EAGAIN);
														
 
															+
														
 
															+	if (ret == 0)  {
														
 
															+		init_waitqueue_head(&nsw->ns_wq);
														
 
															+		nsw->ns_sys_status = O2NET_ERR_NONE;
														
 
															+		nsw->ns_status = 0;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2net_complete_nsw_locked(struct o2net_node *nn,
														
 
															+				      struct o2net_status_wait *nsw,
														
 
															+				      enum o2net_system_error sys_status,
														
 
															+				      s32 status)
														
 
															+{
														
 
															+	assert_spin_locked(&nn->nn_lock);
														
 
															+
														
 
															+	if (!list_empty(&nsw->ns_node_item)) {
														
 
															+		list_del_init(&nsw->ns_node_item);
														
 
															+		nsw->ns_sys_status = sys_status;
														
 
															+		nsw->ns_status = status;
														
 
															+		idr_remove(&nn->nn_status_idr, nsw->ns_id);
														
 
															+		wake_up(&nsw->ns_wq);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void o2net_complete_nsw(struct o2net_node *nn,
														
 
															+			       struct o2net_status_wait *nsw,
														
 
															+			       u64 id, enum o2net_system_error sys_status,
														
 
															+			       s32 status)
														
 
															+{
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	if (nsw == NULL) {
														
 
															+		if (id > INT_MAX)
														
 
															+			goto out;
														
 
															+
														
 
															+		nsw = idr_find(&nn->nn_status_idr, id);
														
 
															+		if (nsw == NULL)
														
 
															+			goto out;
														
 
															+	}
														
 
															+
														
 
															+	o2net_complete_nsw_locked(nn, nsw, sys_status, status);
														
 
															+
														
 
															+out:
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+	return;
														
 
															+}
														
 
															+
														
 
															+static void o2net_complete_nodes_nsw(struct o2net_node *nn)
														
 
															+{
														
 
															+	struct list_head *iter, *tmp;
														
 
															+	unsigned int num_kills = 0;
														
 
															+	struct o2net_status_wait *nsw;
														
 
															+
														
 
															+	assert_spin_locked(&nn->nn_lock);
														
 
															+
														
 
															+	list_for_each_safe(iter, tmp, &nn->nn_status_list) {
														
 
															+		nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
														
 
															+		o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
														
 
															+		num_kills++;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "completed %d messages for node %u\n", num_kills,
														
 
															+	     o2net_num_from_nn(nn));
														
 
															+}
														
 
															+
														
 
															+static int o2net_nsw_completed(struct o2net_node *nn,
														
 
															+			       struct o2net_status_wait *nsw)
														
 
															+{
														
 
															+	int completed;
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	completed = list_empty(&nsw->ns_node_item);
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+	return completed;
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+static void sc_kref_release(struct kref *kref)
														
 
															+{
														
 
															+	struct o2net_sock_container *sc = container_of(kref,
														
 
															+					struct o2net_sock_container, sc_kref);
														
 
															+	sclog(sc, "releasing\n");
														
 
															+
														
 
															+	if (sc->sc_sock) {
														
 
															+		sock_release(sc->sc_sock);
														
 
															+		sc->sc_sock = NULL;
														
 
															+	}
														
 
															+
														
 
															+	o2nm_node_put(sc->sc_node);
														
 
															+	sc->sc_node = NULL;
														
 
															+
														
 
															+	kfree(sc);
														
 
															+}
														
 
															+
														
 
															+static void sc_put(struct o2net_sock_container *sc)
														
 
															+{
														
 
															+	sclog(sc, "put\n");
														
 
															+	kref_put(&sc->sc_kref, sc_kref_release);
														
 
															+}
														
 
															+static void sc_get(struct o2net_sock_container *sc)
														
 
															+{
														
 
															+	sclog(sc, "get\n");
														
 
															+	kref_get(&sc->sc_kref);
														
 
															+}
														
 
															+static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
														
 
															+{
														
 
															+	struct o2net_sock_container *sc, *ret = NULL;
														
 
															+	struct page *page = NULL;
														
 
															+
														
 
															+	page = alloc_page(GFP_NOFS);
														
 
															+	sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
														
 
															+	if (sc == NULL || page == NULL)
														
 
															+		goto out;
														
 
															+
														
 
															+	kref_init(&sc->sc_kref);
														
 
															+	o2nm_node_get(node);
														
 
															+	sc->sc_node = node;
														
 
															+
														
 
															+	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
														
 
															+	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
														
 
															+	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
														
 
															+	INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
														
 
															+
														
 
															+	init_timer(&sc->sc_idle_timeout);
														
 
															+	sc->sc_idle_timeout.function = o2net_idle_timer;
														
 
															+	sc->sc_idle_timeout.data = (unsigned long)sc;
														
 
															+
														
 
															+	sclog(sc, "alloced\n");
														
 
															+
														
 
															+	ret = sc;
														
 
															+	sc->sc_page = page;
														
 
															+	sc = NULL;
														
 
															+	page = NULL;
														
 
															+
														
 
															+out:
														
 
															+	if (page)
														
 
															+		__free_page(page);
														
 
															+	kfree(sc);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+static void o2net_sc_queue_work(struct o2net_sock_container *sc,
														
 
															+				struct work_struct *work)
														
 
															+{
														
 
															+	sc_get(sc);
														
 
															+	if (!queue_work(o2net_wq, work))
														
 
															+		sc_put(sc);
														
 
															+}
														
 
															+static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
														
 
															+					struct work_struct *work,
														
 
															+					int delay)
														
 
															+{
														
 
															+	sc_get(sc);
														
 
															+	if (!queue_delayed_work(o2net_wq, work, delay))
														
 
															+		sc_put(sc);
														
 
															+}
														
 
															+static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
														
 
															+					 struct work_struct *work)
														
 
															+{
														
 
															+	if (cancel_delayed_work(work))
														
 
															+		sc_put(sc);
														
 
															+}
														
 
															+
														
 
															+static void o2net_set_nn_state(struct o2net_node *nn,
														
 
															+			       struct o2net_sock_container *sc,
														
 
															+			       unsigned valid, int err)
														
 
															+{
														
 
															+	int was_valid = nn->nn_sc_valid;
														
 
															+	int was_err = nn->nn_persistent_error;
														
 
															+	struct o2net_sock_container *old_sc = nn->nn_sc;
														
 
															+
														
 
															+	assert_spin_locked(&nn->nn_lock);
														
 
															+
														
 
															+	/* the node num comparison and single connect/accept path should stop
														
 
															+	 * an non-null sc from being overwritten with another */
														
 
															+	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
														
 
															+	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
														
 
															+	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
														
 
															+
														
 
															+	/* we won't reconnect after our valid conn goes away for
														
 
															+	 * this hb iteration.. here so it shows up in the logs */
														
 
															+	if (was_valid && !valid && err == 0)
														
 
															+		err = -ENOTCONN;
														
 
															+
														
 
															+	mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
														
 
															+	     o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
														
 
															+	     nn->nn_persistent_error, err);
														
 
															+
														
 
															+	nn->nn_sc = sc;
														
 
															+	nn->nn_sc_valid = valid ? 1 : 0;
														
 
															+	nn->nn_persistent_error = err;
														
 
															+
														
 
															+	/* mirrors o2net_tx_can_proceed() */
														
 
															+	if (nn->nn_persistent_error || nn->nn_sc_valid)
														
 
															+		wake_up(&nn->nn_sc_wq);
														
 
															+
														
 
															+	if (!was_err && nn->nn_persistent_error) {
														
 
															+		o2quo_conn_err(o2net_num_from_nn(nn));
														
 
															+		queue_delayed_work(o2net_wq, &nn->nn_still_up,
														
 
															+				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
														
 
															+	}
														
 
															+
														
 
															+	if (was_valid && !valid) {
														
 
															+		mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
														
 
															+		     SC_NODEF_ARGS(old_sc));
														
 
															+		o2net_complete_nodes_nsw(nn);
														
 
															+	}
														
 
															+
														
 
															+	if (!was_valid && valid) {
														
 
															+		o2quo_conn_up(o2net_num_from_nn(nn));
														
 
															+		/* this is a bit of a hack.  we only try reconnecting
														
 
															+		 * when heartbeating starts until we get a connection.
														
 
															+		 * if that connection then dies we don't try reconnecting.
														
 
															+		 * the only way to start connecting again is to down
														
 
															+		 * heartbeat and bring it back up. */
														
 
															+		cancel_delayed_work(&nn->nn_connect_expired);
														
 
															+		mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", 
														
 
															+		     o2nm_this_node() > sc->sc_node->nd_num ?
														
 
															+		     	"connected to" : "accepted connection from",
														
 
															+		     SC_NODEF_ARGS(sc));
														
 
															+	}
														
 
															+
														
 
															+	/* trigger the connecting worker func as long as we're not valid,
														
 
															+	 * it will back off if it shouldn't connect.  This can be called
														
 
															+	 * from node config teardown and so needs to be careful about
														
 
															+	 * the work queue actually being up. */
														
 
															+	if (!valid && o2net_wq) {
														
 
															+		unsigned long delay;
														
 
															+		/* delay if we're withing a RECONNECT_DELAY of the
														
 
															+		 * last attempt */
														
 
															+		delay = (nn->nn_last_connect_attempt +
														
 
															+			 msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
														
 
															+			- jiffies;
														
 
															+		if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
														
 
															+			delay = 0;
														
 
															+		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
														
 
															+		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
														
 
															+	}
														
 
															+
														
 
															+	/* keep track of the nn's sc ref for the caller */
														
 
															+	if ((old_sc == NULL) && sc)
														
 
															+		sc_get(sc);
														
 
															+	if (old_sc && (old_sc != sc)) {
														
 
															+		o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
														
 
															+		sc_put(old_sc);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* see o2net_register_callbacks() */
														
 
															+static void o2net_data_ready(struct sock *sk, int bytes)
														
 
															+{
														
 
															+	void (*ready)(struct sock *sk, int bytes);
														
 
															+
														
 
															+	read_lock(&sk->sk_callback_lock);
														
 
															+	if (sk->sk_user_data) {
														
 
															+		struct o2net_sock_container *sc = sk->sk_user_data;
														
 
															+		sclog(sc, "data_ready hit\n");
														
 
															+		do_gettimeofday(&sc->sc_tv_data_ready);
														
 
															+		o2net_sc_queue_work(sc, &sc->sc_rx_work);
														
 
															+		ready = sc->sc_data_ready;
														
 
															+	} else {
														
 
															+		ready = sk->sk_data_ready;
														
 
															+	}
														
 
															+	read_unlock(&sk->sk_callback_lock);
														
 
															+
														
 
															+	ready(sk, bytes);
														
 
															+}
														
 
															+
														
 
															+/* see o2net_register_callbacks() */
														
 
															+static void o2net_state_change(struct sock *sk)
														
 
															+{
														
 
															+	void (*state_change)(struct sock *sk);
														
 
															+	struct o2net_sock_container *sc;
														
 
															+
														
 
															+	read_lock(&sk->sk_callback_lock);
														
 
															+	sc = sk->sk_user_data;
														
 
															+	if (sc == NULL) {
														
 
															+		state_change = sk->sk_state_change;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	sclog(sc, "state_change to %d\n", sk->sk_state);
														
 
															+
														
 
															+	state_change = sc->sc_state_change;
														
 
															+
														
 
															+	switch(sk->sk_state) {
														
 
															+		/* ignore connecting sockets as they make progress */
														
 
															+		case TCP_SYN_SENT:
														
 
															+		case TCP_SYN_RECV:
														
 
															+			break;
														
 
															+		case TCP_ESTABLISHED:
														
 
															+			o2net_sc_queue_work(sc, &sc->sc_connect_work);
														
 
															+			break;
														
 
															+		default:
														
 
															+			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
														
 
															+			break;
														
 
															+	}
														
 
															+out:
														
 
															+	read_unlock(&sk->sk_callback_lock);
														
 
															+	state_change(sk);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * we register callbacks so we can queue work on events before calling
														
 
															+ * the original callbacks.  our callbacks our careful to test user_data
														
 
															+ * to discover when they've reaced with o2net_unregister_callbacks().
														
 
															+ */
														
 
															+static void o2net_register_callbacks(struct sock *sk,
														
 
															+				     struct o2net_sock_container *sc)
														
 
															+{
														
 
															+	write_lock_bh(&sk->sk_callback_lock);
														
 
															+
														
 
															+	/* accepted sockets inherit the old listen socket data ready */
														
 
															+	if (sk->sk_data_ready == o2net_listen_data_ready) {
														
 
															+		sk->sk_data_ready = sk->sk_user_data;
														
 
															+		sk->sk_user_data = NULL;
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(sk->sk_user_data != NULL);
														
 
															+	sk->sk_user_data = sc;
														
 
															+	sc_get(sc);
														
 
															+
														
 
															+	sc->sc_data_ready = sk->sk_data_ready;
														
 
															+	sc->sc_state_change = sk->sk_state_change;
														
 
															+	sk->sk_data_ready = o2net_data_ready;
														
 
															+	sk->sk_state_change = o2net_state_change;
														
 
															+
														
 
															+	write_unlock_bh(&sk->sk_callback_lock);
														
 
															+}
														
 
															+
														
 
															+static int o2net_unregister_callbacks(struct sock *sk,
														
 
															+			           struct o2net_sock_container *sc)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	write_lock_bh(&sk->sk_callback_lock);
														
 
															+	if (sk->sk_user_data == sc) {
														
 
															+		ret = 1;
														
 
															+		sk->sk_user_data = NULL;
														
 
															+		sk->sk_data_ready = sc->sc_data_ready;
														
 
															+		sk->sk_state_change = sc->sc_state_change;
														
 
															+	}
														
 
															+	write_unlock_bh(&sk->sk_callback_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * this is a little helper that is called by callers who have seen a problem
														
 
															+ * with an sc and want to detach it from the nn if someone already hasn't beat
														
 
															+ * them to it.  if an error is given then the shutdown will be persistent
														
 
															+ * and pending transmits will be canceled.
														
 
															+ */
														
 
															+static void o2net_ensure_shutdown(struct o2net_node *nn,
														
 
															+			           struct o2net_sock_container *sc,
														
 
															+				   int err)
														
 
															+{
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	if (nn->nn_sc == sc)
														
 
															+		o2net_set_nn_state(nn, NULL, 0, err);
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This work queue function performs the blocking parts of socket shutdown.  A
														
 
															+ * few paths lead here.  set_nn_state will trigger this callback if it sees an
														
 
															+ * sc detached from the nn.  state_change will also trigger this callback
														
 
															+ * directly when it sees errors.  In that case we need to call set_nn_state
														
 
															+ * ourselves as state_change couldn't get the nn_lock and call set_nn_state
														
 
															+ * itself.
														
 
															+ */
														
 
															+static void o2net_shutdown_sc(void *arg)
														
 
															+{
														
 
															+	struct o2net_sock_container *sc = arg;
														
 
															+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
														
 
															+
														
 
															+	sclog(sc, "shutting down\n");
														
 
															+
														
 
															+	/* drop the callbacks ref and call shutdown only once */
														
 
															+	if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
														
 
															+		/* we shouldn't flush as we're in the thread, the
														
 
															+		 * races with pending sc work structs are harmless */
														
 
															+		del_timer_sync(&sc->sc_idle_timeout);
														
 
															+		o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
														
 
															+		sc_put(sc);
														
 
															+		sc->sc_sock->ops->shutdown(sc->sc_sock,
														
 
															+					   RCV_SHUTDOWN|SEND_SHUTDOWN);
														
 
															+	}
														
 
															+
														
 
															+	/* not fatal so failed connects before the other guy has our
														
 
															+	 * heartbeat can be retried */
														
 
															+	o2net_ensure_shutdown(nn, sc, 0);
														
 
															+	sc_put(sc);
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
														
 
															+			     u32 key)
														
 
															+{
														
 
															+	int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
														
 
															+
														
 
															+	if (ret == 0)
														
 
															+		ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static struct o2net_msg_handler *
														
 
															+o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
														
 
															+			  struct rb_node **ret_parent)
														
 
															+{
														
 
															+        struct rb_node **p = &o2net_handler_tree.rb_node;
														
 
															+        struct rb_node *parent = NULL;
														
 
															+	struct o2net_msg_handler *nmh, *ret = NULL;
														
 
															+	int cmp;
														
 
															+
														
 
															+        while (*p) {
														
 
															+                parent = *p;
														
 
															+                nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
														
 
															+		cmp = o2net_handler_cmp(nmh, msg_type, key);
														
 
															+
														
 
															+                if (cmp < 0)
														
 
															+                        p = &(*p)->rb_left;
														
 
															+                else if (cmp > 0)
														
 
															+                        p = &(*p)->rb_right;
														
 
															+                else {
														
 
															+			ret = nmh;
														
 
															+                        break;
														
 
															+		}
														
 
															+        }
														
 
															+
														
 
															+        if (ret_p != NULL)
														
 
															+                *ret_p = p;
														
 
															+        if (ret_parent != NULL)
														
 
															+                *ret_parent = parent;
														
 
															+
														
 
															+        return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2net_handler_kref_release(struct kref *kref)
														
 
															+{
														
 
															+	struct o2net_msg_handler *nmh;
														
 
															+	nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
														
 
															+
														
 
															+	kfree(nmh);
														
 
															+}
														
 
															+
														
 
															+static void o2net_handler_put(struct o2net_msg_handler *nmh)
														
 
															+{
														
 
															+	kref_put(&nmh->nh_kref, o2net_handler_kref_release);
														
 
															+}
														
 
															+
														
 
															+/* max_len is protection for the handler func.  incoming messages won't
														
 
															+ * be given to the handler if their payload is longer than the max. */
														
 
															+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
														
 
															+			   o2net_msg_handler_func *func, void *data,
														
 
															+			   struct list_head *unreg_list)
														
 
															+{
														
 
															+	struct o2net_msg_handler *nmh = NULL;
														
 
															+	struct rb_node **p, *parent;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
														
 
															+		mlog(0, "max_len for message handler out of range: %u\n",
														
 
															+			max_len);
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (!msg_type) {
														
 
															+		mlog(0, "no message type provided: %u, %p\n", msg_type, func);
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+
														
 
															+	}
														
 
															+	if (!func) {
														
 
															+		mlog(0, "no message handler provided: %u, %p\n",
														
 
															+		       msg_type, func);
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+       	nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS);
														
 
															+	if (nmh == NULL) {
														
 
															+		ret = -ENOMEM;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	nmh->nh_func = func;
														
 
															+	nmh->nh_func_data = data;
														
 
															+	nmh->nh_msg_type = msg_type;
														
 
															+	nmh->nh_max_len = max_len;
														
 
															+	nmh->nh_key = key;
														
 
															+	/* the tree and list get this ref.. they're both removed in
														
 
															+	 * unregister when this ref is dropped */
														
 
															+	kref_init(&nmh->nh_kref);
														
 
															+	INIT_LIST_HEAD(&nmh->nh_unregister_item);
														
 
															+
														
 
															+	write_lock(&o2net_handler_lock);
														
 
															+	if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
														
 
															+		ret = -EEXIST;
														
 
															+	else {
														
 
															+	        rb_link_node(&nmh->nh_node, parent, p);
														
 
															+		rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
														
 
															+		list_add_tail(&nmh->nh_unregister_item, unreg_list);
														
 
															+
														
 
															+		mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
														
 
															+		     func, msg_type, key);
														
 
															+		/* we've had some trouble with handlers seemingly vanishing. */
														
 
															+		mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
														
 
															+							  &parent) == NULL,
														
 
															+			        "couldn't find handler we *just* registerd "
														
 
															+				"for type %u key %08x\n", msg_type, key);
														
 
															+	}
														
 
															+	write_unlock(&o2net_handler_lock);
														
 
															+	if (ret)
														
 
															+		goto out;
														
 
															+
														
 
															+out:
														
 
															+	if (ret)
														
 
															+		kfree(nmh);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2net_register_handler);
														
 
															+
														
 
															+void o2net_unregister_handler_list(struct list_head *list)
														
 
															+{
														
 
															+	struct list_head *pos, *n;
														
 
															+	struct o2net_msg_handler *nmh;
														
 
															+
														
 
															+	write_lock(&o2net_handler_lock);
														
 
															+	list_for_each_safe(pos, n, list) {
														
 
															+		nmh = list_entry(pos, struct o2net_msg_handler,
														
 
															+				 nh_unregister_item);
														
 
															+		mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
														
 
															+		     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
														
 
															+		rb_erase(&nmh->nh_node, &o2net_handler_tree);
														
 
															+		list_del_init(&nmh->nh_unregister_item);
														
 
															+		kref_put(&nmh->nh_kref, o2net_handler_kref_release);
														
 
															+	}
														
 
															+	write_unlock(&o2net_handler_lock);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
														
 
															+
														
 
															+static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
														
 
															+{
														
 
															+	struct o2net_msg_handler *nmh;
														
 
															+
														
 
															+	read_lock(&o2net_handler_lock);
														
 
															+	nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
														
 
															+	if (nmh)
														
 
															+		kref_get(&nmh->nh_kref);
														
 
															+	read_unlock(&o2net_handler_lock);
														
 
															+
														
 
															+	return nmh;
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
														
 
															+{
														
 
															+	int ret;
														
 
															+	mm_segment_t oldfs;
														
 
															+	struct kvec vec = {
														
 
															+		.iov_len = len,
														
 
															+		.iov_base = data,
														
 
															+	};
														
 
															+	struct msghdr msg = {
														
 
															+		.msg_iovlen = 1,
														
 
															+		.msg_iov = (struct iovec *)&vec,
														
 
															+       		.msg_flags = MSG_DONTWAIT,
														
 
															+	};
														
 
															+
														
 
															+	oldfs = get_fs();
														
 
															+	set_fs(get_ds());
														
 
															+	ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
														
 
															+	set_fs(oldfs);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
														
 
															+			      size_t veclen, size_t total)
														
 
															+{
														
 
															+	int ret;
														
 
															+	mm_segment_t oldfs;
														
 
															+	struct msghdr msg = {
														
 
															+		.msg_iov = (struct iovec *)vec,
														
 
															+		.msg_iovlen = veclen,
														
 
															+	};
														
 
															+
														
 
															+	if (sock == NULL) {
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	oldfs = get_fs();
														
 
															+	set_fs(get_ds());
														
 
															+	ret = sock_sendmsg(sock, &msg, total);
														
 
															+	set_fs(oldfs);
														
 
															+	if (ret != total) {
														
 
															+		mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
														
 
															+		     total);
														
 
															+		if (ret >= 0)
														
 
															+			ret = -EPIPE; /* should be smarter, I bet */
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = 0;
														
 
															+out:
														
 
															+	if (ret < 0)
														
 
															+		mlog(0, "returning error: %d\n", ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2net_sendpage(struct o2net_sock_container *sc,
														
 
															+			   void *kmalloced_virt,
														
 
															+			   size_t size)
														
 
															+{
														
 
															+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
														
 
															+	ssize_t ret;
														
 
															+
														
 
															+
														
 
															+	ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
														
 
															+					 virt_to_page(kmalloced_virt),
														
 
															+					 (long)kmalloced_virt & ~PAGE_MASK,
														
 
															+					 size, MSG_DONTWAIT);
														
 
															+	if (ret != size) {
														
 
															+		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 
														
 
															+		     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
														
 
															+		o2net_ensure_shutdown(nn, sc, 0);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
														
 
															+{
														
 
															+	memset(msg, 0, sizeof(struct o2net_msg));
														
 
															+	msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
														
 
															+	msg->data_len = cpu_to_be16(data_len);
														
 
															+	msg->msg_type = cpu_to_be16(msg_type);
														
 
															+	msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
														
 
															+	msg->status = 0;
														
 
															+	msg->key = cpu_to_be32(key);
														
 
															+}
														
 
															+
														
 
															+static int o2net_tx_can_proceed(struct o2net_node *nn,
														
 
															+			        struct o2net_sock_container **sc_ret,
														
 
															+				int *error)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	if (nn->nn_persistent_error) {
														
 
															+		ret = 1;
														
 
															+		*sc_ret = NULL;
														
 
															+		*error = nn->nn_persistent_error;
														
 
															+	} else if (nn->nn_sc_valid) {
														
 
															+		kref_get(&nn->nn_sc->sc_kref);
														
 
															+
														
 
															+		ret = 1;
														
 
															+		*sc_ret = nn->nn_sc;
														
 
															+		*error = 0;
														
 
															+	}
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
														
 
															+			   size_t caller_veclen, u8 target_node, int *status)
														
 
															+{
														
 
															+	int ret, error = 0;
														
 
															+	struct o2net_msg *msg = NULL;
														
 
															+	size_t veclen, caller_bytes = 0;
														
 
															+	struct kvec *vec = NULL;
														
 
															+	struct o2net_sock_container *sc = NULL;
														
 
															+	struct o2net_node *nn = o2net_nn_from_num(target_node);
														
 
															+	struct o2net_status_wait nsw = {
														
 
															+		.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
														
 
															+	};
														
 
															+
														
 
															+	if (o2net_wq == NULL) {
														
 
															+		mlog(0, "attempt to tx without o2netd running\n");
														
 
															+		ret = -ESRCH;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (caller_veclen == 0) {
														
 
															+		mlog(0, "bad kvec array length\n");
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
														
 
															+	if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
														
 
															+		mlog(0, "total payload len %zu too large\n", caller_bytes);
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (target_node == o2nm_this_node()) {
														
 
															+		ret = -ELOOP;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = wait_event_interruptible(nn->nn_sc_wq,
														
 
															+				       o2net_tx_can_proceed(nn, &sc, &error));
														
 
															+	if (!ret && error)
														
 
															+		ret = error;
														
 
															+	if (ret)
														
 
															+		goto out;
														
 
															+
														
 
															+	veclen = caller_veclen + 1;
														
 
															+	vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
														
 
															+	if (vec == NULL) {
														
 
															+		mlog(0, "failed to %zu element kvec!\n", veclen);
														
 
															+		ret = -ENOMEM;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
														
 
															+	if (!msg) {
														
 
															+		mlog(0, "failed to allocate a o2net_msg!\n");
														
 
															+		ret = -ENOMEM;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	o2net_init_msg(msg, caller_bytes, msg_type, key);
														
 
															+
														
 
															+	vec[0].iov_len = sizeof(struct o2net_msg);
														
 
															+	vec[0].iov_base = msg;
														
 
															+	memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
														
 
															+
														
 
															+	ret = o2net_prep_nsw(nn, &nsw);
														
 
															+	if (ret)
														
 
															+		goto out;
														
 
															+
														
 
															+	msg->msg_num = cpu_to_be32(nsw.ns_id);
														
 
															+
														
 
															+	/* finally, convert the message header to network byte-order
														
 
															+	 * and send */
														
 
															+	ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
														
 
															+				 sizeof(struct o2net_msg) + caller_bytes);
														
 
															+	msglog(msg, "sending returned %d\n", ret);
														
 
															+	if (ret < 0) {
														
 
															+		mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/* wait on other node's handler */
														
 
															+	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
														
 
															+
														
 
															+	/* Note that we avoid overwriting the callers status return
														
 
															+	 * variable if a system error was reported on the other
														
 
															+	 * side. Callers beware. */
														
 
															+	ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
														
 
															+	if (status && !ret)
														
 
															+		*status = nsw.ns_status;
														
 
															+
														
 
															+	mlog(0, "woken, returning system status %d, user status %d\n",
														
 
															+	     ret, nsw.ns_status);
														
 
															+out:
														
 
															+	if (sc)
														
 
															+		sc_put(sc);
														
 
															+	if (vec)
														
 
															+		kfree(vec);
														
 
															+	if (msg)
														
 
															+		kfree(msg);
														
 
															+	o2net_complete_nsw(nn, &nsw, 0, 0, 0);
														
 
															+	return ret;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2net_send_message_vec);
														
 
															+
														
 
															+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
														
 
															+		       u8 target_node, int *status)
														
 
															+{
														
 
															+	struct kvec vec = {
														
 
															+		.iov_base = data,
														
 
															+		.iov_len = len,
														
 
															+	};
														
 
															+	return o2net_send_message_vec(msg_type, key, &vec, 1,
														
 
															+				      target_node, status);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(o2net_send_message);
														
 
															+
														
 
															+static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
														
 
															+				   enum o2net_system_error syserr, int err)
														
 
															+{
														
 
															+	struct kvec vec = {
														
 
															+		.iov_base = hdr,
														
 
															+		.iov_len = sizeof(struct o2net_msg),
														
 
															+	};
														
 
															+
														
 
															+	BUG_ON(syserr >= O2NET_ERR_MAX);
														
 
															+
														
 
															+	/* leave other fields intact from the incoming message, msg_num
														
 
															+	 * in particular */
														
 
															+	hdr->sys_status = cpu_to_be32(syserr);
														
 
															+	hdr->status = cpu_to_be32(err);
														
 
															+	hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC);  // twiddle the magic
														
 
															+	hdr->data_len = 0;
														
 
															+
														
 
															+	msglog(hdr, "about to send status magic %d\n", err);
														
 
															+	/* hdr has been in host byteorder this whole time */
														
 
															+	return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
														
 
															+}
														
 
															+
														
 
															+/* this returns -errno if the header was unknown or too large, etc.
														
 
															+ * after this is called the buffer us reused for the next message */
														
 
															+static int o2net_process_message(struct o2net_sock_container *sc,
														
 
															+				 struct o2net_msg *hdr)
														
 
															+{
														
 
															+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
														
 
															+	int ret = 0, handler_status;
														
 
															+	enum  o2net_system_error syserr;
														
 
															+	struct o2net_msg_handler *nmh = NULL;
														
 
															+
														
 
															+	msglog(hdr, "processing message\n");
														
 
															+
														
 
															+	o2net_sc_postpone_idle(sc);
														
 
															+
														
 
															+	switch(be16_to_cpu(hdr->magic)) {
														
 
															+		case O2NET_MSG_STATUS_MAGIC:
														
 
															+			/* special type for returning message status */
														
 
															+			o2net_complete_nsw(nn, NULL,
														
 
															+					   be32_to_cpu(hdr->msg_num),
														
 
															+					   be32_to_cpu(hdr->sys_status),
														
 
															+					   be32_to_cpu(hdr->status));
														
 
															+			goto out;
														
 
															+		case O2NET_MSG_KEEP_REQ_MAGIC:
														
 
															+			o2net_sendpage(sc, o2net_keep_resp,
														
 
															+				       sizeof(*o2net_keep_resp));
														
 
															+			goto out;
														
 
															+		case O2NET_MSG_KEEP_RESP_MAGIC:
														
 
															+			goto out;
														
 
															+		case O2NET_MSG_MAGIC:
														
 
															+			break;
														
 
															+		default:
														
 
															+			msglog(hdr, "bad magic\n");
														
 
															+			ret = -EINVAL;
														
 
															+			goto out;
														
 
															+			break;
														
 
															+	}
														
 
															+
														
 
															+	/* find a handler for it */
														
 
															+	handler_status = 0;
														
 
															+	nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
														
 
															+				be32_to_cpu(hdr->key));
														
 
															+	if (!nmh) {
														
 
															+		mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
														
 
															+		     be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
														
 
															+		syserr = O2NET_ERR_NO_HNDLR;
														
 
															+		goto out_respond;
														
 
															+	}
														
 
															+
														
 
															+	syserr = O2NET_ERR_NONE;
														
 
															+
														
 
															+	if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
														
 
															+		syserr = O2NET_ERR_OVERFLOW;
														
 
															+
														
 
															+	if (syserr != O2NET_ERR_NONE)
														
 
															+		goto out_respond;
														
 
															+
														
 
															+	do_gettimeofday(&sc->sc_tv_func_start);
														
 
															+	sc->sc_msg_key = be32_to_cpu(hdr->key);
														
 
															+	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
														
 
															+	handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
														
 
															+					     be16_to_cpu(hdr->data_len),
														
 
															+					nmh->nh_func_data);
														
 
															+	do_gettimeofday(&sc->sc_tv_func_stop);
														
 
															+
														
 
															+out_respond:
														
 
															+	/* this destroys the hdr, so don't use it after this */
														
 
															+	ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
														
 
															+				      handler_status);
														
 
															+	hdr = NULL;
														
 
															+	mlog(0, "sending handler status %d, syserr %d returned %d\n",
														
 
															+	     handler_status, syserr, ret);
														
 
															+
														
 
															+out:
														
 
															+	if (nmh)
														
 
															+		o2net_handler_put(nmh);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int o2net_check_handshake(struct o2net_sock_container *sc)
														
 
															+{
														
 
															+	struct o2net_handshake *hand = page_address(sc->sc_page);
														
 
															+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
														
 
															+
														
 
															+	if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
														
 
															+		mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
														
 
															+		     "version %llu but %llu is required, disconnecting\n",
														
 
															+		     SC_NODEF_ARGS(sc),
														
 
															+		     (unsigned long long)be64_to_cpu(hand->protocol_version),
														
 
															+		     O2NET_PROTOCOL_VERSION);
														
 
															+
														
 
															+		/* don't bother reconnecting if its the wrong version. */
														
 
															+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	sc->sc_handshake_ok = 1;
														
 
															+
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	/* set valid and queue the idle timers only if it hasn't been
														
 
															+	 * shut down already */
														
 
															+	if (nn->nn_sc == sc) {
														
 
															+		o2net_sc_postpone_idle(sc);
														
 
															+		o2net_set_nn_state(nn, sc, 1, 0);
														
 
															+	}
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+
														
 
															+	/* shift everything up as though it wasn't there */
														
 
															+	sc->sc_page_off -= sizeof(struct o2net_handshake);
														
 
															+	if (sc->sc_page_off)
														
 
															+		memmove(hand, hand + 1, sc->sc_page_off);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* this demuxes the queued rx bytes into header or payload bits and calls
														
 
															+ * handlers as each full message is read off the socket.  it returns -error,
														
 
															+ * == 0 eof, or > 0 for progress made.*/
														
 
															+static int o2net_advance_rx(struct o2net_sock_container *sc)
														
 
															+{
														
 
															+	struct o2net_msg *hdr;
														
 
															+	int ret = 0;
														
 
															+	void *data;
														
 
															+	size_t datalen;
														
 
															+
														
 
															+	sclog(sc, "receiving\n");
														
 
															+	do_gettimeofday(&sc->sc_tv_advance_start);
														
 
															+
														
 
															+	/* do we need more header? */
														
 
															+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
														
 
															+		data = page_address(sc->sc_page) + sc->sc_page_off;
														
 
															+		datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
														
 
															+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
														
 
															+		if (ret > 0) {
														
 
															+			sc->sc_page_off += ret;
														
 
															+
														
 
															+			/* this working relies on the handshake being
														
 
															+			 * smaller than the normal message header */
														
 
															+			if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
														
 
															+			    !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
														
 
															+				ret = -EPROTO;
														
 
															+				goto out;
														
 
															+			}
														
 
															+
														
 
															+			/* only swab incoming here.. we can
														
 
															+			 * only get here once as we cross from
														
 
															+			 * being under to over */
														
 
															+			if (sc->sc_page_off == sizeof(struct o2net_msg)) {
														
 
															+				hdr = page_address(sc->sc_page);
														
 
															+				if (be16_to_cpu(hdr->data_len) >
														
 
															+				    O2NET_MAX_PAYLOAD_BYTES)
														
 
															+					ret = -EOVERFLOW;
														
 
															+			}
														
 
															+		}
														
 
															+		if (ret <= 0)
														
 
															+			goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
														
 
															+		/* oof, still don't have a header */
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/* this was swabbed above when we first read it */
														
 
															+	hdr = page_address(sc->sc_page);
														
 
															+
														
 
															+	msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
														
 
															+
														
 
															+	/* do we need more payload? */
														
 
															+	if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
														
 
															+		/* need more payload */
														
 
															+		data = page_address(sc->sc_page) + sc->sc_page_off;
														
 
															+		datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
														
 
															+			  sc->sc_page_off;
														
 
															+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
														
 
															+		if (ret > 0)
														
 
															+			sc->sc_page_off += ret;
														
 
															+		if (ret <= 0)
														
 
															+			goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
														
 
															+		/* we can only get here once, the first time we read
														
 
															+		 * the payload.. so set ret to progress if the handler
														
 
															+		 * works out. after calling this the message is toast */
														
 
															+		ret = o2net_process_message(sc, hdr);
														
 
															+		if (ret == 0)
														
 
															+			ret = 1;
														
 
															+		sc->sc_page_off = 0;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	sclog(sc, "ret = %d\n", ret);
														
 
															+	do_gettimeofday(&sc->sc_tv_advance_stop);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* this work func is triggerd by data ready.  it reads until it can read no
														
 
															+ * more.  it interprets 0, eof, as fatal.  if data_ready hits while we're doing
														
 
															+ * our work the work struct will be marked and we'll be called again. */
														
 
															+static void o2net_rx_until_empty(void *arg)
														
 
															+{
														
 
															+	struct o2net_sock_container *sc = arg;
														
 
															+	int ret;
														
 
															+
														
 
															+	do {
														
 
															+		ret = o2net_advance_rx(sc);
														
 
															+	} while (ret > 0);
														
 
															+
														
 
															+	if (ret <= 0 && ret != -EAGAIN) {
														
 
															+		struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
														
 
															+		sclog(sc, "saw error %d, closing\n", ret);
														
 
															+		/* not permanent so read failed handshake can retry */
														
 
															+		o2net_ensure_shutdown(nn, sc, 0);
														
 
															+	}
														
 
															+
														
 
															+	sc_put(sc);
														
 
															+}
														
 
															+
														
 
															+static int o2net_set_nodelay(struct socket *sock)
														
 
															+{
														
 
															+	int ret, val = 1;
														
 
															+	mm_segment_t oldfs;
														
 
															+
														
 
															+	oldfs = get_fs();
														
 
															+	set_fs(KERNEL_DS);
														
 
															+
														
 
															+	/*
														
 
															+	 * Dear unsuspecting programmer,
														
 
															+	 *
														
 
															+	 * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
														
 
															+	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
														
 
															+	 * silently turn into SO_DEBUG.
														
 
															+	 *
														
 
															+	 * Yours,
														
 
															+	 * Keeper of hilariously fragile interfaces.
														
 
															+	 */
														
 
															+	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
														
 
															+				    (char __user *)&val, sizeof(val));
														
 
															+
														
 
															+	set_fs(oldfs);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+/* called when a connect completes and after a sock is accepted.  the
														
 
															+ * rx path will see the response and mark the sc valid */
														
 
															+static void o2net_sc_connect_completed(void *arg)
														
 
															+{
														
 
															+	struct o2net_sock_container *sc = arg;
														
 
															+
														
 
															+	mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
														
 
															+              (unsigned long long)O2NET_PROTOCOL_VERSION,
														
 
															+	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
														
 
															+
														
 
															+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
														
 
															+	sc_put(sc);
														
 
															+}
														
 
															+
														
 
															+/* this is called as a work_struct func. */
														
 
															+static void o2net_sc_send_keep_req(void *arg)
														
 
															+{
														
 
															+	struct o2net_sock_container *sc = arg;
														
 
															+
														
 
															+	o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
														
 
															+	sc_put(sc);
														
 
															+}
														
 
															+
														
 
															+/* socket shutdown does a del_timer_sync against this as it tears down.
														
 
															+ * we can't start this timer until we've got to the point in sc buildup
														
 
															+ * where shutdown is going to be involved */
														
 
															+static void o2net_idle_timer(unsigned long data)
														
 
															+{
														
 
															+	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
														
 
															+	struct timeval now;
														
 
															+
														
 
															+	do_gettimeofday(&now);
														
 
															+
														
 
															+	mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
														
 
															+	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
														
 
															+	mlog(ML_NOTICE, "here are some times that might help debug the "
														
 
															+	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
														
 
															+	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
														
 
															+	     sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, 
														
 
															+	     now.tv_sec, now.tv_usec,
														
 
															+	     sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, 
														
 
															+	     sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, 
														
 
															+	     sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, 
														
 
															+	     sc->sc_msg_key, sc->sc_msg_type,
														
 
															+	     sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec,
														
 
															+	     sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec);
														
 
															+
														
 
															+	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
														
 
															+}
														
 
															+
														
 
															+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
														
 
															+{
														
 
															+	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
														
 
															+	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
														
 
															+				    O2NET_KEEPALIVE_DELAY_SECS * HZ);
														
 
															+	do_gettimeofday(&sc->sc_tv_timer);
														
 
															+	mod_timer(&sc->sc_idle_timeout,
														
 
															+		  jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
														
 
															+}
														
 
															+
														
 
															+/* this work func is kicked whenever a path sets the nn state which doesn't
														
 
															+ * have valid set.  This includes seeing hb come up, losing a connection,
														
 
															+ * having a connect attempt fail, etc. This centralizes the logic which decides
														
 
															+ * if a connect attempt should be made or if we should give up and all future
														
 
															+ * transmit attempts should fail */
														
 
															+static void o2net_start_connect(void *arg)
														
 
															+{
														
 
															+	struct o2net_node *nn = arg;
														
 
															+	struct o2net_sock_container *sc = NULL;
														
 
															+	struct o2nm_node *node = NULL;
														
 
															+	struct socket *sock = NULL;
														
 
															+	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	/* if we're greater we initiate tx, otherwise we accept */
														
 
															+	if (o2nm_this_node() <= o2net_num_from_nn(nn))
														
 
															+		goto out;
														
 
															+
														
 
															+	/* watch for racing with tearing a node down */
														
 
															+	node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
														
 
															+	if (node == NULL) {
														
 
															+		ret = 0;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	/* see if we already have one pending or have given up */
														
 
															+	if (nn->nn_sc || nn->nn_persistent_error)
														
 
															+		arg = NULL;
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+	if (arg == NULL) /* *shrug*, needed some indicator */
														
 
															+		goto out;
														
 
															+
														
 
															+	nn->nn_last_connect_attempt = jiffies;
														
 
															+
														
 
															+	sc = sc_alloc(node);
														
 
															+	if (sc == NULL) {
														
 
															+		mlog(0, "couldn't allocate sc\n");
														
 
															+		ret = -ENOMEM;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
														
 
															+	if (ret < 0) {
														
 
															+		mlog(0, "can't create socket: %d\n", ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+	sc->sc_sock = sock; /* freed by sc_kref_release */
														
 
															+
														
 
															+	sock->sk->sk_allocation = GFP_ATOMIC;
														
 
															+
														
 
															+	myaddr.sin_family = AF_INET;
														
 
															+	myaddr.sin_port = (__force u16)htons(0); /* any port */
														
 
															+
														
 
															+	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
														
 
															+			      sizeof(myaddr));
														
 
															+	if (ret) {
														
 
															+		mlog(0, "bind failed: %d\n", ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = o2net_set_nodelay(sc->sc_sock);
														
 
															+	if (ret) {
														
 
															+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	o2net_register_callbacks(sc->sc_sock->sk, sc);
														
 
															+
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	/* handshake completion will set nn->nn_sc_valid */
														
 
															+	o2net_set_nn_state(nn, sc, 0, 0);
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+
														
 
															+	remoteaddr.sin_family = AF_INET;
														
 
															+	remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
														
 
															+	remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
														
 
															+
														
 
															+	ret = sc->sc_sock->ops->connect(sc->sc_sock,
														
 
															+					(struct sockaddr *)&remoteaddr,
														
 
															+					sizeof(remoteaddr),
														
 
															+					O_NONBLOCK);
														
 
															+	if (ret == -EINPROGRESS)
														
 
															+		ret = 0;
														
 
															+
														
 
															+out:
														
 
															+	if (ret) {
														
 
															+		mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
														
 
															+		     "with errno %d\n", SC_NODEF_ARGS(sc), ret);
														
 
															+		/* 0 err so that another will be queued and attempted
														
 
															+		 * from set_nn_state */
														
 
															+		if (sc)
														
 
															+			o2net_ensure_shutdown(nn, sc, 0);
														
 
															+	}
														
 
															+	if (sc)
														
 
															+		sc_put(sc);
														
 
															+	if (node)
														
 
															+		o2nm_node_put(node);
														
 
															+
														
 
															+	return;
														
 
															+}
														
 
															+
														
 
															+static void o2net_connect_expired(void *arg)
														
 
															+{
														
 
															+	struct o2net_node *nn = arg;
														
 
															+
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	if (!nn->nn_sc_valid) {
														
 
															+		mlog(ML_ERROR, "no connection established with node %u after "
														
 
															+		     "%u seconds, giving up and returning errors.\n",
														
 
															+		     o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
														
 
															+
														
 
															+		o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
														
 
															+	}
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+}
														
 
															+
														
 
															+static void o2net_still_up(void *arg)
														
 
															+{
														
 
															+	struct o2net_node *nn = arg;
														
 
															+
														
 
															+	o2quo_hb_still_up(o2net_num_from_nn(nn));
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+void o2net_disconnect_node(struct o2nm_node *node)
														
 
															+{
														
 
															+	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
														
 
															+
														
 
															+	/* don't reconnect until it's heartbeating again */
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+
														
 
															+	if (o2net_wq) {
														
 
															+		cancel_delayed_work(&nn->nn_connect_expired);
														
 
															+		cancel_delayed_work(&nn->nn_connect_work);
														
 
															+		cancel_delayed_work(&nn->nn_still_up);
														
 
															+		flush_workqueue(o2net_wq);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
														
 
															+				  void *data)
														
 
															+{
														
 
															+	o2quo_hb_down(node_num);
														
 
															+
														
 
															+	if (node_num != o2nm_this_node())
														
 
															+		o2net_disconnect_node(node);
														
 
															+}
														
 
															+
														
 
															+static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
														
 
															+				void *data)
														
 
															+{
														
 
															+	struct o2net_node *nn = o2net_nn_from_num(node_num);
														
 
															+
														
 
															+	o2quo_hb_up(node_num);
														
 
															+
														
 
															+	/* ensure an immediate connect attempt */
														
 
															+	nn->nn_last_connect_attempt = jiffies -
														
 
															+		(msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
														
 
															+
														
 
															+	if (node_num != o2nm_this_node()) {
														
 
															+		/* heartbeat doesn't work unless a local node number is
														
 
															+		 * configured and doing so brings up the o2net_wq, so we can
														
 
															+		 * use it.. */
														
 
															+		queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
														
 
															+				   O2NET_IDLE_TIMEOUT_SECS * HZ);
														
 
															+
														
 
															+		/* believe it or not, accept and node hearbeating testing
														
 
															+		 * can succeed for this node before we got here.. so
														
 
															+		 * only use set_nn_state to clear the persistent error
														
 
															+		 * if that hasn't already happened */
														
 
															+		spin_lock(&nn->nn_lock);
														
 
															+		if (nn->nn_persistent_error)
														
 
															+			o2net_set_nn_state(nn, NULL, 0, 0);
														
 
															+		spin_unlock(&nn->nn_lock);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+void o2net_unregister_hb_callbacks(void)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = o2hb_unregister_callback(&o2net_hb_up);
														
 
															+	if (ret < 0)
														
 
															+		mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
														
 
															+		     "callback!\n", ret);
														
 
															+
														
 
															+	ret = o2hb_unregister_callback(&o2net_hb_down);
														
 
															+	if (ret < 0)
														
 
															+		mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
														
 
															+		     "callback!\n", ret);
														
 
															+}
														
 
															+
														
 
															+int o2net_register_hb_callbacks(void)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
														
 
															+			    o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
														
 
															+	o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
														
 
															+			    o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
														
 
															+
														
 
															+	ret = o2hb_register_callback(&o2net_hb_up);
														
 
															+	if (ret == 0)
														
 
															+		ret = o2hb_register_callback(&o2net_hb_down);
														
 
															+
														
 
															+	if (ret)
														
 
															+		o2net_unregister_hb_callbacks();
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+static int o2net_accept_one(struct socket *sock)
														
 
															+{
														
 
															+	int ret, slen;
														
 
															+	struct sockaddr_in sin;
														
 
															+	struct socket *new_sock = NULL;
														
 
															+	struct o2nm_node *node = NULL;
														
 
															+	struct o2net_sock_container *sc = NULL;
														
 
															+	struct o2net_node *nn;
														
 
															+
														
 
															+	BUG_ON(sock == NULL);
														
 
															+	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
														
 
															+			       sock->sk->sk_protocol, &new_sock);
														
 
															+	if (ret)
														
 
															+		goto out;
														
 
															+
														
 
															+	new_sock->type = sock->type;
														
 
															+	new_sock->ops = sock->ops;
														
 
															+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
														
 
															+	if (ret < 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	new_sock->sk->sk_allocation = GFP_ATOMIC;
														
 
															+
														
 
															+	ret = o2net_set_nodelay(new_sock);
														
 
															+	if (ret) {
														
 
															+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	slen = sizeof(sin);
														
 
															+	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
														
 
															+				       &slen, 1);
														
 
															+	if (ret < 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
														
 
															+	if (node == NULL) {
														
 
															+		mlog(ML_NOTICE, "attempt to connect from unknown node at "
														
 
															+		     "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
														
 
															+		     ntohs((__force __be16)sin.sin_port));
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (o2nm_this_node() > node->nd_num) {
														
 
															+		mlog(ML_NOTICE, "unexpected connect attempted from a lower "
														
 
															+		     "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
														
 
															+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
														
 
															+		     ntohs((__force __be16)sin.sin_port), node->nd_num);
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/* this happens all the time when the other node sees our heartbeat
														
 
															+	 * and tries to connect before we see their heartbeat */
														
 
															+	if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
														
 
															+		mlog(ML_CONN, "attempt to connect from node '%s' at "
														
 
															+		     "%u.%u.%u.%u:%d but it isn't heartbeating\n",
														
 
															+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
														
 
															+		     ntohs((__force __be16)sin.sin_port));
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	nn = o2net_nn_from_num(node->nd_num);
														
 
															+
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	if (nn->nn_sc)
														
 
															+		ret = -EBUSY;
														
 
															+	else
														
 
															+		ret = 0;
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+	if (ret) {
														
 
															+		mlog(ML_NOTICE, "attempt to connect from node '%s' at "
														
 
															+		     "%u.%u.%u.%u:%d but it already has an open connection\n",
														
 
															+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
														
 
															+		     ntohs((__force __be16)sin.sin_port));
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	sc = sc_alloc(node);
														
 
															+	if (sc == NULL) {
														
 
															+		ret = -ENOMEM;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	sc->sc_sock = new_sock;
														
 
															+	new_sock = NULL;
														
 
															+
														
 
															+	spin_lock(&nn->nn_lock);
														
 
															+	o2net_set_nn_state(nn, sc, 0, 0);
														
 
															+	spin_unlock(&nn->nn_lock);
														
 
															+
														
 
															+	o2net_register_callbacks(sc->sc_sock->sk, sc);
														
 
															+	o2net_sc_queue_work(sc, &sc->sc_rx_work);
														
 
															+
														
 
															+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
														
 
															+
														
 
															+out:
														
 
															+	if (new_sock)
														
 
															+		sock_release(new_sock);
														
 
															+	if (node)
														
 
															+		o2nm_node_put(node);
														
 
															+	if (sc)
														
 
															+		sc_put(sc);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void o2net_accept_many(void *arg)
														
 
															+{
														
 
															+	struct socket *sock = arg;
														
 
															+	while (o2net_accept_one(sock) == 0)
														
 
															+		cond_resched();
														
 
															+}
														
 
															+
														
 
															+static void o2net_listen_data_ready(struct sock *sk, int bytes)
														
 
															+{
														
 
															+	void (*ready)(struct sock *sk, int bytes);
														
 
															+
														
 
															+	read_lock(&sk->sk_callback_lock);
														
 
															+	ready = sk->sk_user_data;
														
 
															+	if (ready == NULL) { /* check for teardown race */
														
 
															+		ready = sk->sk_data_ready;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/* ->sk_data_ready is also called for a newly established child socket
														
 
															+	 * before it has been accepted and the acceptor has set up their
														
 
															+	 * data_ready.. we only want to queue listen work for our listening
														
 
															+	 * socket */
														
 
															+	if (sk->sk_state == TCP_LISTEN) {
														
 
															+		mlog(ML_TCP, "bytes: %d\n", bytes);
														
 
															+		queue_work(o2net_wq, &o2net_listen_work);
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	read_unlock(&sk->sk_callback_lock);
														
 
															+	ready(sk, bytes);
														
 
															+}
														
 
															+
														
 
															+static int o2net_open_listening_sock(__be16 port)
														
 
															+{
														
 
															+	struct socket *sock = NULL;
														
 
															+	int ret;
														
 
															+	struct sockaddr_in sin = {
														
 
															+		.sin_family = PF_INET,
														
 
															+		.sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
														
 
															+		.sin_port = (__force u16)port,
														
 
															+	};
														
 
															+
														
 
															+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
														
 
															+	if (ret < 0) {
														
 
															+		mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	sock->sk->sk_allocation = GFP_ATOMIC;
														
 
															+
														
 
															+	write_lock_bh(&sock->sk->sk_callback_lock);
														
 
															+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
														
 
															+	sock->sk->sk_data_ready = o2net_listen_data_ready;
														
 
															+	write_unlock_bh(&sock->sk->sk_callback_lock);
														
 
															+
														
 
															+	o2net_listen_sock = sock;
														
 
															+	INIT_WORK(&o2net_listen_work, o2net_accept_many, sock);
														
 
															+
														
 
															+	sock->sk->sk_reuse = 1;
														
 
															+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
														
 
															+	if (ret < 0) {
														
 
															+		mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
														
 
															+		     ntohs(port), ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = sock->ops->listen(sock, 64);
														
 
															+	if (ret < 0) {
														
 
															+		mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
														
 
															+		     ntohs(port), ret);
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	if (ret) {
														
 
															+		o2net_listen_sock = NULL;
														
 
															+		if (sock)
														
 
															+			sock_release(sock);
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * called from node manager when we should bring up our network listening
														
 
															+ * socket.  node manager handles all the serialization to only call this
														
 
															+ * once and to match it with o2net_stop_listening().  note,
														
 
															+ * o2nm_this_node() doesn't work yet as we're being called while it
														
 
															+ * is being set up.
														
 
															+ */
														
 
															+int o2net_start_listening(struct o2nm_node *node)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	BUG_ON(o2net_wq != NULL);
														
 
															+	BUG_ON(o2net_listen_sock != NULL);
														
 
															+
														
 
															+	mlog(ML_KTHREAD, "starting o2net thread...\n");
														
 
															+	o2net_wq = create_singlethread_workqueue("o2net");
														
 
															+	if (o2net_wq == NULL) {
														
 
															+		mlog(ML_ERROR, "unable to launch o2net thread\n");
														
 
															+		return -ENOMEM; /* ? */
														
 
															+	}
														
 
															+
														
 
															+	ret = o2net_open_listening_sock(node->nd_ipv4_port);
														
 
															+	if (ret) {
														
 
															+		destroy_workqueue(o2net_wq);
														
 
															+		o2net_wq = NULL;
														
 
															+	} else
														
 
															+		o2quo_conn_up(node->nd_num);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* again, o2nm_this_node() doesn't work here as we're involved in
														
 
															+ * tearing it down */
														
 
															+void o2net_stop_listening(struct o2nm_node *node)
														
 
															+{
														
 
															+	struct socket *sock = o2net_listen_sock;
														
 
															+	size_t i;
														
 
															+
														
 
															+	BUG_ON(o2net_wq == NULL);
														
 
															+	BUG_ON(o2net_listen_sock == NULL);
														
 
															+
														
 
															+	/* stop the listening socket from generating work */
														
 
															+	write_lock_bh(&sock->sk->sk_callback_lock);
														
 
															+	sock->sk->sk_data_ready = sock->sk->sk_user_data;
														
 
															+	sock->sk->sk_user_data = NULL;
														
 
															+	write_unlock_bh(&sock->sk->sk_callback_lock);
														
 
															+
														
 
															+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
														
 
															+		struct o2nm_node *node = o2nm_get_node_by_num(i);
														
 
															+		if (node) {
														
 
															+			o2net_disconnect_node(node);
														
 
															+			o2nm_node_put(node);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* finish all work and tear down the work queue */
														
 
															+	mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
														
 
															+	destroy_workqueue(o2net_wq);
														
 
															+	o2net_wq = NULL;
														
 
															+
														
 
															+	sock_release(o2net_listen_sock);
														
 
															+	o2net_listen_sock = NULL;
														
 
															+
														
 
															+	o2quo_conn_err(node->nd_num);
														
 
															+}
														
 
															+
														
 
															+/* ------------------------------------------------------------ */
														
 
															+
														
 
															+int o2net_init(void)
														
 
															+{
														
 
															+	unsigned long i;
														
 
															+
														
 
															+	o2quo_init();
														
 
															+
														
 
															+	o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
														
 
															+	o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
														
 
															+	o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
														
 
															+	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
														
 
															+		kfree(o2net_hand);
														
 
															+		kfree(o2net_keep_req);
														
 
															+		kfree(o2net_keep_resp);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
														
 
															+	o2net_hand->connector_id = cpu_to_be64(1);
														
 
															+
														
 
															+	o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
														
 
															+	o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
														
 
															+
														
 
															+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
														
 
															+		struct o2net_node *nn = o2net_nn_from_num(i);
														
 
															+
														
 
															+		spin_lock_init(&nn->nn_lock);
														
 
															+		INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
														
 
															+		INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
														
 
															+		INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
														
 
															+		/* until we see hb from a node we'll return einval */
														
 
															+		nn->nn_persistent_error = -ENOTCONN;
														
 
															+		init_waitqueue_head(&nn->nn_sc_wq);
														
 
															+		idr_init(&nn->nn_status_idr);
														
 
															+		INIT_LIST_HEAD(&nn->nn_status_list);
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void o2net_exit(void)
														
 
															+{
														
 
															+	o2quo_exit();
														
 
															+	kfree(o2net_hand);
														
 
															+	kfree(o2net_keep_req);
														
 
															+	kfree(o2net_keep_resp);
														
 
															+}
														
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -0,0 +1,113 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * tcp.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef O2CLUSTER_TCP_H
														
 
															+#define O2CLUSTER_TCP_H
														
 
															+
														
 
															+#include <linux/socket.h>
														
 
															+#ifdef __KERNEL__
														
 
															+#include <net/sock.h>
														
 
															+#include <linux/tcp.h>
														
 
															+#else
														
 
															+#include <sys/socket.h>
														
 
															+#endif
														
 
															+#include <linux/inet.h>
														
 
															+#include <linux/in.h>
														
 
															+
														
 
															+struct o2net_msg
														
 
															+{
														
 
															+	__be16 magic;
														
 
															+	__be16 data_len;
														
 
															+	__be16 msg_type;
														
 
															+	__be16 pad1;
														
 
															+	__be32 sys_status;
														
 
															+	__be32 status;
														
 
															+	__be32 key;
														
 
															+	__be32 msg_num;
														
 
															+	__u8  buf[0];
														
 
															+};
														
 
															+
														
 
															+typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
														
 
															+
														
 
															+#define O2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(struct o2net_msg))
														
 
															+
														
 
															+/* TODO: figure this out.... */
														
 
															+static inline int o2net_link_down(int err, struct socket *sock)
														
 
															+{
														
 
															+	if (sock) {
														
 
															+		if (sock->sk->sk_state != TCP_ESTABLISHED &&
														
 
															+	    	    sock->sk->sk_state != TCP_CLOSE_WAIT)
														
 
															+			return 1;
														
 
															+	}
														
 
															+
														
 
															+	if (err >= 0)
														
 
															+		return 0;
														
 
															+	switch (err) {
														
 
															+		/* ????????????????????????? */
														
 
															+		case -ERESTARTSYS:
														
 
															+		case -EBADF:
														
 
															+		/* When the server has died, an ICMP port unreachable
														
 
															+		 * message prompts ECONNREFUSED. */
														
 
															+		case -ECONNREFUSED:
														
 
															+		case -ENOTCONN:
														
 
															+		case -ECONNRESET:
														
 
															+		case -EPIPE:
														
 
															+			return 1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+enum {
														
 
															+	O2NET_DRIVER_UNINITED,
														
 
															+	O2NET_DRIVER_READY,
														
 
															+};
														
 
															+
														
 
															+int o2net_init_tcp_sock(struct inode *inode);
														
 
															+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
														
 
															+		       u8 target_node, int *status);
														
 
															+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
														
 
															+			   size_t veclen, u8 target_node, int *status);
														
 
															+int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
														
 
															+			    struct inode *group);
														
 
															+
														
 
															+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
														
 
															+			   o2net_msg_handler_func *func, void *data,
														
 
															+			   struct list_head *unreg_list);
														
 
															+void o2net_unregister_handler_list(struct list_head *list);
														
 
															+
														
 
															+struct o2nm_node;
														
 
															+int o2net_register_hb_callbacks(void);
														
 
															+void o2net_unregister_hb_callbacks(void);
														
 
															+int o2net_start_listening(struct o2nm_node *node);
														
 
															+void o2net_stop_listening(struct o2nm_node *node);
														
 
															+void o2net_disconnect_node(struct o2nm_node *node);
														
 
															+
														
 
															+int o2net_init(void);
														
 
															+void o2net_exit(void);
														
 
															+int o2net_proc_init(struct proc_dir_entry *parent);
														
 
															+void o2net_proc_exit(struct proc_dir_entry *parent);
														
 
															+
														
 
															+#endif /* O2CLUSTER_TCP_H */
														
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -0,0 +1,174 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef O2CLUSTER_TCP_INTERNAL_H
														
 
															+#define O2CLUSTER_TCP_INTERNAL_H
														
 
															+
														
 
															+#define O2NET_MSG_MAGIC           ((u16)0xfa55)
														
 
															+#define O2NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
														
 
															+#define O2NET_MSG_KEEP_REQ_MAGIC  ((u16)0xfa57)
														
 
															+#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
														
 
															+
														
 
															+/* same as hb delay, we're waiting for another node to recognize our hb */
														
 
															+#define O2NET_RECONNECT_DELAY_MS	O2HB_REGION_TIMEOUT_MS
														
 
															+
														
 
															+/* we're delaying our quorum decision so that heartbeat will have timed
														
 
															+ * out truly dead nodes by the time we come around to making decisions
														
 
															+ * on their number */
														
 
															+#define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
														
 
															+
														
 
															+#define O2NET_KEEPALIVE_DELAY_SECS	5
														
 
															+#define O2NET_IDLE_TIMEOUT_SECS		10
														
 
															+
														
 
															+/* 
														
 
															+ * This version number represents quite a lot, unfortunately.  It not
														
 
															+ * only represents the raw network message protocol on the wire but also
														
 
															+ * locking semantics of the file system using the protocol.  It should 
														
 
															+ * be somewhere else, I'm sure, but right now it isn't.
														
 
															+ *
														
 
															+ * New in version 2:
														
 
															+ * 	- full 64 bit i_size in the metadata lock lvbs
														
 
															+ * 	- introduction of "rw" lock and pushing meta/data locking down
														
 
															+ */
														
 
															+#define O2NET_PROTOCOL_VERSION 2ULL
														
 
															+struct o2net_handshake {
														
 
															+	__be64	protocol_version;
														
 
															+	__be64	connector_id;
														
 
															+};
														
 
															+
														
 
															+struct o2net_node {
														
 
															+	/* this is never called from int/bh */
														
 
															+	spinlock_t			nn_lock;
														
 
															+
														
 
															+	/* set the moment an sc is allocated and a connect is started */
														
 
															+	struct o2net_sock_container	*nn_sc;
														
 
															+	/* _valid is only set after the handshake passes and tx can happen */
														
 
															+	unsigned			nn_sc_valid:1;
														
 
															+	/* if this is set tx just returns it */
														
 
															+	int				nn_persistent_error;
														
 
															+
														
 
															+	/* threads waiting for an sc to arrive wait on the wq for generation
														
 
															+	 * to increase.  it is increased when a connecting socket succeeds
														
 
															+	 * or fails or when an accepted socket is attached. */
														
 
															+	wait_queue_head_t		nn_sc_wq;
														
 
															+
														
 
															+	struct idr			nn_status_idr;
														
 
															+	struct list_head		nn_status_list;
														
 
															+
														
 
															+	/* connects are attempted from when heartbeat comes up until either hb
														
 
															+	 * goes down, the node is unconfigured, no connect attempts succeed
														
 
															+	 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
														
 
															+	 * is queued from set_nn_state both from hb up and from itself if a
														
 
															+	 * connect attempt fails and so can be self-arming.  shutdown is
														
 
															+	 * careful to first mark the nn such that no connects will be attempted
														
 
															+	 * before canceling delayed connect work and flushing the queue. */
														
 
															+	struct work_struct		nn_connect_work;
														
 
															+	unsigned long			nn_last_connect_attempt;
														
 
															+
														
 
															+	/* this is queued as nodes come up and is canceled when a connection is
														
 
															+	 * established.  this expiring gives up on the node and errors out
														
 
															+	 * transmits */
														
 
															+	struct work_struct		nn_connect_expired;
														
 
															+
														
 
															+	/* after we give up on a socket we wait a while before deciding
														
 
															+	 * that it is still heartbeating and that we should do some
														
 
															+	 * quorum work */
														
 
															+	struct work_struct		nn_still_up;
														
 
															+};
														
 
															+
														
 
															+struct o2net_sock_container {
														
 
															+	struct kref		sc_kref;
														
 
															+	/* the next two are vaild for the life time of the sc */
														
 
															+	struct socket		*sc_sock;
														
 
															+	struct o2nm_node	*sc_node;
														
 
															+
														
 
															+	/* all of these sc work structs hold refs on the sc while they are
														
 
															+	 * queued.  they should not be able to ref a freed sc.  the teardown
														
 
															+	 * race is with o2net_wq destruction in o2net_stop_listening() */
														
 
															+
														
 
															+	/* rx and connect work are generated from socket callbacks.  sc
														
 
															+	 * shutdown removes the callbacks and then flushes the work queue */
														
 
															+	struct work_struct	sc_rx_work;
														
 
															+	struct work_struct	sc_connect_work;
														
 
															+	/* shutdown work is triggered in two ways.  the simple way is
														
 
															+	 * for a code path calls ensure_shutdown which gets a lock, removes
														
 
															+	 * the sc from the nn, and queues the work.  in this case the
														
 
															+	 * work is single-shot.  the work is also queued from a sock
														
 
															+	 * callback, though, and in this case the work will find the sc
														
 
															+	 * still on the nn and will call ensure_shutdown itself.. this
														
 
															+	 * ends up triggering the shutdown work again, though nothing
														
 
															+	 * will be done in that second iteration.  so work queue teardown
														
 
															+	 * has to be careful to remove the sc from the nn before waiting
														
 
															+	 * on the work queue so that the shutdown work doesn't remove the
														
 
															+	 * sc and rearm itself.
														
 
															+	 */
														
 
															+	struct work_struct	sc_shutdown_work;
														
 
															+
														
 
															+	struct timer_list	sc_idle_timeout;
														
 
															+	struct work_struct	sc_keepalive_work;
														
 
															+
														
 
															+	unsigned		sc_handshake_ok:1;
														
 
															+
														
 
															+	struct page 		*sc_page;
														
 
															+	size_t			sc_page_off;
														
 
															+
														
 
															+	/* original handlers for the sockets */
														
 
															+	void			(*sc_state_change)(struct sock *sk);
														
 
															+	void			(*sc_data_ready)(struct sock *sk, int bytes);
														
 
															+
														
 
															+	struct timeval 		sc_tv_timer;
														
 
															+	struct timeval 		sc_tv_data_ready;
														
 
															+	struct timeval 		sc_tv_advance_start;
														
 
															+	struct timeval 		sc_tv_advance_stop;
														
 
															+	struct timeval 		sc_tv_func_start;
														
 
															+	struct timeval 		sc_tv_func_stop;
														
 
															+	u32			sc_msg_key;
														
 
															+	u16			sc_msg_type;
														
 
															+};
														
 
															+
														
 
															+struct o2net_msg_handler {
														
 
															+	struct rb_node		nh_node;
														
 
															+	u32			nh_max_len;
														
 
															+	u32			nh_msg_type;
														
 
															+	u32			nh_key;
														
 
															+	o2net_msg_handler_func	*nh_func;
														
 
															+	o2net_msg_handler_func	*nh_func_data;
														
 
															+	struct kref		nh_kref;
														
 
															+	struct list_head	nh_unregister_item;
														
 
															+};
														
 
															+
														
 
															+enum o2net_system_error {
														
 
															+	O2NET_ERR_NONE = 0,
														
 
															+	O2NET_ERR_NO_HNDLR,
														
 
															+	O2NET_ERR_OVERFLOW,
														
 
															+	O2NET_ERR_DIED,
														
 
															+	O2NET_ERR_MAX
														
 
															+};
														
 
															+
														
 
															+struct o2net_status_wait {
														
 
															+	enum o2net_system_error	ns_sys_status;
														
 
															+	s32			ns_status;
														
 
															+	int			ns_id;
														
 
															+	wait_queue_head_t	ns_wq;
														
 
															+	struct list_head	ns_node_item;
														
 
															+};
														
 
															+
														
 
															+#endif /* O2CLUSTER_TCP_INTERNAL_H */
														
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -0,0 +1,42 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ver.c
														
 
															+ *
														
 
															+ * version string
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/kernel.h>
														
 
															+
														
 
															+#include "ver.h"
														
 
															+
														
 
															+#define CLUSTER_BUILD_VERSION "1.3.3"
														
 
															+
														
 
															+#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
														
 
															+
														
 
															+void cluster_print_version(void)
														
 
															+{
														
 
															+	printk(KERN_INFO "%s\n", VERSION_STR);
														
 
															+}
														
 
															+
														
 
															+MODULE_DESCRIPTION(VERSION_STR);
														
 
															+
														
 
															+MODULE_VERSION(CLUSTER_BUILD_VERSION);
														
--- a/fs/ocfs2/cluster/ver.h
+++ b/fs/ocfs2/cluster/ver.h
@@ -0,0 +1,31 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ver.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef O2CLUSTER_VER_H
														
 
															+#define O2CLUSTER_VER_H
														
 
															+
														
 
															+void cluster_print_version(void);
														
 
															+
														
 
															+#endif /* O2CLUSTER_VER_H */
														
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,91 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dcache.c
														
 
															+ *
														
 
															+ * dentry cache handling code
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/namei.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DCACHE
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dcache.h"
														
 
															+#include "file.h"
														
 
															+#include "inode.h"
														
 
															+
														
 
															+static int ocfs2_dentry_revalidate(struct dentry *dentry,
														
 
															+				   struct nameidata *nd)
														
 
															+{
														
 
															+	struct inode *inode = dentry->d_inode;
														
 
															+	int ret = 0;    /* if all else fails, just return false */
														
 
															+	struct ocfs2_super *osb;
														
 
															+
														
 
															+	mlog_entry("(0x%p, '%.*s')\n", dentry,
														
 
															+		   dentry->d_name.len, dentry->d_name.name);
														
 
															+
														
 
															+	/* Never trust a negative dentry - force a new lookup. */
														
 
															+	if (inode == NULL) {
														
 
															+		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
														
 
															+		     dentry->d_name.name);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	BUG_ON(!osb);
														
 
															+
														
 
															+	if (inode != osb->root_inode) {
														
 
															+		spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+		/* did we or someone else delete this inode? */
														
 
															+		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
														
 
															+			spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+			mlog(0, "inode (%"MLFu64") deleted, returning false\n",
														
 
															+			     OCFS2_I(inode)->ip_blkno);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+		if (!inode->i_nlink) {
														
 
															+			mlog(0, "Inode %"MLFu64" orphaned, returning false "
														
 
															+			     "dir = %d\n", OCFS2_I(inode)->ip_blkno,
														
 
															+			     S_ISDIR(inode->i_mode));
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	ret = 1;
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(ret);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct dentry_operations ocfs2_dentry_ops = {
														
 
															+	.d_revalidate		= ocfs2_dentry_revalidate,
														
 
															+};
														
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,31 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dcache.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_DCACHE_H
														
 
															+#define OCFS2_DCACHE_H
														
 
															+
														
 
															+extern struct dentry_operations ocfs2_dentry_ops;
														
 
															+
														
 
															+#endif /* OCFS2_DCACHE_H */
														
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,618 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dir.c
														
 
															+ *
														
 
															+ * Creates, reads, walks and deletes directory-nodes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ *  Portions of this code from linux/fs/ext3/dir.c
														
 
															+ *
														
 
															+ *  Copyright (C) 1992, 1993, 1994, 1995
														
 
															+ *  Remy Card (card@masi.ibp.fr)
														
 
															+ *  Laboratoire MASI - Institut Blaise pascal
														
 
															+ *  Universite Pierre et Marie Curie (Paris VI)
														
 
															+ *
														
 
															+ *   from
														
 
															+ *
														
 
															+ *   linux/fs/minix/dir.c
														
 
															+ *
														
 
															+ *   Copyright (C) 1991, 1992 Linux Torvalds
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_NAMEI
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dir.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "file.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "namei.h"
														
 
															+#include "suballoc.h"
														
 
															+#include "uptodate.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+static unsigned char ocfs2_filetype_table[] = {
														
 
															+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
														
 
															+};
														
 
															+
														
 
															+static int ocfs2_extend_dir(struct ocfs2_super *osb,
														
 
															+			    struct inode *dir,
														
 
															+			    struct buffer_head *parent_fe_bh,
														
 
															+			    struct buffer_head **new_de_bh);
														
 
															+/*
														
 
															+ * ocfs2_readdir()
														
 
															+ *
														
 
															+ */
														
 
															+int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
														
 
															+{
														
 
															+	int error = 0;
														
 
															+	unsigned long offset, blk;
														
 
															+	int i, num, stored;
														
 
															+	struct buffer_head * bh, * tmp;
														
 
															+	struct ocfs2_dir_entry * de;
														
 
															+	int err;
														
 
															+	struct inode *inode = filp->f_dentry->d_inode;
														
 
															+	struct super_block * sb = inode->i_sb;
														
 
															+	int have_disk_lock = 0;
														
 
															+
														
 
															+	mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
														
 
															+
														
 
															+	stored = 0;
														
 
															+	bh = NULL;
														
 
															+
														
 
															+	error = ocfs2_meta_lock(inode, NULL, NULL, 0);
														
 
															+	if (error < 0) {
														
 
															+		if (error != -ENOENT)
														
 
															+			mlog_errno(error);
														
 
															+		/* we haven't got any yet, so propagate the error. */
														
 
															+		stored = error;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	have_disk_lock = 1;
														
 
															+
														
 
															+	offset = filp->f_pos & (sb->s_blocksize - 1);
														
 
															+
														
 
															+	while (!error && !stored && filp->f_pos < i_size_read(inode)) {
														
 
															+		blk = (filp->f_pos) >> sb->s_blocksize_bits;
														
 
															+		bh = ocfs2_bread(inode, blk, &err, 0);
														
 
															+		if (!bh) {
														
 
															+			mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
														
 
															+				       "at offset %lld\n",
														
 
															+			     OCFS2_I(inode)->ip_blkno,
														
 
															+			     filp->f_pos);
														
 
															+			filp->f_pos += sb->s_blocksize - offset;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		/*
														
 
															+		 * Do the readahead (8k)
														
 
															+		 */
														
 
															+		if (!offset) {
														
 
															+			for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
														
 
															+			     i > 0; i--) {
														
 
															+				tmp = ocfs2_bread(inode, ++blk, &err, 1);
														
 
															+				if (tmp)
														
 
															+					brelse(tmp);
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+revalidate:
														
 
															+		/* If the dir block has changed since the last call to
														
 
															+		 * readdir(2), then we might be pointing to an invalid
														
 
															+		 * dirent right now.  Scan from the start of the block
														
 
															+		 * to make sure. */
														
 
															+		if (filp->f_version != inode->i_version) {
														
 
															+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
														
 
															+				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
														
 
															+				/* It's too expensive to do a full
														
 
															+				 * dirent test each time round this
														
 
															+				 * loop, but we do have to test at
														
 
															+				 * least that it is non-zero.  A
														
 
															+				 * failure will be detected in the
														
 
															+				 * dirent test below. */
														
 
															+				if (le16_to_cpu(de->rec_len) <
														
 
															+				    OCFS2_DIR_REC_LEN(1))
														
 
															+					break;
														
 
															+				i += le16_to_cpu(de->rec_len);
														
 
															+			}
														
 
															+			offset = i;
														
 
															+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
														
 
															+				| offset;
														
 
															+			filp->f_version = inode->i_version;
														
 
															+		}
														
 
															+
														
 
															+		while (!error && filp->f_pos < i_size_read(inode)
														
 
															+		       && offset < sb->s_blocksize) {
														
 
															+			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
														
 
															+			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
														
 
															+				/* On error, skip the f_pos to the
														
 
															+				   next block. */
														
 
															+				filp->f_pos = (filp->f_pos |
														
 
															+					       (sb->s_blocksize - 1)) + 1;
														
 
															+				brelse(bh);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			offset += le16_to_cpu(de->rec_len);
														
 
															+			if (le64_to_cpu(de->inode)) {
														
 
															+				/* We might block in the next section
														
 
															+				 * if the data destination is
														
 
															+				 * currently swapped out.  So, use a
														
 
															+				 * version stamp to detect whether or
														
 
															+				 * not the directory has been modified
														
 
															+				 * during the copy operation.
														
 
															+				 */
														
 
															+				unsigned long version = filp->f_version;
														
 
															+				unsigned char d_type = DT_UNKNOWN;
														
 
															+
														
 
															+				if (de->file_type < OCFS2_FT_MAX)
														
 
															+					d_type = ocfs2_filetype_table[de->file_type];
														
 
															+				error = filldir(dirent, de->name,
														
 
															+						de->name_len,
														
 
															+						filp->f_pos,
														
 
															+						ino_from_blkno(sb, le64_to_cpu(de->inode)),
														
 
															+						d_type);
														
 
															+				if (error)
														
 
															+					break;
														
 
															+				if (version != filp->f_version)
														
 
															+					goto revalidate;
														
 
															+				stored ++;
														
 
															+			}
														
 
															+			filp->f_pos += le16_to_cpu(de->rec_len);
														
 
															+		}
														
 
															+		offset = 0;
														
 
															+		brelse(bh);
														
 
															+	}
														
 
															+
														
 
															+	stored = 0;
														
 
															+bail:
														
 
															+	if (have_disk_lock)
														
 
															+		ocfs2_meta_unlock(inode, 0);
														
 
															+
														
 
															+	mlog_exit(stored);
														
 
															+
														
 
															+	return stored;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * NOTE: this should always be called with parent dir i_sem taken.
														
 
															+ */
														
 
															+int ocfs2_find_files_on_disk(const char *name,
														
 
															+			     int namelen,
														
 
															+			     u64 *blkno,
														
 
															+			     struct inode *inode,
														
 
															+			     struct buffer_head **dirent_bh,
														
 
															+			     struct ocfs2_dir_entry **dirent)
														
 
															+{
														
 
															+	int status = -ENOENT;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
														
 
															+		   "inode=%p)\n",
														
 
															+		   osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
														
 
															+
														
 
															+	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
														
 
															+	if (!*dirent_bh || !*dirent) {
														
 
															+		status = -ENOENT;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	*blkno = le64_to_cpu((*dirent)->inode);
														
 
															+
														
 
															+	status = 0;
														
 
															+leave:
														
 
															+	if (status < 0) {
														
 
															+		*dirent = NULL;
														
 
															+		if (*dirent_bh) {
														
 
															+			brelse(*dirent_bh);
														
 
															+			*dirent_bh = NULL;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Check for a name within a directory.
														
 
															+ *
														
 
															+ * Return 0 if the name does not exist
														
 
															+ * Return -EEXIST if the directory contains the name
														
 
															+ *
														
 
															+ * Callers should have i_sem + a cluster lock on dir
														
 
															+ */
														
 
															+int ocfs2_check_dir_for_entry(struct inode *dir,
														
 
															+			      const char *name,
														
 
															+			      int namelen)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct buffer_head *dirent_bh = NULL;
														
 
															+	struct ocfs2_dir_entry *dirent = NULL;
														
 
															+
														
 
															+	mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
														
 
															+		   namelen, name);
														
 
															+
														
 
															+	ret = -EEXIST;
														
 
															+	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
														
 
															+	if (dirent_bh)
														
 
															+		goto bail;
														
 
															+
														
 
															+	ret = 0;
														
 
															+bail:
														
 
															+	if (dirent_bh)
														
 
															+		brelse(dirent_bh);
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * routine to check that the specified directory is empty (for rmdir)
														
 
															+ */
														
 
															+int ocfs2_empty_dir(struct inode *inode)
														
 
															+{
														
 
															+	unsigned long offset;
														
 
															+	struct buffer_head * bh;
														
 
															+	struct ocfs2_dir_entry * de, * de1;
														
 
															+	struct super_block * sb;
														
 
															+	int err;
														
 
															+
														
 
															+	sb = inode->i_sb;
														
 
															+	if ((i_size_read(inode) <
														
 
															+	     (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
														
 
															+	    !(bh = ocfs2_bread(inode, 0, &err, 0))) {
														
 
															+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
														
 
															+			       "no data block\n",
														
 
															+		     OCFS2_I(inode)->ip_blkno);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	de = (struct ocfs2_dir_entry *) bh->b_data;
														
 
															+	de1 = (struct ocfs2_dir_entry *)
														
 
															+			((char *)de + le16_to_cpu(de->rec_len));
														
 
															+	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
														
 
															+			!le64_to_cpu(de1->inode) ||
														
 
															+			strcmp(".", de->name) ||
														
 
															+			strcmp("..", de1->name)) {
														
 
															+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
														
 
															+			       "no `.' or `..'\n",
														
 
															+		     OCFS2_I(inode)->ip_blkno);
														
 
															+		brelse(bh);
														
 
															+		return 1;
														
 
															+	}
														
 
															+	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
														
 
															+	de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
														
 
															+	while (offset < i_size_read(inode) ) {
														
 
															+		if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
														
 
															+			brelse(bh);
														
 
															+			bh = ocfs2_bread(inode,
														
 
															+					 offset >> sb->s_blocksize_bits, &err, 0);
														
 
															+			if (!bh) {
														
 
															+				mlog(ML_ERROR, "directory #%"MLFu64" contains "
														
 
															+					       "a hole at offset %lu\n",
														
 
															+				     OCFS2_I(inode)->ip_blkno, offset);
														
 
															+				offset += sb->s_blocksize;
														
 
															+				continue;
														
 
															+			}
														
 
															+			de = (struct ocfs2_dir_entry *) bh->b_data;
														
 
															+		}
														
 
															+		if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
														
 
															+			brelse(bh);
														
 
															+			return 1;
														
 
															+		}
														
 
															+		if (le64_to_cpu(de->inode)) {
														
 
															+			brelse(bh);
														
 
															+			return 0;
														
 
															+		}
														
 
															+		offset += le16_to_cpu(de->rec_len);
														
 
															+		de = (struct ocfs2_dir_entry *)
														
 
															+			((char *)de + le16_to_cpu(de->rec_len));
														
 
															+	}
														
 
															+	brelse(bh);
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+/* returns a bh of the 1st new block in the allocation. */
														
 
															+int ocfs2_do_extend_dir(struct super_block *sb,
														
 
															+			struct ocfs2_journal_handle *handle,
														
 
															+			struct inode *dir,
														
 
															+			struct buffer_head *parent_fe_bh,
														
 
															+			struct ocfs2_alloc_context *data_ac,
														
 
															+			struct ocfs2_alloc_context *meta_ac,
														
 
															+			struct buffer_head **new_bh)
														
 
															+{
														
 
															+	int status;
														
 
															+	int extend;
														
 
															+	u64 p_blkno;
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(dir)->ip_lock);
														
 
															+	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
														
 
															+	spin_unlock(&OCFS2_I(dir)->ip_lock);
														
 
															+
														
 
															+	if (extend) {
														
 
															+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
														
 
															+						    parent_fe_bh, handle,
														
 
															+						    data_ac, meta_ac, NULL);
														
 
															+		BUG_ON(status == -EAGAIN);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
														
 
															+						   (sb->s_blocksize_bits - 9)),
														
 
															+					     1, &p_blkno, NULL);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	*new_bh = sb_getblk(sb, p_blkno);
														
 
															+	if (!*new_bh) {
														
 
															+		status = -EIO;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* assumes you already have a cluster lock on the directory. */
														
 
															+static int ocfs2_extend_dir(struct ocfs2_super *osb,
														
 
															+			    struct inode *dir,
														
 
															+			    struct buffer_head *parent_fe_bh,
														
 
															+			    struct buffer_head **new_de_bh)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int credits, num_free_extents;
														
 
															+	loff_t dir_i_size;
														
 
															+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
														
 
															+	struct ocfs2_alloc_context *data_ac = NULL;
														
 
															+	struct ocfs2_alloc_context *meta_ac = NULL;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct buffer_head *new_bh = NULL;
														
 
															+	struct ocfs2_dir_entry * de;
														
 
															+	struct super_block *sb = osb->sb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	dir_i_size = i_size_read(dir);
														
 
															+	mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
														
 
															+	     OCFS2_I(dir)->ip_blkno, dir_i_size);
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (handle == NULL) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* dir->i_size is always block aligned. */
														
 
															+	spin_lock(&OCFS2_I(dir)->ip_lock);
														
 
															+	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
														
 
															+		spin_unlock(&OCFS2_I(dir)->ip_lock);
														
 
															+		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
														
 
															+		if (num_free_extents < 0) {
														
 
															+			status = num_free_extents;
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		if (!num_free_extents) {
														
 
															+			status = ocfs2_reserve_new_metadata(osb, handle,
														
 
															+							    fe, &meta_ac);
														
 
															+			if (status < 0) {
														
 
															+				if (status != -ENOSPC)
														
 
															+					mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC)
														
 
															+				mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		credits = ocfs2_calc_extend_credits(sb, fe, 1);
														
 
															+	} else {
														
 
															+		spin_unlock(&OCFS2_I(dir)->ip_lock);
														
 
															+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, credits);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
														
 
															+				     data_ac, meta_ac, &new_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_set_new_buffer_uptodate(dir, new_bh);
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, dir, new_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_CREATE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	memset(new_bh->b_data, 0, sb->s_blocksize);
														
 
															+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
														
 
															+	de->inode = 0;
														
 
															+	de->rec_len = cpu_to_le16(sb->s_blocksize);
														
 
															+	status = ocfs2_journal_dirty(handle, new_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	dir_i_size += dir->i_sb->s_blocksize;
														
 
															+	i_size_write(dir, dir_i_size);
														
 
															+	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
														
 
															+	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	*new_de_bh = new_bh;
														
 
															+	get_bh(*new_de_bh);
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (data_ac)
														
 
															+		ocfs2_free_alloc_context(data_ac);
														
 
															+	if (meta_ac)
														
 
															+		ocfs2_free_alloc_context(meta_ac);
														
 
															+
														
 
															+	if (new_bh)
														
 
															+		brelse(new_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Search the dir for a good spot, extending it if necessary. The
														
 
															+ * block containing an appropriate record is returned in ret_de_bh.
														
 
															+ */
														
 
															+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
														
 
															+				 struct inode *dir,
														
 
															+				 struct buffer_head *parent_fe_bh,
														
 
															+				 const char *name,
														
 
															+				 int namelen,
														
 
															+				 struct buffer_head **ret_de_bh)
														
 
															+{
														
 
															+	unsigned long offset;
														
 
															+	struct buffer_head * bh = NULL;
														
 
															+	unsigned short rec_len;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct ocfs2_dir_entry *de;
														
 
															+	struct super_block *sb;
														
 
															+	int status;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
														
 
															+	     namelen, OCFS2_I(dir)->ip_blkno);
														
 
															+
														
 
															+	BUG_ON(!S_ISDIR(dir->i_mode));
														
 
															+	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
														
 
															+	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
														
 
															+
														
 
															+	sb = dir->i_sb;
														
 
															+
														
 
															+	if (!namelen) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	bh = ocfs2_bread(dir, 0, &status, 0);
														
 
															+	if (!bh) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	rec_len = OCFS2_DIR_REC_LEN(namelen);
														
 
															+	offset = 0;
														
 
															+	de = (struct ocfs2_dir_entry *) bh->b_data;
														
 
															+	while (1) {
														
 
															+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
														
 
															+			brelse(bh);
														
 
															+			bh = NULL;
														
 
															+
														
 
															+			if (i_size_read(dir) <= offset) {
														
 
															+				status = ocfs2_extend_dir(osb,
														
 
															+							  dir,
														
 
															+							  parent_fe_bh,
														
 
															+							  &bh);
														
 
															+				if (status < 0) {
														
 
															+					mlog_errno(status);
														
 
															+					goto bail;
														
 
															+				}
														
 
															+				BUG_ON(!bh);
														
 
															+				*ret_de_bh = bh;
														
 
															+				get_bh(*ret_de_bh);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			bh = ocfs2_bread(dir,
														
 
															+					 offset >> sb->s_blocksize_bits,
														
 
															+					 &status,
														
 
															+					 0);
														
 
															+			if (!bh) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			/* move to next block */
														
 
															+			de = (struct ocfs2_dir_entry *) bh->b_data;
														
 
															+		}
														
 
															+		if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
														
 
															+			status = -ENOENT;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		if (ocfs2_match(namelen, name, de)) {
														
 
															+			status = -EEXIST;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		if (((le64_to_cpu(de->inode) == 0) &&
														
 
															+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
														
 
															+		    (le16_to_cpu(de->rec_len) >=
														
 
															+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
														
 
															+			/* Ok, we found a spot. Return this bh and let
														
 
															+			 * the caller actually fill it in. */
														
 
															+			*ret_de_bh = bh;
														
 
															+			get_bh(*ret_de_bh);
														
 
															+			status = 0;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		offset += le16_to_cpu(de->rec_len);
														
 
															+		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,54 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dir.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_DIR_H
														
 
															+#define OCFS2_DIR_H
														
 
															+
														
 
															+int ocfs2_check_dir_for_entry(struct inode *dir,
														
 
															+			      const char *name,
														
 
															+			      int namelen);
														
 
															+int ocfs2_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
														
 
															+int ocfs2_find_files_on_disk(const char *name,
														
 
															+			     int namelen,
														
 
															+			     u64 *blkno,
														
 
															+			     struct inode *inode,
														
 
															+			     struct buffer_head **dirent_bh,
														
 
															+			     struct ocfs2_dir_entry **dirent);
														
 
															+int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
														
 
															+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
														
 
															+				 struct inode *dir,
														
 
															+				 struct buffer_head *parent_fe_bh,
														
 
															+				 const char *name,
														
 
															+				 int namelen,
														
 
															+				 struct buffer_head **ret_de_bh);
														
 
															+struct ocfs2_alloc_context;
														
 
															+int ocfs2_do_extend_dir(struct super_block *sb,
														
 
															+			struct ocfs2_journal_handle *handle,
														
 
															+			struct inode *dir,
														
 
															+			struct buffer_head *parent_fe_bh,
														
 
															+			struct ocfs2_alloc_context *data_ac,
														
 
															+			struct ocfs2_alloc_context *meta_ac,
														
 
															+			struct buffer_head **new_bh);
														
 
															+#endif /* OCFS2_DIR_H */
														
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -0,0 +1,8 @@
 
															+EXTRA_CFLAGS += -Ifs/ocfs2
														
 
															+
														
 
															+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
														
 
															+
														
 
															+ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
														
 
															+	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
														
 
															+
														
 
															+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
														
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -0,0 +1,214 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmapi.h
														
 
															+ *
														
 
															+ * externally exported dlm interfaces
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef DLMAPI_H
														
 
															+#define DLMAPI_H
														
 
															+
														
 
															+struct dlm_lock;
														
 
															+struct dlm_ctxt;
														
 
															+
														
 
															+/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
														
 
															+enum dlm_status {
														
 
															+	DLM_NORMAL = 0,           /*  0: request in progress */
														
 
															+	DLM_GRANTED,              /*  1: request granted */
														
 
															+	DLM_DENIED,               /*  2: request denied */
														
 
															+	DLM_DENIED_NOLOCKS,       /*  3: request denied, out of system resources */
														
 
															+	DLM_WORKING,              /*  4: async request in progress */
														
 
															+	DLM_BLOCKED,              /*  5: lock request blocked */
														
 
															+	DLM_BLOCKED_ORPHAN,       /*  6: lock request blocked by a orphan lock*/
														
 
															+	DLM_DENIED_GRACE_PERIOD,  /*  7: topological change in progress */
														
 
															+	DLM_SYSERR,               /*  8: system error */
														
 
															+	DLM_NOSUPPORT,            /*  9: unsupported */
														
 
															+	DLM_CANCELGRANT,          /* 10: can't cancel convert: already granted */
														
 
															+	DLM_IVLOCKID,             /* 11: bad lockid */
														
 
															+	DLM_SYNC,                 /* 12: synchronous request granted */
														
 
															+	DLM_BADTYPE,              /* 13: bad resource type */
														
 
															+	DLM_BADRESOURCE,          /* 14: bad resource handle */
														
 
															+	DLM_MAXHANDLES,           /* 15: no more resource handles */
														
 
															+	DLM_NOCLINFO,             /* 16: can't contact cluster manager */
														
 
															+	DLM_NOLOCKMGR,            /* 17: can't contact lock manager */
														
 
															+	DLM_NOPURGED,             /* 18: can't contact purge daemon */
														
 
															+	DLM_BADARGS,              /* 19: bad api args */
														
 
															+	DLM_VOID,                 /* 20: no status */
														
 
															+	DLM_NOTQUEUED,            /* 21: NOQUEUE was specified and request failed */
														
 
															+	DLM_IVBUFLEN,             /* 22: invalid resource name length */
														
 
															+	DLM_CVTUNGRANT,           /* 23: attempted to convert ungranted lock */
														
 
															+	DLM_BADPARAM,             /* 24: invalid lock mode specified */
														
 
															+	DLM_VALNOTVALID,          /* 25: value block has been invalidated */
														
 
															+	DLM_REJECTED,             /* 26: request rejected, unrecognized client */
														
 
															+	DLM_ABORT,                /* 27: blocked lock request cancelled */
														
 
															+	DLM_CANCEL,               /* 28: conversion request cancelled */
														
 
															+	DLM_IVRESHANDLE,          /* 29: invalid resource handle */
														
 
															+	DLM_DEADLOCK,             /* 30: deadlock recovery refused this request */
														
 
															+	DLM_DENIED_NOASTS,        /* 31: failed to allocate AST */
														
 
															+	DLM_FORWARD,              /* 32: request must wait for primary's response */
														
 
															+	DLM_TIMEOUT,              /* 33: timeout value for lock has expired */
														
 
															+	DLM_IVGROUPID,            /* 34: invalid group specification */
														
 
															+	DLM_VERS_CONFLICT,        /* 35: version conflicts prevent request handling */
														
 
															+	DLM_BAD_DEVICE_PATH,      /* 36: Locks device does not exist or path wrong */
														
 
															+	DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
														
 
															+	DLM_NO_CONTROL_DEVICE,    /* 38: Cannot set options on opened device */
														
 
															+
														
 
															+	DLM_RECOVERING,           /* 39: extension, allows caller to fail a lock
														
 
															+				     request if it is being recovered */
														
 
															+	DLM_MIGRATING,            /* 40: extension, allows caller to fail a lock
														
 
															+				     request if it is being migrated */
														
 
															+	DLM_MAXSTATS,             /* 41: upper limit for return code validation */
														
 
															+};
														
 
															+
														
 
															+/* for pretty-printing dlm_status error messages */
														
 
															+const char *dlm_errmsg(enum dlm_status err);
														
 
															+/* for pretty-printing dlm_status error names */
														
 
															+const char *dlm_errname(enum dlm_status err);
														
 
															+
														
 
															+/* Eventually the DLM will use standard errno values, but in the
														
 
															+ * meantime this lets us track dlm errors as they bubble up. When we
														
 
															+ * bring its error reporting into line with the rest of the stack,
														
 
															+ * these can just be replaced with calls to mlog_errno. */
														
 
															+#define dlm_error(st) do {						\
														
 
															+	if ((st) != DLM_RECOVERING &&					\
														
 
															+	    (st) != DLM_MIGRATING &&					\
														
 
															+	    (st) != DLM_FORWARD)					\
														
 
															+		mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st)));	\
														
 
															+} while (0)
														
 
															+
														
 
															+#define DLM_LKSB_UNUSED1           0x01  
														
 
															+#define DLM_LKSB_PUT_LVB           0x02
														
 
															+#define DLM_LKSB_GET_LVB           0x04
														
 
															+#define DLM_LKSB_UNUSED2           0x08
														
 
															+#define DLM_LKSB_UNUSED3           0x10
														
 
															+#define DLM_LKSB_UNUSED4           0x20
														
 
															+#define DLM_LKSB_UNUSED5           0x40
														
 
															+#define DLM_LKSB_UNUSED6           0x80
														
 
															+
														
 
															+#define DLM_LVB_LEN  64
														
 
															+
														
 
															+/* Callers are only allowed access to the lvb and status members of
														
 
															+ * this struct. */
														
 
															+struct dlm_lockstatus {
														
 
															+	enum dlm_status status;
														
 
															+	u32 flags;
														
 
															+	struct dlm_lock *lockid;
														
 
															+	char lvb[DLM_LVB_LEN];
														
 
															+};
														
 
															+
														
 
															+/* Valid lock modes. */
														
 
															+#define LKM_IVMODE      (-1)            /* invalid mode */
														
 
															+#define LKM_NLMODE      0               /* null lock */
														
 
															+#define LKM_CRMODE      1               /* concurrent read    unsupported */
														
 
															+#define LKM_CWMODE      2               /* concurrent write   unsupported */
														
 
															+#define LKM_PRMODE      3               /* protected read */
														
 
															+#define LKM_PWMODE      4               /* protected write    unsupported */
														
 
															+#define LKM_EXMODE      5               /* exclusive */
														
 
															+#define LKM_MAXMODE     5
														
 
															+#define LKM_MODEMASK    0xff
														
 
															+
														
 
															+/* Flags passed to dlmlock and dlmunlock:
														
 
															+ * reserved: flags used by the "real" dlm
														
 
															+ * only a few are supported by this dlm
														
 
															+ * (U) = unsupported by ocfs2 dlm */
														
 
															+#define LKM_ORPHAN       0x00000010  /* this lock is orphanable (U) */
														
 
															+#define LKM_PARENTABLE   0x00000020  /* this lock was orphaned (U) */
														
 
															+#define LKM_BLOCK        0x00000040  /* blocking lock request (U) */
														
 
															+#define LKM_LOCAL        0x00000080  /* local lock request */
														
 
															+#define LKM_VALBLK       0x00000100  /* lock value block request */
														
 
															+#define LKM_NOQUEUE      0x00000200  /* non blocking request */
														
 
															+#define LKM_CONVERT      0x00000400  /* conversion request */
														
 
															+#define LKM_NODLCKWT     0x00000800  /* this lock wont deadlock (U) */
														
 
															+#define LKM_UNLOCK       0x00001000  /* deallocate this lock */
														
 
															+#define LKM_CANCEL       0x00002000  /* cancel conversion request */
														
 
															+#define LKM_DEQALL       0x00004000  /* remove all locks held by proc (U) */
														
 
															+#define LKM_INVVALBLK    0x00008000  /* invalidate lock value block */
														
 
															+#define LKM_SYNCSTS      0x00010000  /* return synchronous status if poss (U) */
														
 
															+#define LKM_TIMEOUT      0x00020000  /* lock request contains timeout (U) */
														
 
															+#define LKM_SNGLDLCK     0x00040000  /* request can self-deadlock (U) */
														
 
															+#define LKM_FINDLOCAL    0x00080000  /* find local lock request (U) */
														
 
															+#define LKM_PROC_OWNED   0x00100000  /* owned by process, not group (U) */
														
 
															+#define LKM_XID          0x00200000  /* use transaction id for deadlock (U) */
														
 
															+#define LKM_XID_CONFLICT 0x00400000  /* do not allow lock inheritance (U) */
														
 
															+#define LKM_FORCE        0x00800000  /* force unlock flag */
														
 
															+#define LKM_REVVALBLK    0x01000000  /* temporary solution: re-validate
														
 
															+					lock value block (U) */
														
 
															+/* unused */
														
 
															+#define LKM_UNUSED1      0x00000001  /* unused */
														
 
															+#define LKM_UNUSED2      0x00000002  /* unused */
														
 
															+#define LKM_UNUSED3      0x00000004  /* unused */
														
 
															+#define LKM_UNUSED4      0x00000008  /* unused */
														
 
															+#define LKM_UNUSED5      0x02000000  /* unused */
														
 
															+#define LKM_UNUSED6      0x04000000  /* unused */
														
 
															+#define LKM_UNUSED7      0x08000000  /* unused */
														
 
															+
														
 
															+/* ocfs2 extensions: internal only
														
 
															+ * should never be used by caller */
														
 
															+#define LKM_MIGRATION    0x10000000  /* extension: lockres is to be migrated
														
 
															+					to another node */
														
 
															+#define LKM_PUT_LVB      0x20000000  /* extension: lvb is being passed
														
 
															+					should be applied to lockres */
														
 
															+#define LKM_GET_LVB      0x40000000  /* extension: lvb should be copied
														
 
															+					from lockres when lock is granted */
														
 
															+#define LKM_RECOVERY     0x80000000  /* extension: flag for recovery lock
														
 
															+					used to avoid recovery rwsem */
														
 
															+
														
 
															+
														
 
															+typedef void (dlm_astlockfunc_t)(void *);
														
 
															+typedef void (dlm_bastlockfunc_t)(void *, int);
														
 
															+typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
														
 
															+
														
 
															+enum dlm_status dlmlock(struct dlm_ctxt *dlm,
														
 
															+			int mode,
														
 
															+			struct dlm_lockstatus *lksb,
														
 
															+			int flags,
														
 
															+			const char *name,
														
 
															+			dlm_astlockfunc_t *ast,
														
 
															+			void *data,
														
 
															+			dlm_bastlockfunc_t *bast);
														
 
															+
														
 
															+enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
														
 
															+			  struct dlm_lockstatus *lksb,
														
 
															+			  int flags,
														
 
															+			  dlm_astunlockfunc_t *unlockast,
														
 
															+			  void *data);
														
 
															+
														
 
															+struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
														
 
															+
														
 
															+void dlm_unregister_domain(struct dlm_ctxt *dlm);
														
 
															+
														
 
															+void dlm_print_one_lock(struct dlm_lock *lockid);
														
 
															+
														
 
															+typedef void (dlm_eviction_func)(int, void *);
														
 
															+struct dlm_eviction_cb {
														
 
															+	struct list_head        ec_item;
														
 
															+	dlm_eviction_func       *ec_func;
														
 
															+	void                    *ec_data;
														
 
															+};
														
 
															+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
														
 
															+			   dlm_eviction_func *f,
														
 
															+			   void *data);
														
 
															+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_eviction_cb *cb);
														
 
															+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
														
 
															+
														
 
															+#endif /* DLMAPI_H */
														
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -0,0 +1,466 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmast.c
														
 
															+ *
														
 
															+ * AST and BAST functionality for local and remote nodes
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/socket.h>
														
 
															+#include <linux/inet.h>
														
 
															+#include <linux/spinlock.h>
														
 
															+
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+#include "cluster/endian.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DLM
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+			   struct dlm_lock *lock);
														
 
															+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
														
 
															+
														
 
															+/* Should be called as an ast gets queued to see if the new
														
 
															+ * lock level will obsolete a pending bast.
														
 
															+ * For example, if dlm_thread queued a bast for an EX lock that
														
 
															+ * was blocking another EX, but before sending the bast the
														
 
															+ * lock owner downconverted to NL, the bast is now obsolete.
														
 
															+ * Only the ast should be sent.
														
 
															+ * This is needed because the lock and convert paths can queue
														
 
															+ * asts out-of-band (not waiting for dlm_thread) in order to
														
 
															+ * allow for LKM_NOQUEUE to get immediate responses. */
														
 
															+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
														
 
															+{
														
 
															+	assert_spin_locked(&dlm->ast_lock);
														
 
															+	assert_spin_locked(&lock->spinlock);
														
 
															+
														
 
															+	if (lock->ml.highest_blocked == LKM_IVMODE)
														
 
															+		return 0;
														
 
															+	BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
														
 
															+
														
 
															+	if (lock->bast_pending &&
														
 
															+	    list_empty(&lock->bast_list))
														
 
															+		/* old bast already sent, ok */
														
 
															+		return 0;
														
 
															+
														
 
															+	if (lock->ml.type == LKM_EXMODE)
														
 
															+		/* EX blocks anything left, any bast still valid */
														
 
															+		return 0;
														
 
															+	else if (lock->ml.type == LKM_NLMODE)
														
 
															+		/* NL blocks nothing, no reason to send any bast, cancel it */
														
 
															+		return 1;
														
 
															+	else if (lock->ml.highest_blocked != LKM_EXMODE)
														
 
															+		/* PR only blocks EX */
														
 
															+		return 1;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!dlm);
														
 
															+	BUG_ON(!lock);
														
 
															+
														
 
															+	assert_spin_locked(&dlm->ast_lock);
														
 
															+	if (!list_empty(&lock->ast_list)) {
														
 
															+		mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
														
 
															+		     lock->ast_pending, lock->ml.type);
														
 
															+		BUG();
														
 
															+	}
														
 
															+	BUG_ON(!list_empty(&lock->ast_list));
														
 
															+	if (lock->ast_pending)
														
 
															+		mlog(0, "lock has an ast getting flushed right now\n");
														
 
															+
														
 
															+	/* putting lock on list, add a ref */
														
 
															+	dlm_lock_get(lock);
														
 
															+	spin_lock(&lock->spinlock);
														
 
															+
														
 
															+	/* check to see if this ast obsoletes the bast */
														
 
															+	if (dlm_should_cancel_bast(dlm, lock)) {
														
 
															+		struct dlm_lock_resource *res = lock->lockres;
														
 
															+		mlog(0, "%s: cancelling bast for %.*s\n",
														
 
															+		     dlm->name, res->lockname.len, res->lockname.name);
														
 
															+		lock->bast_pending = 0;
														
 
															+		list_del_init(&lock->bast_list);
														
 
															+		lock->ml.highest_blocked = LKM_IVMODE;
														
 
															+		/* removing lock from list, remove a ref.  guaranteed
														
 
															+		 * this won't be the last ref because of the get above,
														
 
															+		 * so res->spinlock will not be taken here */
														
 
															+		dlm_lock_put(lock);
														
 
															+		/* free up the reserved bast that we are cancelling.
														
 
															+		 * guaranteed that this will not be the last reserved
														
 
															+		 * ast because *both* an ast and a bast were reserved 
														
 
															+		 * to get to this point.  the res->spinlock will not be
														
 
															+		 * taken here */
														
 
															+		dlm_lockres_release_ast(dlm, res);
														
 
															+	}
														
 
															+	list_add_tail(&lock->ast_list, &dlm->pending_asts);
														
 
															+	lock->ast_pending = 1;
														
 
															+	spin_unlock(&lock->spinlock);
														
 
															+}
														
 
															+
														
 
															+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!dlm);
														
 
															+	BUG_ON(!lock);
														
 
															+
														
 
															+	spin_lock(&dlm->ast_lock);
														
 
															+	__dlm_queue_ast(dlm, lock);
														
 
															+	spin_unlock(&dlm->ast_lock);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!dlm);
														
 
															+	BUG_ON(!lock);
														
 
															+	assert_spin_locked(&dlm->ast_lock);
														
 
															+
														
 
															+	BUG_ON(!list_empty(&lock->bast_list));
														
 
															+	if (lock->bast_pending)
														
 
															+		mlog(0, "lock has a bast getting flushed right now\n");
														
 
															+
														
 
															+	/* putting lock on list, add a ref */
														
 
															+	dlm_lock_get(lock);
														
 
															+	spin_lock(&lock->spinlock);
														
 
															+	list_add_tail(&lock->bast_list, &dlm->pending_basts);
														
 
															+	lock->bast_pending = 1;
														
 
															+	spin_unlock(&lock->spinlock);
														
 
															+}
														
 
															+
														
 
															+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!dlm);
														
 
															+	BUG_ON(!lock);
														
 
															+
														
 
															+	spin_lock(&dlm->ast_lock);
														
 
															+	__dlm_queue_bast(dlm, lock);
														
 
															+	spin_unlock(&dlm->ast_lock);
														
 
															+}
														
 
															+
														
 
															+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+			   struct dlm_lock *lock)
														
 
															+{
														
 
															+	struct dlm_lockstatus *lksb = lock->lksb;
														
 
															+	BUG_ON(!lksb);
														
 
															+
														
 
															+	/* only updates if this node masters the lockres */
														
 
															+	if (res->owner == dlm->node_num) {
														
 
															+
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		/* check the lksb flags for the direction */
														
 
															+		if (lksb->flags & DLM_LKSB_GET_LVB) {
														
 
															+			mlog(0, "getting lvb from lockres for %s node\n",
														
 
															+				  lock->ml.node == dlm->node_num ? "master" :
														
 
															+				  "remote");
														
 
															+			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
														
 
															+		} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
														
 
															+			mlog(0, "setting lvb from lockres for %s node\n",
														
 
															+				  lock->ml.node == dlm->node_num ? "master" :
														
 
															+				  "remote");
														
 
															+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
														
 
															+		}
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+	}
														
 
															+
														
 
															+	/* reset any lvb flags on the lksb */
														
 
															+	lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
														
 
															+}
														
 
															+
														
 
															+void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+		      struct dlm_lock *lock)
														
 
															+{
														
 
															+	dlm_astlockfunc_t *fn;
														
 
															+	struct dlm_lockstatus *lksb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	lksb = lock->lksb;
														
 
															+	fn = lock->ast;
														
 
															+	BUG_ON(lock->ml.node != dlm->node_num);
														
 
															+
														
 
															+	dlm_update_lvb(dlm, res, lock);
														
 
															+	(*fn)(lock->astdata);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+		      struct dlm_lock *lock)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct dlm_lockstatus *lksb;
														
 
															+	int lksbflags;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	lksb = lock->lksb;
														
 
															+	BUG_ON(lock->ml.node == dlm->node_num);
														
 
															+
														
 
															+	lksbflags = lksb->flags;
														
 
															+	dlm_update_lvb(dlm, res, lock);
														
 
															+
														
 
															+	/* lock request came from another node
														
 
															+	 * go do the ast over there */
														
 
															+	ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+		       struct dlm_lock *lock, int blocked_type)
														
 
															+{
														
 
															+	dlm_bastlockfunc_t *fn = lock->bast;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	BUG_ON(lock->ml.node != dlm->node_num);
														
 
															+
														
 
															+	(*fn)(lock->astdata, blocked_type);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	int ret;
														
 
															+	unsigned int locklen;
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+	struct dlm_lock *lock = NULL;
														
 
															+	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
														
 
															+	char *name;
														
 
															+	struct list_head *iter, *head=NULL;
														
 
															+	u64 cookie;
														
 
															+	u32 flags;
														
 
															+
														
 
															+	if (!dlm_grab(dlm)) {
														
 
															+		dlm_error(DLM_REJECTED);
														
 
															+		return DLM_REJECTED;
														
 
															+	}
														
 
															+
														
 
															+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
														
 
															+			"Domain %s not fully joined!\n", dlm->name);
														
 
															+
														
 
															+	name = past->name;
														
 
															+	locklen = past->namelen;
														
 
															+	cookie = be64_to_cpu(past->cookie);
														
 
															+	flags = be32_to_cpu(past->flags);
														
 
															+
														
 
															+	if (locklen > DLM_LOCKID_NAME_MAX) {
														
 
															+		ret = DLM_IVBUFLEN;
														
 
															+		mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
														
 
															+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
														
 
															+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
														
 
															+		ret = DLM_BADARGS;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
														
 
															+		  (flags & LKM_GET_LVB ? "get lvb" : "none"));
														
 
															+
														
 
															+	mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
														
 
															+
														
 
															+	if (past->type != DLM_AST &&
														
 
															+	    past->type != DLM_BAST) {
														
 
															+		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
														
 
															+		     "name=%.*s\n", past->type, cookie, locklen, name);
														
 
															+		ret = DLM_IVLOCKID;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	res = dlm_lookup_lockres(dlm, name, locklen);
														
 
															+	if (!res) {
														
 
															+		mlog(ML_ERROR, "got %sast for unknown lockres! "
														
 
															+			       "cookie=%"MLFu64", name=%.*s, namelen=%u\n",
														
 
															+		     past->type == DLM_AST ? "" : "b",
														
 
															+		     cookie, locklen, name, locklen);
														
 
															+		ret = DLM_IVLOCKID;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* cannot get a proxy ast message if this node owns it */
														
 
															+	BUG_ON(res->owner == dlm->node_num);
														
 
															+
														
 
															+	mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	if (res->state & DLM_LOCK_RES_RECOVERING) {
														
 
															+		mlog(0, "responding with DLM_RECOVERING!\n");
														
 
															+		ret = DLM_RECOVERING;
														
 
															+		goto unlock_out;
														
 
															+	}
														
 
															+	if (res->state & DLM_LOCK_RES_MIGRATING) {
														
 
															+		mlog(0, "responding with DLM_MIGRATING!\n");
														
 
															+		ret = DLM_MIGRATING;
														
 
															+		goto unlock_out;
														
 
															+	}
														
 
															+	/* try convert queue for both ast/bast */
														
 
															+	head = &res->converting;
														
 
															+	lock = NULL;
														
 
															+	list_for_each(iter, head) {
														
 
															+		lock = list_entry (iter, struct dlm_lock, list);
														
 
															+		if (be64_to_cpu(lock->ml.cookie) == cookie)
														
 
															+			goto do_ast;
														
 
															+	}
														
 
															+
														
 
															+	/* if not on convert, try blocked for ast, granted for bast */
														
 
															+	if (past->type == DLM_AST)
														
 
															+		head = &res->blocked;
														
 
															+	else
														
 
															+		head = &res->granted;
														
 
															+
														
 
															+	list_for_each(iter, head) {
														
 
															+		lock = list_entry (iter, struct dlm_lock, list);
														
 
															+		if (be64_to_cpu(lock->ml.cookie) == cookie)
														
 
															+			goto do_ast;
														
 
															+	}
														
 
															+
														
 
															+	mlog(ML_ERROR, "got %sast for unknown lock!  cookie=%"MLFu64", "
														
 
															+		       "name=%.*s, namelen=%u\n",
														
 
															+             past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
														
 
															+
														
 
															+	ret = DLM_NORMAL;
														
 
															+unlock_out:
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	goto leave;
														
 
															+
														
 
															+do_ast:
														
 
															+	ret = DLM_NORMAL;
														
 
															+	if (past->type == DLM_AST) {
														
 
															+		/* do not alter lock refcount.  switching lists. */
														
 
															+		list_del_init(&lock->list);
														
 
															+		list_add_tail(&lock->list, &res->granted);
														
 
															+		mlog(0, "ast: adding to granted list... type=%d, "
														
 
															+			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
														
 
															+		if (lock->ml.convert_type != LKM_IVMODE) {
														
 
															+			lock->ml.type = lock->ml.convert_type;
														
 
															+			lock->ml.convert_type = LKM_IVMODE;
														
 
															+		} else {
														
 
															+			// should already be there....
														
 
															+		}
														
 
															+
														
 
															+		lock->lksb->status = DLM_NORMAL;
														
 
															+
														
 
															+		/* if we requested the lvb, fetch it into our lksb now */
														
 
															+		if (flags & LKM_GET_LVB) {
														
 
															+			BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
														
 
															+			memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	if (past->type == DLM_AST)
														
 
															+		dlm_do_local_ast(dlm, res, lock);
														
 
															+	else
														
 
															+		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
														
 
															+
														
 
															+leave:
														
 
															+
														
 
															+	if (res)
														
 
															+		dlm_lockres_put(res);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+			   struct dlm_lock *lock, int msg_type,
														
 
															+			   int blocked_type, int flags)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	struct dlm_proxy_ast past;
														
 
															+	struct kvec vec[2];
														
 
															+	size_t veclen = 1;
														
 
															+	int status;
														
 
															+
														
 
															+	mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
														
 
															+		   res->lockname.len, res->lockname.name, lock->ml.node,
														
 
															+		   msg_type, blocked_type);
														
 
															+
														
 
															+	memset(&past, 0, sizeof(struct dlm_proxy_ast));
														
 
															+	past.node_idx = dlm->node_num;
														
 
															+	past.type = msg_type;
														
 
															+	past.blocked_type = blocked_type;
														
 
															+	past.namelen = res->lockname.len;
														
 
															+	memcpy(past.name, res->lockname.name, past.namelen);
														
 
															+	past.cookie = lock->ml.cookie;
														
 
															+
														
 
															+	vec[0].iov_len = sizeof(struct dlm_proxy_ast);
														
 
															+	vec[0].iov_base = &past;
														
 
															+	if (flags & DLM_LKSB_GET_LVB) {
														
 
															+		mlog(0, "returning requested LVB data\n");
														
 
															+		be32_add_cpu(&past.flags, LKM_GET_LVB);
														
 
															+		vec[1].iov_len = DLM_LVB_LEN;
														
 
															+		vec[1].iov_base = lock->lksb->lvb;
														
 
															+		veclen++;
														
 
															+	}
														
 
															+
														
 
															+	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
														
 
															+				     lock->ml.node, &status);
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+	else {
														
 
															+		if (status == DLM_RECOVERING) {
														
 
															+			mlog(ML_ERROR, "sent AST to node %u, it thinks this "
														
 
															+			     "node is dead!\n", lock->ml.node);
														
 
															+			BUG();
														
 
															+		} else if (status == DLM_MIGRATING) {
														
 
															+			mlog(ML_ERROR, "sent AST to node %u, it returned "
														
 
															+			     "DLM_MIGRATING!\n", lock->ml.node);
														
 
															+			BUG();
														
 
															+		} else if (status != DLM_NORMAL) {
														
 
															+			mlog(ML_ERROR, "AST to node %u returned %d!\n",
														
 
															+			     lock->ml.node, status);
														
 
															+			/* ignore it */
														
 
															+		}
														
 
															+		ret = 0;
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -0,0 +1,884 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmcommon.h
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef DLMCOMMON_H
														
 
															+#define DLMCOMMON_H
														
 
															+
														
 
															+#include <linux/kref.h>
														
 
															+
														
 
															+#define DLM_HB_NODE_DOWN_PRI     (0xf000000)
														
 
															+#define DLM_HB_NODE_UP_PRI       (0x8000000)
														
 
															+
														
 
															+#define DLM_LOCKID_NAME_MAX    32
														
 
															+
														
 
															+#define DLM_DOMAIN_NAME_MAX_LEN    255
														
 
															+#define DLM_LOCK_RES_OWNER_UNKNOWN     O2NM_MAX_NODES
														
 
															+#define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
														
 
															+#define DLM_THREAD_MS                  200   // flush at least every 200 ms
														
 
															+
														
 
															+#define DLM_HASH_BITS     7
														
 
															+#define DLM_HASH_SIZE     (1 << DLM_HASH_BITS)
														
 
															+#define DLM_HASH_MASK     (DLM_HASH_SIZE - 1)
														
 
															+
														
 
															+enum dlm_ast_type {
														
 
															+	DLM_AST = 0,
														
 
															+	DLM_BAST,
														
 
															+	DLM_ASTUNLOCK
														
 
															+};
														
 
															+
														
 
															+
														
 
															+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
														
 
															+			 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
														
 
															+			 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
														
 
															+
														
 
															+#define DLM_RECOVERY_LOCK_NAME       "$RECOVERY"
														
 
															+#define DLM_RECOVERY_LOCK_NAME_LEN   9
														
 
															+
														
 
															+static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
														
 
															+{
														
 
															+	if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
														
 
															+	    memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
														
 
															+		return 1;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+#define DLM_RECO_STATE_ACTIVE  0x0001
														
 
															+
														
 
															+struct dlm_recovery_ctxt
														
 
															+{
														
 
															+	struct list_head resources;
														
 
															+	struct list_head received;
														
 
															+	struct list_head node_data;
														
 
															+	u8  new_master;
														
 
															+	u8  dead_node;
														
 
															+	u16 state;
														
 
															+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	wait_queue_head_t event;
														
 
															+};
														
 
															+
														
 
															+enum dlm_ctxt_state {
														
 
															+	DLM_CTXT_NEW = 0,
														
 
															+	DLM_CTXT_JOINED,
														
 
															+	DLM_CTXT_IN_SHUTDOWN,
														
 
															+	DLM_CTXT_LEAVING,
														
 
															+};
														
 
															+
														
 
															+struct dlm_ctxt
														
 
															+{
														
 
															+	struct list_head list;
														
 
															+	struct list_head *resources;
														
 
															+	struct list_head dirty_list;
														
 
															+	struct list_head purge_list;
														
 
															+	struct list_head pending_asts;
														
 
															+	struct list_head pending_basts;
														
 
															+	unsigned int purge_count;
														
 
															+	spinlock_t spinlock;
														
 
															+	spinlock_t ast_lock;
														
 
															+	char *name;
														
 
															+	u8 node_num;
														
 
															+	u32 key;
														
 
															+	u8  joining_node;
														
 
															+	wait_queue_head_t dlm_join_events;
														
 
															+	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	struct dlm_recovery_ctxt reco;
														
 
															+	spinlock_t master_lock;
														
 
															+	struct list_head master_list;
														
 
															+	struct list_head mle_hb_events;
														
 
															+
														
 
															+	/* these give a really vague idea of the system load */
														
 
															+	atomic_t local_resources;
														
 
															+	atomic_t remote_resources;
														
 
															+	atomic_t unknown_resources;
														
 
															+
														
 
															+	/* NOTE: Next three are protected by dlm_domain_lock */
														
 
															+	struct kref dlm_refs;
														
 
															+	enum dlm_ctxt_state dlm_state;
														
 
															+	unsigned int num_joins;
														
 
															+
														
 
															+	struct o2hb_callback_func dlm_hb_up;
														
 
															+	struct o2hb_callback_func dlm_hb_down;
														
 
															+	struct task_struct *dlm_thread_task;
														
 
															+	struct task_struct *dlm_reco_thread_task;
														
 
															+	wait_queue_head_t dlm_thread_wq;
														
 
															+	wait_queue_head_t dlm_reco_thread_wq;
														
 
															+	wait_queue_head_t ast_wq;
														
 
															+	wait_queue_head_t migration_wq;
														
 
															+
														
 
															+	struct work_struct dispatched_work;
														
 
															+	struct list_head work_list;
														
 
															+	spinlock_t work_lock;
														
 
															+	struct list_head dlm_domain_handlers;
														
 
															+	struct list_head	dlm_eviction_callbacks;
														
 
															+};
														
 
															+
														
 
															+/* these keventd work queue items are for less-frequently
														
 
															+ * called functions that cannot be directly called from the
														
 
															+ * net message handlers for some reason, usually because
														
 
															+ * they need to send net messages of their own. */
														
 
															+void dlm_dispatch_work(void *data);
														
 
															+
														
 
															+struct dlm_lock_resource;
														
 
															+struct dlm_work_item;
														
 
															+
														
 
															+typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
														
 
															+
														
 
															+struct dlm_request_all_locks_priv
														
 
															+{
														
 
															+	u8 reco_master;
														
 
															+	u8 dead_node;
														
 
															+};
														
 
															+
														
 
															+struct dlm_mig_lockres_priv
														
 
															+{
														
 
															+	struct dlm_lock_resource *lockres;
														
 
															+	u8 real_master;
														
 
															+};
														
 
															+
														
 
															+struct dlm_assert_master_priv
														
 
															+{
														
 
															+	struct dlm_lock_resource *lockres;
														
 
															+	u8 request_from;
														
 
															+	u32 flags;
														
 
															+	unsigned ignore_higher:1;
														
 
															+};
														
 
															+
														
 
															+
														
 
															+struct dlm_work_item
														
 
															+{
														
 
															+	struct list_head list;
														
 
															+	dlm_workfunc_t *func;
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+	void *data;
														
 
															+	union {
														
 
															+		struct dlm_request_all_locks_priv ral;
														
 
															+		struct dlm_mig_lockres_priv ml;
														
 
															+		struct dlm_assert_master_priv am;
														
 
															+	} u;
														
 
															+};
														
 
															+
														
 
															+static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_work_item *i,
														
 
															+				      dlm_workfunc_t *f, void *data)
														
 
															+{
														
 
															+	memset(i, 0, sizeof(*i));
														
 
															+	i->func = f;
														
 
															+	INIT_LIST_HEAD(&i->list);
														
 
															+	i->data = data;
														
 
															+	i->dlm = dlm;  /* must have already done a dlm_grab on this! */
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
														
 
															+					  u8 node)
														
 
															+{
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	dlm->joining_node = node;
														
 
															+	wake_up(&dlm->dlm_join_events);
														
 
															+}
														
 
															+
														
 
															+#define DLM_LOCK_RES_UNINITED             0x00000001
														
 
															+#define DLM_LOCK_RES_RECOVERING           0x00000002
														
 
															+#define DLM_LOCK_RES_READY                0x00000004
														
 
															+#define DLM_LOCK_RES_DIRTY                0x00000008
														
 
															+#define DLM_LOCK_RES_IN_PROGRESS          0x00000010
														
 
															+#define DLM_LOCK_RES_MIGRATING            0x00000020
														
 
															+
														
 
															+#define DLM_PURGE_INTERVAL_MS   (8 * 1000)
														
 
															+
														
 
															+struct dlm_lock_resource
														
 
															+{
														
 
															+	/* WARNING: Please see the comment in dlm_init_lockres before
														
 
															+	 * adding fields here. */
														
 
															+	struct list_head list;
														
 
															+	struct kref      refs;
														
 
															+
														
 
															+	/* please keep these next 3 in this order
														
 
															+	 * some funcs want to iterate over all lists */
														
 
															+	struct list_head granted;
														
 
															+	struct list_head converting;
														
 
															+	struct list_head blocked;
														
 
															+
														
 
															+	struct list_head dirty;
														
 
															+	struct list_head recovering; // dlm_recovery_ctxt.resources list
														
 
															+
														
 
															+	/* unused lock resources have their last_used stamped and are
														
 
															+	 * put on a list for the dlm thread to run. */
														
 
															+	struct list_head purge;
														
 
															+	unsigned long    last_used;
														
 
															+
														
 
															+	unsigned migration_pending:1;
														
 
															+	atomic_t asts_reserved;
														
 
															+	spinlock_t spinlock;
														
 
															+	wait_queue_head_t wq;
														
 
															+	u8  owner;              //node which owns the lock resource, or unknown
														
 
															+	u16 state;
														
 
															+	struct qstr lockname;
														
 
															+	char lvb[DLM_LVB_LEN];
														
 
															+};
														
 
															+
														
 
															+struct dlm_migratable_lock
														
 
															+{
														
 
															+	__be64 cookie;
														
 
															+
														
 
															+	/* these 3 are just padding for the in-memory structure, but
														
 
															+	 * list and flags are actually used when sent over the wire */
														
 
															+	__be16 pad1;
														
 
															+	u8 list;  // 0=granted, 1=converting, 2=blocked
														
 
															+	u8 flags;
														
 
															+
														
 
															+	s8 type;
														
 
															+	s8 convert_type;
														
 
															+	s8 highest_blocked;
														
 
															+	u8 node;
														
 
															+};  // 16 bytes
														
 
															+
														
 
															+struct dlm_lock
														
 
															+{
														
 
															+	struct dlm_migratable_lock ml;
														
 
															+
														
 
															+	struct list_head list;
														
 
															+	struct list_head ast_list;
														
 
															+	struct list_head bast_list;
														
 
															+	struct dlm_lock_resource *lockres;
														
 
															+	spinlock_t spinlock;
														
 
															+	struct kref lock_refs;
														
 
															+
														
 
															+	// ast and bast must be callable while holding a spinlock!
														
 
															+	dlm_astlockfunc_t *ast;
														
 
															+	dlm_bastlockfunc_t *bast;
														
 
															+	void *astdata;
														
 
															+	struct dlm_lockstatus *lksb;
														
 
															+	unsigned ast_pending:1,
														
 
															+		 bast_pending:1,
														
 
															+		 convert_pending:1,
														
 
															+		 lock_pending:1,
														
 
															+		 cancel_pending:1,
														
 
															+		 unlock_pending:1,
														
 
															+		 lksb_kernel_allocated:1;
														
 
															+};
														
 
															+
														
 
															+
														
 
															+#define DLM_LKSB_UNUSED1           0x01
														
 
															+#define DLM_LKSB_PUT_LVB           0x02
														
 
															+#define DLM_LKSB_GET_LVB           0x04
														
 
															+#define DLM_LKSB_UNUSED2           0x08
														
 
															+#define DLM_LKSB_UNUSED3           0x10
														
 
															+#define DLM_LKSB_UNUSED4           0x20
														
 
															+#define DLM_LKSB_UNUSED5           0x40
														
 
															+#define DLM_LKSB_UNUSED6           0x80
														
 
															+
														
 
															+
														
 
															+enum dlm_lockres_list {
														
 
															+	DLM_GRANTED_LIST = 0,
														
 
															+	DLM_CONVERTING_LIST,
														
 
															+	DLM_BLOCKED_LIST
														
 
															+};
														
 
															+
														
 
															+static inline struct list_head *
														
 
															+dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
														
 
															+{
														
 
															+	struct list_head *ret = NULL;
														
 
															+	if (idx == DLM_GRANTED_LIST)
														
 
															+		ret = &res->granted;
														
 
															+	else if (idx == DLM_CONVERTING_LIST)
														
 
															+		ret = &res->converting;
														
 
															+	else if (idx == DLM_BLOCKED_LIST)
														
 
															+		ret = &res->blocked;
														
 
															+	else
														
 
															+		BUG();
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+struct dlm_node_iter
														
 
															+{
														
 
															+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	int curnode;
														
 
															+};
														
 
															+
														
 
															+
														
 
															+enum {
														
 
															+	DLM_MASTER_REQUEST_MSG    = 500,
														
 
															+	DLM_UNUSED_MSG1,         /* 501 */
														
 
															+	DLM_ASSERT_MASTER_MSG,	 /* 502 */
														
 
															+	DLM_CREATE_LOCK_MSG,	 /* 503 */
														
 
															+	DLM_CONVERT_LOCK_MSG,	 /* 504 */
														
 
															+	DLM_PROXY_AST_MSG,	 /* 505 */
														
 
															+	DLM_UNLOCK_LOCK_MSG,	 /* 506 */
														
 
															+	DLM_UNUSED_MSG2,	 /* 507 */
														
 
															+	DLM_MIGRATE_REQUEST_MSG, /* 508 */
														
 
															+	DLM_MIG_LOCKRES_MSG, 	 /* 509 */
														
 
															+	DLM_QUERY_JOIN_MSG,	 /* 510 */
														
 
															+	DLM_ASSERT_JOINED_MSG,	 /* 511 */
														
 
															+	DLM_CANCEL_JOIN_MSG,	 /* 512 */
														
 
															+	DLM_EXIT_DOMAIN_MSG,	 /* 513 */
														
 
															+	DLM_MASTER_REQUERY_MSG,	 /* 514 */
														
 
															+	DLM_LOCK_REQUEST_MSG,	 /* 515 */
														
 
															+	DLM_RECO_DATA_DONE_MSG,	 /* 516 */
														
 
															+	DLM_BEGIN_RECO_MSG,	 /* 517 */
														
 
															+	DLM_FINALIZE_RECO_MSG	 /* 518 */
														
 
															+};
														
 
															+
														
 
															+struct dlm_reco_node_data
														
 
															+{
														
 
															+	int state;
														
 
															+	u8 node_num;
														
 
															+	struct list_head list;
														
 
															+};
														
 
															+
														
 
															+enum {
														
 
															+	DLM_RECO_NODE_DATA_DEAD = -1,
														
 
															+	DLM_RECO_NODE_DATA_INIT = 0,
														
 
															+	DLM_RECO_NODE_DATA_REQUESTING,
														
 
															+	DLM_RECO_NODE_DATA_REQUESTED,
														
 
															+	DLM_RECO_NODE_DATA_RECEIVING,
														
 
															+	DLM_RECO_NODE_DATA_DONE,
														
 
															+	DLM_RECO_NODE_DATA_FINALIZE_SENT,
														
 
															+};
														
 
															+
														
 
															+
														
 
															+enum {
														
 
															+	DLM_MASTER_RESP_NO = 0,
														
 
															+	DLM_MASTER_RESP_YES,
														
 
															+	DLM_MASTER_RESP_MAYBE,
														
 
															+	DLM_MASTER_RESP_ERROR
														
 
															+};
														
 
															+
														
 
															+
														
 
															+struct dlm_master_request
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 namelen;
														
 
															+	__be16 pad1;
														
 
															+	__be32 flags;
														
 
															+
														
 
															+	u8 name[O2NM_MAX_NAME_LEN];
														
 
															+};
														
 
															+
														
 
															+#define DLM_ASSERT_MASTER_MLE_CLEANUP      0x00000001
														
 
															+#define DLM_ASSERT_MASTER_REQUERY          0x00000002
														
 
															+#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
														
 
															+struct dlm_assert_master
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 namelen;
														
 
															+	__be16 pad1;
														
 
															+	__be32 flags;
														
 
															+
														
 
															+	u8 name[O2NM_MAX_NAME_LEN];
														
 
															+};
														
 
															+
														
 
															+struct dlm_migrate_request
														
 
															+{
														
 
															+	u8 master;
														
 
															+	u8 new_master;
														
 
															+	u8 namelen;
														
 
															+	u8 pad1;
														
 
															+	__be32 pad2;
														
 
															+	u8 name[O2NM_MAX_NAME_LEN];
														
 
															+};
														
 
															+
														
 
															+struct dlm_master_requery
														
 
															+{
														
 
															+	u8 pad1;
														
 
															+	u8 pad2;
														
 
															+	u8 node_idx;
														
 
															+	u8 namelen;
														
 
															+	__be32 pad3;
														
 
															+	u8 name[O2NM_MAX_NAME_LEN];
														
 
															+};
														
 
															+
														
 
															+#define DLM_MRES_RECOVERY   0x01
														
 
															+#define DLM_MRES_MIGRATION  0x02
														
 
															+#define DLM_MRES_ALL_DONE   0x04
														
 
															+
														
 
															+/*
														
 
															+ * We would like to get one whole lockres into a single network
														
 
															+ * message whenever possible.  Generally speaking, there will be
														
 
															+ * at most one dlm_lock on a lockres for each node in the cluster,
														
 
															+ * plus (infrequently) any additional locks coming in from userdlm.
														
 
															+ *
														
 
															+ * struct _dlm_lockres_page
														
 
															+ * {
														
 
															+ * 	dlm_migratable_lockres mres;
														
 
															+ * 	dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
														
 
															+ * 	u8 pad[DLM_MIG_LOCKRES_RESERVED];
														
 
															+ * };
														
 
															+ *
														
 
															+ * from ../cluster/tcp.h
														
 
															+ *    NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(net_msg))
														
 
															+ *    (roughly 4080 bytes)
														
 
															+ * and sizeof(dlm_migratable_lockres) = 112 bytes
														
 
															+ * and sizeof(dlm_migratable_lock) = 16 bytes
														
 
															+ *
														
 
															+ * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
														
 
															+ * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
														
 
															+ *
														
 
															+ *  (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
														
 
															+ *     sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
														
 
															+ *        NET_MAX_PAYLOAD_BYTES
														
 
															+ *  (240 * 16) + 112 + 128 = 4080
														
 
															+ *
														
 
															+ * So a lockres would need more than 240 locks before it would
														
 
															+ * use more than one network packet to recover.  Not too bad.
														
 
															+ */
														
 
															+#define DLM_MAX_MIGRATABLE_LOCKS   240
														
 
															+
														
 
															+struct dlm_migratable_lockres
														
 
															+{
														
 
															+	u8 master;
														
 
															+	u8 lockname_len;
														
 
															+	u8 num_locks;    // locks sent in this structure
														
 
															+	u8 flags;
														
 
															+	__be32 total_locks; // locks to be sent for this migration cookie
														
 
															+	__be64 mig_cookie;  // cookie for this lockres migration
														
 
															+			 // or zero if not needed
														
 
															+	// 16 bytes
														
 
															+	u8 lockname[DLM_LOCKID_NAME_MAX];
														
 
															+	// 48 bytes
														
 
															+	u8 lvb[DLM_LVB_LEN];
														
 
															+	// 112 bytes
														
 
															+	struct dlm_migratable_lock ml[0];  // 16 bytes each, begins at byte 112
														
 
															+};
														
 
															+#define DLM_MIG_LOCKRES_MAX_LEN  \
														
 
															+	(sizeof(struct dlm_migratable_lockres) + \
														
 
															+	 (sizeof(struct dlm_migratable_lock) * \
														
 
															+	  DLM_MAX_MIGRATABLE_LOCKS) )
														
 
															+
														
 
															+/* from above, 128 bytes
														
 
															+ * for some undetermined future use */
														
 
															+#define DLM_MIG_LOCKRES_RESERVED   (NET_MAX_PAYLOAD_BYTES - \
														
 
															+				    DLM_MIG_LOCKRES_MAX_LEN)
														
 
															+
														
 
															+struct dlm_create_lock
														
 
															+{
														
 
															+	__be64 cookie;
														
 
															+
														
 
															+	__be32 flags;
														
 
															+	u8 pad1;
														
 
															+	u8 node_idx;
														
 
															+	s8 requested_type;
														
 
															+	u8 namelen;
														
 
															+
														
 
															+	u8 name[O2NM_MAX_NAME_LEN];
														
 
															+};
														
 
															+
														
 
															+struct dlm_convert_lock
														
 
															+{
														
 
															+	__be64 cookie;
														
 
															+
														
 
															+	__be32 flags;
														
 
															+	u8 pad1;
														
 
															+	u8 node_idx;
														
 
															+	s8 requested_type;
														
 
															+	u8 namelen;
														
 
															+
														
 
															+	u8 name[O2NM_MAX_NAME_LEN];
														
 
															+
														
 
															+	s8 lvb[0];
														
 
															+};
														
 
															+#define DLM_CONVERT_LOCK_MAX_LEN  (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
														
 
															+
														
 
															+struct dlm_unlock_lock
														
 
															+{
														
 
															+	__be64 cookie;
														
 
															+
														
 
															+	__be32 flags;
														
 
															+	__be16 pad1;
														
 
															+	u8 node_idx;
														
 
															+	u8 namelen;
														
 
															+
														
 
															+	u8 name[O2NM_MAX_NAME_LEN];
														
 
															+
														
 
															+	s8 lvb[0];
														
 
															+};
														
 
															+#define DLM_UNLOCK_LOCK_MAX_LEN  (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
														
 
															+
														
 
															+struct dlm_proxy_ast
														
 
															+{
														
 
															+	__be64 cookie;
														
 
															+
														
 
															+	__be32 flags;
														
 
															+	u8 node_idx;
														
 
															+	u8 type;
														
 
															+	u8 blocked_type;
														
 
															+	u8 namelen;
														
 
															+
														
 
															+	u8 name[O2NM_MAX_NAME_LEN];
														
 
															+
														
 
															+	s8 lvb[0];
														
 
															+};
														
 
															+#define DLM_PROXY_AST_MAX_LEN  (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
														
 
															+
														
 
															+#define DLM_MOD_KEY (0x666c6172)
														
 
															+enum dlm_query_join_response {
														
 
															+	JOIN_DISALLOW = 0,
														
 
															+	JOIN_OK,
														
 
															+	JOIN_OK_NO_MAP,
														
 
															+};
														
 
															+
														
 
															+struct dlm_lock_request
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 dead_node;
														
 
															+	__be16 pad1;
														
 
															+	__be32 pad2;
														
 
															+};
														
 
															+
														
 
															+struct dlm_reco_data_done
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 dead_node;
														
 
															+	__be16 pad1;
														
 
															+	__be32 pad2;
														
 
															+
														
 
															+	/* unused for now */
														
 
															+	/* eventually we can use this to attempt
														
 
															+	 * lvb recovery based on each node's info */
														
 
															+	u8 reco_lvb[DLM_LVB_LEN];
														
 
															+};
														
 
															+
														
 
															+struct dlm_begin_reco
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 dead_node;
														
 
															+	__be16 pad1;
														
 
															+	__be32 pad2;
														
 
															+};
														
 
															+
														
 
															+
														
 
															+struct dlm_query_join_request
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 pad1[2];
														
 
															+	u8 name_len;
														
 
															+	u8 domain[O2NM_MAX_NAME_LEN];
														
 
															+};
														
 
															+
														
 
															+struct dlm_assert_joined
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 pad1[2];
														
 
															+	u8 name_len;
														
 
															+	u8 domain[O2NM_MAX_NAME_LEN];
														
 
															+};
														
 
															+
														
 
															+struct dlm_cancel_join
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 pad1[2];
														
 
															+	u8 name_len;
														
 
															+	u8 domain[O2NM_MAX_NAME_LEN];
														
 
															+};
														
 
															+
														
 
															+struct dlm_exit_domain
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 pad1[3];
														
 
															+};
														
 
															+
														
 
															+struct dlm_finalize_reco
														
 
															+{
														
 
															+	u8 node_idx;
														
 
															+	u8 dead_node;
														
 
															+	__be16 pad1;
														
 
															+	__be32 pad2;
														
 
															+};
														
 
															+
														
 
															+static inline enum dlm_status
														
 
															+__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	enum dlm_status status = DLM_NORMAL;
														
 
															+
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	if (res->state & DLM_LOCK_RES_RECOVERING)
														
 
															+		status = DLM_RECOVERING;
														
 
															+	else if (res->state & DLM_LOCK_RES_MIGRATING)
														
 
															+		status = DLM_MIGRATING;
														
 
															+	else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
														
 
															+		status = DLM_FORWARD;
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
														
 
															+			       struct dlm_lockstatus *lksb);
														
 
															+void dlm_lock_get(struct dlm_lock *lock);
														
 
															+void dlm_lock_put(struct dlm_lock *lock);
														
 
															+
														
 
															+void dlm_lock_attach_lockres(struct dlm_lock *lock,
														
 
															+			     struct dlm_lock_resource *res);
														
 
															+
														
 
															+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+
														
 
															+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
														
 
															+				struct dlm_lock *lock);
														
 
															+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
														
 
															+			     struct dlm_lock *lock);
														
 
															+
														
 
															+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
														
 
															+			       struct dlm_lock *lock);
														
 
															+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
														
 
															+			       struct dlm_lock *lock);
														
 
															+
														
 
															+int dlm_launch_thread(struct dlm_ctxt *dlm);
														
 
															+void dlm_complete_thread(struct dlm_ctxt *dlm);
														
 
															+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
														
 
															+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
														
 
															+void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
														
 
															+
														
 
															+void dlm_put(struct dlm_ctxt *dlm);
														
 
															+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
														
 
															+int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
														
 
															+
														
 
															+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_lock_resource *res);
														
 
															+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
														
 
															+			    struct dlm_lock_resource *res);
														
 
															+void dlm_purge_lockres(struct dlm_ctxt *dlm,
														
 
															+		       struct dlm_lock_resource *lockres);
														
 
															+void dlm_lockres_get(struct dlm_lock_resource *res);
														
 
															+void dlm_lockres_put(struct dlm_lock_resource *res);
														
 
															+void __dlm_unhash_lockres(struct dlm_lock_resource *res);
														
 
															+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
														
 
															+			  struct dlm_lock_resource *res);
														
 
															+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
														
 
															+						const char *name,
														
 
															+						unsigned int len);
														
 
															+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
														
 
															+					      const char *name,
														
 
															+					      unsigned int len);
														
 
															+
														
 
															+int dlm_is_host_down(int errno);
														
 
															+void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_lock_resource *res,
														
 
															+			      u8 owner);
														
 
															+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
														
 
															+						 const char *lockid,
														
 
															+						 int flags);
														
 
															+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
														
 
															+					  const char *name,
														
 
															+					  unsigned int namelen);
														
 
															+
														
 
															+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
														
 
															+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
														
 
															+void dlm_do_local_ast(struct dlm_ctxt *dlm,
														
 
															+		      struct dlm_lock_resource *res,
														
 
															+		      struct dlm_lock *lock);
														
 
															+int dlm_do_remote_ast(struct dlm_ctxt *dlm,
														
 
															+		      struct dlm_lock_resource *res,
														
 
															+		      struct dlm_lock *lock);
														
 
															+void dlm_do_local_bast(struct dlm_ctxt *dlm,
														
 
															+		       struct dlm_lock_resource *res,
														
 
															+		       struct dlm_lock *lock,
														
 
															+		       int blocked_type);
														
 
															+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
														
 
															+			   struct dlm_lock_resource *res,
														
 
															+			   struct dlm_lock *lock,
														
 
															+			   int msg_type,
														
 
															+			   int blocked_type, int flags);
														
 
															+static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_lock_resource *res,
														
 
															+				      struct dlm_lock *lock,
														
 
															+				      int blocked_type)
														
 
															+{
														
 
															+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
														
 
															+				      blocked_type, 0);
														
 
															+}
														
 
															+
														
 
															+static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
														
 
															+				     struct dlm_lock_resource *res,
														
 
															+				     struct dlm_lock *lock,
														
 
															+				     int flags)
														
 
															+{
														
 
															+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
														
 
															+				      0, flags);
														
 
															+}
														
 
															+
														
 
															+void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
														
 
															+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
														
 
															+
														
 
															+u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
														
 
															+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
														
 
															+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
														
 
															+
														
 
															+
														
 
															+int dlm_nm_init(struct dlm_ctxt *dlm);
														
 
															+int dlm_heartbeat_init(struct dlm_ctxt *dlm);
														
 
															+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
														
 
															+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
														
 
															+
														
 
															+int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
														
 
															+int dlm_migrate_lockres(struct dlm_ctxt *dlm,
														
 
															+			struct dlm_lock_resource *res,
														
 
															+			u8 target);
														
 
															+int dlm_finish_migration(struct dlm_ctxt *dlm,
														
 
															+			 struct dlm_lock_resource *res,
														
 
															+			 u8 old_master);
														
 
															+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
														
 
															+			     struct dlm_lock_resource *res);
														
 
															+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
														
 
															+
														
 
															+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+
														
 
															+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
														
 
															+			       struct dlm_lock_resource *res,
														
 
															+			       int ignore_higher,
														
 
															+			       u8 request_from,
														
 
															+			       u32 flags);
														
 
															+
														
 
															+
														
 
															+int dlm_send_one_lockres(struct dlm_ctxt *dlm,
														
 
															+			 struct dlm_lock_resource *res,
														
 
															+			 struct dlm_migratable_lockres *mres,
														
 
															+			 u8 send_to,
														
 
															+			 u8 flags);
														
 
															+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
														
 
															+				       struct dlm_lock_resource *res);
														
 
															+
														
 
															+/* will exit holding res->spinlock, but may drop in function */
														
 
															+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
														
 
															+void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
														
 
															+
														
 
															+/* will exit holding res->spinlock, but may drop in function */
														
 
															+static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
														
 
															+				    	  DLM_LOCK_RES_RECOVERING|
														
 
															+					  DLM_LOCK_RES_MIGRATING));
														
 
															+}
														
 
															+
														
 
															+
														
 
															+int dlm_init_mle_cache(void);
														
 
															+void dlm_destroy_mle_cache(void);
														
 
															+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
														
 
															+void dlm_clean_master_list(struct dlm_ctxt *dlm,
														
 
															+			   u8 dead_node);
														
 
															+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
														
 
															+
														
 
															+
														
 
															+static inline const char * dlm_lock_mode_name(int mode)
														
 
															+{
														
 
															+	switch (mode) {
														
 
															+		case LKM_EXMODE:
														
 
															+			return "EX";
														
 
															+		case LKM_PRMODE:
														
 
															+			return "PR";
														
 
															+		case LKM_NLMODE:
														
 
															+			return "NL";
														
 
															+	}
														
 
															+	return "UNKNOWN";
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static inline int dlm_lock_compatible(int existing, int request)
														
 
															+{
														
 
															+	/* NO_LOCK compatible with all */
														
 
															+	if (request == LKM_NLMODE ||
														
 
															+	    existing == LKM_NLMODE)
														
 
															+		return 1;
														
 
															+
														
 
															+	/* EX incompatible with all non-NO_LOCK */
														
 
															+	if (request == LKM_EXMODE)
														
 
															+		return 0;
														
 
															+
														
 
															+	/* request must be PR, which is compatible with PR */
														
 
															+	if (existing == LKM_PRMODE)
														
 
															+		return 1;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static inline int dlm_lock_on_list(struct list_head *head,
														
 
															+				   struct dlm_lock *lock)
														
 
															+{
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock *tmplock;
														
 
															+
														
 
															+	list_for_each(iter, head) {
														
 
															+		tmplock = list_entry(iter, struct dlm_lock, list);
														
 
															+		if (tmplock == lock)
														
 
															+			return 1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static inline enum dlm_status dlm_err_to_dlm_status(int err)
														
 
															+{
														
 
															+	enum dlm_status ret;
														
 
															+	if (err == -ENOMEM)
														
 
															+		ret = DLM_SYSERR;
														
 
															+	else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
														
 
															+		ret = DLM_NOLOCKMGR;
														
 
															+	else if (err == -EINVAL)
														
 
															+		ret = DLM_BADPARAM;
														
 
															+	else if (err == -ENAMETOOLONG)
														
 
															+		ret = DLM_IVBUFLEN;
														
 
															+	else
														
 
															+		ret = DLM_BADARGS;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static inline void dlm_node_iter_init(unsigned long *map,
														
 
															+				      struct dlm_node_iter *iter)
														
 
															+{
														
 
															+	memcpy(iter->node_map, map, sizeof(iter->node_map));
														
 
															+	iter->curnode = -1;
														
 
															+}
														
 
															+
														
 
															+static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
														
 
															+{
														
 
															+	int bit;
														
 
															+	bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
														
 
															+	if (bit >= O2NM_MAX_NODES) {
														
 
															+		iter->curnode = O2NM_MAX_NODES;
														
 
															+		return -ENOENT;
														
 
															+	}
														
 
															+	iter->curnode = bit;
														
 
															+	return bit;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+#endif /* DLMCOMMON_H */
														
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -0,0 +1,530 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmconvert.c
														
 
															+ *
														
 
															+ * underlying calls for lock conversion
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/socket.h>
														
 
															+#include <linux/inet.h>
														
 
															+#include <linux/spinlock.h>
														
 
															+
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+
														
 
															+#include "dlmconvert.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DLM
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+/* NOTE: __dlmconvert_master is the only function in here that
														
 
															+ * needs a spinlock held on entry (res->spinlock) and it is the
														
 
															+ * only one that holds a lock on exit (res->spinlock).
														
 
															+ * All other functions in here need no locks and drop all of
														
 
															+ * the locks that they acquire. */
														
 
															+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
														
 
															+					   struct dlm_lock_resource *res,
														
 
															+					   struct dlm_lock *lock, int flags,
														
 
															+					   int type, int *call_ast,
														
 
															+					   int *kick_thread);
														
 
															+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
														
 
															+					   struct dlm_lock_resource *res,
														
 
															+					   struct dlm_lock *lock, int flags, int type);
														
 
															+
														
 
															+/*
														
 
															+ * this is only called directly by dlmlock(), and only when the
														
 
															+ * local node is the owner of the lockres
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         takes and drops res->spinlock
														
 
															+ *   held on exit:  none
														
 
															+ * returns: see __dlmconvert_master
														
 
															+ */
														
 
															+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
														
 
															+				  struct dlm_lock_resource *res,
														
 
															+				  struct dlm_lock *lock, int flags, int type)
														
 
															+{
														
 
															+	int call_ast = 0, kick_thread = 0;
														
 
															+	enum dlm_status status;
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	/* we are not in a network handler, this is fine */
														
 
															+	__dlm_wait_on_lockres(res);
														
 
															+	__dlm_lockres_reserve_ast(res);
														
 
															+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
														
 
															+
														
 
															+	status = __dlmconvert_master(dlm, res, lock, flags, type,
														
 
															+				     &call_ast, &kick_thread);
														
 
															+
														
 
															+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	wake_up(&res->wq);
														
 
															+	if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
														
 
															+		dlm_error(status);
														
 
															+
														
 
															+	/* either queue the ast or release it */
														
 
															+	if (call_ast)
														
 
															+		dlm_queue_ast(dlm, lock);
														
 
															+	else
														
 
															+		dlm_lockres_release_ast(dlm, res);
														
 
															+
														
 
															+	if (kick_thread)
														
 
															+		dlm_kick_thread(dlm, res);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* performs lock conversion at the lockres master site
														
 
															+ * locking:
														
 
															+ *   caller needs:  res->spinlock
														
 
															+ *   taken:         takes and drops lock->spinlock
														
 
															+ *   held on exit:  res->spinlock
														
 
															+ * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
														
 
															+ *   call_ast: whether ast should be called for this lock
														
 
															+ *   kick_thread: whether dlm_kick_thread should be called
														
 
															+ */
														
 
															+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
														
 
															+					   struct dlm_lock_resource *res,
														
 
															+					   struct dlm_lock *lock, int flags,
														
 
															+					   int type, int *call_ast,
														
 
															+					   int *kick_thread)
														
 
															+{
														
 
															+	enum dlm_status status = DLM_NORMAL;
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock *tmplock=NULL;
														
 
															+
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
														
 
															+		   lock->ml.type, lock->ml.convert_type, type);
														
 
															+
														
 
															+	spin_lock(&lock->spinlock);
														
 
															+
														
 
															+	/* already converting? */
														
 
															+	if (lock->ml.convert_type != LKM_IVMODE) {
														
 
															+		mlog(ML_ERROR, "attempted to convert a lock with a lock "
														
 
															+		     "conversion pending\n");
														
 
															+		status = DLM_DENIED;
														
 
															+		goto unlock_exit;
														
 
															+	}
														
 
															+
														
 
															+	/* must be on grant queue to convert */
														
 
															+	if (!dlm_lock_on_list(&res->granted, lock)) {
														
 
															+		mlog(ML_ERROR, "attempted to convert a lock not on grant "
														
 
															+		     "queue\n");
														
 
															+		status = DLM_DENIED;
														
 
															+		goto unlock_exit;
														
 
															+	}
														
 
															+
														
 
															+	if (flags & LKM_VALBLK) {
														
 
															+		switch (lock->ml.type) {
														
 
															+			case LKM_EXMODE:
														
 
															+				/* EX + LKM_VALBLK + convert == set lvb */
														
 
															+				mlog(0, "will set lvb: converting %s->%s\n",
														
 
															+				     dlm_lock_mode_name(lock->ml.type),
														
 
															+				     dlm_lock_mode_name(type));
														
 
															+				lock->lksb->flags |= DLM_LKSB_PUT_LVB;
														
 
															+				break;
														
 
															+			case LKM_PRMODE:
														
 
															+			case LKM_NLMODE:
														
 
															+				/* refetch if new level is not NL */
														
 
															+				if (type > LKM_NLMODE) {
														
 
															+					mlog(0, "will fetch new value into "
														
 
															+					     "lvb: converting %s->%s\n",
														
 
															+					     dlm_lock_mode_name(lock->ml.type),
														
 
															+					     dlm_lock_mode_name(type));
														
 
															+					lock->lksb->flags |= DLM_LKSB_GET_LVB;
														
 
															+				} else {
														
 
															+					mlog(0, "will NOT fetch new value "
														
 
															+					     "into lvb: converting %s->%s\n",
														
 
															+					     dlm_lock_mode_name(lock->ml.type),
														
 
															+					     dlm_lock_mode_name(type));
														
 
															+					flags &= ~(LKM_VALBLK);
														
 
															+				}
														
 
															+				break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	/* in-place downconvert? */
														
 
															+	if (type <= lock->ml.type)
														
 
															+		goto grant;
														
 
															+
														
 
															+	/* upconvert from here on */
														
 
															+	status = DLM_NORMAL;
														
 
															+	list_for_each(iter, &res->granted) {
														
 
															+		tmplock = list_entry(iter, struct dlm_lock, list);
														
 
															+		if (tmplock == lock)
														
 
															+			continue;
														
 
															+		if (!dlm_lock_compatible(tmplock->ml.type, type))
														
 
															+			goto switch_queues;
														
 
															+	}
														
 
															+
														
 
															+	list_for_each(iter, &res->converting) {
														
 
															+		tmplock = list_entry(iter, struct dlm_lock, list);
														
 
															+		if (!dlm_lock_compatible(tmplock->ml.type, type))
														
 
															+			goto switch_queues;
														
 
															+		/* existing conversion requests take precedence */
														
 
															+		if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
														
 
															+			goto switch_queues;
														
 
															+	}
														
 
															+
														
 
															+	/* fall thru to grant */
														
 
															+
														
 
															+grant:
														
 
															+	mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
														
 
															+	     res->lockname.name, dlm_lock_mode_name(type));
														
 
															+	/* immediately grant the new lock type */
														
 
															+	lock->lksb->status = DLM_NORMAL;
														
 
															+	if (lock->ml.node == dlm->node_num)
														
 
															+		mlog(0, "doing in-place convert for nonlocal lock\n");
														
 
															+	lock->ml.type = type;
														
 
															+	status = DLM_NORMAL;
														
 
															+	*call_ast = 1;
														
 
															+	goto unlock_exit;
														
 
															+
														
 
															+switch_queues:
														
 
															+	if (flags & LKM_NOQUEUE) {
														
 
															+		mlog(0, "failed to convert NOQUEUE lock %.*s from "
														
 
															+		     "%d to %d...\n", res->lockname.len, res->lockname.name,
														
 
															+		     lock->ml.type, type);
														
 
															+		status = DLM_NOTQUEUED;
														
 
															+		goto unlock_exit;
														
 
															+	}
														
 
															+	mlog(0, "res %.*s, queueing...\n", res->lockname.len,
														
 
															+	     res->lockname.name);
														
 
															+
														
 
															+	lock->ml.convert_type = type;
														
 
															+	/* do not alter lock refcount.  switching lists. */
														
 
															+	list_del_init(&lock->list);
														
 
															+	list_add_tail(&lock->list, &res->converting);
														
 
															+
														
 
															+unlock_exit:
														
 
															+	spin_unlock(&lock->spinlock);
														
 
															+	if (status == DLM_DENIED) {
														
 
															+		__dlm_print_one_lock_resource(res);
														
 
															+	}
														
 
															+	if (status == DLM_NORMAL)
														
 
															+		*kick_thread = 1;
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
														
 
															+				struct dlm_lock *lock)
														
 
															+{
														
 
															+	/* do not alter lock refcount.  switching lists. */
														
 
															+	list_del_init(&lock->list);
														
 
															+	list_add_tail(&lock->list, &res->granted);
														
 
															+	lock->ml.convert_type = LKM_IVMODE;
														
 
															+	lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
														
 
															+}
														
 
															+
														
 
															+/* messages the master site to do lock conversion
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
														
 
															+ */
														
 
															+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
														
 
															+				  struct dlm_lock_resource *res,
														
 
															+				  struct dlm_lock *lock, int flags, int type)
														
 
															+{
														
 
															+	enum dlm_status status;
														
 
															+
														
 
															+	mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
														
 
															+	     lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	if (res->state & DLM_LOCK_RES_RECOVERING) {
														
 
															+		mlog(0, "bailing out early since res is RECOVERING "
														
 
															+		     "on secondary queue\n");
														
 
															+		/* __dlm_print_one_lock_resource(res); */
														
 
															+		status = DLM_RECOVERING;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	/* will exit this call with spinlock held */
														
 
															+	__dlm_wait_on_lockres(res);
														
 
															+
														
 
															+	if (lock->ml.convert_type != LKM_IVMODE) {
														
 
															+		__dlm_print_one_lock_resource(res);
														
 
															+		mlog(ML_ERROR, "converting a remote lock that is already "
														
 
															+		     "converting! (cookie=%"MLFu64", conv=%d)\n",
														
 
															+		     lock->ml.cookie, lock->ml.convert_type);
														
 
															+		status = DLM_DENIED;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
														
 
															+	/* move lock to local convert queue */
														
 
															+	/* do not alter lock refcount.  switching lists. */
														
 
															+	list_del_init(&lock->list);
														
 
															+	list_add_tail(&lock->list, &res->converting);
														
 
															+	lock->convert_pending = 1;
														
 
															+	lock->ml.convert_type = type;
														
 
															+
														
 
															+	if (flags & LKM_VALBLK) {
														
 
															+		if (lock->ml.type == LKM_EXMODE) {
														
 
															+			flags |= LKM_PUT_LVB;
														
 
															+			lock->lksb->flags |= DLM_LKSB_PUT_LVB;
														
 
															+		} else {
														
 
															+			if (lock->ml.convert_type == LKM_NLMODE)
														
 
															+				flags &= ~LKM_VALBLK;
														
 
															+			else {
														
 
															+				flags |= LKM_GET_LVB;
														
 
															+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	/* no locks held here.
														
 
															+	 * need to wait for a reply as to whether it got queued or not. */
														
 
															+	status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
														
 
															+	lock->convert_pending = 0;
														
 
															+	/* if it failed, move it back to granted queue */
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		if (status != DLM_NOTQUEUED)
														
 
															+			dlm_error(status);
														
 
															+		dlm_revert_pending_convert(res, lock);
														
 
															+	}
														
 
															+bail:
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	/* TODO: should this be a wake_one? */
														
 
															+	/* wake up any IN_PROGRESS waiters */
														
 
															+	wake_up(&res->wq);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* sends DLM_CONVERT_LOCK_MSG to master site
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         none
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NOLOCKMGR, status from remote node
														
 
															+ */
														
 
															+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
														
 
															+					   struct dlm_lock_resource *res,
														
 
															+					   struct dlm_lock *lock, int flags, int type)
														
 
															+{
														
 
															+	struct dlm_convert_lock convert;
														
 
															+	int tmpret;
														
 
															+	enum dlm_status ret;
														
 
															+	int status = 0;
														
 
															+	struct kvec vec[2];
														
 
															+	size_t veclen = 1;
														
 
															+
														
 
															+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
														
 
															+
														
 
															+	memset(&convert, 0, sizeof(struct dlm_convert_lock));
														
 
															+	convert.node_idx = dlm->node_num;
														
 
															+	convert.requested_type = type;
														
 
															+	convert.cookie = lock->ml.cookie;
														
 
															+	convert.namelen = res->lockname.len;
														
 
															+	convert.flags = cpu_to_be32(flags);
														
 
															+	memcpy(convert.name, res->lockname.name, convert.namelen);
														
 
															+
														
 
															+	vec[0].iov_len = sizeof(struct dlm_convert_lock);
														
 
															+	vec[0].iov_base = &convert;
														
 
															+
														
 
															+	if (flags & LKM_PUT_LVB) {
														
 
															+		/* extra data to send if we are updating lvb */
														
 
															+		vec[1].iov_len = DLM_LVB_LEN;
														
 
															+		vec[1].iov_base = lock->lksb->lvb;
														
 
															+		veclen++;
														
 
															+	}
														
 
															+
														
 
															+	tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
														
 
															+					vec, veclen, res->owner, &status);
														
 
															+	if (tmpret >= 0) {
														
 
															+		// successfully sent and received
														
 
															+		ret = status;  // this is already a dlm_status
														
 
															+		if (ret == DLM_RECOVERING) {
														
 
															+			mlog(0, "node %u returned DLM_RECOVERING from convert "
														
 
															+			     "message!\n", res->owner);
														
 
															+		} else if (ret == DLM_MIGRATING) {
														
 
															+			mlog(0, "node %u returned DLM_MIGRATING from convert "
														
 
															+			     "message!\n", res->owner);
														
 
															+		} else if (ret == DLM_FORWARD) {
														
 
															+			mlog(0, "node %u returned DLM_FORWARD from convert "
														
 
															+			     "message!\n", res->owner);
														
 
															+		} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
														
 
															+			dlm_error(ret);
														
 
															+	} else {
														
 
															+		mlog_errno(tmpret);
														
 
															+		if (dlm_is_host_down(tmpret)) {
														
 
															+			ret = DLM_RECOVERING;
														
 
															+			mlog(0, "node %u died so returning DLM_RECOVERING "
														
 
															+			     "from convert message!\n", res->owner);
														
 
															+		} else {
														
 
															+			ret = dlm_err_to_dlm_status(tmpret);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* handler for DLM_CONVERT_LOCK_MSG on master site
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         takes and drop res->spinlock
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
														
 
															+ *          status from __dlmconvert_master
														
 
															+ */
														
 
															+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock *lock = NULL;
														
 
															+	struct dlm_lockstatus *lksb;
														
 
															+	enum dlm_status status = DLM_NORMAL;
														
 
															+	u32 flags;
														
 
															+	int call_ast = 0, kick_thread = 0;
														
 
															+
														
 
															+	if (!dlm_grab(dlm)) {
														
 
															+		dlm_error(DLM_REJECTED);
														
 
															+		return DLM_REJECTED;
														
 
															+	}
														
 
															+
														
 
															+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
														
 
															+			"Domain %s not fully joined!\n", dlm->name);
														
 
															+
														
 
															+	if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
														
 
															+		status = DLM_IVBUFLEN;
														
 
															+		dlm_error(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	flags = be32_to_cpu(cnv->flags);
														
 
															+
														
 
															+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
														
 
															+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
														
 
															+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
														
 
															+		status = DLM_BADARGS;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
														
 
															+	     (flags & LKM_GET_LVB ? "get lvb" : "none"));
														
 
															+
														
 
															+	status = DLM_IVLOCKID;
														
 
															+	res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
														
 
															+	if (!res) {
														
 
															+		dlm_error(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	list_for_each(iter, &res->granted) {
														
 
															+		lock = list_entry(iter, struct dlm_lock, list);
														
 
															+		if (lock->ml.cookie == cnv->cookie &&
														
 
															+		    lock->ml.node == cnv->node_idx) {
														
 
															+			dlm_lock_get(lock);
														
 
															+			break;
														
 
															+		}
														
 
															+		lock = NULL;
														
 
															+	}
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	if (!lock) {
														
 
															+		status = DLM_IVLOCKID;
														
 
															+		dlm_error(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* found the lock */
														
 
															+	lksb = lock->lksb;
														
 
															+
														
 
															+	/* see if caller needed to get/put lvb */
														
 
															+	if (flags & LKM_PUT_LVB) {
														
 
															+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
														
 
															+		lksb->flags |= DLM_LKSB_PUT_LVB;
														
 
															+		memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
														
 
															+	} else if (flags & LKM_GET_LVB) {
														
 
															+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
														
 
															+		lksb->flags |= DLM_LKSB_GET_LVB;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	status = __dlm_lockres_state_to_status(res);
														
 
															+	if (status == DLM_NORMAL) {
														
 
															+		__dlm_lockres_reserve_ast(res);
														
 
															+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
														
 
															+		status = __dlmconvert_master(dlm, res, lock, flags,
														
 
															+					     cnv->requested_type,
														
 
															+					     &call_ast, &kick_thread);
														
 
															+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
														
 
															+	}
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		if (status != DLM_NOTQUEUED)
														
 
															+			dlm_error(status);
														
 
															+		lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	if (!lock)
														
 
															+		mlog(ML_ERROR, "did not find lock to convert on grant queue! "
														
 
															+			       "cookie=%"MLFu64"\n",
														
 
															+		     cnv->cookie);
														
 
															+	else
														
 
															+		dlm_lock_put(lock);
														
 
															+
														
 
															+	/* either queue the ast or release it */
														
 
															+	if (call_ast)
														
 
															+		dlm_queue_ast(dlm, lock);
														
 
															+	else
														
 
															+		dlm_lockres_release_ast(dlm, res);
														
 
															+
														
 
															+	if (kick_thread)
														
 
															+		dlm_kick_thread(dlm, res);
														
 
															+
														
 
															+	if (res)
														
 
															+		dlm_lockres_put(res);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
--- a/fs/ocfs2/dlm/dlmconvert.h
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -0,0 +1,35 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmconvert.h
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef DLMCONVERT_H
														
 
															+#define DLMCONVERT_H
														
 
															+
														
 
															+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
														
 
															+				  struct dlm_lock_resource *res,
														
 
															+				  struct dlm_lock *lock, int flags, int type);
														
 
															+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
														
 
															+				  struct dlm_lock_resource *res,
														
 
															+				  struct dlm_lock *lock, int flags, int type);
														
 
															+
														
 
															+#endif
														
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -0,0 +1,246 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmdebug.c
														
 
															+ *
														
 
															+ * debug functionality for the dlm
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/spinlock.h>
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+#include "dlmdebug.h"
														
 
															+
														
 
															+#include "dlmdomain.h"
														
 
															+#include "dlmdebug.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DLM
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
														
 
															+	       res->lockname.len, res->lockname.name,
														
 
															+	       res->owner, res->state);
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	__dlm_print_one_lock_resource(res);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+}
														
 
															+
														
 
															+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	struct list_head *iter2;
														
 
															+	struct dlm_lock *lock;
														
 
															+
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
														
 
															+	       res->lockname.len, res->lockname.name,
														
 
															+	       res->owner, res->state);
														
 
															+	mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
														
 
															+	     res->last_used, list_empty(&res->purge) ? "no" : "yes");
														
 
															+	mlog(ML_NOTICE, "  granted queue: \n");
														
 
															+	list_for_each(iter2, &res->granted) {
														
 
															+		lock = list_entry(iter2, struct dlm_lock, list);
														
 
															+		spin_lock(&lock->spinlock);
														
 
															+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
														
 
															+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
														
 
															+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
														
 
															+		       list_empty(&lock->ast_list) ? 'y' : 'n',
														
 
															+		       lock->ast_pending ? 'y' : 'n',
														
 
															+		       list_empty(&lock->bast_list) ? 'y' : 'n',
														
 
															+		       lock->bast_pending ? 'y' : 'n');
														
 
															+		spin_unlock(&lock->spinlock);
														
 
															+	}
														
 
															+	mlog(ML_NOTICE, "  converting queue: \n");
														
 
															+	list_for_each(iter2, &res->converting) {
														
 
															+		lock = list_entry(iter2, struct dlm_lock, list);
														
 
															+		spin_lock(&lock->spinlock);
														
 
															+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
														
 
															+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
														
 
															+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
														
 
															+		       list_empty(&lock->ast_list) ? 'y' : 'n',
														
 
															+		       lock->ast_pending ? 'y' : 'n',
														
 
															+		       list_empty(&lock->bast_list) ? 'y' : 'n',
														
 
															+		       lock->bast_pending ? 'y' : 'n');
														
 
															+		spin_unlock(&lock->spinlock);
														
 
															+	}
														
 
															+	mlog(ML_NOTICE, "  blocked queue: \n");
														
 
															+	list_for_each(iter2, &res->blocked) {
														
 
															+		lock = list_entry(iter2, struct dlm_lock, list);
														
 
															+		spin_lock(&lock->spinlock);
														
 
															+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
														
 
															+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
														
 
															+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
														
 
															+		       list_empty(&lock->ast_list) ? 'y' : 'n',
														
 
															+		       lock->ast_pending ? 'y' : 'n',
														
 
															+		       list_empty(&lock->bast_list) ? 'y' : 'n',
														
 
															+		       lock->bast_pending ? 'y' : 'n');
														
 
															+		spin_unlock(&lock->spinlock);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+void dlm_print_one_lock(struct dlm_lock *lockid)
														
 
															+{
														
 
															+	dlm_print_one_lock_resource(lockid->lockres);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_print_one_lock);
														
 
															+
														
 
															+void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	struct list_head *iter;
														
 
															+	struct list_head *bucket;
														
 
															+	int i;
														
 
															+
														
 
															+	mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
														
 
															+		  dlm->name, dlm->node_num, dlm->key);
														
 
															+	if (!dlm || !dlm->name) {
														
 
															+		mlog(ML_ERROR, "dlm=%p\n", dlm);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	for (i=0; i<DLM_HASH_SIZE; i++) {
														
 
															+		bucket = &(dlm->resources[i]);
														
 
															+		list_for_each(iter, bucket) {
														
 
															+			res = list_entry(iter, struct dlm_lock_resource, list);
														
 
															+			dlm_print_one_lock_resource(res);
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+static const char *dlm_errnames[] = {
														
 
															+	[DLM_NORMAL] =			"DLM_NORMAL",
														
 
															+	[DLM_GRANTED] =			"DLM_GRANTED",
														
 
															+	[DLM_DENIED] =			"DLM_DENIED",
														
 
															+	[DLM_DENIED_NOLOCKS] =		"DLM_DENIED_NOLOCKS",
														
 
															+	[DLM_WORKING] =			"DLM_WORKING",
														
 
															+	[DLM_BLOCKED] =			"DLM_BLOCKED",
														
 
															+	[DLM_BLOCKED_ORPHAN] =		"DLM_BLOCKED_ORPHAN",
														
 
															+	[DLM_DENIED_GRACE_PERIOD] =	"DLM_DENIED_GRACE_PERIOD",
														
 
															+	[DLM_SYSERR] =			"DLM_SYSERR",
														
 
															+	[DLM_NOSUPPORT] =		"DLM_NOSUPPORT",
														
 
															+	[DLM_CANCELGRANT] =		"DLM_CANCELGRANT",
														
 
															+	[DLM_IVLOCKID] =		"DLM_IVLOCKID",
														
 
															+	[DLM_SYNC] =			"DLM_SYNC",
														
 
															+	[DLM_BADTYPE] =			"DLM_BADTYPE",
														
 
															+	[DLM_BADRESOURCE] =		"DLM_BADRESOURCE",
														
 
															+	[DLM_MAXHANDLES] =		"DLM_MAXHANDLES",
														
 
															+	[DLM_NOCLINFO] =		"DLM_NOCLINFO",
														
 
															+	[DLM_NOLOCKMGR] =		"DLM_NOLOCKMGR",
														
 
															+	[DLM_NOPURGED] =		"DLM_NOPURGED",
														
 
															+	[DLM_BADARGS] =			"DLM_BADARGS",
														
 
															+	[DLM_VOID] =			"DLM_VOID",
														
 
															+	[DLM_NOTQUEUED] =		"DLM_NOTQUEUED",
														
 
															+	[DLM_IVBUFLEN] =		"DLM_IVBUFLEN",
														
 
															+	[DLM_CVTUNGRANT] =		"DLM_CVTUNGRANT",
														
 
															+	[DLM_BADPARAM] =		"DLM_BADPARAM",
														
 
															+	[DLM_VALNOTVALID] =		"DLM_VALNOTVALID",
														
 
															+	[DLM_REJECTED] =		"DLM_REJECTED",
														
 
															+	[DLM_ABORT] =			"DLM_ABORT",
														
 
															+	[DLM_CANCEL] =			"DLM_CANCEL",
														
 
															+	[DLM_IVRESHANDLE] =		"DLM_IVRESHANDLE",
														
 
															+	[DLM_DEADLOCK] =		"DLM_DEADLOCK",
														
 
															+	[DLM_DENIED_NOASTS] =		"DLM_DENIED_NOASTS",
														
 
															+	[DLM_FORWARD] =			"DLM_FORWARD",
														
 
															+	[DLM_TIMEOUT] =			"DLM_TIMEOUT",
														
 
															+	[DLM_IVGROUPID] =		"DLM_IVGROUPID",
														
 
															+	[DLM_VERS_CONFLICT] =		"DLM_VERS_CONFLICT",
														
 
															+	[DLM_BAD_DEVICE_PATH] =		"DLM_BAD_DEVICE_PATH",
														
 
															+	[DLM_NO_DEVICE_PERMISSION] =	"DLM_NO_DEVICE_PERMISSION",
														
 
															+	[DLM_NO_CONTROL_DEVICE ] =	"DLM_NO_CONTROL_DEVICE ",
														
 
															+	[DLM_RECOVERING] =		"DLM_RECOVERING",
														
 
															+	[DLM_MIGRATING] =		"DLM_MIGRATING",
														
 
															+	[DLM_MAXSTATS] =		"DLM_MAXSTATS",
														
 
															+};
														
 
															+
														
 
															+static const char *dlm_errmsgs[] = {
														
 
															+	[DLM_NORMAL] = 			"request in progress",
														
 
															+	[DLM_GRANTED] = 		"request granted",
														
 
															+	[DLM_DENIED] = 			"request denied",
														
 
															+	[DLM_DENIED_NOLOCKS] = 		"request denied, out of system resources",
														
 
															+	[DLM_WORKING] = 		"async request in progress",
														
 
															+	[DLM_BLOCKED] = 		"lock request blocked",
														
 
															+	[DLM_BLOCKED_ORPHAN] = 		"lock request blocked by a orphan lock",
														
 
															+	[DLM_DENIED_GRACE_PERIOD] = 	"topological change in progress",
														
 
															+	[DLM_SYSERR] = 			"system error",
														
 
															+	[DLM_NOSUPPORT] = 		"unsupported",
														
 
															+	[DLM_CANCELGRANT] = 		"can't cancel convert: already granted",
														
 
															+	[DLM_IVLOCKID] = 		"bad lockid",
														
 
															+	[DLM_SYNC] = 			"synchronous request granted",
														
 
															+	[DLM_BADTYPE] = 		"bad resource type",
														
 
															+	[DLM_BADRESOURCE] = 		"bad resource handle",
														
 
															+	[DLM_MAXHANDLES] = 		"no more resource handles",
														
 
															+	[DLM_NOCLINFO] = 		"can't contact cluster manager",
														
 
															+	[DLM_NOLOCKMGR] = 		"can't contact lock manager",
														
 
															+	[DLM_NOPURGED] = 		"can't contact purge daemon",
														
 
															+	[DLM_BADARGS] = 		"bad api args",
														
 
															+	[DLM_VOID] = 			"no status",
														
 
															+	[DLM_NOTQUEUED] = 		"NOQUEUE was specified and request failed",
														
 
															+	[DLM_IVBUFLEN] = 		"invalid resource name length",
														
 
															+	[DLM_CVTUNGRANT] = 		"attempted to convert ungranted lock",
														
 
															+	[DLM_BADPARAM] = 		"invalid lock mode specified",
														
 
															+	[DLM_VALNOTVALID] = 		"value block has been invalidated",
														
 
															+	[DLM_REJECTED] = 		"request rejected, unrecognized client",
														
 
															+	[DLM_ABORT] = 			"blocked lock request cancelled",
														
 
															+	[DLM_CANCEL] = 			"conversion request cancelled",
														
 
															+	[DLM_IVRESHANDLE] = 		"invalid resource handle",
														
 
															+	[DLM_DEADLOCK] = 		"deadlock recovery refused this request",
														
 
															+	[DLM_DENIED_NOASTS] = 		"failed to allocate AST",
														
 
															+	[DLM_FORWARD] = 		"request must wait for primary's response",
														
 
															+	[DLM_TIMEOUT] = 		"timeout value for lock has expired",
														
 
															+	[DLM_IVGROUPID] = 		"invalid group specification",
														
 
															+	[DLM_VERS_CONFLICT] = 		"version conflicts prevent request handling",
														
 
															+	[DLM_BAD_DEVICE_PATH] = 	"Locks device does not exist or path wrong",
														
 
															+	[DLM_NO_DEVICE_PERMISSION] = 	"Client has insufficient perms for device",
														
 
															+	[DLM_NO_CONTROL_DEVICE] = 	"Cannot set options on opened device ",
														
 
															+	[DLM_RECOVERING] = 		"lock resource being recovered",
														
 
															+	[DLM_MIGRATING] = 		"lock resource being migrated",
														
 
															+	[DLM_MAXSTATS] = 		"invalid error number",
														
 
															+};
														
 
															+
														
 
															+const char *dlm_errmsg(enum dlm_status err)
														
 
															+{
														
 
															+	if (err >= DLM_MAXSTATS || err < 0)
														
 
															+		return dlm_errmsgs[DLM_MAXSTATS];
														
 
															+	return dlm_errmsgs[err];
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_errmsg);
														
 
															+
														
 
															+const char *dlm_errname(enum dlm_status err)
														
 
															+{
														
 
															+	if (err >= DLM_MAXSTATS || err < 0)
														
 
															+		return dlm_errnames[DLM_MAXSTATS];
														
 
															+	return dlm_errnames[err];
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_errname);
														
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,30 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmdebug.h
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef DLMDEBUG_H
														
 
															+#define DLMDEBUG_H
														
 
															+
														
 
															+void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
														
 
															+
														
 
															+#endif
														
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -0,0 +1,1469 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmdomain.c
														
 
															+ *
														
 
															+ * defines domain join / leave apis
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/spinlock.h>
														
 
															+#include <linux/delay.h>
														
 
															+#include <linux/err.h>
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+
														
 
															+#include "dlmdebug.h"
														
 
															+#include "dlmdomain.h"
														
 
															+
														
 
															+#include "dlmver.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+/*
														
 
															+ *
														
 
															+ * spinlock lock ordering: if multiple locks are needed, obey this ordering:
														
 
															+ *    dlm_domain_lock
														
 
															+ *    struct dlm_ctxt->spinlock
														
 
															+ *    struct dlm_lock_resource->spinlock
														
 
															+ *    struct dlm_ctxt->master_lock
														
 
															+ *    struct dlm_ctxt->ast_lock
														
 
															+ *    dlm_master_list_entry->spinlock
														
 
															+ *    dlm_lock->spinlock
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
														
 
															+LIST_HEAD(dlm_domains);
														
 
															+static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
														
 
															+
														
 
															+#define DLM_DOMAIN_BACKOFF_MS 200
														
 
															+
														
 
															+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
														
 
															+
														
 
															+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
														
 
															+
														
 
															+void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
														
 
															+{
														
 
															+	list_del_init(&lockres->list);
														
 
															+	dlm_lockres_put(lockres);
														
 
															+}
														
 
															+
														
 
															+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
														
 
															+		       struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	struct list_head *bucket;
														
 
															+	struct qstr *q;
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	q = &res->lockname;
														
 
															+	q->hash = full_name_hash(q->name, q->len);
														
 
															+	bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
														
 
															+
														
 
															+	/* get a reference for our hashtable */
														
 
															+	dlm_lockres_get(res);
														
 
															+
														
 
															+	list_add_tail(&res->list, bucket);
														
 
															+}
														
 
															+
														
 
															+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
														
 
															+					 const char *name,
														
 
															+					 unsigned int len)
														
 
															+{
														
 
															+	unsigned int hash;
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock_resource *tmpres=NULL;
														
 
															+	struct list_head *bucket;
														
 
															+
														
 
															+	mlog_entry("%.*s\n", len, name);
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	hash = full_name_hash(name, len);
														
 
															+
														
 
															+	bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
														
 
															+
														
 
															+	/* check for pre-existing lock */
														
 
															+	list_for_each(iter, bucket) {
														
 
															+		tmpres = list_entry(iter, struct dlm_lock_resource, list);
														
 
															+		if (tmpres->lockname.len == len &&
														
 
															+		    memcmp(tmpres->lockname.name, name, len) == 0) {
														
 
															+			dlm_lockres_get(tmpres);
														
 
															+			break;
														
 
															+		}
														
 
															+
														
 
															+		tmpres = NULL;
														
 
															+	}
														
 
															+	return tmpres;
														
 
															+}
														
 
															+
														
 
															+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
														
 
															+				    const char *name,
														
 
															+				    unsigned int len)
														
 
															+{
														
 
															+	struct dlm_lock_resource *res;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	res = __dlm_lookup_lockres(dlm, name, len);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+	return res;
														
 
															+}
														
 
															+
														
 
															+static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
														
 
															+{
														
 
															+	struct dlm_ctxt *tmp = NULL;
														
 
															+	struct list_head *iter;
														
 
															+
														
 
															+	assert_spin_locked(&dlm_domain_lock);
														
 
															+
														
 
															+	/* tmp->name here is always NULL terminated,
														
 
															+	 * but domain may not be! */
														
 
															+	list_for_each(iter, &dlm_domains) {
														
 
															+		tmp = list_entry (iter, struct dlm_ctxt, list);
														
 
															+		if (strlen(tmp->name) == len &&
														
 
															+		    memcmp(tmp->name, domain, len)==0)
														
 
															+			break;
														
 
															+		tmp = NULL;
														
 
															+	}
														
 
															+
														
 
															+	return tmp;
														
 
															+}
														
 
															+
														
 
															+/* For null terminated domain strings ONLY */
														
 
															+static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
														
 
															+{
														
 
															+	assert_spin_locked(&dlm_domain_lock);
														
 
															+
														
 
															+	return __dlm_lookup_domain_full(domain, strlen(domain));
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* returns true on one of two conditions:
														
 
															+ * 1) the domain does not exist
														
 
															+ * 2) the domain exists and it's state is "joined" */
														
 
															+static int dlm_wait_on_domain_helper(const char *domain)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	struct dlm_ctxt *tmp = NULL;
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+
														
 
															+	tmp = __dlm_lookup_domain(domain);
														
 
															+	if (!tmp)
														
 
															+		ret = 1;
														
 
															+	else if (tmp->dlm_state == DLM_CTXT_JOINED)
														
 
															+		ret = 1;
														
 
															+
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	if (dlm->resources)
														
 
															+		free_page((unsigned long) dlm->resources);
														
 
															+
														
 
															+	if (dlm->name)
														
 
															+		kfree(dlm->name);
														
 
															+
														
 
															+	kfree(dlm);
														
 
															+}
														
 
															+
														
 
															+/* A little strange - this function will be called while holding
														
 
															+ * dlm_domain_lock and is expected to be holding it on the way out. We
														
 
															+ * will however drop and reacquire it multiple times */
														
 
															+static void dlm_ctxt_release(struct kref *kref)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+
														
 
															+	dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
														
 
															+
														
 
															+	BUG_ON(dlm->num_joins);
														
 
															+	BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
														
 
															+
														
 
															+	/* we may still be in the list if we hit an error during join. */
														
 
															+	list_del_init(&dlm->list);
														
 
															+
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	mlog(0, "freeing memory from domain %s\n", dlm->name);
														
 
															+
														
 
															+	wake_up(&dlm_domain_events);
														
 
															+
														
 
															+	dlm_free_ctxt_mem(dlm);
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+}
														
 
															+
														
 
															+void dlm_put(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	kref_put(&dlm->dlm_refs, dlm_ctxt_release);
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+}
														
 
															+
														
 
															+static void __dlm_get(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	kref_get(&dlm->dlm_refs);
														
 
															+}
														
 
															+
														
 
															+/* given a questionable reference to a dlm object, gets a reference if
														
 
															+ * it can find it in the list, otherwise returns NULL in which case
														
 
															+ * you shouldn't trust your pointer. */
														
 
															+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_ctxt *target = NULL;
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+
														
 
															+	list_for_each(iter, &dlm_domains) {
														
 
															+		target = list_entry (iter, struct dlm_ctxt, list);
														
 
															+
														
 
															+		if (target == dlm) {
														
 
															+			__dlm_get(target);
														
 
															+			break;
														
 
															+		}
														
 
															+
														
 
															+		target = NULL;
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	return target;
														
 
															+}
														
 
															+
														
 
															+int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
														
 
															+		(dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	dlm_unregister_domain_handlers(dlm);
														
 
															+	dlm_complete_thread(dlm);
														
 
															+	dlm_complete_recovery_thread(dlm);
														
 
															+
														
 
															+	/* We've left the domain. Now we can take ourselves out of the
														
 
															+	 * list and allow the kref stuff to help us free the
														
 
															+	 * memory. */
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	list_del_init(&dlm->list);
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	/* Wake up anyone waiting for us to remove this domain */
														
 
															+	wake_up(&dlm_domain_events);
														
 
															+}
														
 
															+
														
 
															+static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+
														
 
															+	mlog(0, "Migrating locks from domain %s\n", dlm->name);
														
 
															+restart:
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	for (i=0; i<DLM_HASH_SIZE; i++) {
														
 
															+		while (!list_empty(&dlm->resources[i])) {
														
 
															+			res = list_entry(dlm->resources[i].next,
														
 
															+				     struct dlm_lock_resource, list);
														
 
															+			/* need reference when manually grabbing lockres */
														
 
															+			dlm_lockres_get(res);
														
 
															+			/* this should unhash the lockres
														
 
															+			 * and exit with dlm->spinlock */
														
 
															+			mlog(0, "purging res=%p\n", res);
														
 
															+			if (dlm_lockres_is_dirty(dlm, res)) {
														
 
															+				/* HACK!  this should absolutely go.
														
 
															+				 * need to figure out why some empty
														
 
															+				 * lockreses are still marked dirty */
														
 
															+				mlog(ML_ERROR, "lockres %.*s dirty!\n",
														
 
															+				     res->lockname.len, res->lockname.name);
														
 
															+
														
 
															+				spin_unlock(&dlm->spinlock);
														
 
															+				dlm_kick_thread(dlm, res);
														
 
															+				wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
														
 
															+				dlm_lockres_put(res);
														
 
															+				goto restart;
														
 
															+			}
														
 
															+			dlm_purge_lockres(dlm, res);
														
 
															+			dlm_lockres_put(res);
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
														
 
															+}
														
 
															+
														
 
															+static int dlm_no_joining_node(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	/* Yikes, a double spinlock! I need domain_lock for the dlm
														
 
															+	 * state and the dlm spinlock for join state... Sorry! */
														
 
															+again:
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+	if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+		mlog(0, "Node %d is joining, we wait on it.\n",
														
 
															+			  dlm->joining_node);
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+		spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+		wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	dlm->dlm_state = DLM_CTXT_LEAVING;
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+}
														
 
															+
														
 
															+static void __dlm_print_nodes(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int node = -1;
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
														
 
															+
														
 
															+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
														
 
															+				     node + 1)) < O2NM_MAX_NODES) {
														
 
															+		mlog(ML_NOTICE, " node %d\n", node);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	unsigned int node;
														
 
															+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
														
 
															+
														
 
															+	mlog_entry("%p %u %p", msg, len, data);
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return 0;
														
 
															+
														
 
															+	node = exit_msg->node_idx;
														
 
															+
														
 
															+	mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	clear_bit(node, dlm->domain_map);
														
 
															+	__dlm_print_nodes(dlm);
														
 
															+
														
 
															+	/* notify anything attached to the heartbeat events */
														
 
															+	dlm_hb_event_notify_attached(dlm, node, 0);
														
 
															+
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
														
 
															+				    unsigned int node)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct dlm_exit_domain leave_msg;
														
 
															+
														
 
															+	mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
														
 
															+		  node, dlm->name, dlm->node_num);
														
 
															+
														
 
															+	memset(&leave_msg, 0, sizeof(leave_msg));
														
 
															+	leave_msg.node_idx = dlm->node_num;
														
 
															+
														
 
															+	status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
														
 
															+				    &leave_msg, sizeof(leave_msg), node,
														
 
															+				    NULL);
														
 
															+
														
 
															+	mlog(0, "status return %d from o2net_send_message\n", status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static void dlm_leave_domain(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int node, clear_node, status;
														
 
															+
														
 
															+	/* At this point we've migrated away all our locks and won't
														
 
															+	 * accept mastership of new ones. The dlm is responsible for
														
 
															+	 * almost nothing now. We make sure not to confuse any joining
														
 
															+	 * nodes and then commence shutdown procedure. */
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	/* Clear ourselves from the domain map */
														
 
															+	clear_bit(dlm->node_num, dlm->domain_map);
														
 
															+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
														
 
															+				     0)) < O2NM_MAX_NODES) {
														
 
															+		/* Drop the dlm spinlock. This is safe wrt the domain_map.
														
 
															+		 * -nodes cannot be added now as the
														
 
															+		 *   query_join_handlers knows to respond with OK_NO_MAP
														
 
															+		 * -we catch the right network errors if a node is
														
 
															+		 *   removed from the map while we're sending him the
														
 
															+		 *   exit message. */
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+		clear_node = 1;
														
 
															+
														
 
															+		status = dlm_send_one_domain_exit(dlm, node);
														
 
															+		if (status < 0 &&
														
 
															+		    status != -ENOPROTOOPT &&
														
 
															+		    status != -ENOTCONN) {
														
 
															+			mlog(ML_NOTICE, "Error %d sending domain exit message "
														
 
															+			     "to node %d\n", status, node);
														
 
															+
														
 
															+			/* Not sure what to do here but lets sleep for
														
 
															+			 * a bit in case this was a transient
														
 
															+			 * error... */
														
 
															+			msleep(DLM_DOMAIN_BACKOFF_MS);
														
 
															+			clear_node = 0;
														
 
															+		}
														
 
															+
														
 
															+		spin_lock(&dlm->spinlock);
														
 
															+		/* If we're not clearing the node bit then we intend
														
 
															+		 * to loop back around to try again. */
														
 
															+		if (clear_node)
														
 
															+			clear_bit(node, dlm->domain_map);
														
 
															+	}
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+int dlm_joined(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+
														
 
															+	if (dlm->dlm_state == DLM_CTXT_JOINED)
														
 
															+		ret = 1;
														
 
															+
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int dlm_shutting_down(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+
														
 
															+	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
														
 
															+		ret = 1;
														
 
															+
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+void dlm_unregister_domain(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int leave = 0;
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
														
 
															+	BUG_ON(!dlm->num_joins);
														
 
															+
														
 
															+	dlm->num_joins--;
														
 
															+	if (!dlm->num_joins) {
														
 
															+		/* We mark it "in shutdown" now so new register
														
 
															+		 * requests wait until we've completely left the
														
 
															+		 * domain. Don't use DLM_CTXT_LEAVING yet as we still
														
 
															+		 * want new domain joins to communicate with us at
														
 
															+		 * least until we've completed migration of our
														
 
															+		 * resources. */
														
 
															+		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
														
 
															+		leave = 1;
														
 
															+	}
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	if (leave) {
														
 
															+		mlog(0, "shutting down domain %s\n", dlm->name);
														
 
															+
														
 
															+		/* We changed dlm state, notify the thread */
														
 
															+		dlm_kick_thread(dlm, NULL);
														
 
															+
														
 
															+		dlm_migrate_all_locks(dlm);
														
 
															+		dlm_mark_domain_leaving(dlm);
														
 
															+		dlm_leave_domain(dlm);
														
 
															+		dlm_complete_dlm_shutdown(dlm);
														
 
															+	}
														
 
															+	dlm_put(dlm);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_unregister_domain);
														
 
															+
														
 
															+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_query_join_request *query;
														
 
															+	enum dlm_query_join_response response;
														
 
															+	struct dlm_ctxt *dlm = NULL;
														
 
															+
														
 
															+	query = (struct dlm_query_join_request *) msg->buf;
														
 
															+
														
 
															+	mlog(0, "node %u wants to join domain %s\n", query->node_idx,
														
 
															+		  query->domain);
														
 
															+
														
 
															+	/*
														
 
															+	 * If heartbeat doesn't consider the node live, tell it
														
 
															+	 * to back off and try again.  This gives heartbeat a chance
														
 
															+	 * to catch up.
														
 
															+	 */
														
 
															+	if (!o2hb_check_node_heartbeating(query->node_idx)) {
														
 
															+		mlog(0, "node %u is not in our live map yet\n",
														
 
															+		     query->node_idx);
														
 
															+
														
 
															+		response = JOIN_DISALLOW;
														
 
															+		goto respond;
														
 
															+	}
														
 
															+
														
 
															+	response = JOIN_OK_NO_MAP;
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
														
 
															+	/* Once the dlm ctxt is marked as leaving then we don't want
														
 
															+	 * to be put in someone's domain map. */
														
 
															+	if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
														
 
															+		spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+		if (dlm->dlm_state == DLM_CTXT_NEW &&
														
 
															+		    dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+			/*If this is a brand new context and we
														
 
															+			 * haven't started our join process yet, then
														
 
															+			 * the other node won the race. */
														
 
															+			response = JOIN_OK_NO_MAP;
														
 
															+		} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+			/* Disallow parallel joins. */
														
 
															+			response = JOIN_DISALLOW;
														
 
															+		} else {
														
 
															+			/* Alright we're fully a part of this domain
														
 
															+			 * so we keep some state as to who's joining
														
 
															+			 * and indicate to him that needs to be fixed
														
 
															+			 * up. */
														
 
															+			response = JOIN_OK;
														
 
															+			__dlm_set_joining_node(dlm, query->node_idx);
														
 
															+		}
														
 
															+
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+	}
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+respond:
														
 
															+	mlog(0, "We respond with %u\n", response);
														
 
															+
														
 
															+	return response;
														
 
															+}
														
 
															+
														
 
															+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_assert_joined *assert;
														
 
															+	struct dlm_ctxt *dlm = NULL;
														
 
															+
														
 
															+	assert = (struct dlm_assert_joined *) msg->buf;
														
 
															+
														
 
															+	mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
														
 
															+		  assert->domain);
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
														
 
															+	/* XXX should we consider no dlm ctxt an error? */
														
 
															+	if (dlm) {
														
 
															+		spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+		/* Alright, this node has officially joined our
														
 
															+		 * domain. Set him in the map and clean up our
														
 
															+		 * leftover join state. */
														
 
															+		BUG_ON(dlm->joining_node != assert->node_idx);
														
 
															+		set_bit(assert->node_idx, dlm->domain_map);
														
 
															+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
														
 
															+
														
 
															+		__dlm_print_nodes(dlm);
														
 
															+
														
 
															+		/* notify anything attached to the heartbeat events */
														
 
															+		dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
														
 
															+
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+	}
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_cancel_join *cancel;
														
 
															+	struct dlm_ctxt *dlm = NULL;
														
 
															+
														
 
															+	cancel = (struct dlm_cancel_join *) msg->buf;
														
 
															+
														
 
															+	mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
														
 
															+		  cancel->domain);
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
														
 
															+
														
 
															+	if (dlm) {
														
 
															+		spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+		/* Yikes, this guy wants to cancel his join. No
														
 
															+		 * problem, we simply cleanup our join state. */
														
 
															+		BUG_ON(dlm->joining_node != cancel->node_idx);
														
 
															+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
														
 
															+
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+	}
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
														
 
															+				    unsigned int node)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct dlm_cancel_join cancel_msg;
														
 
															+
														
 
															+	memset(&cancel_msg, 0, sizeof(cancel_msg));
														
 
															+	cancel_msg.node_idx = dlm->node_num;
														
 
															+	cancel_msg.name_len = strlen(dlm->name);
														
 
															+	memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
														
 
															+
														
 
															+	status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
														
 
															+				    &cancel_msg, sizeof(cancel_msg), node,
														
 
															+				    NULL);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* map_size should be in bytes. */
														
 
															+static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
														
 
															+				 unsigned long *node_map,
														
 
															+				 unsigned int map_size)
														
 
															+{
														
 
															+	int status, tmpstat;
														
 
															+	unsigned int node;
														
 
															+
														
 
															+	if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
														
 
															+			 sizeof(unsigned long))) {
														
 
															+		mlog(ML_ERROR,
														
 
															+		     "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
														
 
															+		     map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+	node = -1;
														
 
															+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
														
 
															+				     node + 1)) < O2NM_MAX_NODES) {
														
 
															+		if (node == dlm->node_num)
														
 
															+			continue;
														
 
															+
														
 
															+		tmpstat = dlm_send_one_join_cancel(dlm, node);
														
 
															+		if (tmpstat) {
														
 
															+			mlog(ML_ERROR, "Error return %d cancelling join on "
														
 
															+			     "node %d\n", tmpstat, node);
														
 
															+			if (!status)
														
 
															+				status = tmpstat;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (status)
														
 
															+		mlog_errno(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlm_request_join(struct dlm_ctxt *dlm,
														
 
															+			    int node,
														
 
															+			    enum dlm_query_join_response *response)
														
 
															+{
														
 
															+	int status, retval;
														
 
															+	struct dlm_query_join_request join_msg;
														
 
															+
														
 
															+	mlog(0, "querying node %d\n", node);
														
 
															+
														
 
															+	memset(&join_msg, 0, sizeof(join_msg));
														
 
															+	join_msg.node_idx = dlm->node_num;
														
 
															+	join_msg.name_len = strlen(dlm->name);
														
 
															+	memcpy(join_msg.domain, dlm->name, join_msg.name_len);
														
 
															+
														
 
															+	status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
														
 
															+				    sizeof(join_msg), node, &retval);
														
 
															+	if (status < 0 && status != -ENOPROTOOPT) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* -ENOPROTOOPT from the net code means the other side isn't
														
 
															+	    listening for our message type -- that's fine, it means
														
 
															+	    his dlm isn't up, so we can consider him a 'yes' but not
														
 
															+	    joined into the domain.  */
														
 
															+	if (status == -ENOPROTOOPT) {
														
 
															+		status = 0;
														
 
															+		*response = JOIN_OK_NO_MAP;
														
 
															+	} else if (retval == JOIN_DISALLOW ||
														
 
															+		   retval == JOIN_OK ||
														
 
															+		   retval == JOIN_OK_NO_MAP) {
														
 
															+		*response = retval;
														
 
															+	} else {
														
 
															+		status = -EINVAL;
														
 
															+		mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
														
 
															+		     node);
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "status %d, node %d response is %d\n", status, node,
														
 
															+		  *response);
														
 
															+
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
														
 
															+				    unsigned int node)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct dlm_assert_joined assert_msg;
														
 
															+
														
 
															+	mlog(0, "Sending join assert to node %u\n", node);
														
 
															+
														
 
															+	memset(&assert_msg, 0, sizeof(assert_msg));
														
 
															+	assert_msg.node_idx = dlm->node_num;
														
 
															+	assert_msg.name_len = strlen(dlm->name);
														
 
															+	memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
														
 
															+
														
 
															+	status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
														
 
															+				    &assert_msg, sizeof(assert_msg), node,
														
 
															+				    NULL);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
														
 
															+				  unsigned long *node_map)
														
 
															+{
														
 
															+	int status, node, live;
														
 
															+
														
 
															+	status = 0;
														
 
															+	node = -1;
														
 
															+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
														
 
															+				     node + 1)) < O2NM_MAX_NODES) {
														
 
															+		if (node == dlm->node_num)
														
 
															+			continue;
														
 
															+
														
 
															+		do {
														
 
															+			/* It is very important that this message be
														
 
															+			 * received so we spin until either the node
														
 
															+			 * has died or it gets the message. */
														
 
															+			status = dlm_send_one_join_assert(dlm, node);
														
 
															+
														
 
															+			spin_lock(&dlm->spinlock);
														
 
															+			live = test_bit(node, dlm->live_nodes_map);
														
 
															+			spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+			if (status) {
														
 
															+				mlog(ML_ERROR, "Error return %d asserting "
														
 
															+				     "join on node %d\n", status, node);
														
 
															+
														
 
															+				/* give us some time between errors... */
														
 
															+				if (live)
														
 
															+					msleep(DLM_DOMAIN_BACKOFF_MS);
														
 
															+			}
														
 
															+		} while (status && live);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+struct domain_join_ctxt {
														
 
															+	unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+};
														
 
															+
														
 
															+static int dlm_should_restart_join(struct dlm_ctxt *dlm,
														
 
															+				   struct domain_join_ctxt *ctxt,
														
 
															+				   enum dlm_query_join_response response)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	if (response == JOIN_DISALLOW) {
														
 
															+		mlog(0, "Latest response of disallow -- should restart\n");
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	/* For now, we restart the process if the node maps have
														
 
															+	 * changed at all */
														
 
															+	ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
														
 
															+		     sizeof(dlm->live_nodes_map));
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	if (ret)
														
 
															+		mlog(0, "Node maps changed -- should restart\n");
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int status = 0, tmpstat, node;
														
 
															+	struct domain_join_ctxt *ctxt;
														
 
															+	enum dlm_query_join_response response;
														
 
															+
														
 
															+	mlog_entry("%p", dlm);
														
 
															+
														
 
															+	ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
														
 
															+	if (!ctxt) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* group sem locking should work for us here -- we're already
														
 
															+	 * registered for heartbeat events so filling this should be
														
 
															+	 * atomic wrt getting those handlers called. */
														
 
															+	o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
														
 
															+
														
 
															+	__dlm_set_joining_node(dlm, dlm->node_num);
														
 
															+
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	node = -1;
														
 
															+	while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
														
 
															+				     node + 1)) < O2NM_MAX_NODES) {
														
 
															+		if (node == dlm->node_num)
														
 
															+			continue;
														
 
															+
														
 
															+		status = dlm_request_join(dlm, node, &response);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		/* Ok, either we got a response or the node doesn't have a
														
 
															+		 * dlm up. */
														
 
															+		if (response == JOIN_OK)
														
 
															+			set_bit(node, ctxt->yes_resp_map);
														
 
															+
														
 
															+		if (dlm_should_restart_join(dlm, ctxt, response)) {
														
 
															+			status = -EAGAIN;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "Yay, done querying nodes!\n");
														
 
															+
														
 
															+	/* Yay, everyone agree's we can join the domain. My domain is
														
 
															+	 * comprised of all nodes who were put in the
														
 
															+	 * yes_resp_map. Copy that into our domain map and send a join
														
 
															+	 * assert message to clean up everyone elses state. */
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	memcpy(dlm->domain_map, ctxt->yes_resp_map,
														
 
															+	       sizeof(ctxt->yes_resp_map));
														
 
															+	set_bit(dlm->node_num, dlm->domain_map);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
														
 
															+
														
 
															+	/* Joined state *must* be set before the joining node
														
 
															+	 * information, otherwise the query_join handler may read no
														
 
															+	 * current joiner but a state of NEW and tell joining nodes
														
 
															+	 * we're not in the domain. */
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	dlm->dlm_state = DLM_CTXT_JOINED;
														
 
															+	dlm->num_joins++;
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+bail:
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
														
 
															+	if (!status)
														
 
															+		__dlm_print_nodes(dlm);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	if (ctxt) {
														
 
															+		/* Do we need to send a cancel message to any nodes? */
														
 
															+		if (status < 0) {
														
 
															+			tmpstat = dlm_send_join_cancels(dlm,
														
 
															+							ctxt->yes_resp_map,
														
 
															+							sizeof(ctxt->yes_resp_map));
														
 
															+			if (tmpstat < 0)
														
 
															+				mlog_errno(tmpstat);
														
 
															+		}
														
 
															+		kfree(ctxt);
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "returning %d\n", status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	o2hb_unregister_callback(&dlm->dlm_hb_up);
														
 
															+	o2hb_unregister_callback(&dlm->dlm_hb_down);
														
 
															+	o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
														
 
															+}
														
 
															+
														
 
															+static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	mlog(0, "registering handlers.\n");
														
 
															+
														
 
															+	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
														
 
															+			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
														
 
															+	status = o2hb_register_callback(&dlm->dlm_hb_down);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
														
 
															+			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
														
 
															+	status = o2hb_register_callback(&dlm->dlm_hb_up);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_master_request),
														
 
															+					dlm_master_request_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_assert_master),
														
 
															+					dlm_assert_master_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_create_lock),
														
 
															+					dlm_create_lock_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
														
 
															+					DLM_CONVERT_LOCK_MAX_LEN,
														
 
															+					dlm_convert_lock_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
														
 
															+					DLM_UNLOCK_LOCK_MAX_LEN,
														
 
															+					dlm_unlock_lock_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
														
 
															+					DLM_PROXY_AST_MAX_LEN,
														
 
															+					dlm_proxy_ast_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_exit_domain),
														
 
															+					dlm_exit_domain_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_migrate_request),
														
 
															+					dlm_migrate_request_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
														
 
															+					DLM_MIG_LOCKRES_MAX_LEN,
														
 
															+					dlm_mig_lockres_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_master_requery),
														
 
															+					dlm_master_requery_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_lock_request),
														
 
															+					dlm_request_all_locks_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_reco_data_done),
														
 
															+					dlm_reco_data_done_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_begin_reco),
														
 
															+					dlm_begin_reco_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
														
 
															+					sizeof(struct dlm_finalize_reco),
														
 
															+					dlm_finalize_reco_handler,
														
 
															+					dlm, &dlm->dlm_domain_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+bail:
														
 
															+	if (status)
														
 
															+		dlm_unregister_domain_handlers(dlm);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlm_join_domain(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	BUG_ON(!dlm);
														
 
															+
														
 
															+	mlog(0, "Join domain %s\n", dlm->name);
														
 
															+
														
 
															+	status = dlm_register_domain_handlers(dlm);
														
 
															+	if (status) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = dlm_launch_thread(dlm);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = dlm_launch_recovery_thread(dlm);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	do {
														
 
															+		unsigned int backoff;
														
 
															+		status = dlm_try_to_join_domain(dlm);
														
 
															+
														
 
															+		/* If we're racing another node to the join, then we
														
 
															+		 * need to back off temporarily and let them
														
 
															+		 * complete. */
														
 
															+		if (status == -EAGAIN) {
														
 
															+			if (signal_pending(current)) {
														
 
															+				status = -ERESTARTSYS;
														
 
															+				goto bail;
														
 
															+			}
														
 
															+
														
 
															+			/*
														
 
															+			 * <chip> After you!
														
 
															+			 * <dale> No, after you!
														
 
															+			 * <chip> I insist!
														
 
															+			 * <dale> But you first!
														
 
															+			 * ...
														
 
															+			 */
														
 
															+			backoff = (unsigned int)(jiffies & 0x3);
														
 
															+			backoff *= DLM_DOMAIN_BACKOFF_MS;
														
 
															+			mlog(0, "backoff %d\n", backoff);
														
 
															+			msleep(backoff);
														
 
															+		}
														
 
															+	} while (status == -EAGAIN);
														
 
															+
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	wake_up(&dlm_domain_events);
														
 
															+
														
 
															+	if (status) {
														
 
															+		dlm_unregister_domain_handlers(dlm);
														
 
															+		dlm_complete_thread(dlm);
														
 
															+		dlm_complete_recovery_thread(dlm);
														
 
															+	}
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
														
 
															+				u32 key)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct dlm_ctxt *dlm = NULL;
														
 
															+
														
 
															+	dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
														
 
															+	if (!dlm) {
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
														
 
															+	if (dlm->name == NULL) {
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		kfree(dlm);
														
 
															+		dlm = NULL;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
														
 
															+	if (!dlm->resources) {
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		kfree(dlm->name);
														
 
															+		kfree(dlm);
														
 
															+		dlm = NULL;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+	memset(dlm->resources, 0, PAGE_SIZE);
														
 
															+
														
 
															+	for (i=0; i<DLM_HASH_SIZE; i++)
														
 
															+		INIT_LIST_HEAD(&dlm->resources[i]);
														
 
															+
														
 
															+	strcpy(dlm->name, domain);
														
 
															+	dlm->key = key;
														
 
															+	dlm->node_num = o2nm_this_node();
														
 
															+
														
 
															+	spin_lock_init(&dlm->spinlock);
														
 
															+	spin_lock_init(&dlm->master_lock);
														
 
															+	spin_lock_init(&dlm->ast_lock);
														
 
															+	INIT_LIST_HEAD(&dlm->list);
														
 
															+	INIT_LIST_HEAD(&dlm->dirty_list);
														
 
															+	INIT_LIST_HEAD(&dlm->reco.resources);
														
 
															+	INIT_LIST_HEAD(&dlm->reco.received);
														
 
															+	INIT_LIST_HEAD(&dlm->reco.node_data);
														
 
															+	INIT_LIST_HEAD(&dlm->purge_list);
														
 
															+	INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
														
 
															+	dlm->reco.state = 0;
														
 
															+
														
 
															+	INIT_LIST_HEAD(&dlm->pending_asts);
														
 
															+	INIT_LIST_HEAD(&dlm->pending_basts);
														
 
															+
														
 
															+	mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
														
 
															+		  dlm->recovery_map, &(dlm->recovery_map[0]));
														
 
															+
														
 
															+	memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
														
 
															+	memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
														
 
															+	memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
														
 
															+
														
 
															+	dlm->dlm_thread_task = NULL;
														
 
															+	dlm->dlm_reco_thread_task = NULL;
														
 
															+	init_waitqueue_head(&dlm->dlm_thread_wq);
														
 
															+	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
														
 
															+	init_waitqueue_head(&dlm->reco.event);
														
 
															+	init_waitqueue_head(&dlm->ast_wq);
														
 
															+	init_waitqueue_head(&dlm->migration_wq);
														
 
															+	INIT_LIST_HEAD(&dlm->master_list);
														
 
															+	INIT_LIST_HEAD(&dlm->mle_hb_events);
														
 
															+
														
 
															+	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
														
 
															+	init_waitqueue_head(&dlm->dlm_join_events);
														
 
															+
														
 
															+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
														
 
															+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
														
 
															+	atomic_set(&dlm->local_resources, 0);
														
 
															+	atomic_set(&dlm->remote_resources, 0);
														
 
															+	atomic_set(&dlm->unknown_resources, 0);
														
 
															+
														
 
															+	spin_lock_init(&dlm->work_lock);
														
 
															+	INIT_LIST_HEAD(&dlm->work_list);
														
 
															+	INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
														
 
															+
														
 
															+	kref_init(&dlm->dlm_refs);
														
 
															+	dlm->dlm_state = DLM_CTXT_NEW;
														
 
															+
														
 
															+	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
														
 
															+
														
 
															+	mlog(0, "context init: refcount %u\n",
														
 
															+		  atomic_read(&dlm->dlm_refs.refcount));
														
 
															+
														
 
															+leave:
														
 
															+	return dlm;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * dlm_register_domain: one-time setup per "domain"
														
 
															+ */
														
 
															+struct dlm_ctxt * dlm_register_domain(const char *domain,
														
 
															+			       u32 key)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct dlm_ctxt *dlm = NULL;
														
 
															+	struct dlm_ctxt *new_ctxt = NULL;
														
 
															+
														
 
															+	if (strlen(domain) > O2NM_MAX_NAME_LEN) {
														
 
															+		ret = -ENAMETOOLONG;
														
 
															+		mlog(ML_ERROR, "domain name length too long\n");
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (!o2hb_check_local_node_heartbeating()) {
														
 
															+		mlog(ML_ERROR, "the local node has not been configured, or is "
														
 
															+		     "not heartbeating\n");
														
 
															+		ret = -EPROTO;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "register called for domain \"%s\"\n", domain);
														
 
															+
														
 
															+retry:
														
 
															+	dlm = NULL;
														
 
															+	if (signal_pending(current)) {
														
 
															+		ret = -ERESTARTSYS;
														
 
															+		mlog_errno(ret);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+
														
 
															+	dlm = __dlm_lookup_domain(domain);
														
 
															+	if (dlm) {
														
 
															+		if (dlm->dlm_state != DLM_CTXT_JOINED) {
														
 
															+			spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+			mlog(0, "This ctxt is not joined yet!\n");
														
 
															+			wait_event_interruptible(dlm_domain_events,
														
 
															+						 dlm_wait_on_domain_helper(
														
 
															+							 domain));
														
 
															+			goto retry;
														
 
															+		}
														
 
															+
														
 
															+		__dlm_get(dlm);
														
 
															+		dlm->num_joins++;
														
 
															+
														
 
															+		spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+		ret = 0;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* doesn't exist */
														
 
															+	if (!new_ctxt) {
														
 
															+		spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+		new_ctxt = dlm_alloc_ctxt(domain, key);
														
 
															+		if (new_ctxt)
														
 
															+			goto retry;
														
 
															+
														
 
															+		ret = -ENOMEM;
														
 
															+		mlog_errno(ret);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* a little variable switch-a-roo here... */
														
 
															+	dlm = new_ctxt;
														
 
															+	new_ctxt = NULL;
														
 
															+
														
 
															+	/* add the new domain */
														
 
															+	list_add_tail(&dlm->list, &dlm_domains);
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+
														
 
															+	ret = dlm_join_domain(dlm);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		dlm_put(dlm);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	ret = 0;
														
 
															+leave:
														
 
															+	if (new_ctxt)
														
 
															+		dlm_free_ctxt_mem(new_ctxt);
														
 
															+
														
 
															+	if (ret < 0)
														
 
															+		dlm = ERR_PTR(ret);
														
 
															+
														
 
															+	return dlm;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_register_domain);
														
 
															+
														
 
															+static LIST_HEAD(dlm_join_handlers);
														
 
															+
														
 
															+static void dlm_unregister_net_handlers(void)
														
 
															+{
														
 
															+	o2net_unregister_handler_list(&dlm_join_handlers);
														
 
															+}
														
 
															+
														
 
															+static int dlm_register_net_handlers(void)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
														
 
															+					sizeof(struct dlm_query_join_request),
														
 
															+					dlm_query_join_handler,
														
 
															+					NULL, &dlm_join_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
														
 
															+					sizeof(struct dlm_assert_joined),
														
 
															+					dlm_assert_joined_handler,
														
 
															+					NULL, &dlm_join_handlers);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
														
 
															+					sizeof(struct dlm_cancel_join),
														
 
															+					dlm_cancel_join_handler,
														
 
															+					NULL, &dlm_join_handlers);
														
 
															+
														
 
															+bail:
														
 
															+	if (status < 0)
														
 
															+		dlm_unregister_net_handlers();
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Domain eviction callback handling.
														
 
															+ *
														
 
															+ * The file system requires notification of node death *before* the
														
 
															+ * dlm completes it's recovery work, otherwise it may be able to
														
 
															+ * acquire locks on resources requiring recovery. Since the dlm can
														
 
															+ * evict a node from it's domain *before* heartbeat fires, a similar
														
 
															+ * mechanism is required. */
														
 
															+
														
 
															+/* Eviction is not expected to happen often, so a per-domain lock is
														
 
															+ * not necessary. Eviction callbacks are allowed to sleep for short
														
 
															+ * periods of time. */
														
 
															+static DECLARE_RWSEM(dlm_callback_sem);
														
 
															+
														
 
															+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
														
 
															+					int node_num)
														
 
															+{
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_eviction_cb *cb;
														
 
															+
														
 
															+	down_read(&dlm_callback_sem);
														
 
															+	list_for_each(iter, &dlm->dlm_eviction_callbacks) {
														
 
															+		cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
														
 
															+
														
 
															+		cb->ec_func(node_num, cb->ec_data);
														
 
															+	}
														
 
															+	up_read(&dlm_callback_sem);
														
 
															+}
														
 
															+
														
 
															+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
														
 
															+			   dlm_eviction_func *f,
														
 
															+			   void *data)
														
 
															+{
														
 
															+	INIT_LIST_HEAD(&cb->ec_item);
														
 
															+	cb->ec_func = f;
														
 
															+	cb->ec_data = data;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
														
 
															+
														
 
															+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_eviction_cb *cb)
														
 
															+{
														
 
															+	down_write(&dlm_callback_sem);
														
 
															+	list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
														
 
															+	up_write(&dlm_callback_sem);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
														
 
															+
														
 
															+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
														
 
															+{
														
 
															+	down_write(&dlm_callback_sem);
														
 
															+	list_del_init(&cb->ec_item);
														
 
															+	up_write(&dlm_callback_sem);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
														
 
															+
														
 
															+static int __init dlm_init(void)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	dlm_print_version();
														
 
															+
														
 
															+	status = dlm_init_mle_cache();
														
 
															+	if (status)
														
 
															+		return -1;
														
 
															+
														
 
															+	status = dlm_register_net_handlers();
														
 
															+	if (status) {
														
 
															+		dlm_destroy_mle_cache();
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void __exit dlm_exit (void)
														
 
															+{
														
 
															+	dlm_unregister_net_handlers();
														
 
															+	dlm_destroy_mle_cache();
														
 
															+}
														
 
															+
														
 
															+MODULE_AUTHOR("Oracle");
														
 
															+MODULE_LICENSE("GPL");
														
 
															+
														
 
															+module_init(dlm_init);
														
 
															+module_exit(dlm_exit);
														
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -0,0 +1,36 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmdomain.h
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#ifndef DLMDOMAIN_H
														
 
															+#define DLMDOMAIN_H
														
 
															+
														
 
															+extern spinlock_t dlm_domain_lock;
														
 
															+extern struct list_head dlm_domains;
														
 
															+
														
 
															+int dlm_joined(struct dlm_ctxt *dlm);
														
 
															+int dlm_shutting_down(struct dlm_ctxt *dlm);
														
 
															+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
														
 
															+					int node_num);
														
 
															+
														
 
															+#endif
														
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -0,0 +1,640 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmfs.c
														
 
															+ *
														
 
															+ * Code which implements the kernel side of a minimal userspace
														
 
															+ * interface to our DLM. This file handles the virtual file system
														
 
															+ * used for communication with userspace. Credit should go to ramfs,
														
 
															+ * which was a template for the fs side of this module.
														
 
															+ *
														
 
															+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+/* Simple VFS hooks based on: */
														
 
															+/*
														
 
															+ * Resizable simple ram filesystem for Linux.
														
 
															+ *
														
 
															+ * Copyright (C) 2000 Linus Torvalds.
														
 
															+ *               2000 Transmeta Corp.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/pagemap.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/string.h>
														
 
															+#include <linux/smp_lock.h>
														
 
															+#include <linux/backing-dev.h>
														
 
															+
														
 
															+#include <asm/uaccess.h>
														
 
															+
														
 
															+
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+
														
 
															+#include "userdlm.h"
														
 
															+
														
 
															+#include "dlmfsver.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DLMFS
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+static struct super_operations dlmfs_ops;
														
 
															+static struct file_operations dlmfs_file_operations;
														
 
															+static struct inode_operations dlmfs_dir_inode_operations;
														
 
															+static struct inode_operations dlmfs_root_inode_operations;
														
 
															+static struct inode_operations dlmfs_file_inode_operations;
														
 
															+static kmem_cache_t *dlmfs_inode_cache;
														
 
															+
														
 
															+struct workqueue_struct *user_dlm_worker;
														
 
															+
														
 
															+/*
														
 
															+ * decodes a set of open flags into a valid lock level and a set of flags.
														
 
															+ * returns < 0 if we have invalid flags
														
 
															+ * flags which mean something to us:
														
 
															+ * O_RDONLY -> PRMODE level
														
 
															+ * O_WRONLY -> EXMODE level
														
 
															+ *
														
 
															+ * O_NONBLOCK -> LKM_NOQUEUE
														
 
															+ */
														
 
															+static int dlmfs_decode_open_flags(int open_flags,
														
 
															+				   int *level,
														
 
															+				   int *flags)
														
 
															+{
														
 
															+	if (open_flags & (O_WRONLY|O_RDWR))
														
 
															+		*level = LKM_EXMODE;
														
 
															+	else
														
 
															+		*level = LKM_PRMODE;
														
 
															+
														
 
															+	*flags = 0;
														
 
															+	if (open_flags & O_NONBLOCK)
														
 
															+		*flags |= LKM_NOQUEUE;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int dlmfs_file_open(struct inode *inode,
														
 
															+			   struct file *file)
														
 
															+{
														
 
															+	int status, level, flags;
														
 
															+	struct dlmfs_filp_private *fp = NULL;
														
 
															+	struct dlmfs_inode_private *ip;
														
 
															+
														
 
															+	if (S_ISDIR(inode->i_mode))
														
 
															+		BUG();
														
 
															+
														
 
															+	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
														
 
															+		file->f_flags);
														
 
															+
														
 
															+	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
														
 
															+	if (status < 0)
														
 
															+		goto bail;
														
 
															+
														
 
															+	/* We don't want to honor O_APPEND at read/write time as it
														
 
															+	 * doesn't make sense for LVB writes. */
														
 
															+	file->f_flags &= ~O_APPEND;
														
 
															+
														
 
															+	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
														
 
															+	if (!fp) {
														
 
															+		status = -ENOMEM;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	fp->fp_lock_level = level;
														
 
															+
														
 
															+	ip = DLMFS_I(inode);
														
 
															+
														
 
															+	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
														
 
															+	if (status < 0) {
														
 
															+		/* this is a strange error to return here but I want
														
 
															+		 * to be able userspace to be able to distinguish a
														
 
															+		 * valid lock request from one that simply couldn't be
														
 
															+		 * granted. */
														
 
															+		if (flags & LKM_NOQUEUE && status == -EAGAIN)
														
 
															+			status = -ETXTBSY;
														
 
															+		kfree(fp);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	file->private_data = fp;
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlmfs_file_release(struct inode *inode,
														
 
															+			      struct file *file)
														
 
															+{
														
 
															+	int level, status;
														
 
															+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
														
 
															+	struct dlmfs_filp_private *fp =
														
 
															+		(struct dlmfs_filp_private *) file->private_data;
														
 
															+
														
 
															+	if (S_ISDIR(inode->i_mode))
														
 
															+		BUG();
														
 
															+
														
 
															+	mlog(0, "close called on inode %lu\n", inode->i_ino);
														
 
															+
														
 
															+	status = 0;
														
 
															+	if (fp) {
														
 
															+		level = fp->fp_lock_level;
														
 
															+		if (level != LKM_IVMODE)
														
 
															+			user_dlm_cluster_unlock(&ip->ip_lockres, level);
														
 
															+
														
 
															+		kfree(fp);
														
 
															+		file->private_data = NULL;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static ssize_t dlmfs_file_read(struct file *filp,
														
 
															+			       char __user *buf,
														
 
															+			       size_t count,
														
 
															+			       loff_t *ppos)
														
 
															+{
														
 
															+	int bytes_left;
														
 
															+	ssize_t readlen;
														
 
															+	char *lvb_buf;
														
 
															+	struct inode *inode = filp->f_dentry->d_inode;
														
 
															+
														
 
															+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
														
 
															+		inode->i_ino, count, *ppos);
														
 
															+
														
 
															+	if (*ppos >= i_size_read(inode))
														
 
															+		return 0;
														
 
															+
														
 
															+	if (!count)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (!access_ok(VERIFY_WRITE, buf, count))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	/* don't read past the lvb */
														
 
															+	if ((count + *ppos) > i_size_read(inode))
														
 
															+		readlen = i_size_read(inode) - *ppos;
														
 
															+	else
														
 
															+		readlen = count - *ppos;
														
 
															+
														
 
															+	lvb_buf = kmalloc(readlen, GFP_KERNEL);
														
 
															+	if (!lvb_buf)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	user_dlm_read_lvb(inode, lvb_buf, readlen);
														
 
															+	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
														
 
															+	readlen -= bytes_left;
														
 
															+
														
 
															+	kfree(lvb_buf);
														
 
															+
														
 
															+	*ppos = *ppos + readlen;
														
 
															+
														
 
															+	mlog(0, "read %zd bytes\n", readlen);
														
 
															+	return readlen;
														
 
															+}
														
 
															+
														
 
															+static ssize_t dlmfs_file_write(struct file *filp,
														
 
															+				const char __user *buf,
														
 
															+				size_t count,
														
 
															+				loff_t *ppos)
														
 
															+{
														
 
															+	int bytes_left;
														
 
															+	ssize_t writelen;
														
 
															+	char *lvb_buf;
														
 
															+	struct inode *inode = filp->f_dentry->d_inode;
														
 
															+
														
 
															+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
														
 
															+		inode->i_ino, count, *ppos);
														
 
															+
														
 
															+	if (*ppos >= i_size_read(inode))
														
 
															+		return -ENOSPC;
														
 
															+
														
 
															+	if (!count)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (!access_ok(VERIFY_READ, buf, count))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	/* don't write past the lvb */
														
 
															+	if ((count + *ppos) > i_size_read(inode))
														
 
															+		writelen = i_size_read(inode) - *ppos;
														
 
															+	else
														
 
															+		writelen = count - *ppos;
														
 
															+
														
 
															+	lvb_buf = kmalloc(writelen, GFP_KERNEL);
														
 
															+	if (!lvb_buf)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	bytes_left = copy_from_user(lvb_buf, buf, writelen);
														
 
															+	writelen -= bytes_left;
														
 
															+	if (writelen)
														
 
															+		user_dlm_write_lvb(inode, lvb_buf, writelen);
														
 
															+
														
 
															+	kfree(lvb_buf);
														
 
															+
														
 
															+	*ppos = *ppos + writelen;
														
 
															+	mlog(0, "wrote %zd bytes\n", writelen);
														
 
															+	return writelen;
														
 
															+}
														
 
															+
														
 
															+static void dlmfs_init_once(void *foo,
														
 
															+			    kmem_cache_t *cachep,
														
 
															+			    unsigned long flags)
														
 
															+{
														
 
															+	struct dlmfs_inode_private *ip =
														
 
															+		(struct dlmfs_inode_private *) foo;
														
 
															+
														
 
															+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
														
 
															+	    SLAB_CTOR_CONSTRUCTOR) {
														
 
															+		ip->ip_dlm = NULL;
														
 
															+		ip->ip_parent = NULL;
														
 
															+
														
 
															+		inode_init_once(&ip->ip_vfs_inode);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static struct inode *dlmfs_alloc_inode(struct super_block *sb)
														
 
															+{
														
 
															+	struct dlmfs_inode_private *ip;
														
 
															+
														
 
															+	ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
														
 
															+	if (!ip)
														
 
															+		return NULL;
														
 
															+
														
 
															+	return &ip->ip_vfs_inode;
														
 
															+}
														
 
															+
														
 
															+static void dlmfs_destroy_inode(struct inode *inode)
														
 
															+{
														
 
															+	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
														
 
															+}
														
 
															+
														
 
															+static void dlmfs_clear_inode(struct inode *inode)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct dlmfs_inode_private *ip;
														
 
															+
														
 
															+	if (!inode)
														
 
															+		return;
														
 
															+
														
 
															+	mlog(0, "inode %lu\n", inode->i_ino);
														
 
															+
														
 
															+	ip = DLMFS_I(inode);
														
 
															+
														
 
															+	if (S_ISREG(inode->i_mode)) {
														
 
															+		status = user_dlm_destroy_lock(&ip->ip_lockres);
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+		iput(ip->ip_parent);
														
 
															+		goto clear_fields;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
														
 
															+	/* we must be a directory. If required, lets unregister the
														
 
															+	 * dlm context now. */
														
 
															+	if (ip->ip_dlm)
														
 
															+		user_dlm_unregister_context(ip->ip_dlm);
														
 
															+clear_fields:
														
 
															+	ip->ip_parent = NULL;
														
 
															+	ip->ip_dlm = NULL;
														
 
															+}
														
 
															+
														
 
															+static struct backing_dev_info dlmfs_backing_dev_info = {
														
 
															+	.ra_pages	= 0,	/* No readahead */
														
 
															+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
														
 
															+};
														
 
															+
														
 
															+static struct inode *dlmfs_get_root_inode(struct super_block *sb)
														
 
															+{
														
 
															+	struct inode *inode = new_inode(sb);
														
 
															+	int mode = S_IFDIR | 0755;
														
 
															+	struct dlmfs_inode_private *ip;
														
 
															+
														
 
															+	if (inode) {
														
 
															+		ip = DLMFS_I(inode);
														
 
															+
														
 
															+		inode->i_mode = mode;
														
 
															+		inode->i_uid = current->fsuid;
														
 
															+		inode->i_gid = current->fsgid;
														
 
															+		inode->i_blksize = PAGE_CACHE_SIZE;
														
 
															+		inode->i_blocks = 0;
														
 
															+		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
														
 
															+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
														
 
															+		inode->i_nlink++;
														
 
															+
														
 
															+		inode->i_fop = &simple_dir_operations;
														
 
															+		inode->i_op = &dlmfs_root_inode_operations;
														
 
															+	}
														
 
															+
														
 
															+	return inode;
														
 
															+}
														
 
															+
														
 
															+static struct inode *dlmfs_get_inode(struct inode *parent,
														
 
															+				     struct dentry *dentry,
														
 
															+				     int mode)
														
 
															+{
														
 
															+	struct super_block *sb = parent->i_sb;
														
 
															+	struct inode * inode = new_inode(sb);
														
 
															+	struct dlmfs_inode_private *ip;
														
 
															+
														
 
															+	if (!inode)
														
 
															+		return NULL;
														
 
															+
														
 
															+	inode->i_mode = mode;
														
 
															+	inode->i_uid = current->fsuid;
														
 
															+	inode->i_gid = current->fsgid;
														
 
															+	inode->i_blksize = PAGE_CACHE_SIZE;
														
 
															+	inode->i_blocks = 0;
														
 
															+	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
														
 
															+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
														
 
															+
														
 
															+	ip = DLMFS_I(inode);
														
 
															+	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
														
 
															+
														
 
															+	switch (mode & S_IFMT) {
														
 
															+	default:
														
 
															+		/* for now we don't support anything other than
														
 
															+		 * directories and regular files. */
														
 
															+		BUG();
														
 
															+		break;
														
 
															+	case S_IFREG:
														
 
															+		inode->i_op = &dlmfs_file_inode_operations;
														
 
															+		inode->i_fop = &dlmfs_file_operations;
														
 
															+
														
 
															+		i_size_write(inode,  DLM_LVB_LEN);
														
 
															+
														
 
															+		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
														
 
															+
														
 
															+		/* released at clear_inode time, this insures that we
														
 
															+		 * get to drop the dlm reference on each lock *before*
														
 
															+		 * we call the unregister code for releasing parent
														
 
															+		 * directories. */
														
 
															+		ip->ip_parent = igrab(parent);
														
 
															+		BUG_ON(!ip->ip_parent);
														
 
															+		break;
														
 
															+	case S_IFDIR:
														
 
															+		inode->i_op = &dlmfs_dir_inode_operations;
														
 
															+		inode->i_fop = &simple_dir_operations;
														
 
															+
														
 
															+		/* directory inodes start off with i_nlink ==
														
 
															+		 * 2 (for "." entry) */
														
 
															+		inode->i_nlink++;
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	if (parent->i_mode & S_ISGID) {
														
 
															+		inode->i_gid = parent->i_gid;
														
 
															+		if (S_ISDIR(mode))
														
 
															+			inode->i_mode |= S_ISGID;
														
 
															+	}
														
 
															+
														
 
															+	return inode;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * File creation. Allocate an inode, and we're done..
														
 
															+ */
														
 
															+/* SMP-safe */
														
 
															+static int dlmfs_mkdir(struct inode * dir,
														
 
															+		       struct dentry * dentry,
														
 
															+		       int mode)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct qstr *domain = &dentry->d_name;
														
 
															+	struct dlmfs_inode_private *ip;
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+
														
 
															+	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
														
 
															+
														
 
															+	/* verify that we have a proper domain */
														
 
															+	if (domain->len >= O2NM_MAX_NAME_LEN) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog(ML_ERROR, "invalid domain name for directory.\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
														
 
															+	if (!inode) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ip = DLMFS_I(inode);
														
 
															+
														
 
															+	dlm = user_dlm_register_context(domain);
														
 
															+	if (IS_ERR(dlm)) {
														
 
															+		status = PTR_ERR(dlm);
														
 
															+		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
														
 
															+		     status, domain->len, domain->name);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	ip->ip_dlm = dlm;
														
 
															+
														
 
															+	dir->i_nlink++;
														
 
															+	d_instantiate(dentry, inode);
														
 
															+	dget(dentry);	/* Extra count - pin the dentry in core */
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (status < 0)
														
 
															+		iput(inode);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlmfs_create(struct inode *dir,
														
 
															+			struct dentry *dentry,
														
 
															+			int mode,
														
 
															+			struct nameidata *nd)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct inode *inode;
														
 
															+	struct qstr *name = &dentry->d_name;
														
 
															+
														
 
															+	mlog(0, "create %.*s\n", name->len, name->name);
														
 
															+
														
 
															+	/* verify name is valid and doesn't contain any dlm reserved
														
 
															+	 * characters */
														
 
															+	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
														
 
															+	    name->name[0] == '$') {
														
 
															+		status = -EINVAL;
														
 
															+		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
														
 
															+		     name->name);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
														
 
															+	if (!inode) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	d_instantiate(dentry, inode);
														
 
															+	dget(dentry);	/* Extra count - pin the dentry in core */
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlmfs_unlink(struct inode *dir,
														
 
															+			struct dentry *dentry)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *inode = dentry->d_inode;
														
 
															+
														
 
															+	mlog(0, "unlink inode %lu\n", inode->i_ino);
														
 
															+
														
 
															+	/* if there are no current holders, or none that are waiting
														
 
															+	 * to acquire a lock, this basically destroys our lockres. */
														
 
															+	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
														
 
															+	if (status < 0) {
														
 
															+		mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
														
 
															+		     dentry->d_name.len, dentry->d_name.name, status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	status = simple_unlink(dir, dentry);
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlmfs_fill_super(struct super_block * sb,
														
 
															+			    void * data,
														
 
															+			    int silent)
														
 
															+{
														
 
															+	struct inode * inode;
														
 
															+	struct dentry * root;
														
 
															+
														
 
															+	sb->s_maxbytes = MAX_LFS_FILESIZE;
														
 
															+	sb->s_blocksize = PAGE_CACHE_SIZE;
														
 
															+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
														
 
															+	sb->s_magic = DLMFS_MAGIC;
														
 
															+	sb->s_op = &dlmfs_ops;
														
 
															+	inode = dlmfs_get_root_inode(sb);
														
 
															+	if (!inode)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	root = d_alloc_root(inode);
														
 
															+	if (!root) {
														
 
															+		iput(inode);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+	sb->s_root = root;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static struct file_operations dlmfs_file_operations = {
														
 
															+	.open		= dlmfs_file_open,
														
 
															+	.release	= dlmfs_file_release,
														
 
															+	.read		= dlmfs_file_read,
														
 
															+	.write		= dlmfs_file_write,
														
 
															+};
														
 
															+
														
 
															+static struct inode_operations dlmfs_dir_inode_operations = {
														
 
															+	.create		= dlmfs_create,
														
 
															+	.lookup		= simple_lookup,
														
 
															+	.unlink		= dlmfs_unlink,
														
 
															+};
														
 
															+
														
 
															+/* this way we can restrict mkdir to only the toplevel of the fs. */
														
 
															+static struct inode_operations dlmfs_root_inode_operations = {
														
 
															+	.lookup		= simple_lookup,
														
 
															+	.mkdir		= dlmfs_mkdir,
														
 
															+	.rmdir		= simple_rmdir,
														
 
															+};
														
 
															+
														
 
															+static struct super_operations dlmfs_ops = {
														
 
															+	.statfs		= simple_statfs,
														
 
															+	.alloc_inode	= dlmfs_alloc_inode,
														
 
															+	.destroy_inode	= dlmfs_destroy_inode,
														
 
															+	.clear_inode	= dlmfs_clear_inode,
														
 
															+	.drop_inode	= generic_delete_inode,
														
 
															+};
														
 
															+
														
 
															+static struct inode_operations dlmfs_file_inode_operations = {
														
 
															+	.getattr	= simple_getattr,
														
 
															+};
														
 
															+
														
 
															+static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
														
 
															+	int flags, const char *dev_name, void *data)
														
 
															+{
														
 
															+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
														
 
															+}
														
 
															+
														
 
															+static struct file_system_type dlmfs_fs_type = {
														
 
															+	.owner		= THIS_MODULE,
														
 
															+	.name		= "ocfs2_dlmfs",
														
 
															+	.get_sb		= dlmfs_get_sb,
														
 
															+	.kill_sb	= kill_litter_super,
														
 
															+};
														
 
															+
														
 
															+static int __init init_dlmfs_fs(void)
														
 
															+{
														
 
															+	int status;
														
 
															+	int cleanup_inode = 0, cleanup_worker = 0;
														
 
															+
														
 
															+	dlmfs_print_version();
														
 
															+
														
 
															+	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
														
 
															+				sizeof(struct dlmfs_inode_private),
														
 
															+				0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
														
 
															+				dlmfs_init_once, NULL);
														
 
															+	if (!dlmfs_inode_cache)
														
 
															+		return -ENOMEM;
														
 
															+	cleanup_inode = 1;
														
 
															+
														
 
															+	user_dlm_worker = create_singlethread_workqueue("user_dlm");
														
 
															+	if (!user_dlm_worker) {
														
 
															+		status = -ENOMEM;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	cleanup_worker = 1;
														
 
															+
														
 
															+	status = register_filesystem(&dlmfs_fs_type);
														
 
															+bail:
														
 
															+	if (status) {
														
 
															+		if (cleanup_inode)
														
 
															+			kmem_cache_destroy(dlmfs_inode_cache);
														
 
															+		if (cleanup_worker)
														
 
															+			destroy_workqueue(user_dlm_worker);
														
 
															+	} else
														
 
															+		printk("OCFS2 User DLM kernel interface loaded\n");
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static void __exit exit_dlmfs_fs(void)
														
 
															+{
														
 
															+	unregister_filesystem(&dlmfs_fs_type);
														
 
															+
														
 
															+	flush_workqueue(user_dlm_worker);
														
 
															+	destroy_workqueue(user_dlm_worker);
														
 
															+
														
 
															+	if (kmem_cache_destroy(dlmfs_inode_cache))
														
 
															+		printk(KERN_INFO "dlmfs_inode_cache: not all structures "
														
 
															+		       "were freed\n");
														
 
															+}
														
 
															+
														
 
															+MODULE_AUTHOR("Oracle");
														
 
															+MODULE_LICENSE("GPL");
														
 
															+
														
 
															+module_init(init_dlmfs_fs)
														
 
															+module_exit(exit_dlmfs_fs)
														
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -0,0 +1,42 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmfsver.c
														
 
															+ *
														
 
															+ * version string
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/kernel.h>
														
 
															+
														
 
															+#include "dlmfsver.h"
														
 
															+
														
 
															+#define DLM_BUILD_VERSION "1.3.3"
														
 
															+
														
 
															+#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
														
 
															+
														
 
															+void dlmfs_print_version(void)
														
 
															+{
														
 
															+	printk(KERN_INFO "%s\n", VERSION_STR);
														
 
															+}
														
 
															+
														
 
															+MODULE_DESCRIPTION(VERSION_STR);
														
 
															+
														
 
															+MODULE_VERSION(DLM_BUILD_VERSION);
														
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlm/dlmfsver.h
@@ -0,0 +1,31 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmver.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef DLMFS_VER_H
														
 
															+#define DLMFS_VER_H
														
 
															+
														
 
															+void dlmfs_print_version(void);
														
 
															+
														
 
															+#endif /* DLMFS_VER_H */
														
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -0,0 +1,676 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmlock.c
														
 
															+ *
														
 
															+ * underlying calls for lock creation
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/socket.h>
														
 
															+#include <linux/inet.h>
														
 
															+#include <linux/spinlock.h>
														
 
															+#include <linux/delay.h>
														
 
															+
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+
														
 
															+#include "dlmconvert.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DLM
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
														
 
															+static u64 dlm_next_cookie = 1;
														
 
															+
														
 
															+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
														
 
															+					       struct dlm_lock_resource *res,
														
 
															+					       struct dlm_lock *lock, int flags);
														
 
															+static void dlm_init_lock(struct dlm_lock *newlock, int type,
														
 
															+			  u8 node, u64 cookie);
														
 
															+static void dlm_lock_release(struct kref *kref);
														
 
															+static void dlm_lock_detach_lockres(struct dlm_lock *lock);
														
 
															+
														
 
															+/* Tell us whether we can grant a new lock request.
														
 
															+ * locking:
														
 
															+ *   caller needs:  res->spinlock
														
 
															+ *   taken:         none
														
 
															+ *   held on exit:  none
														
 
															+ * returns: 1 if the lock can be granted, 0 otherwise.
														
 
															+ */
														
 
															+static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
														
 
															+				  struct dlm_lock *lock)
														
 
															+{
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock *tmplock;
														
 
															+
														
 
															+	list_for_each(iter, &res->granted) {
														
 
															+		tmplock = list_entry(iter, struct dlm_lock, list);
														
 
															+
														
 
															+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
														
 
															+			return 0;
														
 
															+	}
														
 
															+
														
 
															+	list_for_each(iter, &res->converting) {
														
 
															+		tmplock = list_entry(iter, struct dlm_lock, list);
														
 
															+
														
 
															+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
														
 
															+			return 0;
														
 
															+	}
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+/* performs lock creation at the lockres master site
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         takes and drops res->spinlock
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NORMAL, DLM_NOTQUEUED
														
 
															+ */
														
 
															+static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_lock_resource *res,
														
 
															+				      struct dlm_lock *lock, int flags)
														
 
															+{
														
 
															+	int call_ast = 0, kick_thread = 0;
														
 
															+	enum dlm_status status = DLM_NORMAL;
														
 
															+
														
 
															+	mlog_entry("type=%d\n", lock->ml.type);
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	/* if called from dlm_create_lock_handler, need to
														
 
															+	 * ensure it will not sleep in dlm_wait_on_lockres */
														
 
															+	status = __dlm_lockres_state_to_status(res);
														
 
															+	if (status != DLM_NORMAL &&
														
 
															+	    lock->ml.node != dlm->node_num) {
														
 
															+		/* erf.  state changed after lock was dropped. */
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		dlm_error(status);
														
 
															+		return status;
														
 
															+	}
														
 
															+	__dlm_wait_on_lockres(res);
														
 
															+	__dlm_lockres_reserve_ast(res);
														
 
															+
														
 
															+	if (dlm_can_grant_new_lock(res, lock)) {
														
 
															+		mlog(0, "I can grant this lock right away\n");
														
 
															+		/* got it right away */
														
 
															+		lock->lksb->status = DLM_NORMAL;
														
 
															+		status = DLM_NORMAL;
														
 
															+		dlm_lock_get(lock);
														
 
															+		list_add_tail(&lock->list, &res->granted);
														
 
															+
														
 
															+		/* for the recovery lock, we can't allow the ast
														
 
															+		 * to be queued since the dlmthread is already
														
 
															+		 * frozen.  but the recovery lock is always locked
														
 
															+		 * with LKM_NOQUEUE so we do not need the ast in
														
 
															+		 * this special case */
														
 
															+		if (!dlm_is_recovery_lock(res->lockname.name,
														
 
															+					  res->lockname.len)) {
														
 
															+			kick_thread = 1;
														
 
															+			call_ast = 1;
														
 
															+		}
														
 
															+	} else {
														
 
															+		/* for NOQUEUE request, unless we get the
														
 
															+		 * lock right away, return DLM_NOTQUEUED */
														
 
															+		if (flags & LKM_NOQUEUE)
														
 
															+			status = DLM_NOTQUEUED;
														
 
															+		else {
														
 
															+			dlm_lock_get(lock);
														
 
															+			list_add_tail(&lock->list, &res->blocked);
														
 
															+			kick_thread = 1;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	wake_up(&res->wq);
														
 
															+
														
 
															+	/* either queue the ast or release it */
														
 
															+	if (call_ast)
														
 
															+		dlm_queue_ast(dlm, lock);
														
 
															+	else
														
 
															+		dlm_lockres_release_ast(dlm, res);
														
 
															+
														
 
															+	dlm_lockres_calc_usage(dlm, res);
														
 
															+	if (kick_thread)
														
 
															+		dlm_kick_thread(dlm, res);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
														
 
															+			     struct dlm_lock *lock)
														
 
															+{
														
 
															+	/* remove from local queue if it failed */
														
 
															+	list_del_init(&lock->list);
														
 
															+	lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         takes and drops res->spinlock
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_DENIED, DLM_RECOVERING, or net status
														
 
															+ */
														
 
															+static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_lock_resource *res,
														
 
															+				      struct dlm_lock *lock, int flags)
														
 
															+{
														
 
															+	enum dlm_status status = DLM_DENIED;
														
 
															+
														
 
															+	mlog_entry("type=%d\n", lock->ml.type);
														
 
															+	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
														
 
															+	     res->lockname.name, flags);
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+
														
 
															+	/* will exit this call with spinlock held */
														
 
															+	__dlm_wait_on_lockres(res);
														
 
															+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
														
 
															+
														
 
															+	/* add lock to local (secondary) queue */
														
 
															+	dlm_lock_get(lock);
														
 
															+	list_add_tail(&lock->list, &res->blocked);
														
 
															+	lock->lock_pending = 1;
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	/* spec seems to say that you will get DLM_NORMAL when the lock
														
 
															+	 * has been queued, meaning we need to wait for a reply here. */
														
 
															+	status = dlm_send_remote_lock_request(dlm, res, lock, flags);
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
														
 
															+	lock->lock_pending = 0;
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		if (status != DLM_NOTQUEUED)
														
 
															+			dlm_error(status);
														
 
															+		dlm_revert_pending_lock(res, lock);
														
 
															+		dlm_lock_put(lock);
														
 
															+	}
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	dlm_lockres_calc_usage(dlm, res);
														
 
															+
														
 
															+	wake_up(&res->wq);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* for remote lock creation.
														
 
															+ * locking:
														
 
															+ *   caller needs:  none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
														
 
															+ *   taken:         none
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NOLOCKMGR, or net status
														
 
															+ */
														
 
															+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
														
 
															+					       struct dlm_lock_resource *res,
														
 
															+					       struct dlm_lock *lock, int flags)
														
 
															+{
														
 
															+	struct dlm_create_lock create;
														
 
															+	int tmpret, status = 0;
														
 
															+	enum dlm_status ret;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	memset(&create, 0, sizeof(create));
														
 
															+	create.node_idx = dlm->node_num;
														
 
															+	create.requested_type = lock->ml.type;
														
 
															+	create.cookie = lock->ml.cookie;
														
 
															+	create.namelen = res->lockname.len;
														
 
															+	create.flags = cpu_to_be32(flags);
														
 
															+	memcpy(create.name, res->lockname.name, create.namelen);
														
 
															+
														
 
															+	tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
														
 
															+				    sizeof(create), res->owner, &status);
														
 
															+	if (tmpret >= 0) {
														
 
															+		// successfully sent and received
														
 
															+		ret = status;  // this is already a dlm_status
														
 
															+	} else {
														
 
															+		mlog_errno(tmpret);
														
 
															+		if (dlm_is_host_down(tmpret)) {
														
 
															+			ret = DLM_RECOVERING;
														
 
															+			mlog(0, "node %u died so returning DLM_RECOVERING "
														
 
															+			     "from lock message!\n", res->owner);
														
 
															+		} else {
														
 
															+			ret = dlm_err_to_dlm_status(tmpret);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+void dlm_lock_get(struct dlm_lock *lock)
														
 
															+{
														
 
															+	kref_get(&lock->lock_refs);
														
 
															+}
														
 
															+
														
 
															+void dlm_lock_put(struct dlm_lock *lock)
														
 
															+{
														
 
															+	kref_put(&lock->lock_refs, dlm_lock_release);
														
 
															+}
														
 
															+
														
 
															+static void dlm_lock_release(struct kref *kref)
														
 
															+{
														
 
															+	struct dlm_lock *lock;
														
 
															+
														
 
															+	lock = container_of(kref, struct dlm_lock, lock_refs);
														
 
															+
														
 
															+	BUG_ON(!list_empty(&lock->list));
														
 
															+	BUG_ON(!list_empty(&lock->ast_list));
														
 
															+	BUG_ON(!list_empty(&lock->bast_list));
														
 
															+	BUG_ON(lock->ast_pending);
														
 
															+	BUG_ON(lock->bast_pending);
														
 
															+
														
 
															+	dlm_lock_detach_lockres(lock);
														
 
															+
														
 
															+	if (lock->lksb_kernel_allocated) {
														
 
															+		mlog(0, "freeing kernel-allocated lksb\n");
														
 
															+		kfree(lock->lksb);
														
 
															+	}
														
 
															+	kfree(lock);
														
 
															+}
														
 
															+
														
 
															+/* associate a lock with it's lockres, getting a ref on the lockres */
														
 
															+void dlm_lock_attach_lockres(struct dlm_lock *lock,
														
 
															+			     struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	dlm_lockres_get(res);
														
 
															+	lock->lockres = res;
														
 
															+}
														
 
															+
														
 
															+/* drop ref on lockres, if there is still one associated with lock */
														
 
															+static void dlm_lock_detach_lockres(struct dlm_lock *lock)
														
 
															+{
														
 
															+	struct dlm_lock_resource *res;
														
 
															+
														
 
															+	res = lock->lockres;
														
 
															+	if (res) {
														
 
															+		lock->lockres = NULL;
														
 
															+		mlog(0, "removing lock's lockres reference\n");
														
 
															+		dlm_lockres_put(res);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void dlm_init_lock(struct dlm_lock *newlock, int type,
														
 
															+			  u8 node, u64 cookie)
														
 
															+{
														
 
															+	INIT_LIST_HEAD(&newlock->list);
														
 
															+	INIT_LIST_HEAD(&newlock->ast_list);
														
 
															+	INIT_LIST_HEAD(&newlock->bast_list);
														
 
															+	spin_lock_init(&newlock->spinlock);
														
 
															+	newlock->ml.type = type;
														
 
															+	newlock->ml.convert_type = LKM_IVMODE;
														
 
															+	newlock->ml.highest_blocked = LKM_IVMODE;
														
 
															+	newlock->ml.node = node;
														
 
															+	newlock->ml.pad1 = 0;
														
 
															+	newlock->ml.list = 0;
														
 
															+	newlock->ml.flags = 0;
														
 
															+	newlock->ast = NULL;
														
 
															+	newlock->bast = NULL;
														
 
															+	newlock->astdata = NULL;
														
 
															+	newlock->ml.cookie = cpu_to_be64(cookie);
														
 
															+	newlock->ast_pending = 0;
														
 
															+	newlock->bast_pending = 0;
														
 
															+	newlock->convert_pending = 0;
														
 
															+	newlock->lock_pending = 0;
														
 
															+	newlock->unlock_pending = 0;
														
 
															+	newlock->cancel_pending = 0;
														
 
															+	newlock->lksb_kernel_allocated = 0;
														
 
															+
														
 
															+	kref_init(&newlock->lock_refs);
														
 
															+}
														
 
															+
														
 
															+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
														
 
															+			       struct dlm_lockstatus *lksb)
														
 
															+{
														
 
															+	struct dlm_lock *lock;
														
 
															+	int kernel_allocated = 0;
														
 
															+
														
 
															+	lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
														
 
															+	if (!lock)
														
 
															+		return NULL;
														
 
															+
														
 
															+	if (!lksb) {
														
 
															+		/* zero memory only if kernel-allocated */
														
 
															+		lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
														
 
															+		if (!lksb) {
														
 
															+			kfree(lock);
														
 
															+			return NULL;
														
 
															+		}
														
 
															+		kernel_allocated = 1;
														
 
															+	}
														
 
															+
														
 
															+	dlm_init_lock(lock, type, node, cookie);
														
 
															+	if (kernel_allocated)
														
 
															+		lock->lksb_kernel_allocated = 1;
														
 
															+	lock->lksb = lksb;
														
 
															+	lksb->lockid = lock;
														
 
															+	return lock;
														
 
															+}
														
 
															+
														
 
															+/* handler for lock creation net message
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         takes and drops res->spinlock
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
														
 
															+ */
														
 
															+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+	struct dlm_lock *newlock = NULL;
														
 
															+	struct dlm_lockstatus *lksb = NULL;
														
 
															+	enum dlm_status status = DLM_NORMAL;
														
 
															+	char *name;
														
 
															+	unsigned int namelen;
														
 
															+
														
 
															+	BUG_ON(!dlm);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return DLM_REJECTED;
														
 
															+
														
 
															+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
														
 
															+			"Domain %s not fully joined!\n", dlm->name);
														
 
															+
														
 
															+	name = create->name;
														
 
															+	namelen = create->namelen;
														
 
															+
														
 
															+	status = DLM_IVBUFLEN;
														
 
															+	if (namelen > DLM_LOCKID_NAME_MAX) {
														
 
															+		dlm_error(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = DLM_SYSERR;
														
 
															+	newlock = dlm_new_lock(create->requested_type,
														
 
															+			       create->node_idx,
														
 
															+			       be64_to_cpu(create->cookie), NULL);
														
 
															+	if (!newlock) {
														
 
															+		dlm_error(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	lksb = newlock->lksb;
														
 
															+
														
 
															+	if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
														
 
															+		lksb->flags |= DLM_LKSB_GET_LVB;
														
 
															+		mlog(0, "set DLM_LKSB_GET_LVB flag\n");
														
 
															+	}
														
 
															+
														
 
															+	status = DLM_IVLOCKID;
														
 
															+	res = dlm_lookup_lockres(dlm, name, namelen);
														
 
															+	if (!res) {
														
 
															+		dlm_error(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	status = __dlm_lockres_state_to_status(res);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		mlog(0, "lockres recovering/migrating/in-progress\n");
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	dlm_lock_attach_lockres(newlock, res);
														
 
															+
														
 
															+	status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
														
 
															+leave:
														
 
															+	if (status != DLM_NORMAL)
														
 
															+		if (newlock)
														
 
															+			dlm_lock_put(newlock);
														
 
															+
														
 
															+	if (res)
														
 
															+		dlm_lockres_put(res);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
														
 
															+static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
														
 
															+{
														
 
															+	u64 tmpnode = node_num;
														
 
															+
														
 
															+	/* shift single byte of node num into top 8 bits */
														
 
															+	tmpnode <<= 56;
														
 
															+
														
 
															+	spin_lock(&dlm_cookie_lock);
														
 
															+	*cookie = (dlm_next_cookie | tmpnode);
														
 
															+	if (++dlm_next_cookie & 0xff00000000000000ull) {
														
 
															+		mlog(0, "This node's cookie will now wrap!\n");
														
 
															+		dlm_next_cookie = 1;
														
 
															+	}
														
 
															+	spin_unlock(&dlm_cookie_lock);
														
 
															+}
														
 
															+
														
 
															+enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
														
 
															+			struct dlm_lockstatus *lksb, int flags,
														
 
															+			const char *name, dlm_astlockfunc_t *ast, void *data,
														
 
															+			dlm_bastlockfunc_t *bast)
														
 
															+{
														
 
															+	enum dlm_status status;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+	struct dlm_lock *lock = NULL;
														
 
															+	int convert = 0, recovery = 0;
														
 
															+
														
 
															+	/* yes this function is a mess.
														
 
															+	 * TODO: clean this up.  lots of common code in the
														
 
															+	 *       lock and convert paths, especially in the retry blocks */
														
 
															+	if (!lksb) {
														
 
															+		dlm_error(DLM_BADARGS);
														
 
															+		return DLM_BADARGS;
														
 
															+	}
														
 
															+
														
 
															+	status = DLM_BADPARAM;
														
 
															+	if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
														
 
															+		dlm_error(status);
														
 
															+		goto error;
														
 
															+	}
														
 
															+
														
 
															+	if (flags & ~LKM_VALID_FLAGS) {
														
 
															+		dlm_error(status);
														
 
															+		goto error;
														
 
															+	}
														
 
															+
														
 
															+	convert = (flags & LKM_CONVERT);
														
 
															+	recovery = (flags & LKM_RECOVERY);
														
 
															+
														
 
															+	if (recovery &&
														
 
															+	    (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
														
 
															+		dlm_error(status);
														
 
															+		goto error;
														
 
															+	}
														
 
															+	if (convert && (flags & LKM_LOCAL)) {
														
 
															+		mlog(ML_ERROR, "strange LOCAL convert request!\n");
														
 
															+		goto error;
														
 
															+	}
														
 
															+
														
 
															+	if (convert) {
														
 
															+		/* CONVERT request */
														
 
															+
														
 
															+		/* if converting, must pass in a valid dlm_lock */
														
 
															+		lock = lksb->lockid;
														
 
															+		if (!lock) {
														
 
															+			mlog(ML_ERROR, "NULL lock pointer in convert "
														
 
															+			     "request\n");
														
 
															+			goto error;
														
 
															+		}
														
 
															+
														
 
															+		res = lock->lockres;
														
 
															+		if (!res) {
														
 
															+			mlog(ML_ERROR, "NULL lockres pointer in convert "
														
 
															+			     "request\n");
														
 
															+			goto error;
														
 
															+		}
														
 
															+		dlm_lockres_get(res);
														
 
															+
														
 
															+		/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
														
 
															+	 	 * static after the original lock call.  convert requests will
														
 
															+		 * ensure that everything is the same, or return DLM_BADARGS.
														
 
															+	 	 * this means that DLM_DENIED_NOASTS will never be returned.
														
 
															+	 	 */
														
 
															+		if (lock->lksb != lksb || lock->ast != ast ||
														
 
															+		    lock->bast != bast || lock->astdata != data) {
														
 
															+			status = DLM_BADARGS;
														
 
															+			mlog(ML_ERROR, "new args:  lksb=%p, ast=%p, bast=%p, "
														
 
															+			     "astdata=%p\n", lksb, ast, bast, data);
														
 
															+			mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
														
 
															+			     "astdata=%p\n", lock->lksb, lock->ast,
														
 
															+			     lock->bast, lock->astdata);
														
 
															+			goto error;
														
 
															+		}
														
 
															+retry_convert:
														
 
															+		dlm_wait_for_recovery(dlm);
														
 
															+
														
 
															+		if (res->owner == dlm->node_num)
														
 
															+			status = dlmconvert_master(dlm, res, lock, flags, mode);
														
 
															+		else
														
 
															+			status = dlmconvert_remote(dlm, res, lock, flags, mode);
														
 
															+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
														
 
															+		    status == DLM_FORWARD) {
														
 
															+			/* for now, see how this works without sleeping
														
 
															+			 * and just retry right away.  I suspect the reco
														
 
															+			 * or migration will complete fast enough that
														
 
															+			 * no waiting will be necessary */
														
 
															+			mlog(0, "retrying convert with migration/recovery/"
														
 
															+			     "in-progress\n");
														
 
															+			msleep(100);
														
 
															+			goto retry_convert;
														
 
															+		}
														
 
															+	} else {
														
 
															+		u64 tmpcookie;
														
 
															+
														
 
															+		/* LOCK request */
														
 
															+		status = DLM_BADARGS;
														
 
															+		if (!name) {
														
 
															+			dlm_error(status);
														
 
															+			goto error;
														
 
															+		}
														
 
															+
														
 
															+		status = DLM_IVBUFLEN;
														
 
															+		if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
														
 
															+			dlm_error(status);
														
 
															+			goto error;
														
 
															+		}
														
 
															+
														
 
															+		dlm_get_next_cookie(dlm->node_num, &tmpcookie);
														
 
															+		lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
														
 
															+		if (!lock) {
														
 
															+			dlm_error(status);
														
 
															+			goto error;
														
 
															+		}
														
 
															+
														
 
															+		if (!recovery)
														
 
															+			dlm_wait_for_recovery(dlm);
														
 
															+
														
 
															+		/* find or create the lock resource */
														
 
															+		res = dlm_get_lock_resource(dlm, name, flags);
														
 
															+		if (!res) {
														
 
															+			status = DLM_IVLOCKID;
														
 
															+			dlm_error(status);
														
 
															+			goto error;
														
 
															+		}
														
 
															+
														
 
															+		mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
														
 
															+		mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
														
 
															+
														
 
															+		dlm_lock_attach_lockres(lock, res);
														
 
															+		lock->ast = ast;
														
 
															+		lock->bast = bast;
														
 
															+		lock->astdata = data;
														
 
															+
														
 
															+retry_lock:
														
 
															+		if (flags & LKM_VALBLK) {
														
 
															+			mlog(0, "LKM_VALBLK passed by caller\n");
														
 
															+
														
 
															+			/* LVB requests for non PR, PW or EX locks are
														
 
															+			 * ignored. */
														
 
															+			if (mode < LKM_PRMODE)
														
 
															+				flags &= ~LKM_VALBLK;
														
 
															+			else {
														
 
															+				flags |= LKM_GET_LVB;
														
 
															+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		if (res->owner == dlm->node_num)
														
 
															+			status = dlmlock_master(dlm, res, lock, flags);
														
 
															+		else
														
 
															+			status = dlmlock_remote(dlm, res, lock, flags);
														
 
															+
														
 
															+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
														
 
															+		    status == DLM_FORWARD) {
														
 
															+			mlog(0, "retrying lock with migration/"
														
 
															+			     "recovery/in progress\n");
														
 
															+			msleep(100);
														
 
															+			dlm_wait_for_recovery(dlm);
														
 
															+			goto retry_lock;
														
 
															+		}
														
 
															+
														
 
															+		if (status != DLM_NORMAL) {
														
 
															+			lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
														
 
															+			if (status != DLM_NOTQUEUED)
														
 
															+				dlm_error(status);
														
 
															+			goto error;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+error:
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		if (lock && !convert)
														
 
															+			dlm_lock_put(lock);
														
 
															+		// this is kind of unnecessary
														
 
															+		lksb->status = status;
														
 
															+	}
														
 
															+
														
 
															+	/* put lockres ref from the convert path
														
 
															+	 * or from dlm_get_lock_resource */
														
 
															+	if (res)
														
 
															+		dlm_lockres_put(res);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlmlock);
														
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -0,0 +1,2664 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmmod.c
														
 
															+ *
														
 
															+ * standalone DLM module
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/socket.h>
														
 
															+#include <linux/inet.h>
														
 
															+#include <linux/spinlock.h>
														
 
															+#include <linux/delay.h>
														
 
															+
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+#include "dlmdebug.h"
														
 
															+#include "dlmdomain.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+enum dlm_mle_type {
														
 
															+	DLM_MLE_BLOCK,
														
 
															+	DLM_MLE_MASTER,
														
 
															+	DLM_MLE_MIGRATION
														
 
															+};
														
 
															+
														
 
															+struct dlm_lock_name
														
 
															+{
														
 
															+	u8 len;
														
 
															+	u8 name[DLM_LOCKID_NAME_MAX];
														
 
															+};
														
 
															+
														
 
															+struct dlm_master_list_entry
														
 
															+{
														
 
															+	struct list_head list;
														
 
															+	struct list_head hb_events;
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+	spinlock_t spinlock;
														
 
															+	wait_queue_head_t wq;
														
 
															+	atomic_t woken;
														
 
															+	struct kref mle_refs;
														
 
															+	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	u8 master;
														
 
															+	u8 new_master;
														
 
															+	enum dlm_mle_type type;
														
 
															+	struct o2hb_callback_func mle_hb_up;
														
 
															+	struct o2hb_callback_func mle_hb_down;
														
 
															+	union {
														
 
															+		struct dlm_lock_resource *res;
														
 
															+		struct dlm_lock_name name;
														
 
															+	} u;
														
 
															+};
														
 
															+
														
 
															+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_master_list_entry *mle,
														
 
															+			      struct o2nm_node *node,
														
 
															+			      int idx);
														
 
															+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
														
 
															+			    struct dlm_master_list_entry *mle,
														
 
															+			    struct o2nm_node *node,
														
 
															+			    int idx);
														
 
															+
														
 
															+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
														
 
															+static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
														
 
															+				unsigned int namelen, void *nodemap,
														
 
															+				u32 flags);
														
 
															+
														
 
															+static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
														
 
															+				struct dlm_master_list_entry *mle,
														
 
															+				const char *name,
														
 
															+				unsigned int namelen)
														
 
															+{
														
 
															+	struct dlm_lock_resource *res;
														
 
															+
														
 
															+	if (dlm != mle->dlm)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (mle->type == DLM_MLE_BLOCK ||
														
 
															+	    mle->type == DLM_MLE_MIGRATION) {
														
 
															+		if (namelen != mle->u.name.len ||
														
 
															+    	    	    memcmp(name, mle->u.name.name, namelen)!=0)
														
 
															+			return 0;
														
 
															+	} else {
														
 
															+		res = mle->u.res;
														
 
															+		if (namelen != res->lockname.len ||
														
 
															+		    memcmp(res->lockname.name, name, namelen) != 0)
														
 
															+			return 0;
														
 
															+	}
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+#if 0
														
 
															+/* Code here is included but defined out as it aids debugging */
														
 
															+
														
 
															+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
														
 
															+{
														
 
															+	int i = 0, refs;
														
 
															+	char *type;
														
 
															+	char attached;
														
 
															+	u8 master;
														
 
															+	unsigned int namelen;
														
 
															+	const char *name;
														
 
															+	struct kref *k;
														
 
															+
														
 
															+	k = &mle->mle_refs;
														
 
															+	if (mle->type == DLM_MLE_BLOCK)
														
 
															+		type = "BLK";
														
 
															+	else if (mle->type == DLM_MLE_MASTER)
														
 
															+		type = "MAS";
														
 
															+	else
														
 
															+		type = "MIG";
														
 
															+	refs = atomic_read(&k->refcount);
														
 
															+	master = mle->master;
														
 
															+	attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
														
 
															+
														
 
															+	if (mle->type != DLM_MLE_MASTER) {
														
 
															+		namelen = mle->u.name.len;
														
 
															+		name = mle->u.name.name;
														
 
															+	} else {
														
 
															+		namelen = mle->u.res->lockname.len;
														
 
															+		name = mle->u.res->lockname.name;
														
 
															+	}
														
 
															+
														
 
															+	mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
														
 
															+		  i, type, refs, master, mle->new_master, attached,
														
 
															+		  namelen, namelen, name);
														
 
															+}
														
 
															+
														
 
															+static void dlm_dump_mles(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	struct dlm_master_list_entry *mle;
														
 
															+	struct list_head *iter;
														
 
															+	
														
 
															+	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
														
 
															+	mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+	list_for_each(iter, &dlm->master_list) {
														
 
															+		mle = list_entry(iter, struct dlm_master_list_entry, list);
														
 
															+		dlm_print_one_mle(mle);
														
 
															+	}
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+}
														
 
															+
														
 
															+int dlm_dump_all_mles(const char __user *data, unsigned int len)
														
 
															+{
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+
														
 
															+	spin_lock(&dlm_domain_lock);
														
 
															+	list_for_each(iter, &dlm_domains) {
														
 
															+		dlm = list_entry (iter, struct dlm_ctxt, list);
														
 
															+		mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
														
 
															+		dlm_dump_mles(dlm);
														
 
															+	}
														
 
															+	spin_unlock(&dlm_domain_lock);
														
 
															+	return len;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
														
 
															+
														
 
															+#endif  /*  0  */
														
 
															+
														
 
															+
														
 
															+static kmem_cache_t *dlm_mle_cache = NULL;
														
 
															+
														
 
															+
														
 
															+static void dlm_mle_release(struct kref *kref);
														
 
															+static void dlm_init_mle(struct dlm_master_list_entry *mle,
														
 
															+			enum dlm_mle_type type,
														
 
															+			struct dlm_ctxt *dlm,
														
 
															+			struct dlm_lock_resource *res,
														
 
															+			const char *name,
														
 
															+			unsigned int namelen);
														
 
															+static void dlm_put_mle(struct dlm_master_list_entry *mle);
														
 
															+static void __dlm_put_mle(struct dlm_master_list_entry *mle);
														
 
															+static int dlm_find_mle(struct dlm_ctxt *dlm,
														
 
															+			struct dlm_master_list_entry **mle,
														
 
															+			char *name, unsigned int namelen);
														
 
															+
														
 
															+static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
														
 
															+
														
 
															+
														
 
															+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
														
 
															+				     struct dlm_lock_resource *res,
														
 
															+				     struct dlm_master_list_entry *mle,
														
 
															+				     int *blocked);
														
 
															+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
														
 
															+				    struct dlm_lock_resource *res,
														
 
															+				    struct dlm_master_list_entry *mle,
														
 
															+				    int blocked);
														
 
															+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
														
 
															+				 struct dlm_lock_resource *res,
														
 
															+				 struct dlm_master_list_entry *mle,
														
 
															+				 struct dlm_master_list_entry **oldmle,
														
 
															+				 const char *name, unsigned int namelen,
														
 
															+				 u8 new_master, u8 master);
														
 
															+
														
 
															+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
														
 
															+				    struct dlm_lock_resource *res);
														
 
															+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_lock_resource *res);
														
 
															+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
														
 
															+				       struct dlm_lock_resource *res,
														
 
															+				       u8 target);
														
 
															+
														
 
															+
														
 
															+int dlm_is_host_down(int errno)
														
 
															+{
														
 
															+	switch (errno) {
														
 
															+		case -EBADF:
														
 
															+		case -ECONNREFUSED:
														
 
															+		case -ENOTCONN:
														
 
															+		case -ECONNRESET:
														
 
															+		case -EPIPE:
														
 
															+		case -EHOSTDOWN:
														
 
															+		case -EHOSTUNREACH:
														
 
															+		case -ETIMEDOUT:
														
 
															+		case -ECONNABORTED:
														
 
															+		case -ENETDOWN:
														
 
															+		case -ENETUNREACH:
														
 
															+		case -ENETRESET:
														
 
															+		case -ESHUTDOWN:
														
 
															+		case -ENOPROTOOPT:
														
 
															+		case -EINVAL:   /* if returned from our tcp code,
														
 
															+				   this means there is no socket */
														
 
															+			return 1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * MASTER LIST FUNCTIONS
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * regarding master list entries and heartbeat callbacks:
														
 
															+ *
														
 
															+ * in order to avoid sleeping and allocation that occurs in
														
 
															+ * heartbeat, master list entries are simply attached to the
														
 
															+ * dlm's established heartbeat callbacks.  the mle is attached
														
 
															+ * when it is created, and since the dlm->spinlock is held at
														
 
															+ * that time, any heartbeat event will be properly discovered
														
 
															+ * by the mle.  the mle needs to be detached from the
														
 
															+ * dlm->mle_hb_events list as soon as heartbeat events are no
														
 
															+ * longer useful to the mle, and before the mle is freed.
														
 
															+ *
														
 
															+ * as a general rule, heartbeat events are no longer needed by
														
 
															+ * the mle once an "answer" regarding the lock master has been
														
 
															+ * received.
														
 
															+ */
														
 
															+static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
														
 
															+					      struct dlm_master_list_entry *mle)
														
 
															+{
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
														
 
															+					      struct dlm_master_list_entry *mle)
														
 
															+{
														
 
															+	if (!list_empty(&mle->hb_events))
														
 
															+		list_del_init(&mle->hb_events);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
														
 
															+					    struct dlm_master_list_entry *mle)
														
 
															+{
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	__dlm_mle_detach_hb_events(dlm, mle);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+/* remove from list and free */
														
 
															+static void __dlm_put_mle(struct dlm_master_list_entry *mle)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+	dlm = mle->dlm;
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+	assert_spin_locked(&dlm->master_lock);
														
 
															+	BUG_ON(!atomic_read(&mle->mle_refs.refcount));
														
 
															+
														
 
															+	kref_put(&mle->mle_refs, dlm_mle_release);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* must not have any spinlocks coming in */
														
 
															+static void dlm_put_mle(struct dlm_master_list_entry *mle)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+	dlm = mle->dlm;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+	__dlm_put_mle(mle);
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
														
 
															+{
														
 
															+	kref_get(&mle->mle_refs);
														
 
															+}
														
 
															+
														
 
															+static void dlm_init_mle(struct dlm_master_list_entry *mle,
														
 
															+			enum dlm_mle_type type,
														
 
															+			struct dlm_ctxt *dlm,
														
 
															+			struct dlm_lock_resource *res,
														
 
															+			const char *name,
														
 
															+			unsigned int namelen)
														
 
															+{
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	mle->dlm = dlm;
														
 
															+	mle->type = type;
														
 
															+	INIT_LIST_HEAD(&mle->list);
														
 
															+	INIT_LIST_HEAD(&mle->hb_events);
														
 
															+	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
														
 
															+	spin_lock_init(&mle->spinlock);
														
 
															+	init_waitqueue_head(&mle->wq);
														
 
															+	atomic_set(&mle->woken, 0);
														
 
															+	kref_init(&mle->mle_refs);
														
 
															+	memset(mle->response_map, 0, sizeof(mle->response_map));
														
 
															+	mle->master = O2NM_MAX_NODES;
														
 
															+	mle->new_master = O2NM_MAX_NODES;
														
 
															+
														
 
															+	if (mle->type == DLM_MLE_MASTER) {
														
 
															+		BUG_ON(!res);
														
 
															+		mle->u.res = res;
														
 
															+	} else if (mle->type == DLM_MLE_BLOCK) {
														
 
															+		BUG_ON(!name);
														
 
															+		memcpy(mle->u.name.name, name, namelen);
														
 
															+		mle->u.name.len = namelen;
														
 
															+	} else /* DLM_MLE_MIGRATION */ {
														
 
															+		BUG_ON(!name);
														
 
															+		memcpy(mle->u.name.name, name, namelen);
														
 
															+		mle->u.name.len = namelen;
														
 
															+	}
														
 
															+
														
 
															+	/* copy off the node_map and register hb callbacks on our copy */
														
 
															+	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
														
 
															+	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
														
 
															+	clear_bit(dlm->node_num, mle->vote_map);
														
 
															+	clear_bit(dlm->node_num, mle->node_map);
														
 
															+
														
 
															+	/* attach the mle to the domain node up/down events */
														
 
															+	__dlm_mle_attach_hb_events(dlm, mle);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* returns 1 if found, 0 if not */
														
 
															+static int dlm_find_mle(struct dlm_ctxt *dlm,
														
 
															+			struct dlm_master_list_entry **mle,
														
 
															+			char *name, unsigned int namelen)
														
 
															+{
														
 
															+	struct dlm_master_list_entry *tmpmle;
														
 
															+	struct list_head *iter;
														
 
															+
														
 
															+	assert_spin_locked(&dlm->master_lock);
														
 
															+
														
 
															+	list_for_each(iter, &dlm->master_list) {
														
 
															+		tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
														
 
															+		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
														
 
															+			continue;
														
 
															+		dlm_get_mle(tmpmle);
														
 
															+		*mle = tmpmle;
														
 
															+		return 1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
														
 
															+{
														
 
															+	struct dlm_master_list_entry *mle;
														
 
															+	struct list_head *iter;
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+	
														
 
															+	list_for_each(iter, &dlm->mle_hb_events) {
														
 
															+		mle = list_entry(iter, struct dlm_master_list_entry, 
														
 
															+				 hb_events);
														
 
															+		if (node_up)
														
 
															+			dlm_mle_node_up(dlm, mle, NULL, idx);
														
 
															+		else
														
 
															+			dlm_mle_node_down(dlm, mle, NULL, idx);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_master_list_entry *mle,
														
 
															+			      struct o2nm_node *node, int idx)
														
 
															+{
														
 
															+	spin_lock(&mle->spinlock);
														
 
															+
														
 
															+	if (!test_bit(idx, mle->node_map))
														
 
															+		mlog(0, "node %u already removed from nodemap!\n", idx);
														
 
															+	else
														
 
															+		clear_bit(idx, mle->node_map);
														
 
															+
														
 
															+	spin_unlock(&mle->spinlock);
														
 
															+}
														
 
															+
														
 
															+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
														
 
															+			    struct dlm_master_list_entry *mle,
														
 
															+			    struct o2nm_node *node, int idx)
														
 
															+{
														
 
															+	spin_lock(&mle->spinlock);
														
 
															+
														
 
															+	if (test_bit(idx, mle->node_map))
														
 
															+		mlog(0, "node %u already in node map!\n", idx);
														
 
															+	else
														
 
															+		set_bit(idx, mle->node_map);
														
 
															+
														
 
															+	spin_unlock(&mle->spinlock);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+int dlm_init_mle_cache(void)
														
 
															+{
														
 
															+	dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
														
 
															+					  sizeof(struct dlm_master_list_entry),
														
 
															+					  0, SLAB_HWCACHE_ALIGN,
														
 
															+					  NULL, NULL);
														
 
															+	if (dlm_mle_cache == NULL)
														
 
															+		return -ENOMEM;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void dlm_destroy_mle_cache(void)
														
 
															+{
														
 
															+	if (dlm_mle_cache)
														
 
															+		kmem_cache_destroy(dlm_mle_cache);
														
 
															+}
														
 
															+
														
 
															+static void dlm_mle_release(struct kref *kref)
														
 
															+{
														
 
															+	struct dlm_master_list_entry *mle;
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
														
 
															+	dlm = mle->dlm;
														
 
															+
														
 
															+	if (mle->type != DLM_MLE_MASTER) {
														
 
															+		mlog(0, "calling mle_release for %.*s, type %d\n",
														
 
															+		     mle->u.name.len, mle->u.name.name, mle->type);
														
 
															+	} else {
														
 
															+		mlog(0, "calling mle_release for %.*s, type %d\n",
														
 
															+		     mle->u.res->lockname.len,
														
 
															+		     mle->u.res->lockname.name, mle->type);
														
 
															+	}
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+	assert_spin_locked(&dlm->master_lock);
														
 
															+
														
 
															+	/* remove from list if not already */
														
 
															+	if (!list_empty(&mle->list))
														
 
															+		list_del_init(&mle->list);
														
 
															+
														
 
															+	/* detach the mle from the domain node up/down events */
														
 
															+	__dlm_mle_detach_hb_events(dlm, mle);
														
 
															+
														
 
															+	/* NOTE: kfree under spinlock here.
														
 
															+	 * if this is bad, we can move this to a freelist. */
														
 
															+	kmem_cache_free(dlm_mle_cache, mle);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * LOCK RESOURCE FUNCTIONS
														
 
															+ */
														
 
															+
														
 
															+static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
														
 
															+				  struct dlm_lock_resource *res,
														
 
															+				  u8 owner)
														
 
															+{
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
														
 
															+
														
 
															+	if (owner == dlm->node_num)
														
 
															+		atomic_inc(&dlm->local_resources);
														
 
															+	else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
														
 
															+		atomic_inc(&dlm->unknown_resources);
														
 
															+	else
														
 
															+		atomic_inc(&dlm->remote_resources);
														
 
															+
														
 
															+	res->owner = owner;
														
 
															+}
														
 
															+
														
 
															+void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_lock_resource *res, u8 owner)
														
 
															+{
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	if (owner == res->owner)
														
 
															+		return;
														
 
															+
														
 
															+	if (res->owner == dlm->node_num)
														
 
															+		atomic_dec(&dlm->local_resources);
														
 
															+	else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
														
 
															+		atomic_dec(&dlm->unknown_resources);
														
 
															+	else
														
 
															+		atomic_dec(&dlm->remote_resources);
														
 
															+
														
 
															+	dlm_set_lockres_owner(dlm, res, owner);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static void dlm_lockres_release(struct kref *kref)
														
 
															+{
														
 
															+	struct dlm_lock_resource *res;
														
 
															+
														
 
															+	res = container_of(kref, struct dlm_lock_resource, refs);
														
 
															+
														
 
															+	/* This should not happen -- all lockres' have a name
														
 
															+	 * associated with them at init time. */
														
 
															+	BUG_ON(!res->lockname.name);
														
 
															+
														
 
															+	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
														
 
															+	     res->lockname.name);
														
 
															+
														
 
															+	/* By the time we're ready to blow this guy away, we shouldn't
														
 
															+	 * be on any lists. */
														
 
															+	BUG_ON(!list_empty(&res->list));
														
 
															+	BUG_ON(!list_empty(&res->granted));
														
 
															+	BUG_ON(!list_empty(&res->converting));
														
 
															+	BUG_ON(!list_empty(&res->blocked));
														
 
															+	BUG_ON(!list_empty(&res->dirty));
														
 
															+	BUG_ON(!list_empty(&res->recovering));
														
 
															+	BUG_ON(!list_empty(&res->purge));
														
 
															+
														
 
															+	kfree(res->lockname.name);
														
 
															+
														
 
															+	kfree(res);
														
 
															+}
														
 
															+
														
 
															+void dlm_lockres_get(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	kref_get(&res->refs);
														
 
															+}
														
 
															+
														
 
															+void dlm_lockres_put(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	kref_put(&res->refs, dlm_lockres_release);
														
 
															+}
														
 
															+
														
 
															+static void dlm_init_lockres(struct dlm_ctxt *dlm,
														
 
															+			     struct dlm_lock_resource *res,
														
 
															+			     const char *name, unsigned int namelen)
														
 
															+{
														
 
															+	char *qname;
														
 
															+
														
 
															+	/* If we memset here, we lose our reference to the kmalloc'd
														
 
															+	 * res->lockname.name, so be sure to init every field
														
 
															+	 * correctly! */
														
 
															+
														
 
															+	qname = (char *) res->lockname.name;
														
 
															+	memcpy(qname, name, namelen);
														
 
															+
														
 
															+	res->lockname.len = namelen;
														
 
															+	res->lockname.hash = full_name_hash(name, namelen);
														
 
															+
														
 
															+	init_waitqueue_head(&res->wq);
														
 
															+	spin_lock_init(&res->spinlock);
														
 
															+	INIT_LIST_HEAD(&res->list);
														
 
															+	INIT_LIST_HEAD(&res->granted);
														
 
															+	INIT_LIST_HEAD(&res->converting);
														
 
															+	INIT_LIST_HEAD(&res->blocked);
														
 
															+	INIT_LIST_HEAD(&res->dirty);
														
 
															+	INIT_LIST_HEAD(&res->recovering);
														
 
															+	INIT_LIST_HEAD(&res->purge);
														
 
															+	atomic_set(&res->asts_reserved, 0);
														
 
															+	res->migration_pending = 0;
														
 
															+
														
 
															+	kref_init(&res->refs);
														
 
															+
														
 
															+	/* just for consistency */
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	res->state = DLM_LOCK_RES_IN_PROGRESS;
														
 
															+
														
 
															+	res->last_used = 0;
														
 
															+
														
 
															+	memset(res->lvb, 0, DLM_LVB_LEN);
														
 
															+}
														
 
															+
														
 
															+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
														
 
															+				   const char *name,
														
 
															+				   unsigned int namelen)
														
 
															+{
														
 
															+	struct dlm_lock_resource *res;
														
 
															+
														
 
															+	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
														
 
															+	if (!res)
														
 
															+		return NULL;
														
 
															+
														
 
															+	res->lockname.name = kmalloc(namelen, GFP_KERNEL);
														
 
															+	if (!res->lockname.name) {
														
 
															+		kfree(res);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															+	dlm_init_lockres(dlm, res, name, namelen);
														
 
															+	return res;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * lookup a lock resource by name.
														
 
															+ * may already exist in the hashtable.
														
 
															+ * lockid is null terminated
														
 
															+ *
														
 
															+ * if not, allocate enough for the lockres and for
														
 
															+ * the temporary structure used in doing the mastering.
														
 
															+ *
														
 
															+ * also, do a lookup in the dlm->master_list to see
														
 
															+ * if another node has begun mastering the same lock.
														
 
															+ * if so, there should be a block entry in there
														
 
															+ * for this name, and we should *not* attempt to master
														
 
															+ * the lock here.   need to wait around for that node
														
 
															+ * to assert_master (or die).
														
 
															+ *
														
 
															+ */
														
 
															+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
														
 
															+					  const char *lockid,
														
 
															+					  int flags)
														
 
															+{
														
 
															+	struct dlm_lock_resource *tmpres=NULL, *res=NULL;
														
 
															+	struct dlm_master_list_entry *mle = NULL;
														
 
															+	struct dlm_master_list_entry *alloc_mle = NULL;
														
 
															+	int blocked = 0;
														
 
															+	int ret, nodenum;
														
 
															+	struct dlm_node_iter iter;
														
 
															+	unsigned int namelen;
														
 
															+	int tries = 0;
														
 
															+
														
 
															+	BUG_ON(!lockid);
														
 
															+
														
 
															+	namelen = strlen(lockid);
														
 
															+
														
 
															+	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
														
 
															+
														
 
															+lookup:
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
														
 
															+	if (tmpres) {
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+		mlog(0, "found in hash!\n");
														
 
															+		if (res)
														
 
															+			dlm_lockres_put(res);
														
 
															+		res = tmpres;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (!res) {
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+		mlog(0, "allocating a new resource\n");
														
 
															+		/* nothing found and we need to allocate one. */
														
 
															+		alloc_mle = (struct dlm_master_list_entry *)
														
 
															+			kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
														
 
															+		if (!alloc_mle)
														
 
															+			goto leave;
														
 
															+		res = dlm_new_lockres(dlm, lockid, namelen);
														
 
															+		if (!res)
														
 
															+			goto leave;
														
 
															+		goto lookup;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "no lockres found, allocated our own: %p\n", res);
														
 
															+
														
 
															+	if (flags & LKM_LOCAL) {
														
 
															+		/* caller knows it's safe to assume it's not mastered elsewhere
														
 
															+		 * DONE!  return right away */
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
														
 
															+		__dlm_insert_lockres(dlm, res);
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+		/* lockres still marked IN_PROGRESS */
														
 
															+		goto wake_waiters;
														
 
															+	}
														
 
															+
														
 
															+	/* check master list to see if another node has started mastering it */
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+
														
 
															+	/* if we found a block, wait for lock to be mastered by another node */
														
 
															+	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
														
 
															+	if (blocked) {
														
 
															+		if (mle->type == DLM_MLE_MASTER) {
														
 
															+			mlog(ML_ERROR, "master entry for nonexistent lock!\n");
														
 
															+			BUG();
														
 
															+		} else if (mle->type == DLM_MLE_MIGRATION) {
														
 
															+			/* migration is in progress! */
														
 
															+			/* the good news is that we now know the
														
 
															+			 * "current" master (mle->master). */
														
 
															+
														
 
															+			spin_unlock(&dlm->master_lock);
														
 
															+			assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+			/* set the lockres owner and hash it */
														
 
															+			spin_lock(&res->spinlock);
														
 
															+			dlm_set_lockres_owner(dlm, res, mle->master);
														
 
															+			__dlm_insert_lockres(dlm, res);
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+			/* master is known, detach */
														
 
															+			dlm_mle_detach_hb_events(dlm, mle);
														
 
															+			dlm_put_mle(mle);
														
 
															+			mle = NULL;
														
 
															+			goto wake_waiters;
														
 
															+		}
														
 
															+	} else {
														
 
															+		/* go ahead and try to master lock on this node */
														
 
															+		mle = alloc_mle;
														
 
															+		/* make sure this does not get freed below */
														
 
															+		alloc_mle = NULL;
														
 
															+		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
														
 
															+		set_bit(dlm->node_num, mle->maybe_map);
														
 
															+		list_add(&mle->list, &dlm->master_list);
														
 
															+	}
														
 
															+
														
 
															+	/* at this point there is either a DLM_MLE_BLOCK or a
														
 
															+	 * DLM_MLE_MASTER on the master list, so it's safe to add the
														
 
															+	 * lockres to the hashtable.  anyone who finds the lock will
														
 
															+	 * still have to wait on the IN_PROGRESS. */
														
 
															+
														
 
															+	/* finally add the lockres to its hash bucket */
														
 
															+	__dlm_insert_lockres(dlm, res);
														
 
															+	/* get an extra ref on the mle in case this is a BLOCK
														
 
															+	 * if so, the creator of the BLOCK may try to put the last
														
 
															+	 * ref at this time in the assert master handler, so we
														
 
															+	 * need an extra one to keep from a bad ptr deref. */
														
 
															+	dlm_get_mle(mle);
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	/* must wait for lock to be mastered elsewhere */
														
 
															+	if (blocked)
														
 
															+		goto wait;
														
 
															+
														
 
															+redo_request:
														
 
															+	ret = -EINVAL;
														
 
															+	dlm_node_iter_init(mle->vote_map, &iter);
														
 
															+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
														
 
															+		ret = dlm_do_master_request(mle, nodenum);
														
 
															+		if (ret < 0)
														
 
															+			mlog_errno(ret);
														
 
															+		if (mle->master != O2NM_MAX_NODES) {
														
 
															+			/* found a master ! */
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+wait:
														
 
															+	/* keep going until the response map includes all nodes */
														
 
															+	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
														
 
															+	if (ret < 0) {
														
 
															+		mlog(0, "%s:%.*s: node map changed, redo the "
														
 
															+		     "master request now, blocked=%d\n",
														
 
															+		     dlm->name, res->lockname.len,
														
 
															+		     res->lockname.name, blocked);
														
 
															+		if (++tries > 20) {
														
 
															+			mlog(ML_ERROR, "%s:%.*s: spinning on "
														
 
															+			     "dlm_wait_for_lock_mastery, blocked=%d\n", 
														
 
															+			     dlm->name, res->lockname.len, 
														
 
															+			     res->lockname.name, blocked);
														
 
															+			dlm_print_one_lock_resource(res);
														
 
															+			/* dlm_print_one_mle(mle); */
														
 
															+			tries = 0;
														
 
															+		}
														
 
															+		goto redo_request;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "lockres mastered by %u\n", res->owner);
														
 
															+	/* make sure we never continue without this */
														
 
															+	BUG_ON(res->owner == O2NM_MAX_NODES);
														
 
															+
														
 
															+	/* master is known, detach if not already detached */
														
 
															+	dlm_mle_detach_hb_events(dlm, mle);
														
 
															+	dlm_put_mle(mle);
														
 
															+	/* put the extra ref */
														
 
															+	dlm_put_mle(mle);
														
 
															+
														
 
															+wake_waiters:
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	wake_up(&res->wq);
														
 
															+
														
 
															+leave:
														
 
															+	/* need to free the unused mle */
														
 
															+	if (alloc_mle)
														
 
															+		kmem_cache_free(dlm_mle_cache, alloc_mle);
														
 
															+
														
 
															+	return res;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+#define DLM_MASTERY_TIMEOUT_MS   5000
														
 
															+
														
 
															+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
														
 
															+				     struct dlm_lock_resource *res,
														
 
															+				     struct dlm_master_list_entry *mle,
														
 
															+				     int *blocked)
														
 
															+{
														
 
															+	u8 m;
														
 
															+	int ret, bit;
														
 
															+	int map_changed, voting_done;
														
 
															+	int assert, sleep;
														
 
															+
														
 
															+recheck:
														
 
															+	ret = 0;
														
 
															+	assert = 0;
														
 
															+
														
 
															+	/* check if another node has already become the owner */
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	spin_lock(&mle->spinlock);
														
 
															+	m = mle->master;
														
 
															+	map_changed = (memcmp(mle->vote_map, mle->node_map,
														
 
															+			      sizeof(mle->vote_map)) != 0);
														
 
															+	voting_done = (memcmp(mle->vote_map, mle->response_map,
														
 
															+			     sizeof(mle->vote_map)) == 0);
														
 
															+
														
 
															+	/* restart if we hit any errors */
														
 
															+	if (map_changed) {
														
 
															+		int b;
														
 
															+		mlog(0, "%s: %.*s: node map changed, restarting\n",
														
 
															+		     dlm->name, res->lockname.len, res->lockname.name);
														
 
															+		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
														
 
															+		b = (mle->type == DLM_MLE_BLOCK);
														
 
															+		if ((*blocked && !b) || (!*blocked && b)) {
														
 
															+			mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 
														
 
															+			     dlm->name, res->lockname.len, res->lockname.name,
														
 
															+			     *blocked, b);
														
 
															+			*blocked = b;
														
 
															+		}
														
 
															+		spin_unlock(&mle->spinlock);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+		mlog(0, "%s:%.*s: restart lock mastery succeeded, "
														
 
															+		     "rechecking now\n", dlm->name, res->lockname.len,
														
 
															+		     res->lockname.name);
														
 
															+		goto recheck;
														
 
															+	}
														
 
															+
														
 
															+	if (m != O2NM_MAX_NODES) {
														
 
															+		/* another node has done an assert!
														
 
															+		 * all done! */
														
 
															+		sleep = 0;
														
 
															+	} else {
														
 
															+		sleep = 1;
														
 
															+		/* have all nodes responded? */
														
 
															+		if (voting_done && !*blocked) {
														
 
															+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
														
 
															+			if (dlm->node_num <= bit) {
														
 
															+				/* my node number is lowest.
														
 
															+			 	 * now tell other nodes that I am
														
 
															+				 * mastering this. */
														
 
															+				mle->master = dlm->node_num;
														
 
															+				assert = 1;
														
 
															+				sleep = 0;
														
 
															+			}
														
 
															+			/* if voting is done, but we have not received
														
 
															+			 * an assert master yet, we must sleep */
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&mle->spinlock);
														
 
															+
														
 
															+	/* sleep if we haven't finished voting yet */
														
 
															+	if (sleep) {
														
 
															+		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
														
 
															+
														
 
															+		/*
														
 
															+		if (atomic_read(&mle->mle_refs.refcount) < 2)
														
 
															+			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
														
 
															+			atomic_read(&mle->mle_refs.refcount),
														
 
															+			res->lockname.len, res->lockname.name);
														
 
															+		*/
														
 
															+		atomic_set(&mle->woken, 0);
														
 
															+		(void)wait_event_timeout(mle->wq,
														
 
															+					 (atomic_read(&mle->woken) == 1),
														
 
															+					 timeo);
														
 
															+		if (res->owner == O2NM_MAX_NODES) {
														
 
															+			mlog(0, "waiting again\n");
														
 
															+			goto recheck;
														
 
															+		}
														
 
															+		mlog(0, "done waiting, master is %u\n", res->owner);
														
 
															+		ret = 0;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	ret = 0;   /* done */
														
 
															+	if (assert) {
														
 
															+		m = dlm->node_num;
														
 
															+		mlog(0, "about to master %.*s here, this=%u\n",
														
 
															+		     res->lockname.len, res->lockname.name, m);
														
 
															+		ret = dlm_do_assert_master(dlm, res->lockname.name,
														
 
															+					   res->lockname.len, mle->vote_map, 0);
														
 
															+		if (ret) {
														
 
															+			/* This is a failure in the network path,
														
 
															+			 * not in the response to the assert_master
														
 
															+			 * (any nonzero response is a BUG on this node).
														
 
															+			 * Most likely a socket just got disconnected
														
 
															+			 * due to node death. */
														
 
															+			mlog_errno(ret);
														
 
															+		}
														
 
															+		/* no longer need to restart lock mastery.
														
 
															+		 * all living nodes have been contacted. */
														
 
															+		ret = 0;
														
 
															+	}
														
 
															+
														
 
															+	/* set the lockres owner */
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	dlm_change_lockres_owner(dlm, res, m);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+leave:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct dlm_bitmap_diff_iter
														
 
															+{
														
 
															+	int curnode;
														
 
															+	unsigned long *orig_bm;
														
 
															+	unsigned long *cur_bm;
														
 
															+	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+};
														
 
															+
														
 
															+enum dlm_node_state_change
														
 
															+{
														
 
															+	NODE_DOWN = -1,
														
 
															+	NODE_NO_CHANGE = 0,
														
 
															+	NODE_UP
														
 
															+};
														
 
															+
														
 
															+static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
														
 
															+				      unsigned long *orig_bm,
														
 
															+				      unsigned long *cur_bm)
														
 
															+{
														
 
															+	unsigned long p1, p2;
														
 
															+	int i;
														
 
															+
														
 
															+	iter->curnode = -1;
														
 
															+	iter->orig_bm = orig_bm;
														
 
															+	iter->cur_bm = cur_bm;
														
 
															+
														
 
															+	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
														
 
															+       		p1 = *(iter->orig_bm + i);
														
 
															+	       	p2 = *(iter->cur_bm + i);
														
 
															+		iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
														
 
															+				     enum dlm_node_state_change *state)
														
 
															+{
														
 
															+	int bit;
														
 
															+
														
 
															+	if (iter->curnode >= O2NM_MAX_NODES)
														
 
															+		return -ENOENT;
														
 
															+
														
 
															+	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
														
 
															+			    iter->curnode+1);
														
 
															+	if (bit >= O2NM_MAX_NODES) {
														
 
															+		iter->curnode = O2NM_MAX_NODES;
														
 
															+		return -ENOENT;
														
 
															+	}
														
 
															+
														
 
															+	/* if it was there in the original then this node died */
														
 
															+	if (test_bit(bit, iter->orig_bm))
														
 
															+		*state = NODE_DOWN;
														
 
															+	else
														
 
															+		*state = NODE_UP;
														
 
															+
														
 
															+	iter->curnode = bit;
														
 
															+	return bit;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
														
 
															+				    struct dlm_lock_resource *res,
														
 
															+				    struct dlm_master_list_entry *mle,
														
 
															+				    int blocked)
														
 
															+{
														
 
															+	struct dlm_bitmap_diff_iter bdi;
														
 
															+	enum dlm_node_state_change sc;
														
 
															+	int node;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	mlog(0, "something happened such that the "
														
 
															+	     "master process may need to be restarted!\n");
														
 
															+
														
 
															+	assert_spin_locked(&mle->spinlock);
														
 
															+
														
 
															+	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
														
 
															+	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
														
 
															+	while (node >= 0) {
														
 
															+		if (sc == NODE_UP) {
														
 
															+			/* a node came up.  easy.  might not even need
														
 
															+			 * to talk to it if its node number is higher
														
 
															+			 * or if we are already blocked. */
														
 
															+			mlog(0, "node up! %d\n", node);
														
 
															+			if (blocked)
														
 
															+				goto next;
														
 
															+
														
 
															+			if (node > dlm->node_num) {
														
 
															+				mlog(0, "node > this node. skipping.\n");
														
 
															+				goto next;
														
 
															+			}
														
 
															+
														
 
															+			/* redo the master request, but only for the new node */
														
 
															+			mlog(0, "sending request to new node\n");
														
 
															+			clear_bit(node, mle->response_map);
														
 
															+			set_bit(node, mle->vote_map);
														
 
															+		} else {
														
 
															+			mlog(ML_ERROR, "node down! %d\n", node);
														
 
															+
														
 
															+			/* if the node wasn't involved in mastery skip it,
														
 
															+			 * but clear it out from the maps so that it will
														
 
															+			 * not affect mastery of this lockres */
														
 
															+			clear_bit(node, mle->response_map);
														
 
															+			clear_bit(node, mle->vote_map);
														
 
															+			if (!test_bit(node, mle->maybe_map))
														
 
															+				goto next;
														
 
															+
														
 
															+			/* if we're already blocked on lock mastery, and the
														
 
															+			 * dead node wasn't the expected master, or there is
														
 
															+			 * another node in the maybe_map, keep waiting */
														
 
															+			if (blocked) {
														
 
															+				int lowest = find_next_bit(mle->maybe_map,
														
 
															+						       O2NM_MAX_NODES, 0);
														
 
															+
														
 
															+				/* act like it was never there */
														
 
															+				clear_bit(node, mle->maybe_map);
														
 
															+
														
 
															+			       	if (node != lowest)
														
 
															+					goto next;
														
 
															+
														
 
															+				mlog(ML_ERROR, "expected master %u died while "
														
 
															+				     "this node was blocked waiting on it!\n",
														
 
															+				     node);
														
 
															+				lowest = find_next_bit(mle->maybe_map,
														
 
															+						       O2NM_MAX_NODES,
														
 
															+						       lowest+1);
														
 
															+				if (lowest < O2NM_MAX_NODES) {
														
 
															+					mlog(0, "still blocked. waiting "
														
 
															+					     "on %u now\n", lowest);
														
 
															+					goto next;
														
 
															+				}
														
 
															+
														
 
															+				/* mle is an MLE_BLOCK, but there is now
														
 
															+				 * nothing left to block on.  we need to return
														
 
															+				 * all the way back out and try again with
														
 
															+				 * an MLE_MASTER. dlm_do_local_recovery_cleanup
														
 
															+				 * has already run, so the mle refcount is ok */
														
 
															+				mlog(0, "no longer blocking. we can "
														
 
															+				     "try to master this here\n");
														
 
															+				mle->type = DLM_MLE_MASTER;
														
 
															+				memset(mle->maybe_map, 0,
														
 
															+				       sizeof(mle->maybe_map));
														
 
															+				memset(mle->response_map, 0,
														
 
															+				       sizeof(mle->maybe_map));
														
 
															+				memcpy(mle->vote_map, mle->node_map,
														
 
															+				       sizeof(mle->node_map));
														
 
															+				mle->u.res = res;
														
 
															+				set_bit(dlm->node_num, mle->maybe_map);
														
 
															+
														
 
															+				ret = -EAGAIN;
														
 
															+				goto next;
														
 
															+			}
														
 
															+
														
 
															+			clear_bit(node, mle->maybe_map);
														
 
															+			if (node > dlm->node_num)
														
 
															+				goto next;
														
 
															+
														
 
															+			mlog(0, "dead node in map!\n");
														
 
															+			/* yuck. go back and re-contact all nodes
														
 
															+			 * in the vote_map, removing this node. */
														
 
															+			memset(mle->response_map, 0,
														
 
															+			       sizeof(mle->response_map));
														
 
															+		}
														
 
															+		ret = -EAGAIN;
														
 
															+next:
														
 
															+		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * DLM_MASTER_REQUEST_MSG
														
 
															+ *
														
 
															+ * returns: 0 on success,
														
 
															+ *          -errno on a network error
														
 
															+ *
														
 
															+ * on error, the caller should assume the target node is "dead"
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = mle->dlm;
														
 
															+	struct dlm_master_request request;
														
 
															+	int ret, response=0, resend;
														
 
															+
														
 
															+	memset(&request, 0, sizeof(request));
														
 
															+	request.node_idx = dlm->node_num;
														
 
															+
														
 
															+	BUG_ON(mle->type == DLM_MLE_MIGRATION);
														
 
															+
														
 
															+	if (mle->type != DLM_MLE_MASTER) {
														
 
															+		request.namelen = mle->u.name.len;
														
 
															+		memcpy(request.name, mle->u.name.name, request.namelen);
														
 
															+	} else {
														
 
															+		request.namelen = mle->u.res->lockname.len;
														
 
															+		memcpy(request.name, mle->u.res->lockname.name,
														
 
															+			request.namelen);
														
 
															+	}
														
 
															+
														
 
															+again:
														
 
															+	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
														
 
															+				 sizeof(request), to, &response);
														
 
															+	if (ret < 0)  {
														
 
															+		if (ret == -ESRCH) {
														
 
															+			/* should never happen */
														
 
															+			mlog(ML_ERROR, "TCP stack not ready!\n");
														
 
															+			BUG();
														
 
															+		} else if (ret == -EINVAL) {
														
 
															+			mlog(ML_ERROR, "bad args passed to o2net!\n");
														
 
															+			BUG();
														
 
															+		} else if (ret == -ENOMEM) {
														
 
															+			mlog(ML_ERROR, "out of memory while trying to send "
														
 
															+			     "network message!  retrying\n");
														
 
															+			/* this is totally crude */
														
 
															+			msleep(50);
														
 
															+			goto again;
														
 
															+		} else if (!dlm_is_host_down(ret)) {
														
 
															+			/* not a network error. bad. */
														
 
															+			mlog_errno(ret);
														
 
															+			mlog(ML_ERROR, "unhandled error!");
														
 
															+			BUG();
														
 
															+		}
														
 
															+		/* all other errors should be network errors,
														
 
															+		 * and likely indicate node death */
														
 
															+		mlog(ML_ERROR, "link to %d went down!\n", to);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = 0;
														
 
															+	resend = 0;
														
 
															+	spin_lock(&mle->spinlock);
														
 
															+	switch (response) {
														
 
															+		case DLM_MASTER_RESP_YES:
														
 
															+			set_bit(to, mle->response_map);
														
 
															+			mlog(0, "node %u is the master, response=YES\n", to);
														
 
															+			mle->master = to;
														
 
															+			break;
														
 
															+		case DLM_MASTER_RESP_NO:
														
 
															+			mlog(0, "node %u not master, response=NO\n", to);
														
 
															+			set_bit(to, mle->response_map);
														
 
															+			break;
														
 
															+		case DLM_MASTER_RESP_MAYBE:
														
 
															+			mlog(0, "node %u not master, response=MAYBE\n", to);
														
 
															+			set_bit(to, mle->response_map);
														
 
															+			set_bit(to, mle->maybe_map);
														
 
															+			break;
														
 
															+		case DLM_MASTER_RESP_ERROR:
														
 
															+			mlog(0, "node %u hit an error, resending\n", to);
														
 
															+			resend = 1;
														
 
															+			response = 0;
														
 
															+			break;
														
 
															+		default:
														
 
															+			mlog(ML_ERROR, "bad response! %u\n", response);
														
 
															+			BUG();
														
 
															+	}
														
 
															+	spin_unlock(&mle->spinlock);
														
 
															+	if (resend) {
														
 
															+		/* this is also totally crude */
														
 
															+		msleep(50);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * locks that can be taken here:
														
 
															+ * dlm->spinlock
														
 
															+ * res->spinlock
														
 
															+ * mle->spinlock
														
 
															+ * dlm->master_list
														
 
															+ *
														
 
															+ * if possible, TRIM THIS DOWN!!!
														
 
															+ */
														
 
															+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	u8 response = DLM_MASTER_RESP_MAYBE;
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
														
 
															+	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
														
 
															+	char *name;
														
 
															+	unsigned int namelen;
														
 
															+	int found, ret;
														
 
															+	int set_maybe;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return DLM_MASTER_RESP_NO;
														
 
															+
														
 
															+	if (!dlm_domain_fully_joined(dlm)) {
														
 
															+		response = DLM_MASTER_RESP_NO;
														
 
															+		goto send_response;
														
 
															+	}
														
 
															+
														
 
															+	name = request->name;
														
 
															+	namelen = request->namelen;
														
 
															+
														
 
															+	if (namelen > DLM_LOCKID_NAME_MAX) {
														
 
															+		response = DLM_IVBUFLEN;
														
 
															+		goto send_response;
														
 
															+	}
														
 
															+
														
 
															+way_up_top:
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	res = __dlm_lookup_lockres(dlm, name, namelen);
														
 
															+	if (res) {
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+		/* take care of the easy cases up front */
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		if (res->state & DLM_LOCK_RES_RECOVERING) {
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
														
 
															+			     "being recovered\n");
														
 
															+			response = DLM_MASTER_RESP_ERROR;
														
 
															+			if (mle)
														
 
															+				kmem_cache_free(dlm_mle_cache, mle);
														
 
															+			goto send_response;
														
 
															+		}
														
 
															+
														
 
															+		if (res->owner == dlm->node_num) {
														
 
															+			u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			// mlog(0, "this node is the master\n");
														
 
															+			response = DLM_MASTER_RESP_YES;
														
 
															+			if (mle)
														
 
															+				kmem_cache_free(dlm_mle_cache, mle);
														
 
															+
														
 
															+			/* this node is the owner.
														
 
															+			 * there is some extra work that needs to
														
 
															+			 * happen now.  the requesting node has
														
 
															+			 * caused all nodes up to this one to
														
 
															+			 * create mles.  this node now needs to
														
 
															+			 * go back and clean those up. */
														
 
															+			mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
														
 
															+			     dlm->node_num, res->lockname.len, res->lockname.name);
														
 
															+			ret = dlm_dispatch_assert_master(dlm, res, 1,
														
 
															+							 request->node_idx,
														
 
															+							 flags);
														
 
															+			if (ret < 0) {
														
 
															+				mlog(ML_ERROR, "failed to dispatch assert "
														
 
															+				     "master work\n");
														
 
															+				response = DLM_MASTER_RESP_ERROR;
														
 
															+			}
														
 
															+			goto send_response;
														
 
															+		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			// mlog(0, "node %u is the master\n", res->owner);
														
 
															+			response = DLM_MASTER_RESP_NO;
														
 
															+			if (mle)
														
 
															+				kmem_cache_free(dlm_mle_cache, mle);
														
 
															+			goto send_response;
														
 
															+		}
														
 
															+
														
 
															+		/* ok, there is no owner.  either this node is
														
 
															+		 * being blocked, or it is actively trying to
														
 
															+		 * master this lock. */
														
 
															+		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
														
 
															+			mlog(ML_ERROR, "lock with no owner should be "
														
 
															+			     "in-progress!\n");
														
 
															+			BUG();
														
 
															+		}
														
 
															+
														
 
															+		// mlog(0, "lockres is in progress...\n");
														
 
															+		spin_lock(&dlm->master_lock);
														
 
															+		found = dlm_find_mle(dlm, &tmpmle, name, namelen);
														
 
															+		if (!found) {
														
 
															+			mlog(ML_ERROR, "no mle found for this lock!\n");
														
 
															+			BUG();
														
 
															+		}
														
 
															+		set_maybe = 1;
														
 
															+		spin_lock(&tmpmle->spinlock);
														
 
															+		if (tmpmle->type == DLM_MLE_BLOCK) {
														
 
															+			// mlog(0, "this node is waiting for "
														
 
															+			// "lockres to be mastered\n");
														
 
															+			response = DLM_MASTER_RESP_NO;
														
 
															+		} else if (tmpmle->type == DLM_MLE_MIGRATION) {
														
 
															+			mlog(0, "node %u is master, but trying to migrate to "
														
 
															+			     "node %u.\n", tmpmle->master, tmpmle->new_master);
														
 
															+			if (tmpmle->master == dlm->node_num) {
														
 
															+				response = DLM_MASTER_RESP_YES;
														
 
															+				mlog(ML_ERROR, "no owner on lockres, but this "
														
 
															+				     "node is trying to migrate it to %u?!\n",
														
 
															+				     tmpmle->new_master);
														
 
															+				BUG();
														
 
															+			} else {
														
 
															+				/* the real master can respond on its own */
														
 
															+				response = DLM_MASTER_RESP_NO;
														
 
															+			}
														
 
															+		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+			set_maybe = 0;
														
 
															+			if (tmpmle->master == dlm->node_num)
														
 
															+				response = DLM_MASTER_RESP_YES;
														
 
															+			else
														
 
															+				response = DLM_MASTER_RESP_NO;
														
 
															+		} else {
														
 
															+			// mlog(0, "this node is attempting to "
														
 
															+			// "master lockres\n");
														
 
															+			response = DLM_MASTER_RESP_MAYBE;
														
 
															+		}
														
 
															+		if (set_maybe)
														
 
															+			set_bit(request->node_idx, tmpmle->maybe_map);
														
 
															+		spin_unlock(&tmpmle->spinlock);
														
 
															+
														
 
															+		spin_unlock(&dlm->master_lock);
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+
														
 
															+		/* keep the mle attached to heartbeat events */
														
 
															+		dlm_put_mle(tmpmle);
														
 
															+		if (mle)
														
 
															+			kmem_cache_free(dlm_mle_cache, mle);
														
 
															+		goto send_response;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * lockres doesn't exist on this node
														
 
															+	 * if there is an MLE_BLOCK, return NO
														
 
															+	 * if there is an MLE_MASTER, return MAYBE
														
 
															+	 * otherwise, add an MLE_BLOCK, return NO
														
 
															+	 */
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
														
 
															+	if (!found) {
														
 
															+		/* this lockid has never been seen on this node yet */
														
 
															+		// mlog(0, "no mle found\n");
														
 
															+		if (!mle) {
														
 
															+			spin_unlock(&dlm->master_lock);
														
 
															+			spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+			mle = (struct dlm_master_list_entry *)
														
 
															+				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
														
 
															+			if (!mle) {
														
 
															+				// bad bad bad... this sucks.
														
 
															+				response = DLM_MASTER_RESP_ERROR;
														
 
															+				goto send_response;
														
 
															+			}
														
 
															+			spin_lock(&dlm->spinlock);
														
 
															+			dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
														
 
															+					 name, namelen);
														
 
															+			spin_unlock(&dlm->spinlock);
														
 
															+			goto way_up_top;
														
 
															+		}
														
 
															+
														
 
															+		// mlog(0, "this is second time thru, already allocated, "
														
 
															+		// "add the block.\n");
														
 
															+		set_bit(request->node_idx, mle->maybe_map);
														
 
															+		list_add(&mle->list, &dlm->master_list);
														
 
															+		response = DLM_MASTER_RESP_NO;
														
 
															+	} else {
														
 
															+		// mlog(0, "mle was found\n");
														
 
															+		set_maybe = 1;
														
 
															+		spin_lock(&tmpmle->spinlock);
														
 
															+		if (tmpmle->type == DLM_MLE_BLOCK)
														
 
															+			response = DLM_MASTER_RESP_NO;
														
 
															+		else if (tmpmle->type == DLM_MLE_MIGRATION) {
														
 
															+			mlog(0, "migration mle was found (%u->%u)\n",
														
 
															+			     tmpmle->master, tmpmle->new_master);
														
 
															+			if (tmpmle->master == dlm->node_num) {
														
 
															+				mlog(ML_ERROR, "no lockres, but migration mle "
														
 
															+				     "says that this node is master!\n");
														
 
															+				BUG();
														
 
															+			}
														
 
															+			/* real master can respond on its own */
														
 
															+			response = DLM_MASTER_RESP_NO;
														
 
															+		} else {
														
 
															+			if (tmpmle->master == dlm->node_num) {
														
 
															+				response = DLM_MASTER_RESP_YES;
														
 
															+				set_maybe = 0;
														
 
															+			} else
														
 
															+				response = DLM_MASTER_RESP_MAYBE;
														
 
															+		}
														
 
															+		if (set_maybe)
														
 
															+			set_bit(request->node_idx, tmpmle->maybe_map);
														
 
															+		spin_unlock(&tmpmle->spinlock);
														
 
															+	}
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	if (found) {
														
 
															+		/* keep the mle attached to heartbeat events */
														
 
															+		dlm_put_mle(tmpmle);
														
 
															+	}
														
 
															+send_response:
														
 
															+	dlm_put(dlm);
														
 
															+	return response;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * DLM_ASSERT_MASTER_MSG
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * NOTE: this can be used for debugging
														
 
															+ * can periodically run all locks owned by this node
														
 
															+ * and re-assert across the cluster...
														
 
															+ */
														
 
															+static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
														
 
															+				unsigned int namelen, void *nodemap,
														
 
															+				u32 flags)
														
 
															+{
														
 
															+	struct dlm_assert_master assert;
														
 
															+	int to, tmpret;
														
 
															+	struct dlm_node_iter iter;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
														
 
															+
														
 
															+	/* note that if this nodemap is empty, it returns 0 */
														
 
															+	dlm_node_iter_init(nodemap, &iter);
														
 
															+	while ((to = dlm_node_iter_next(&iter)) >= 0) {
														
 
															+		int r = 0;
														
 
															+		mlog(0, "sending assert master to %d (%.*s)\n", to,
														
 
															+		     namelen, lockname);
														
 
															+		memset(&assert, 0, sizeof(assert));
														
 
															+		assert.node_idx = dlm->node_num;
														
 
															+		assert.namelen = namelen;
														
 
															+		memcpy(assert.name, lockname, namelen);
														
 
															+		assert.flags = cpu_to_be32(flags);
														
 
															+
														
 
															+		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
														
 
															+					    &assert, sizeof(assert), to, &r);
														
 
															+		if (tmpret < 0) {
														
 
															+			mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
														
 
															+			if (!dlm_is_host_down(tmpret)) {
														
 
															+				mlog(ML_ERROR, "unhandled error!\n");
														
 
															+				BUG();
														
 
															+			}
														
 
															+			/* a node died.  finish out the rest of the nodes. */
														
 
															+			mlog(ML_ERROR, "link to %d went down!\n", to);
														
 
															+			/* any nonzero status return will do */
														
 
															+			ret = tmpret;
														
 
															+		} else if (r < 0) {
														
 
															+			/* ok, something horribly messed.  kill thyself. */
														
 
															+			mlog(ML_ERROR,"during assert master of %.*s to %u, "
														
 
															+			     "got %d.\n", namelen, lockname, to, r);
														
 
															+			dlm_dump_lock_resources(dlm);
														
 
															+			BUG();
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * locks that can be taken here:
														
 
															+ * dlm->spinlock
														
 
															+ * res->spinlock
														
 
															+ * mle->spinlock
														
 
															+ * dlm->master_list
														
 
															+ *
														
 
															+ * if possible, TRIM THIS DOWN!!!
														
 
															+ */
														
 
															+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_master_list_entry *mle = NULL;
														
 
															+	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+	char *name;
														
 
															+	unsigned int namelen;
														
 
															+	u32 flags;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return 0;
														
 
															+
														
 
															+	name = assert->name;
														
 
															+	namelen = assert->namelen;
														
 
															+	flags = be32_to_cpu(assert->flags);
														
 
															+
														
 
															+	if (namelen > DLM_LOCKID_NAME_MAX) {
														
 
															+		mlog(ML_ERROR, "Invalid name length!");
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+	if (flags)
														
 
															+		mlog(0, "assert_master with flags: %u\n", flags);
														
 
															+
														
 
															+	/* find the MLE */
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
														
 
															+		/* not an error, could be master just re-asserting */
														
 
															+		mlog(0, "just got an assert_master from %u, but no "
														
 
															+		     "MLE for it! (%.*s)\n", assert->node_idx,
														
 
															+		     namelen, name);
														
 
															+	} else {
														
 
															+		int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
														
 
															+		if (bit >= O2NM_MAX_NODES) {
														
 
															+			/* not necessarily an error, though less likely.
														
 
															+			 * could be master just re-asserting. */
														
 
															+			mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
														
 
															+			     "is asserting! (%.*s)\n", assert->node_idx,
														
 
															+			     namelen, name);
														
 
															+		} else if (bit != assert->node_idx) {
														
 
															+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
														
 
															+				mlog(0, "master %u was found, %u should "
														
 
															+				     "back off\n", assert->node_idx, bit);
														
 
															+			} else {
														
 
															+				/* with the fix for bug 569, a higher node
														
 
															+				 * number winning the mastery will respond
														
 
															+				 * YES to mastery requests, but this node
														
 
															+				 * had no way of knowing.  let it pass. */
														
 
															+				mlog(ML_ERROR, "%u is the lowest node, "
														
 
															+				     "%u is asserting. (%.*s)  %u must "
														
 
															+				     "have begun after %u won.\n", bit,
														
 
															+				     assert->node_idx, namelen, name, bit,
														
 
															+				     assert->node_idx);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+
														
 
															+	/* ok everything checks out with the MLE
														
 
															+	 * now check to see if there is a lockres */
														
 
															+	res = __dlm_lookup_lockres(dlm, name, namelen);
														
 
															+	if (res) {
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		if (res->state & DLM_LOCK_RES_RECOVERING)  {
														
 
															+			mlog(ML_ERROR, "%u asserting but %.*s is "
														
 
															+			     "RECOVERING!\n", assert->node_idx, namelen, name);
														
 
															+			goto kill;
														
 
															+		}
														
 
															+		if (!mle) {
														
 
															+			if (res->owner != assert->node_idx) {
														
 
															+				mlog(ML_ERROR, "assert_master from "
														
 
															+					  "%u, but current owner is "
														
 
															+					  "%u! (%.*s)\n",
														
 
															+				       assert->node_idx, res->owner,
														
 
															+				       namelen, name);
														
 
															+				goto kill;
														
 
															+			}
														
 
															+		} else if (mle->type != DLM_MLE_MIGRATION) {
														
 
															+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+				/* owner is just re-asserting */
														
 
															+				if (res->owner == assert->node_idx) {
														
 
															+					mlog(0, "owner %u re-asserting on "
														
 
															+					     "lock %.*s\n", assert->node_idx,
														
 
															+					     namelen, name);
														
 
															+					goto ok;
														
 
															+				}
														
 
															+				mlog(ML_ERROR, "got assert_master from "
														
 
															+				     "node %u, but %u is the owner! "
														
 
															+				     "(%.*s)\n", assert->node_idx,
														
 
															+				     res->owner, namelen, name);
														
 
															+				goto kill;
														
 
															+			}
														
 
															+			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
														
 
															+				mlog(ML_ERROR, "got assert from %u, but lock "
														
 
															+				     "with no owner should be "
														
 
															+				     "in-progress! (%.*s)\n",
														
 
															+				     assert->node_idx,
														
 
															+				     namelen, name);
														
 
															+				goto kill;
														
 
															+			}
														
 
															+		} else /* mle->type == DLM_MLE_MIGRATION */ {
														
 
															+			/* should only be getting an assert from new master */
														
 
															+			if (assert->node_idx != mle->new_master) {
														
 
															+				mlog(ML_ERROR, "got assert from %u, but "
														
 
															+				     "new master is %u, and old master "
														
 
															+				     "was %u (%.*s)\n",
														
 
															+				     assert->node_idx, mle->new_master,
														
 
															+				     mle->master, namelen, name);
														
 
															+				goto kill;
														
 
															+			}
														
 
															+
														
 
															+		}
														
 
															+ok:
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+	}
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	// mlog(0, "woo!  got an assert_master from node %u!\n",
														
 
															+	// 	     assert->node_idx);
														
 
															+	if (mle) {
														
 
															+		int extra_ref;
														
 
															+		
														
 
															+		spin_lock(&mle->spinlock);
														
 
															+		extra_ref = !!(mle->type == DLM_MLE_BLOCK
														
 
															+			       || mle->type == DLM_MLE_MIGRATION);
														
 
															+		mle->master = assert->node_idx;
														
 
															+		atomic_set(&mle->woken, 1);
														
 
															+		wake_up(&mle->wq);
														
 
															+		spin_unlock(&mle->spinlock);
														
 
															+
														
 
															+		if (mle->type == DLM_MLE_MIGRATION && res) {
														
 
															+			mlog(0, "finishing off migration of lockres %.*s, "
														
 
															+			     "from %u to %u\n",
														
 
															+			       res->lockname.len, res->lockname.name,
														
 
															+			       dlm->node_num, mle->new_master);
														
 
															+			spin_lock(&res->spinlock);
														
 
															+			res->state &= ~DLM_LOCK_RES_MIGRATING;
														
 
															+			dlm_change_lockres_owner(dlm, res, mle->new_master);
														
 
															+			BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+		}
														
 
															+		/* master is known, detach if not already detached */
														
 
															+		dlm_mle_detach_hb_events(dlm, mle);
														
 
															+		dlm_put_mle(mle);
														
 
															+		
														
 
															+		if (extra_ref) {
														
 
															+			/* the assert master message now balances the extra
														
 
															+		 	 * ref given by the master / migration request message.
														
 
															+		 	 * if this is the last put, it will be removed
														
 
															+		 	 * from the list. */
														
 
															+			dlm_put_mle(mle);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+done:
														
 
															+	if (res)
														
 
															+		dlm_lockres_put(res);
														
 
															+	dlm_put(dlm);
														
 
															+	return 0;
														
 
															+
														
 
															+kill:
														
 
															+	/* kill the caller! */
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+	dlm_lockres_put(res);
														
 
															+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
														
 
															+	     "and killing the other node now!  This node is OK and can continue.\n");
														
 
															+	dlm_dump_lock_resources(dlm);
														
 
															+	dlm_put(dlm);
														
 
															+	return -EINVAL;
														
 
															+}
														
 
															+
														
 
															+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
														
 
															+			       struct dlm_lock_resource *res,
														
 
															+			       int ignore_higher, u8 request_from, u32 flags)
														
 
															+{
														
 
															+	struct dlm_work_item *item;
														
 
															+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
														
 
															+	if (!item)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+
														
 
															+	/* queue up work for dlm_assert_master_worker */
														
 
															+	dlm_grab(dlm);  /* get an extra ref for the work item */
														
 
															+	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
														
 
															+	item->u.am.lockres = res; /* already have a ref */
														
 
															+	/* can optionally ignore node numbers higher than this node */
														
 
															+	item->u.am.ignore_higher = ignore_higher;
														
 
															+	item->u.am.request_from = request_from;
														
 
															+	item->u.am.flags = flags;
														
 
															+
														
 
															+	spin_lock(&dlm->work_lock);
														
 
															+	list_add_tail(&item->list, &dlm->work_list);
														
 
															+	spin_unlock(&dlm->work_lock);
														
 
															+
														
 
															+	schedule_work(&dlm->dispatched_work);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	int ret = 0;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
														
 
															+	int ignore_higher;
														
 
															+	int bit;
														
 
															+	u8 request_from;
														
 
															+	u32 flags;
														
 
															+
														
 
															+	dlm = item->dlm;
														
 
															+	res = item->u.am.lockres;
														
 
															+	ignore_higher = item->u.am.ignore_higher;
														
 
															+	request_from = item->u.am.request_from;
														
 
															+	flags = item->u.am.flags;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	clear_bit(dlm->node_num, nodemap);
														
 
															+	if (ignore_higher) {
														
 
															+		/* if is this just to clear up mles for nodes below
														
 
															+		 * this node, do not send the message to the original
														
 
															+		 * caller or any node number higher than this */
														
 
															+		clear_bit(request_from, nodemap);
														
 
															+		bit = dlm->node_num;
														
 
															+		while (1) {
														
 
															+			bit = find_next_bit(nodemap, O2NM_MAX_NODES,
														
 
															+					    bit+1);
														
 
															+		       	if (bit >= O2NM_MAX_NODES)
														
 
															+				break;
														
 
															+			clear_bit(bit, nodemap);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* this call now finishes out the nodemap
														
 
															+	 * even if one or more nodes die */
														
 
															+	mlog(0, "worker about to master %.*s here, this=%u\n",
														
 
															+		     res->lockname.len, res->lockname.name, dlm->node_num);
														
 
															+	ret = dlm_do_assert_master(dlm, res->lockname.name,
														
 
															+				   res->lockname.len,
														
 
															+				   nodemap, flags);
														
 
															+	if (ret < 0) {
														
 
															+		/* no need to restart, we are done */
														
 
															+		mlog_errno(ret);
														
 
															+	}
														
 
															+
														
 
															+	dlm_lockres_put(res);
														
 
															+
														
 
															+	mlog(0, "finished with dlm_assert_master_worker\n");
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * DLM_MIGRATE_LOCKRES
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+			u8 target)
														
 
															+{
														
 
															+	struct dlm_master_list_entry *mle = NULL;
														
 
															+	struct dlm_master_list_entry *oldmle = NULL;
														
 
															+ 	struct dlm_migratable_lockres *mres = NULL;
														
 
															+	int ret = -EINVAL;
														
 
															+	const char *name;
														
 
															+	unsigned int namelen;
														
 
															+	int mle_added = 0;
														
 
															+	struct list_head *queue, *iter;
														
 
															+	int i;
														
 
															+	struct dlm_lock *lock;
														
 
															+	int empty = 1;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	name = res->lockname.name;
														
 
															+	namelen = res->lockname.len;
														
 
															+
														
 
															+	mlog(0, "migrating %.*s to %u\n", namelen, name, target);
														
 
															+
														
 
															+	/*
														
 
															+	 * ensure this lockres is a proper candidate for migration
														
 
															+	 */
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+		mlog(0, "cannot migrate lockres with unknown owner!\n");
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+	if (res->owner != dlm->node_num) {
														
 
															+		mlog(0, "cannot migrate lockres this node doesn't own!\n");
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+	mlog(0, "checking queues...\n");
														
 
															+	queue = &res->granted;
														
 
															+	for (i=0; i<3; i++) {
														
 
															+		list_for_each(iter, queue) {
														
 
															+			lock = list_entry (iter, struct dlm_lock, list);
														
 
															+			empty = 0;
														
 
															+			if (lock->ml.node == dlm->node_num) {
														
 
															+				mlog(0, "found a lock owned by this node "
														
 
															+				     "still on the %s queue!  will not "
														
 
															+				     "migrate this lockres\n",
														
 
															+				     i==0 ? "granted" :
														
 
															+				     (i==1 ? "converting" : "blocked"));
														
 
															+				spin_unlock(&res->spinlock);
														
 
															+				ret = -ENOTEMPTY;
														
 
															+				goto leave;
														
 
															+			}
														
 
															+		}
														
 
															+		queue++;
														
 
															+	}
														
 
															+	mlog(0, "all locks on this lockres are nonlocal.  continuing\n");
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	/* no work to do */
														
 
															+	if (empty) {
														
 
															+		mlog(0, "no locks were found on this lockres! done!\n");
														
 
															+		ret = 0;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * preallocate up front
														
 
															+	 * if this fails, abort
														
 
															+	 */
														
 
															+
														
 
															+	ret = -ENOMEM;
														
 
															+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
														
 
															+	if (!mres) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
														
 
															+								GFP_KERNEL);
														
 
															+	if (!mle) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+	ret = 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * find a node to migrate the lockres to
														
 
															+	 */
														
 
															+
														
 
															+	mlog(0, "picking a migration node\n");
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	/* pick a new node */
														
 
															+	if (!test_bit(target, dlm->domain_map) ||
														
 
															+	    target >= O2NM_MAX_NODES) {
														
 
															+		target = dlm_pick_migration_target(dlm, res);
														
 
															+	}
														
 
															+	mlog(0, "node %u chosen for migration\n", target);
														
 
															+
														
 
															+	if (target >= O2NM_MAX_NODES ||
														
 
															+	    !test_bit(target, dlm->domain_map)) {
														
 
															+		/* target chosen is not alive */
														
 
															+		ret = -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	if (ret) {
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "continuing with target = %u\n", target);
														
 
															+
														
 
															+	/*
														
 
															+	 * clear any existing master requests and
														
 
															+	 * add the migration mle to the list
														
 
															+	 */
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
														
 
															+				    namelen, target, dlm->node_num);
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	if (ret == -EEXIST) {
														
 
															+		mlog(0, "another process is already migrating it\n");
														
 
															+		goto fail;
														
 
															+	}
														
 
															+	mle_added = 1;
														
 
															+
														
 
															+	/*
														
 
															+	 * set the MIGRATING flag and flush asts
														
 
															+	 * if we fail after this we need to re-dirty the lockres
														
 
															+	 */
														
 
															+	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
														
 
															+		mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
														
 
															+		     "the target went down.\n", res->lockname.len,
														
 
															+		     res->lockname.name, target);
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		res->state &= ~DLM_LOCK_RES_MIGRATING;
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		ret = -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+fail:
														
 
															+	if (oldmle) {
														
 
															+		/* master is known, detach if not already detached */
														
 
															+		dlm_mle_detach_hb_events(dlm, oldmle);
														
 
															+		dlm_put_mle(oldmle);
														
 
															+	}
														
 
															+
														
 
															+	if (ret < 0) {
														
 
															+		if (mle_added) {
														
 
															+			dlm_mle_detach_hb_events(dlm, mle);
														
 
															+			dlm_put_mle(mle);
														
 
															+		} else if (mle) {
														
 
															+			kmem_cache_free(dlm_mle_cache, mle);
														
 
															+		}
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * at this point, we have a migration target, an mle
														
 
															+	 * in the master list, and the MIGRATING flag set on
														
 
															+	 * the lockres
														
 
															+	 */
														
 
															+
														
 
															+
														
 
															+	/* get an extra reference on the mle.
														
 
															+	 * otherwise the assert_master from the new
														
 
															+	 * master will destroy this.
														
 
															+	 * also, make sure that all callers of dlm_get_mle
														
 
															+	 * take both dlm->spinlock and dlm->master_lock */
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+	dlm_get_mle(mle);
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	/* notify new node and send all lock state */
														
 
															+	/* call send_one_lockres with migration flag.
														
 
															+	 * this serves as notice to the target node that a
														
 
															+	 * migration is starting. */
														
 
															+	ret = dlm_send_one_lockres(dlm, res, mres, target,
														
 
															+				   DLM_MRES_MIGRATION);
														
 
															+
														
 
															+	if (ret < 0) {
														
 
															+		mlog(0, "migration to node %u failed with %d\n",
														
 
															+		     target, ret);
														
 
															+		/* migration failed, detach and clean up mle */
														
 
															+		dlm_mle_detach_hb_events(dlm, mle);
														
 
															+		dlm_put_mle(mle);
														
 
															+		dlm_put_mle(mle);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* at this point, the target sends a message to all nodes,
														
 
															+	 * (using dlm_do_migrate_request).  this node is skipped since
														
 
															+	 * we had to put an mle in the list to begin the process.  this
														
 
															+	 * node now waits for target to do an assert master.  this node
														
 
															+	 * will be the last one notified, ensuring that the migration
														
 
															+	 * is complete everywhere.  if the target dies while this is
														
 
															+	 * going on, some nodes could potentially see the target as the
														
 
															+	 * master, so it is important that my recovery finds the migration
														
 
															+	 * mle and sets the master to UNKNONWN. */
														
 
															+
														
 
															+
														
 
															+	/* wait for new node to assert master */
														
 
															+	while (1) {
														
 
															+		ret = wait_event_interruptible_timeout(mle->wq,
														
 
															+					(atomic_read(&mle->woken) == 1),
														
 
															+					msecs_to_jiffies(5000));
														
 
															+
														
 
															+		if (ret >= 0) {
														
 
															+		       	if (atomic_read(&mle->woken) == 1 ||
														
 
															+			    res->owner == target)
														
 
															+				break;
														
 
															+
														
 
															+			mlog(0, "timed out during migration\n");
														
 
															+		}
														
 
															+		if (ret == -ERESTARTSYS) {
														
 
															+			/* migration failed, detach and clean up mle */
														
 
															+			dlm_mle_detach_hb_events(dlm, mle);
														
 
															+			dlm_put_mle(mle);
														
 
															+			dlm_put_mle(mle);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+		/* TODO: if node died: stop, clean up, return error */
														
 
															+	}
														
 
															+
														
 
															+	/* all done, set the owner, clear the flag */
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	dlm_set_lockres_owner(dlm, res, target);
														
 
															+	res->state &= ~DLM_LOCK_RES_MIGRATING;
														
 
															+	dlm_remove_nonlocal_locks(dlm, res);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	wake_up(&res->wq);
														
 
															+
														
 
															+	/* master is known, detach if not already detached */
														
 
															+	dlm_mle_detach_hb_events(dlm, mle);
														
 
															+	dlm_put_mle(mle);
														
 
															+	ret = 0;
														
 
															+
														
 
															+	dlm_lockres_calc_usage(dlm, res);
														
 
															+
														
 
															+leave:
														
 
															+	/* re-dirty the lockres if we failed */
														
 
															+	if (ret < 0)
														
 
															+		dlm_kick_thread(dlm, res);
														
 
															+
														
 
															+	/* TODO: cleanup */
														
 
															+	if (mres)
														
 
															+		free_page((unsigned long)mres);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+
														
 
															+	mlog(0, "returning %d\n", ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
														
 
															+
														
 
															+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
														
 
															+{
														
 
															+	int ret;
														
 
															+	spin_lock(&dlm->ast_lock);
														
 
															+	spin_lock(&lock->spinlock);
														
 
															+	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
														
 
															+	spin_unlock(&lock->spinlock);
														
 
															+	spin_unlock(&dlm->ast_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
														
 
															+				     struct dlm_lock_resource *res,
														
 
															+				     u8 mig_target)
														
 
															+{
														
 
															+	int can_proceed;
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	/* target has died, so make the caller break out of the 
														
 
															+	 * wait_event, but caller must recheck the domain_map */
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	if (!test_bit(mig_target, dlm->domain_map))
														
 
															+		can_proceed = 1;
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+	return can_proceed;
														
 
															+}
														
 
															+
														
 
															+int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	int ret;
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
														
 
															+				       struct dlm_lock_resource *res,
														
 
															+				       u8 target)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
														
 
															+	       res->lockname.len, res->lockname.name, dlm->node_num,
														
 
															+	       target);
														
 
															+	/* need to set MIGRATING flag on lockres.  this is done by
														
 
															+	 * ensuring that all asts have been flushed for this lockres. */
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	BUG_ON(res->migration_pending);
														
 
															+	res->migration_pending = 1;
														
 
															+	/* strategy is to reserve an extra ast then release
														
 
															+	 * it below, letting the release do all of the work */
														
 
															+	__dlm_lockres_reserve_ast(res);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	/* now flush all the pending asts.. hang out for a bit */
														
 
															+	dlm_kick_thread(dlm, res);
														
 
															+	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
														
 
															+	dlm_lockres_release_ast(dlm, res);
														
 
															+
														
 
															+	mlog(0, "about to wait on migration_wq, dirty=%s\n",
														
 
															+	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
														
 
															+	/* if the extra ref we just put was the final one, this
														
 
															+	 * will pass thru immediately.  otherwise, we need to wait
														
 
															+	 * for the last ast to finish. */
														
 
															+again:
														
 
															+	ret = wait_event_interruptible_timeout(dlm->migration_wq,
														
 
															+		   dlm_migration_can_proceed(dlm, res, target),
														
 
															+		   msecs_to_jiffies(1000));
														
 
															+	if (ret < 0) {
														
 
															+		mlog(0, "woken again: migrating? %s, dead? %s\n",
														
 
															+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
														
 
															+		       test_bit(target, dlm->domain_map) ? "no":"yes");
														
 
															+	} else {
														
 
															+		mlog(0, "all is well: migrating? %s, dead? %s\n",
														
 
															+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
														
 
															+		       test_bit(target, dlm->domain_map) ? "no":"yes");
														
 
															+	}
														
 
															+	if (!dlm_migration_can_proceed(dlm, res, target)) {
														
 
															+		mlog(0, "trying again...\n");
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	/* did the target go down or die? */
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	if (!test_bit(target, dlm->domain_map)) {
														
 
															+		mlog(ML_ERROR, "aha. migration target %u just went down\n",
														
 
															+		     target);
														
 
															+		ret = -EHOSTDOWN;
														
 
															+	}
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	/*
														
 
															+	 * at this point:
														
 
															+	 *
														
 
															+	 *   o the DLM_LOCK_RES_MIGRATING flag is set
														
 
															+	 *   o there are no pending asts on this lockres
														
 
															+	 *   o all processes trying to reserve an ast on this
														
 
															+	 *     lockres must wait for the MIGRATING flag to clear
														
 
															+	 */
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* last step in the migration process.
														
 
															+ * original master calls this to free all of the dlm_lock
														
 
															+ * structures that used to be for other nodes. */
														
 
															+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	struct list_head *iter, *iter2;
														
 
															+	struct list_head *queue = &res->granted;
														
 
															+	int i;
														
 
															+	struct dlm_lock *lock;
														
 
															+
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	BUG_ON(res->owner == dlm->node_num);
														
 
															+
														
 
															+	for (i=0; i<3; i++) {
														
 
															+		list_for_each_safe(iter, iter2, queue) {
														
 
															+			lock = list_entry (iter, struct dlm_lock, list);
														
 
															+			if (lock->ml.node != dlm->node_num) {
														
 
															+				mlog(0, "putting lock for node %u\n",
														
 
															+				     lock->ml.node);
														
 
															+				/* be extra careful */
														
 
															+				BUG_ON(!list_empty(&lock->ast_list));
														
 
															+				BUG_ON(!list_empty(&lock->bast_list));
														
 
															+				BUG_ON(lock->ast_pending);
														
 
															+				BUG_ON(lock->bast_pending);
														
 
															+				list_del_init(&lock->list);
														
 
															+				dlm_lock_put(lock);
														
 
															+			}
														
 
															+		}
														
 
															+		queue++;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* for now this is not too intelligent.  we will
														
 
															+ * need stats to make this do the right thing.
														
 
															+ * this just finds the first lock on one of the
														
 
															+ * queues and uses that node as the target. */
														
 
															+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
														
 
															+				    struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct list_head *queue = &res->granted;
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock *lock;
														
 
															+	int nodenum;
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	for (i=0; i<3; i++) {
														
 
															+		list_for_each(iter, queue) {
														
 
															+			/* up to the caller to make sure this node
														
 
															+			 * is alive */
														
 
															+			lock = list_entry (iter, struct dlm_lock, list);
														
 
															+			if (lock->ml.node != dlm->node_num) {
														
 
															+				spin_unlock(&res->spinlock);
														
 
															+				return lock->ml.node;
														
 
															+			}
														
 
															+		}
														
 
															+		queue++;
														
 
															+	}
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	mlog(0, "have not found a suitable target yet! checking domain map\n");
														
 
															+
														
 
															+	/* ok now we're getting desperate.  pick anyone alive. */
														
 
															+	nodenum = -1;
														
 
															+	while (1) {
														
 
															+		nodenum = find_next_bit(dlm->domain_map,
														
 
															+					O2NM_MAX_NODES, nodenum+1);
														
 
															+		mlog(0, "found %d in domain map\n", nodenum);
														
 
															+		if (nodenum >= O2NM_MAX_NODES)
														
 
															+			break;
														
 
															+		if (nodenum != dlm->node_num) {
														
 
															+			mlog(0, "picking %d\n", nodenum);
														
 
															+			return nodenum;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "giving up.  no master to migrate to\n");
														
 
															+	return DLM_LOCK_RES_OWNER_UNKNOWN;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+/* this is called by the new master once all lockres
														
 
															+ * data has been received */
														
 
															+static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
														
 
															+				  struct dlm_lock_resource *res,
														
 
															+				  u8 master, u8 new_master,
														
 
															+				  struct dlm_node_iter *iter)
														
 
															+{
														
 
															+	struct dlm_migrate_request migrate;
														
 
															+	int ret, status = 0;
														
 
															+	int nodenum;
														
 
															+
														
 
															+	memset(&migrate, 0, sizeof(migrate));
														
 
															+	migrate.namelen = res->lockname.len;
														
 
															+	memcpy(migrate.name, res->lockname.name, migrate.namelen);
														
 
															+	migrate.new_master = new_master;
														
 
															+	migrate.master = master;
														
 
															+
														
 
															+	ret = 0;
														
 
															+
														
 
															+	/* send message to all nodes, except the master and myself */
														
 
															+	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
														
 
															+		if (nodenum == master ||
														
 
															+		    nodenum == new_master)
														
 
															+			continue;
														
 
															+
														
 
															+		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
														
 
															+					 &migrate, sizeof(migrate), nodenum,
														
 
															+					 &status);
														
 
															+		if (ret < 0)
														
 
															+			mlog_errno(ret);
														
 
															+		else if (status < 0) {
														
 
															+			mlog(0, "migrate request (node %u) returned %d!\n",
														
 
															+			     nodenum, status);
														
 
															+			ret = status;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+
														
 
															+	mlog(0, "returning ret=%d\n", ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* if there is an existing mle for this lockres, we now know who the master is.
														
 
															+ * (the one who sent us *this* message) we can clear it up right away.
														
 
															+ * since the process that put the mle on the list still has a reference to it,
														
 
															+ * we can unhash it now, set the master and wake the process.  as a result,
														
 
															+ * we will have no mle in the list to start with.  now we can add an mle for
														
 
															+ * the migration and this should be the only one found for those scanning the
														
 
															+ * list.  */
														
 
															+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
														
 
															+	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
														
 
															+	const char *name;
														
 
															+	unsigned int namelen;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	name = migrate->name;
														
 
															+	namelen = migrate->namelen;
														
 
															+
														
 
															+	/* preallocate.. if this fails, abort */
														
 
															+	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
														
 
															+							 GFP_KERNEL);
														
 
															+
														
 
															+	if (!mle) {
														
 
															+		ret = -ENOMEM;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* check for pre-existing lock */
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	res = __dlm_lookup_lockres(dlm, name, namelen);
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+
														
 
															+	if (res) {
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		if (res->state & DLM_LOCK_RES_RECOVERING) {
														
 
															+			/* if all is working ok, this can only mean that we got
														
 
															+		 	* a migrate request from a node that we now see as
														
 
															+		 	* dead.  what can we do here?  drop it to the floor? */
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			mlog(ML_ERROR, "Got a migrate request, but the "
														
 
															+			     "lockres is marked as recovering!");
														
 
															+			kmem_cache_free(dlm_mle_cache, mle);
														
 
															+			ret = -EINVAL; /* need a better solution */
														
 
															+			goto unlock;
														
 
															+		}
														
 
															+		res->state |= DLM_LOCK_RES_MIGRATING;
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+	}
														
 
															+
														
 
															+	/* ignore status.  only nonzero status would BUG. */
														
 
															+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
														
 
															+				    name, namelen,
														
 
															+				    migrate->new_master,
														
 
															+				    migrate->master);
														
 
															+
														
 
															+unlock:
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	if (oldmle) {
														
 
															+		/* master is known, detach if not already detached */
														
 
															+		dlm_mle_detach_hb_events(dlm, oldmle);
														
 
															+		dlm_put_mle(oldmle);
														
 
															+	}
														
 
															+
														
 
															+	if (res)
														
 
															+		dlm_lockres_put(res);
														
 
															+leave:
														
 
															+	dlm_put(dlm);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* must be holding dlm->spinlock and dlm->master_lock
														
 
															+ * when adding a migration mle, we can clear any other mles
														
 
															+ * in the master list because we know with certainty that
														
 
															+ * the master is "master".  so we remove any old mle from
														
 
															+ * the list after setting it's master field, and then add
														
 
															+ * the new migration mle.  this way we can hold with the rule
														
 
															+ * of having only one mle for a given lock name at all times. */
														
 
															+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
														
 
															+				 struct dlm_lock_resource *res,
														
 
															+				 struct dlm_master_list_entry *mle,
														
 
															+				 struct dlm_master_list_entry **oldmle,
														
 
															+				 const char *name, unsigned int namelen,
														
 
															+				 u8 new_master, u8 master)
														
 
															+{
														
 
															+	int found;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	*oldmle = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+	assert_spin_locked(&dlm->master_lock);
														
 
															+
														
 
															+	/* caller is responsible for any ref taken here on oldmle */
														
 
															+	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
														
 
															+	if (found) {
														
 
															+		struct dlm_master_list_entry *tmp = *oldmle;
														
 
															+		spin_lock(&tmp->spinlock);
														
 
															+		if (tmp->type == DLM_MLE_MIGRATION) {
														
 
															+			if (master == dlm->node_num) {
														
 
															+				/* ah another process raced me to it */
														
 
															+				mlog(0, "tried to migrate %.*s, but some "
														
 
															+				     "process beat me to it\n",
														
 
															+				     namelen, name);
														
 
															+				ret = -EEXIST;
														
 
															+			} else {
														
 
															+				/* bad.  2 NODES are trying to migrate! */
														
 
															+				mlog(ML_ERROR, "migration error  mle: "
														
 
															+				     "master=%u new_master=%u // request: "
														
 
															+				     "master=%u new_master=%u // "
														
 
															+				     "lockres=%.*s\n",
														
 
															+				     tmp->master, tmp->new_master,
														
 
															+				     master, new_master,
														
 
															+				     namelen, name);
														
 
															+				BUG();
														
 
															+			}
														
 
															+		} else {
														
 
															+			/* this is essentially what assert_master does */
														
 
															+			tmp->master = master;
														
 
															+			atomic_set(&tmp->woken, 1);
														
 
															+			wake_up(&tmp->wq);
														
 
															+			/* remove it from the list so that only one
														
 
															+			 * mle will be found */
														
 
															+			list_del_init(&tmp->list);
														
 
															+		}
														
 
															+		spin_unlock(&tmp->spinlock);
														
 
															+	}
														
 
															+
														
 
															+	/* now add a migration mle to the tail of the list */
														
 
															+	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
														
 
															+	mle->new_master = new_master;
														
 
															+	mle->master = master;
														
 
															+	/* do this for consistency with other mle types */
														
 
															+	set_bit(new_master, mle->maybe_map);
														
 
															+	list_add(&mle->list, &dlm->master_list);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
														
 
															+{
														
 
															+	struct list_head *iter, *iter2;
														
 
															+	struct dlm_master_list_entry *mle;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+
														
 
															+	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
														
 
															+top:
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	/* clean the master list */
														
 
															+	spin_lock(&dlm->master_lock);
														
 
															+	list_for_each_safe(iter, iter2, &dlm->master_list) {
														
 
															+		mle = list_entry(iter, struct dlm_master_list_entry, list);
														
 
															+
														
 
															+		BUG_ON(mle->type != DLM_MLE_BLOCK &&
														
 
															+		       mle->type != DLM_MLE_MASTER &&
														
 
															+		       mle->type != DLM_MLE_MIGRATION);
														
 
															+
														
 
															+		/* MASTER mles are initiated locally.  the waiting
														
 
															+		 * process will notice the node map change
														
 
															+		 * shortly.  let that happen as normal. */
														
 
															+		if (mle->type == DLM_MLE_MASTER)
														
 
															+			continue;
														
 
															+
														
 
															+
														
 
															+		/* BLOCK mles are initiated by other nodes.
														
 
															+		 * need to clean up if the dead node would have
														
 
															+		 * been the master. */
														
 
															+		if (mle->type == DLM_MLE_BLOCK) {
														
 
															+			int bit;
														
 
															+
														
 
															+			spin_lock(&mle->spinlock);
														
 
															+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
														
 
															+			if (bit != dead_node) {
														
 
															+				mlog(0, "mle found, but dead node %u would "
														
 
															+				     "not have been master\n", dead_node);
														
 
															+				spin_unlock(&mle->spinlock);
														
 
															+			} else {
														
 
															+				/* must drop the refcount by one since the
														
 
															+				 * assert_master will never arrive.  this
														
 
															+				 * may result in the mle being unlinked and
														
 
															+				 * freed, but there may still be a process
														
 
															+				 * waiting in the dlmlock path which is fine. */
														
 
															+				mlog(ML_ERROR, "node %u was expected master\n",
														
 
															+				     dead_node);
														
 
															+				atomic_set(&mle->woken, 1);
														
 
															+				spin_unlock(&mle->spinlock);
														
 
															+				wake_up(&mle->wq);
														
 
															+				/* final put will take care of list removal */
														
 
															+				__dlm_put_mle(mle);
														
 
															+			}
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		/* everything else is a MIGRATION mle */
														
 
															+
														
 
															+		/* the rule for MIGRATION mles is that the master
														
 
															+		 * becomes UNKNOWN if *either* the original or
														
 
															+		 * the new master dies.  all UNKNOWN lockreses
														
 
															+		 * are sent to whichever node becomes the recovery
														
 
															+		 * master.  the new master is responsible for
														
 
															+		 * determining if there is still a master for
														
 
															+		 * this lockres, or if he needs to take over
														
 
															+		 * mastery.  either way, this node should expect
														
 
															+		 * another message to resolve this. */
														
 
															+		if (mle->master != dead_node &&
														
 
															+		    mle->new_master != dead_node)
														
 
															+			continue;
														
 
															+
														
 
															+		/* if we have reached this point, this mle needs to
														
 
															+		 * be removed from the list and freed. */
														
 
															+
														
 
															+		/* remove from the list early.  NOTE: unlinking
														
 
															+		 * list_head while in list_for_each_safe */
														
 
															+		spin_lock(&mle->spinlock);
														
 
															+		list_del_init(&mle->list);
														
 
															+		atomic_set(&mle->woken, 1);
														
 
															+		spin_unlock(&mle->spinlock);
														
 
															+		wake_up(&mle->wq);
														
 
															+
														
 
															+		mlog(0, "node %u died during migration from "
														
 
															+		     "%u to %u!\n", dead_node,
														
 
															+		     mle->master, mle->new_master);
														
 
															+		/* if there is a lockres associated with this
														
 
															+	 	 * mle, find it and set its owner to UNKNOWN */
														
 
															+		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
														
 
															+					mle->u.name.len);
														
 
															+		if (res) {
														
 
															+			/* unfortunately if we hit this rare case, our
														
 
															+		 	 * lock ordering is messed.  we need to drop
														
 
															+		 	 * the master lock so that we can take the
														
 
															+		  	 * lockres lock, meaning that we will have to
														
 
															+			 * restart from the head of list. */
														
 
															+			spin_unlock(&dlm->master_lock);
														
 
															+
														
 
															+			/* move lockres onto recovery list */
														
 
															+			spin_lock(&res->spinlock);
														
 
															+			dlm_set_lockres_owner(dlm, res,
														
 
															+				      	DLM_LOCK_RES_OWNER_UNKNOWN);
														
 
															+			dlm_move_lockres_to_recovery_list(dlm, res);
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			dlm_lockres_put(res);
														
 
															+
														
 
															+			/* dump the mle */
														
 
															+			spin_lock(&dlm->master_lock);
														
 
															+			__dlm_put_mle(mle);
														
 
															+			spin_unlock(&dlm->master_lock);
														
 
															+
														
 
															+			/* restart */
														
 
															+			goto top;
														
 
															+		}
														
 
															+
														
 
															+		/* this may be the last reference */
														
 
															+		__dlm_put_mle(mle);
														
 
															+	}
														
 
															+	spin_unlock(&dlm->master_lock);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+			 u8 old_master)
														
 
															+{
														
 
															+	struct dlm_node_iter iter;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	dlm_node_iter_init(dlm->domain_map, &iter);
														
 
															+	clear_bit(old_master, iter.node_map);
														
 
															+	clear_bit(dlm->node_num, iter.node_map);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	mlog(0, "now time to do a migrate request to other nodes\n");
														
 
															+	ret = dlm_do_migrate_request(dlm, res, old_master,
														
 
															+				     dlm->node_num, &iter);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "doing assert master of %.*s to all except the original node\n",
														
 
															+	     res->lockname.len, res->lockname.name);
														
 
															+	/* this call now finishes out the nodemap
														
 
															+	 * even if one or more nodes die */
														
 
															+	ret = dlm_do_assert_master(dlm, res->lockname.name,
														
 
															+				   res->lockname.len, iter.node_map,
														
 
															+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
														
 
															+	if (ret < 0) {
														
 
															+		/* no longer need to retry.  all living nodes contacted. */
														
 
															+		mlog_errno(ret);
														
 
															+		ret = 0;
														
 
															+	}
														
 
															+
														
 
															+	memset(iter.node_map, 0, sizeof(iter.node_map));
														
 
															+	set_bit(old_master, iter.node_map);
														
 
															+	mlog(0, "doing assert master of %.*s back to %u\n",
														
 
															+	     res->lockname.len, res->lockname.name, old_master);
														
 
															+	ret = dlm_do_assert_master(dlm, res->lockname.name,
														
 
															+				   res->lockname.len, iter.node_map,
														
 
															+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
														
 
															+	if (ret < 0) {
														
 
															+		mlog(0, "assert master to original master failed "
														
 
															+		     "with %d.\n", ret);
														
 
															+		/* the only nonzero status here would be because of
														
 
															+		 * a dead original node.  we're done. */
														
 
															+		ret = 0;
														
 
															+	}
														
 
															+
														
 
															+	/* all done, set the owner, clear the flag */
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	dlm_set_lockres_owner(dlm, res, dlm->node_num);
														
 
															+	res->state &= ~DLM_LOCK_RES_MIGRATING;
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	/* re-dirty it on the new master */
														
 
															+	dlm_kick_thread(dlm, res);
														
 
															+	wake_up(&res->wq);
														
 
															+leave:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * LOCKRES AST REFCOUNT
														
 
															+ * this is integral to migration
														
 
															+ */
														
 
															+
														
 
															+/* for future intent to call an ast, reserve one ahead of time.
														
 
															+ * this should be called only after waiting on the lockres
														
 
															+ * with dlm_wait_on_lockres, and while still holding the
														
 
															+ * spinlock after the call. */
														
 
															+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+	if (res->state & DLM_LOCK_RES_MIGRATING) {
														
 
															+		__dlm_print_one_lock_resource(res);
														
 
															+	}
														
 
															+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
														
 
															+
														
 
															+	atomic_inc(&res->asts_reserved);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * used to drop the reserved ast, either because it went unused,
														
 
															+ * or because the ast/bast was actually called.
														
 
															+ *
														
 
															+ * also, if there is a pending migration on this lockres,
														
 
															+ * and this was the last pending ast on the lockres,
														
 
															+ * atomically set the MIGRATING flag before we drop the lock.
														
 
															+ * this is how we ensure that migration can proceed with no
														
 
															+ * asts in progress.  note that it is ok if the state of the
														
 
															+ * queues is such that a lock should be granted in the future
														
 
															+ * or that a bast should be fired, because the new master will
														
 
															+ * shuffle the lists on this lockres as soon as it is migrated.
														
 
															+ */
														
 
															+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
														
 
															+			     struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
														
 
															+		return;
														
 
															+
														
 
															+	if (!res->migration_pending) {
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
														
 
															+	res->migration_pending = 0;
														
 
															+	res->state |= DLM_LOCK_RES_MIGRATING;
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	wake_up(&res->wq);
														
 
															+	wake_up(&dlm->migration_wq);
														
 
															+}
														
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2132 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmrecovery.c
														
 
															+ *
														
 
															+ * recovery stuff
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/socket.h>
														
 
															+#include <linux/inet.h>
														
 
															+#include <linux/timer.h>
														
 
															+#include <linux/kthread.h>
														
 
															+
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+#include "dlmdomain.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
														
 
															+
														
 
															+static int dlm_recovery_thread(void *data);
														
 
															+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
														
 
															+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
														
 
															+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
														
 
															+static int dlm_do_recovery(struct dlm_ctxt *dlm);
														
 
															+
														
 
															+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
														
 
															+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
														
 
															+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
														
 
															+static int dlm_request_all_locks(struct dlm_ctxt *dlm,
														
 
															+				 u8 request_from, u8 dead_node);
														
 
															+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
														
 
															+
														
 
															+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
														
 
															+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
														
 
															+					const char *lockname, int namelen,
														
 
															+					int total_locks, u64 cookie,
														
 
															+					u8 flags, u8 master);
														
 
															+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
														
 
															+				    struct dlm_migratable_lockres *mres,
														
 
															+				    u8 send_to,
														
 
															+				    struct dlm_lock_resource *res,
														
 
															+				    int total_locks);
														
 
															+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_lock_resource *res,
														
 
															+				      u8 *real_master);
														
 
															+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
														
 
															+				     struct dlm_lock_resource *res,
														
 
															+				     struct dlm_migratable_lockres *mres);
														
 
															+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
														
 
															+				 struct dlm_lock_resource *res,
														
 
															+				 u8 nodenum, u8 *real_master);
														
 
															+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
														
 
															+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
														
 
															+				 u8 dead_node, u8 send_to);
														
 
															+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
														
 
															+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
														
 
															+					struct list_head *list, u8 dead_node);
														
 
															+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
														
 
															+					      u8 dead_node, u8 new_master);
														
 
															+static void dlm_reco_ast(void *astdata);
														
 
															+static void dlm_reco_bast(void *astdata, int blocked_type);
														
 
															+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
														
 
															+static void dlm_request_all_locks_worker(struct dlm_work_item *item,
														
 
															+					 void *data);
														
 
															+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
														
 
															+
														
 
															+static u64 dlm_get_next_mig_cookie(void);
														
 
															+
														
 
															+static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
														
 
															+static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
														
 
															+static u64 dlm_mig_cookie = 1;
														
 
															+
														
 
															+static u64 dlm_get_next_mig_cookie(void)
														
 
															+{
														
 
															+	u64 c;
														
 
															+	spin_lock(&dlm_mig_cookie_lock);
														
 
															+	c = dlm_mig_cookie;
														
 
															+	if (dlm_mig_cookie == (~0ULL))
														
 
															+		dlm_mig_cookie = 1;
														
 
															+	else
														
 
															+		dlm_mig_cookie++;
														
 
															+	spin_unlock(&dlm_mig_cookie_lock);
														
 
															+	return c;
														
 
															+}
														
 
															+
														
 
															+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
														
 
															+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
														
 
															+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+/* Worker function used during recovery. */
														
 
															+void dlm_dispatch_work(void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
														
 
															+	LIST_HEAD(tmp_list);
														
 
															+	struct list_head *iter, *iter2;
														
 
															+	struct dlm_work_item *item;
														
 
															+	dlm_workfunc_t *workfunc;
														
 
															+
														
 
															+	spin_lock(&dlm->work_lock);
														
 
															+	list_splice_init(&dlm->work_list, &tmp_list);
														
 
															+	spin_unlock(&dlm->work_lock);
														
 
															+
														
 
															+	list_for_each_safe(iter, iter2, &tmp_list) {
														
 
															+		item = list_entry(iter, struct dlm_work_item, list);
														
 
															+		workfunc = item->func;
														
 
															+		list_del_init(&item->list);
														
 
															+
														
 
															+		/* already have ref on dlm to avoid having
														
 
															+		 * it disappear.  just double-check. */
														
 
															+		BUG_ON(item->dlm != dlm);
														
 
															+
														
 
															+		/* this is allowed to sleep and
														
 
															+		 * call network stuff */
														
 
															+		workfunc(item, item->data);
														
 
															+
														
 
															+		dlm_put(dlm);
														
 
															+		kfree(item);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * RECOVERY THREAD
														
 
															+ */
														
 
															+
														
 
															+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	/* wake the recovery thread
														
 
															+	 * this will wake the reco thread in one of three places
														
 
															+	 * 1) sleeping with no recovery happening
														
 
															+	 * 2) sleeping with recovery mastered elsewhere
														
 
															+	 * 3) recovery mastered here, waiting on reco data */
														
 
															+
														
 
															+	wake_up(&dlm->dlm_reco_thread_wq);
														
 
															+}
														
 
															+
														
 
															+/* Launch the recovery thread */
														
 
															+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	mlog(0, "starting dlm recovery thread...\n");
														
 
															+
														
 
															+	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
														
 
															+						"dlm_reco_thread");
														
 
															+	if (IS_ERR(dlm->dlm_reco_thread_task)) {
														
 
															+		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
														
 
															+		dlm->dlm_reco_thread_task = NULL;
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	if (dlm->dlm_reco_thread_task) {
														
 
															+		mlog(0, "waiting for dlm recovery thread to exit\n");
														
 
															+		kthread_stop(dlm->dlm_reco_thread_task);
														
 
															+		dlm->dlm_reco_thread_task = NULL;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * this is lame, but here's how recovery works...
														
 
															+ * 1) all recovery threads cluster wide will work on recovering
														
 
															+ *    ONE node at a time
														
 
															+ * 2) negotiate who will take over all the locks for the dead node.
														
 
															+ *    thats right... ALL the locks.
														
 
															+ * 3) once a new master is chosen, everyone scans all locks
														
 
															+ *    and moves aside those mastered by the dead guy
														
 
															+ * 4) each of these locks should be locked until recovery is done
														
 
															+ * 5) the new master collects up all of secondary lock queue info
														
 
															+ *    one lock at a time, forcing each node to communicate back
														
 
															+ *    before continuing
														
 
															+ * 6) each secondary lock queue responds with the full known lock info
														
 
															+ * 7) once the new master has run all its locks, it sends a ALLDONE!
														
 
															+ *    message to everyone
														
 
															+ * 8) upon receiving this message, the secondary queue node unlocks
														
 
															+ *    and responds to the ALLDONE
														
 
															+ * 9) once the new master gets responses from everyone, he unlocks
														
 
															+ *    everything and recovery for this dead node is done
														
 
															+ *10) go back to 2) while there are still dead nodes
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
														
 
															+
														
 
															+static int dlm_recovery_thread(void *data)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
														
 
															+
														
 
															+	mlog(0, "dlm thread running for %s...\n", dlm->name);
														
 
															+
														
 
															+	while (!kthread_should_stop()) {
														
 
															+		if (dlm_joined(dlm)) {
														
 
															+			status = dlm_do_recovery(dlm);
														
 
															+			if (status == -EAGAIN) {
														
 
															+				/* do not sleep, recheck immediately. */
														
 
															+				continue;
														
 
															+			}
														
 
															+			if (status < 0)
														
 
															+				mlog_errno(status);
														
 
															+		}
														
 
															+
														
 
															+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
														
 
															+						 kthread_should_stop(),
														
 
															+						 timeout);
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "quitting DLM recovery thread\n");
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* callers of the top-level api calls (dlmlock/dlmunlock) should
														
 
															+ * block on the dlm->reco.event when recovery is in progress.
														
 
															+ * the dlm recovery thread will set this state when it begins
														
 
															+ * recovering a dead node (as the new master or not) and clear
														
 
															+ * the state and wake as soon as all affected lock resources have
														
 
															+ * been marked with the RECOVERY flag */
														
 
															+static int dlm_in_recovery(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int in_recovery;
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+	return in_recovery;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
														
 
															+}
														
 
															+
														
 
															+static void dlm_begin_recovery(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
														
 
															+	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+static void dlm_end_recovery(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
														
 
															+	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+	wake_up(&dlm->reco.event);
														
 
															+}
														
 
															+
														
 
															+static int dlm_do_recovery(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+	/* check to see if the new master has died */
														
 
															+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
														
 
															+	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
														
 
															+		mlog(0, "new master %u died while recovering %u!\n",
														
 
															+		     dlm->reco.new_master, dlm->reco.dead_node);
														
 
															+		/* unset the new_master, leave dead_node */
														
 
															+		dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
														
 
															+	}
														
 
															+
														
 
															+	/* select a target to recover */
														
 
															+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
														
 
															+		int bit;
														
 
															+
														
 
															+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
														
 
															+		if (bit >= O2NM_MAX_NODES || bit < 0)
														
 
															+			dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
														
 
															+		else
														
 
															+			dlm->reco.dead_node = bit;
														
 
															+	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
														
 
															+		/* BUG? */
														
 
															+		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
														
 
															+		     dlm->reco.dead_node);
														
 
															+		dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
														
 
															+	}
														
 
															+
														
 
															+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
														
 
															+		// mlog(0, "nothing to recover!  sleeping now!\n");
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+		/* return to main thread loop and sleep. */
														
 
															+		return 0;
														
 
															+	}
														
 
															+	mlog(0, "recovery thread found node %u in the recovery map!\n",
														
 
															+	     dlm->reco.dead_node);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	/* take write barrier */
														
 
															+	/* (stops the list reshuffling thread, proxy ast handling) */
														
 
															+	dlm_begin_recovery(dlm);
														
 
															+
														
 
															+	if (dlm->reco.new_master == dlm->node_num)
														
 
															+		goto master_here;
														
 
															+
														
 
															+	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
														
 
															+		/* choose a new master */
														
 
															+		if (!dlm_pick_recovery_master(dlm)) {
														
 
															+			/* already notified everyone.  go. */
														
 
															+			dlm->reco.new_master = dlm->node_num;
														
 
															+			goto master_here;
														
 
															+		}
														
 
															+		mlog(0, "another node will master this recovery session.\n");
														
 
															+	}
														
 
															+	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
														
 
															+	     dlm->name, dlm->reco.new_master,
														
 
															+	     dlm->node_num, dlm->reco.dead_node);
														
 
															+
														
 
															+	/* it is safe to start everything back up here
														
 
															+	 * because all of the dead node's lock resources
														
 
															+	 * have been marked as in-recovery */
														
 
															+	dlm_end_recovery(dlm);
														
 
															+
														
 
															+	/* sleep out in main dlm_recovery_thread loop. */
														
 
															+	return 0;
														
 
															+
														
 
															+master_here:
														
 
															+	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
														
 
															+	     dlm->name, dlm->reco.dead_node, dlm->node_num);
														
 
															+
														
 
															+	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
														
 
															+	if (status < 0) {
														
 
															+		mlog(ML_ERROR, "error %d remastering locks for node %u, "
														
 
															+		     "retrying.\n", status, dlm->reco.dead_node);
														
 
															+	} else {
														
 
															+		/* success!  see if any other nodes need recovery */
														
 
															+		dlm_reset_recovery(dlm);
														
 
															+	}
														
 
															+	dlm_end_recovery(dlm);
														
 
															+
														
 
															+	/* continue and look for another dead node */
														
 
															+	return -EAGAIN;
														
 
															+}
														
 
															+
														
 
															+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct dlm_reco_node_data *ndata;
														
 
															+	struct list_head *iter;
														
 
															+	int all_nodes_done;
														
 
															+	int destroy = 0;
														
 
															+	int pass = 0;
														
 
															+
														
 
															+	status = dlm_init_recovery_area(dlm, dead_node);
														
 
															+	if (status < 0)
														
 
															+		goto leave;
														
 
															+
														
 
															+	/* safe to access the node data list without a lock, since this
														
 
															+	 * process is the only one to change the list */
														
 
															+	list_for_each(iter, &dlm->reco.node_data) {
														
 
															+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
														
 
															+		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
														
 
															+		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
														
 
															+
														
 
															+		mlog(0, "requesting lock info from node %u\n",
														
 
															+		     ndata->node_num);
														
 
															+
														
 
															+		if (ndata->node_num == dlm->node_num) {
														
 
															+			ndata->state = DLM_RECO_NODE_DATA_DONE;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			if (dlm_is_host_down(status))
														
 
															+				ndata->state = DLM_RECO_NODE_DATA_DEAD;
														
 
															+			else {
														
 
															+				destroy = 1;
														
 
															+				goto leave;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		switch (ndata->state) {
														
 
															+			case DLM_RECO_NODE_DATA_INIT:
														
 
															+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
														
 
															+			case DLM_RECO_NODE_DATA_REQUESTED:
														
 
															+				BUG();
														
 
															+				break;
														
 
															+			case DLM_RECO_NODE_DATA_DEAD:
														
 
															+				mlog(0, "node %u died after requesting "
														
 
															+				     "recovery info for node %u\n",
														
 
															+				     ndata->node_num, dead_node);
														
 
															+				// start all over
														
 
															+				destroy = 1;
														
 
															+				status = -EAGAIN;
														
 
															+				goto leave;
														
 
															+			case DLM_RECO_NODE_DATA_REQUESTING:
														
 
															+				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
														
 
															+				mlog(0, "now receiving recovery data from "
														
 
															+				     "node %u for dead node %u\n",
														
 
															+				     ndata->node_num, dead_node);
														
 
															+				break;
														
 
															+			case DLM_RECO_NODE_DATA_RECEIVING:
														
 
															+				mlog(0, "already receiving recovery data from "
														
 
															+				     "node %u for dead node %u\n",
														
 
															+				     ndata->node_num, dead_node);
														
 
															+				break;
														
 
															+			case DLM_RECO_NODE_DATA_DONE:
														
 
															+				mlog(0, "already DONE receiving recovery data "
														
 
															+				     "from node %u for dead node %u\n",
														
 
															+				     ndata->node_num, dead_node);
														
 
															+				break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "done requesting all lock info\n");
														
 
															+
														
 
															+	/* nodes should be sending reco data now
														
 
															+	 * just need to wait */
														
 
															+
														
 
															+	while (1) {
														
 
															+		/* check all the nodes now to see if we are
														
 
															+		 * done, or if anyone died */
														
 
															+		all_nodes_done = 1;
														
 
															+		spin_lock(&dlm_reco_state_lock);
														
 
															+		list_for_each(iter, &dlm->reco.node_data) {
														
 
															+			ndata = list_entry (iter, struct dlm_reco_node_data, list);
														
 
															+
														
 
															+			mlog(0, "checking recovery state of node %u\n",
														
 
															+			     ndata->node_num);
														
 
															+			switch (ndata->state) {
														
 
															+				case DLM_RECO_NODE_DATA_INIT:
														
 
															+				case DLM_RECO_NODE_DATA_REQUESTING:
														
 
															+					mlog(ML_ERROR, "bad ndata state for "
														
 
															+					     "node %u: state=%d\n",
														
 
															+					     ndata->node_num, ndata->state);
														
 
															+					BUG();
														
 
															+					break;
														
 
															+				case DLM_RECO_NODE_DATA_DEAD:
														
 
															+					mlog(0, "node %u died after "
														
 
															+					     "requesting recovery info for "
														
 
															+					     "node %u\n", ndata->node_num,
														
 
															+					     dead_node);
														
 
															+					spin_unlock(&dlm_reco_state_lock);
														
 
															+					// start all over
														
 
															+					destroy = 1;
														
 
															+					status = -EAGAIN;
														
 
															+					goto leave;
														
 
															+				case DLM_RECO_NODE_DATA_RECEIVING:
														
 
															+				case DLM_RECO_NODE_DATA_REQUESTED:
														
 
															+					all_nodes_done = 0;
														
 
															+					break;
														
 
															+				case DLM_RECO_NODE_DATA_DONE:
														
 
															+					break;
														
 
															+				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
														
 
															+					break;
														
 
															+			}
														
 
															+		}
														
 
															+		spin_unlock(&dlm_reco_state_lock);
														
 
															+
														
 
															+		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
														
 
															+		     all_nodes_done?"yes":"no");
														
 
															+		if (all_nodes_done) {
														
 
															+			int ret;
														
 
															+
														
 
															+			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
														
 
															+	 		 * just send a finalize message to everyone and
														
 
															+	 		 * clean up */
														
 
															+			mlog(0, "all nodes are done! send finalize\n");
														
 
															+			ret = dlm_send_finalize_reco_message(dlm);
														
 
															+			if (ret < 0)
														
 
															+				mlog_errno(ret);
														
 
															+
														
 
															+			spin_lock(&dlm->spinlock);
														
 
															+			dlm_finish_local_lockres_recovery(dlm, dead_node,
														
 
															+							  dlm->node_num);
														
 
															+			spin_unlock(&dlm->spinlock);
														
 
															+			mlog(0, "should be done with recovery!\n");
														
 
															+
														
 
															+			mlog(0, "finishing recovery of %s at %lu, "
														
 
															+			     "dead=%u, this=%u, new=%u\n", dlm->name,
														
 
															+			     jiffies, dlm->reco.dead_node,
														
 
															+			     dlm->node_num, dlm->reco.new_master);
														
 
															+			destroy = 1;
														
 
															+			status = ret;
														
 
															+			/* rescan everything marked dirty along the way */
														
 
															+			dlm_kick_thread(dlm, NULL);
														
 
															+			break;
														
 
															+		}
														
 
															+		/* wait to be signalled, with periodic timeout
														
 
															+		 * to check for node death */
														
 
															+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
														
 
															+					 kthread_should_stop(),
														
 
															+					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
														
 
															+
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	if (destroy)
														
 
															+		dlm_destroy_recovery_area(dlm, dead_node);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
														
 
															+{
														
 
															+	int num=0;
														
 
															+	struct dlm_reco_node_data *ndata;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
														
 
															+	/* nodes can only be removed (by dying) after dropping
														
 
															+	 * this lock, and death will be trapped later, so this should do */
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	while (1) {
														
 
															+		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
														
 
															+		if (num >= O2NM_MAX_NODES) {
														
 
															+			break;
														
 
															+		}
														
 
															+		BUG_ON(num == dead_node);
														
 
															+
														
 
															+		ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
														
 
															+		if (!ndata) {
														
 
															+			dlm_destroy_recovery_area(dlm, dead_node);
														
 
															+			return -ENOMEM;
														
 
															+		}
														
 
															+		ndata->node_num = num;
														
 
															+		ndata->state = DLM_RECO_NODE_DATA_INIT;
														
 
															+		spin_lock(&dlm_reco_state_lock);
														
 
															+		list_add_tail(&ndata->list, &dlm->reco.node_data);
														
 
															+		spin_unlock(&dlm_reco_state_lock);
														
 
															+		num++;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
														
 
															+{
														
 
															+	struct list_head *iter, *iter2;
														
 
															+	struct dlm_reco_node_data *ndata;
														
 
															+	LIST_HEAD(tmplist);
														
 
															+
														
 
															+	spin_lock(&dlm_reco_state_lock);
														
 
															+	list_splice_init(&dlm->reco.node_data, &tmplist);
														
 
															+	spin_unlock(&dlm_reco_state_lock);
														
 
															+
														
 
															+	list_for_each_safe(iter, iter2, &tmplist) {
														
 
															+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
														
 
															+		list_del_init(&ndata->list);
														
 
															+		kfree(ndata);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
														
 
															+				 u8 dead_node)
														
 
															+{
														
 
															+	struct dlm_lock_request lr;
														
 
															+	enum dlm_status ret;
														
 
															+
														
 
															+	mlog(0, "\n");
														
 
															+
														
 
															+
														
 
															+	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
														
 
															+		  "to %u\n", dead_node, request_from);
														
 
															+
														
 
															+	memset(&lr, 0, sizeof(lr));
														
 
															+	lr.node_idx = dlm->node_num;
														
 
															+	lr.dead_node = dead_node;
														
 
															+
														
 
															+	// send message
														
 
															+	ret = DLM_NOLOCKMGR;
														
 
															+	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
														
 
															+				 &lr, sizeof(lr), request_from, NULL);
														
 
															+
														
 
															+	/* negative status is handled by caller */
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+
														
 
															+	// return from here, then
														
 
															+	// sleep until all received or error
														
 
															+	return ret;
														
 
															+
														
 
															+}
														
 
															+
														
 
															+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
														
 
															+	char *buf = NULL;
														
 
															+	struct dlm_work_item *item = NULL;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	BUG_ON(lr->dead_node != dlm->reco.dead_node);
														
 
															+
														
 
															+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
														
 
															+	if (!item) {
														
 
															+		dlm_put(dlm);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	/* this will get freed by dlm_request_all_locks_worker */
														
 
															+	buf = (char *) __get_free_page(GFP_KERNEL);
														
 
															+	if (!buf) {
														
 
															+		kfree(item);
														
 
															+		dlm_put(dlm);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	/* queue up work for dlm_request_all_locks_worker */
														
 
															+	dlm_grab(dlm);  /* get an extra ref for the work item */
														
 
															+	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
														
 
															+	item->u.ral.reco_master = lr->node_idx;
														
 
															+	item->u.ral.dead_node = lr->dead_node;
														
 
															+	spin_lock(&dlm->work_lock);
														
 
															+	list_add_tail(&item->list, &dlm->work_list);
														
 
															+	spin_unlock(&dlm->work_lock);
														
 
															+	schedule_work(&dlm->dispatched_work);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
														
 
															+{
														
 
															+	struct dlm_migratable_lockres *mres;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+	LIST_HEAD(resources);
														
 
															+	struct list_head *iter;
														
 
															+	int ret;
														
 
															+	u8 dead_node, reco_master;
														
 
															+
														
 
															+	dlm = item->dlm;
														
 
															+	dead_node = item->u.ral.dead_node;
														
 
															+	reco_master = item->u.ral.reco_master;
														
 
															+	BUG_ON(dead_node != dlm->reco.dead_node);
														
 
															+	BUG_ON(reco_master != dlm->reco.new_master);
														
 
															+
														
 
															+	mres = (struct dlm_migratable_lockres *)data;
														
 
															+
														
 
															+	/* lock resources should have already been moved to the
														
 
															+ 	 * dlm->reco.resources list.  now move items from that list
														
 
															+ 	 * to a temp list if the dead owner matches.  note that the
														
 
															+	 * whole cluster recovers only one node at a time, so we
														
 
															+	 * can safely move UNKNOWN lock resources for each recovery
														
 
															+	 * session. */
														
 
															+	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
														
 
															+
														
 
															+	/* now we can begin blasting lockreses without the dlm lock */
														
 
															+	list_for_each(iter, &resources) {
														
 
															+		res = list_entry (iter, struct dlm_lock_resource, recovering);
														
 
															+		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
														
 
															+				   	DLM_MRES_RECOVERY);
														
 
															+		if (ret < 0)
														
 
															+			mlog_errno(ret);
														
 
															+	}
														
 
															+
														
 
															+	/* move the resources back to the list */
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	list_splice_init(&resources, &dlm->reco.resources);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+
														
 
															+	free_page((unsigned long)data);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
														
 
															+{
														
 
															+	int ret, tmpret;
														
 
															+	struct dlm_reco_data_done done_msg;
														
 
															+
														
 
															+	memset(&done_msg, 0, sizeof(done_msg));
														
 
															+	done_msg.node_idx = dlm->node_num;
														
 
															+	done_msg.dead_node = dead_node;
														
 
															+	mlog(0, "sending DATA DONE message to %u, "
														
 
															+	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
														
 
															+	     done_msg.dead_node);
														
 
															+
														
 
															+	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
														
 
															+				 sizeof(done_msg), send_to, &tmpret);
														
 
															+	/* negative status is ignored by the caller */
														
 
															+	if (ret >= 0)
														
 
															+		ret = tmpret;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_reco_node_data *ndata = NULL;
														
 
															+	int ret = -EINVAL;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
														
 
															+	     "node_idx=%u, this node=%u\n", done->dead_node,
														
 
															+	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
														
 
															+	BUG_ON(done->dead_node != dlm->reco.dead_node);
														
 
															+
														
 
															+	spin_lock(&dlm_reco_state_lock);
														
 
															+	list_for_each(iter, &dlm->reco.node_data) {
														
 
															+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
														
 
															+		if (ndata->node_num != done->node_idx)
														
 
															+			continue;
														
 
															+
														
 
															+		switch (ndata->state) {
														
 
															+			case DLM_RECO_NODE_DATA_INIT:
														
 
															+			case DLM_RECO_NODE_DATA_DEAD:
														
 
															+			case DLM_RECO_NODE_DATA_DONE:
														
 
															+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
														
 
															+				mlog(ML_ERROR, "bad ndata state for node %u:"
														
 
															+				     " state=%d\n", ndata->node_num,
														
 
															+				     ndata->state);
														
 
															+				BUG();
														
 
															+				break;
														
 
															+			case DLM_RECO_NODE_DATA_RECEIVING:
														
 
															+			case DLM_RECO_NODE_DATA_REQUESTED:
														
 
															+			case DLM_RECO_NODE_DATA_REQUESTING:
														
 
															+				mlog(0, "node %u is DONE sending "
														
 
															+					  "recovery data!\n",
														
 
															+					  ndata->node_num);
														
 
															+
														
 
															+				ndata->state = DLM_RECO_NODE_DATA_DONE;
														
 
															+				ret = 0;
														
 
															+				break;
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&dlm_reco_state_lock);
														
 
															+
														
 
															+	/* wake the recovery thread, some node is done */
														
 
															+	if (!ret)
														
 
															+		dlm_kick_recovery_thread(dlm);
														
 
															+
														
 
															+	if (ret < 0)
														
 
															+		mlog(ML_ERROR, "failed to find recovery node data for node "
														
 
															+		     "%u\n", done->node_idx);
														
 
															+	dlm_put(dlm);
														
 
															+
														
 
															+	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
														
 
															+					struct list_head *list,
														
 
															+				       	u8 dead_node)
														
 
															+{
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	struct list_head *iter, *iter2;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
														
 
															+		res = list_entry (iter, struct dlm_lock_resource, recovering);
														
 
															+		if (dlm_is_recovery_lock(res->lockname.name,
														
 
															+					 res->lockname.len))
														
 
															+			continue;
														
 
															+		if (res->owner == dead_node) {
														
 
															+			mlog(0, "found lockres owned by dead node while "
														
 
															+				  "doing recovery for node %u. sending it.\n",
														
 
															+				  dead_node);
														
 
															+			list_del_init(&res->recovering);
														
 
															+			list_add_tail(&res->recovering, list);
														
 
															+		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+			mlog(0, "found UNKNOWN owner while doing recovery "
														
 
															+				  "for node %u. sending it.\n", dead_node);
														
 
															+			list_del_init(&res->recovering);
														
 
															+			list_add_tail(&res->recovering, list);
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	int total_locks = 0;
														
 
															+	struct list_head *iter, *queue = &res->granted;
														
 
															+	int i;
														
 
															+
														
 
															+	for (i=0; i<3; i++) {
														
 
															+		list_for_each(iter, queue)
														
 
															+			total_locks++;
														
 
															+		queue++;
														
 
															+	}
														
 
															+	return total_locks;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_migratable_lockres *mres,
														
 
															+				      u8 send_to,
														
 
															+				      struct dlm_lock_resource *res,
														
 
															+				      int total_locks)
														
 
															+{
														
 
															+	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
														
 
															+	int mres_total_locks = be32_to_cpu(mres->total_locks);
														
 
															+	int sz, ret = 0, status = 0;
														
 
															+	u8 orig_flags = mres->flags,
														
 
															+	   orig_master = mres->master;
														
 
															+
														
 
															+	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
														
 
															+	if (!mres->num_locks)
														
 
															+		return 0;
														
 
															+
														
 
															+	sz = sizeof(struct dlm_migratable_lockres) +
														
 
															+		(mres->num_locks * sizeof(struct dlm_migratable_lock));
														
 
															+
														
 
															+	/* add an all-done flag if we reached the last lock */
														
 
															+	orig_flags = mres->flags;
														
 
															+	BUG_ON(total_locks > mres_total_locks);
														
 
															+	if (total_locks == mres_total_locks)
														
 
															+		mres->flags |= DLM_MRES_ALL_DONE;
														
 
															+
														
 
															+	/* send it */
														
 
															+	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
														
 
															+				 sz, send_to, &status);
														
 
															+	if (ret < 0) {
														
 
															+		/* XXX: negative status is not handled.
														
 
															+		 * this will end up killing this node. */
														
 
															+		mlog_errno(ret);
														
 
															+	} else {
														
 
															+		/* might get an -ENOMEM back here */
														
 
															+		ret = status;
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+
														
 
															+			if (ret == -EFAULT) {
														
 
															+				mlog(ML_ERROR, "node %u told me to kill "
														
 
															+				     "myself!\n", send_to);
														
 
															+				BUG();
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* zero and reinit the message buffer */
														
 
															+	dlm_init_migratable_lockres(mres, res->lockname.name,
														
 
															+				    res->lockname.len, mres_total_locks,
														
 
															+				    mig_cookie, orig_flags, orig_master);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
														
 
															+					const char *lockname, int namelen,
														
 
															+					int total_locks, u64 cookie,
														
 
															+					u8 flags, u8 master)
														
 
															+{
														
 
															+	/* mres here is one full page */
														
 
															+	memset(mres, 0, PAGE_SIZE);
														
 
															+	mres->lockname_len = namelen;
														
 
															+	memcpy(mres->lockname, lockname, namelen);
														
 
															+	mres->num_locks = 0;
														
 
															+	mres->total_locks = cpu_to_be32(total_locks);
														
 
															+	mres->mig_cookie = cpu_to_be64(cookie);
														
 
															+	mres->flags = flags;
														
 
															+	mres->master = master;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* returns 1 if this lock fills the network structure,
														
 
															+ * 0 otherwise */
														
 
															+static int dlm_add_lock_to_array(struct dlm_lock *lock,
														
 
															+				 struct dlm_migratable_lockres *mres, int queue)
														
 
															+{
														
 
															+	struct dlm_migratable_lock *ml;
														
 
															+	int lock_num = mres->num_locks;
														
 
															+
														
 
															+	ml = &(mres->ml[lock_num]);
														
 
															+	ml->cookie = lock->ml.cookie;
														
 
															+	ml->type = lock->ml.type;
														
 
															+	ml->convert_type = lock->ml.convert_type;
														
 
															+	ml->highest_blocked = lock->ml.highest_blocked;
														
 
															+	ml->list = queue;
														
 
															+	if (lock->lksb) {
														
 
															+		ml->flags = lock->lksb->flags;
														
 
															+		/* send our current lvb */
														
 
															+		if (ml->type == LKM_EXMODE ||
														
 
															+		    ml->type == LKM_PRMODE) {
														
 
															+			/* if it is already set, this had better be a PR
														
 
															+			 * and it has to match */
														
 
															+			if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
														
 
															+			    memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
														
 
															+				mlog(ML_ERROR, "mismatched lvbs!\n");
														
 
															+				__dlm_print_one_lock_resource(lock->lockres);
														
 
															+				BUG();
														
 
															+			}
														
 
															+			memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
														
 
															+		}
														
 
															+	}
														
 
															+	ml->node = lock->ml.node;
														
 
															+	mres->num_locks++;
														
 
															+	/* we reached the max, send this network message */
														
 
															+	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
														
 
															+		return 1;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
														
 
															+			 struct dlm_migratable_lockres *mres,
														
 
															+			 u8 send_to, u8 flags)
														
 
															+{
														
 
															+	struct list_head *queue, *iter;
														
 
															+	int total_locks, i;
														
 
															+	u64 mig_cookie = 0;
														
 
															+	struct dlm_lock *lock;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
														
 
															+
														
 
															+	mlog(0, "sending to %u\n", send_to);
														
 
															+
														
 
															+	total_locks = dlm_num_locks_in_lockres(res);
														
 
															+	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
														
 
															+		/* rare, but possible */
														
 
															+		mlog(0, "argh.  lockres has %d locks.  this will "
														
 
															+			  "require more than one network packet to "
														
 
															+			  "migrate\n", total_locks);
														
 
															+		mig_cookie = dlm_get_next_mig_cookie();
														
 
															+	}
														
 
															+
														
 
															+	dlm_init_migratable_lockres(mres, res->lockname.name,
														
 
															+				    res->lockname.len, total_locks,
														
 
															+				    mig_cookie, flags, res->owner);
														
 
															+
														
 
															+	total_locks = 0;
														
 
															+	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
														
 
															+		queue = dlm_list_idx_to_ptr(res, i);
														
 
															+		list_for_each(iter, queue) {
														
 
															+			lock = list_entry (iter, struct dlm_lock, list);
														
 
															+
														
 
															+			/* add another lock. */
														
 
															+			total_locks++;
														
 
															+			if (!dlm_add_lock_to_array(lock, mres, i))
														
 
															+				continue;
														
 
															+
														
 
															+			/* this filled the lock message,
														
 
															+			 * we must send it immediately. */
														
 
															+			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
														
 
															+						       res, total_locks);
														
 
															+			if (ret < 0) {
														
 
															+				// TODO
														
 
															+				mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
														
 
															+				     "returned %d, TODO\n", ret);
														
 
															+				BUG();
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	/* flush any remaining locks */
														
 
															+	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
														
 
															+	if (ret < 0) {
														
 
															+		// TODO
														
 
															+		mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
														
 
															+		     "TODO\n", ret);
														
 
															+		BUG();
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * this message will contain no more than one page worth of
														
 
															+ * recovery data, and it will work on only one lockres.
														
 
															+ * there may be many locks in this page, and we may need to wait
														
 
															+ * for additional packets to complete all the locks (rare, but
														
 
															+ * possible).
														
 
															+ */
														
 
															+/*
														
 
															+ * NOTE: the allocation error cases here are scary
														
 
															+ * we really cannot afford to fail an alloc in recovery
														
 
															+ * do we spin?  returning an error only delays the problem really
														
 
															+ */
														
 
															+
														
 
															+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_migratable_lockres *mres =
														
 
															+		(struct dlm_migratable_lockres *)msg->buf;
														
 
															+	int ret = 0;
														
 
															+	u8 real_master;
														
 
															+	char *buf = NULL;
														
 
															+	struct dlm_work_item *item = NULL;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
														
 
															+
														
 
															+	real_master = mres->master;
														
 
															+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+		/* cannot migrate a lockres with no master */
														
 
															+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "%s message received from node %u\n",
														
 
															+		  (mres->flags & DLM_MRES_RECOVERY) ?
														
 
															+		  "recovery" : "migration", mres->master);
														
 
															+	if (mres->flags & DLM_MRES_ALL_DONE)
														
 
															+		mlog(0, "all done flag.  all lockres data received!\n");
														
 
															+
														
 
															+	ret = -ENOMEM;
														
 
															+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
														
 
															+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
														
 
															+	if (!buf || !item)
														
 
															+		goto leave;
														
 
															+
														
 
															+	/* lookup the lock to see if we have a secondary queue for this
														
 
															+	 * already...  just add the locks in and this will have its owner
														
 
															+	 * and RECOVERY flag changed when it completes. */
														
 
															+	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
														
 
															+	if (res) {
														
 
															+	 	/* this will get a ref on res */
														
 
															+		/* mark it as recovering/migrating and hash it */
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		if (mres->flags & DLM_MRES_RECOVERY) {
														
 
															+			res->state |= DLM_LOCK_RES_RECOVERING;
														
 
															+		} else {
														
 
															+			if (res->state & DLM_LOCK_RES_MIGRATING) {
														
 
															+				/* this is at least the second
														
 
															+				 * lockres message */
														
 
															+				mlog(0, "lock %.*s is already migrating\n",
														
 
															+					  mres->lockname_len,
														
 
															+					  mres->lockname);
														
 
															+			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
														
 
															+				/* caller should BUG */
														
 
															+				mlog(ML_ERROR, "node is attempting to migrate "
														
 
															+				     "lock %.*s, but marked as recovering!\n",
														
 
															+				     mres->lockname_len, mres->lockname);
														
 
															+				ret = -EFAULT;
														
 
															+				spin_unlock(&res->spinlock);
														
 
															+				goto leave;
														
 
															+			}
														
 
															+			res->state |= DLM_LOCK_RES_MIGRATING;
														
 
															+		}
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+	} else {
														
 
															+		/* need to allocate, just like if it was
														
 
															+		 * mastered here normally  */
														
 
															+		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
														
 
															+		if (!res)
														
 
															+			goto leave;
														
 
															+
														
 
															+		/* to match the ref that we would have gotten if
														
 
															+		 * dlm_lookup_lockres had succeeded */
														
 
															+		dlm_lockres_get(res);
														
 
															+
														
 
															+		/* mark it as recovering/migrating and hash it */
														
 
															+		if (mres->flags & DLM_MRES_RECOVERY)
														
 
															+			res->state |= DLM_LOCK_RES_RECOVERING;
														
 
															+		else
														
 
															+			res->state |= DLM_LOCK_RES_MIGRATING;
														
 
															+
														
 
															+		spin_lock(&dlm->spinlock);
														
 
															+		__dlm_insert_lockres(dlm, res);
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+		/* now that the new lockres is inserted,
														
 
															+		 * make it usable by other processes */
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+
														
 
															+		/* add an extra ref for just-allocated lockres 
														
 
															+		 * otherwise the lockres will be purged immediately */
														
 
															+		dlm_lockres_get(res);
														
 
															+
														
 
															+	}
														
 
															+
														
 
															+	/* at this point we have allocated everything we need,
														
 
															+	 * and we have a hashed lockres with an extra ref and
														
 
															+	 * the proper res->state flags. */
														
 
															+	ret = 0;
														
 
															+	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+		/* migration cannot have an unknown master */
														
 
															+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
														
 
															+		mlog(0, "recovery has passed me a lockres with an "
														
 
															+			  "unknown owner.. will need to requery: "
														
 
															+			  "%.*s\n", mres->lockname_len, mres->lockname);
														
 
															+	} else {
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+	}
														
 
															+
														
 
															+	/* queue up work for dlm_mig_lockres_worker */
														
 
															+	dlm_grab(dlm);  /* get an extra ref for the work item */
														
 
															+	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */
														
 
															+	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
														
 
															+	item->u.ml.lockres = res; /* already have a ref */
														
 
															+	item->u.ml.real_master = real_master;
														
 
															+	spin_lock(&dlm->work_lock);
														
 
															+	list_add_tail(&item->list, &dlm->work_list);
														
 
															+	spin_unlock(&dlm->work_lock);
														
 
															+	schedule_work(&dlm->dispatched_work);
														
 
															+
														
 
															+leave:
														
 
															+	dlm_put(dlm);
														
 
															+	if (ret < 0) {
														
 
															+		if (buf)
														
 
															+			kfree(buf);
														
 
															+		if (item)
														
 
															+			kfree(item);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_migratable_lockres *mres;
														
 
															+	int ret = 0;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	u8 real_master;
														
 
															+
														
 
															+	dlm = item->dlm;
														
 
															+	mres = (struct dlm_migratable_lockres *)data;
														
 
															+
														
 
															+	res = item->u.ml.lockres;
														
 
															+	real_master = item->u.ml.real_master;
														
 
															+
														
 
															+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+		/* this case is super-rare. only occurs if
														
 
															+		 * node death happens during migration. */
														
 
															+again:
														
 
															+		ret = dlm_lockres_master_requery(dlm, res, &real_master);
														
 
															+		if (ret < 0) {
														
 
															+			mlog(0, "dlm_lockres_master_requery failure: %d\n",
														
 
															+				  ret);
														
 
															+			goto again;
														
 
															+		}
														
 
															+		if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+			mlog(0, "lockres %.*s not claimed.  "
														
 
															+				   "this node will take it.\n",
														
 
															+				   res->lockname.len, res->lockname.name);
														
 
															+		} else {
														
 
															+			mlog(0, "master needs to respond to sender "
														
 
															+				  "that node %u still owns %.*s\n",
														
 
															+				  real_master, res->lockname.len,
														
 
															+				  res->lockname.name);
														
 
															+			/* cannot touch this lockres */
														
 
															+			goto leave;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	ret = dlm_process_recovery_data(dlm, res, mres);
														
 
															+	if (ret < 0)
														
 
															+		mlog(0, "dlm_process_recovery_data returned  %d\n", ret);
														
 
															+	else
														
 
															+		mlog(0, "dlm_process_recovery_data succeeded\n");
														
 
															+
														
 
															+	if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
														
 
															+	                   (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
														
 
															+		ret = dlm_finish_migration(dlm, res, mres->master);
														
 
															+		if (ret < 0)
														
 
															+			mlog_errno(ret);
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	kfree(data);
														
 
															+	mlog_exit(ret);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
														
 
															+				      struct dlm_lock_resource *res,
														
 
															+				      u8 *real_master)
														
 
															+{
														
 
															+	struct dlm_node_iter iter;
														
 
															+	int nodenum;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
														
 
															+
														
 
															+	/* we only reach here if one of the two nodes in a
														
 
															+	 * migration died while the migration was in progress.
														
 
															+	 * at this point we need to requery the master.  we
														
 
															+	 * know that the new_master got as far as creating
														
 
															+	 * an mle on at least one node, but we do not know
														
 
															+	 * if any nodes had actually cleared the mle and set
														
 
															+	 * the master to the new_master.  the old master
														
 
															+	 * is supposed to set the owner to UNKNOWN in the
														
 
															+	 * event of a new_master death, so the only possible
														
 
															+	 * responses that we can get from nodes here are
														
 
															+	 * that the master is new_master, or that the master
														
 
															+	 * is UNKNOWN.
														
 
															+	 * if all nodes come back with UNKNOWN then we know
														
 
															+	 * the lock needs remastering here.
														
 
															+	 * if any node comes back with a valid master, check
														
 
															+	 * to see if that master is the one that we are
														
 
															+	 * recovering.  if so, then the new_master died and
														
 
															+	 * we need to remaster this lock.  if not, then the
														
 
															+	 * new_master survived and that node will respond to
														
 
															+	 * other nodes about the owner.
														
 
															+	 * if there is an owner, this node needs to dump this
														
 
															+	 * lockres and alert the sender that this lockres
														
 
															+	 * was rejected. */
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	dlm_node_iter_init(dlm->domain_map, &iter);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
														
 
															+		/* do not send to self */
														
 
															+		if (nodenum == dlm->node_num)
														
 
															+			continue;
														
 
															+		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			BUG();
														
 
															+			/* TODO: need to figure a way to restart this */
														
 
															+		}
														
 
															+		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
														
 
															+			mlog(0, "lock master is %u\n", *real_master);
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
														
 
															+				 struct dlm_lock_resource *res,
														
 
															+				 u8 nodenum, u8 *real_master)
														
 
															+{
														
 
															+	int ret = -EINVAL;
														
 
															+	struct dlm_master_requery req;
														
 
															+	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
														
 
															+
														
 
															+	memset(&req, 0, sizeof(req));
														
 
															+	req.node_idx = dlm->node_num;
														
 
															+	req.namelen = res->lockname.len;
														
 
															+	memcpy(req.name, res->lockname.name, res->lockname.len);
														
 
															+
														
 
															+	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
														
 
															+				 &req, sizeof(req), nodenum, &status);
														
 
															+	/* XXX: negative status not handled properly here. */
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+	else {
														
 
															+		BUG_ON(status < 0);
														
 
															+		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
														
 
															+		*real_master = (u8) (status & 0xff);
														
 
															+		mlog(0, "node %u responded to master requery with %u\n",
														
 
															+			  nodenum, *real_master);
														
 
															+		ret = 0;
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* this function cannot error, so unless the sending
														
 
															+ * or receiving of the message failed, the owner can
														
 
															+ * be trusted */
														
 
															+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
														
 
															+	u32 flags = DLM_ASSERT_MASTER_REQUERY;
														
 
															+
														
 
															+	if (!dlm_grab(dlm)) {
														
 
															+		/* since the domain has gone away on this
														
 
															+		 * node, the proper response is UNKNOWN */
														
 
															+		return master;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
														
 
															+	if (res) {
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		master = res->owner;
														
 
															+		if (master == dlm->node_num) {
														
 
															+			int ret = dlm_dispatch_assert_master(dlm, res,
														
 
															+							     0, 0, flags);
														
 
															+			if (ret < 0) {
														
 
															+				mlog_errno(-ENOMEM);
														
 
															+				/* retry!? */
														
 
															+				BUG();
														
 
															+			}
														
 
															+		}
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+	}
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+	return master;
														
 
															+}
														
 
															+
														
 
															+static inline struct list_head *
														
 
															+dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
														
 
															+{
														
 
															+	struct list_head *ret;
														
 
															+	BUG_ON(list_num < 0);
														
 
															+	BUG_ON(list_num > 2);
														
 
															+	ret = &(res->granted);
														
 
															+	ret += list_num;
														
 
															+	return ret;
														
 
															+}
														
 
															+/* TODO: do ast flush business
														
 
															+ * TODO: do MIGRATING and RECOVERING spinning
														
 
															+ */
														
 
															+
														
 
															+/*
														
 
															+* NOTE about in-flight requests during migration:
														
 
															+*
														
 
															+* Before attempting the migrate, the master has marked the lockres as
														
 
															+* MIGRATING and then flushed all of its pending ASTS.  So any in-flight
														
 
															+* requests either got queued before the MIGRATING flag got set, in which
														
 
															+* case the lock data will reflect the change and a return message is on
														
 
															+* the way, or the request failed to get in before MIGRATING got set.  In
														
 
															+* this case, the caller will be told to spin and wait for the MIGRATING
														
 
															+* flag to be dropped, then recheck the master.
														
 
															+* This holds true for the convert, cancel and unlock cases, and since lvb
														
 
															+* updates are tied to these same messages, it applies to lvb updates as
														
 
															+* well.  For the lock case, there is no way a lock can be on the master
														
 
															+* queue and not be on the secondary queue since the lock is always added
														
 
															+* locally first.  This means that the new target node will never be sent
														
 
															+* a lock that he doesn't already have on the list.
														
 
															+* In total, this means that the local lock is correct and should not be
														
 
															+* updated to match the one sent by the master.  Any messages sent back
														
 
															+* from the master before the MIGRATING flag will bring the lock properly
														
 
															+* up-to-date, and the change will be ordered properly for the waiter.
														
 
															+* We will *not* attempt to modify the lock underneath the waiter.
														
 
															+*/
														
 
															+
														
 
															+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
														
 
															+				     struct dlm_lock_resource *res,
														
 
															+				     struct dlm_migratable_lockres *mres)
														
 
															+{
														
 
															+	struct dlm_migratable_lock *ml;
														
 
															+	struct list_head *queue;
														
 
															+	struct dlm_lock *newlock = NULL;
														
 
															+	struct dlm_lockstatus *lksb = NULL;
														
 
															+	int ret = 0;
														
 
															+	int i;
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock *lock = NULL;
														
 
															+
														
 
															+	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
														
 
															+	for (i=0; i<mres->num_locks; i++) {
														
 
															+		ml = &(mres->ml[i]);
														
 
															+		BUG_ON(ml->highest_blocked != LKM_IVMODE);
														
 
															+		newlock = NULL;
														
 
															+		lksb = NULL;
														
 
															+
														
 
															+		queue = dlm_list_num_to_pointer(res, ml->list);
														
 
															+
														
 
															+		/* if the lock is for the local node it needs to
														
 
															+		 * be moved to the proper location within the queue.
														
 
															+		 * do not allocate a new lock structure. */
														
 
															+		if (ml->node == dlm->node_num) {
														
 
															+			/* MIGRATION ONLY! */
														
 
															+			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
														
 
															+
														
 
															+			spin_lock(&res->spinlock);
														
 
															+			list_for_each(iter, queue) {
														
 
															+				lock = list_entry (iter, struct dlm_lock, list);
														
 
															+				if (lock->ml.cookie != ml->cookie)
														
 
															+					lock = NULL;
														
 
															+				else
														
 
															+					break;
														
 
															+			}
														
 
															+
														
 
															+			/* lock is always created locally first, and
														
 
															+			 * destroyed locally last.  it must be on the list */
														
 
															+			if (!lock) {
														
 
															+				mlog(ML_ERROR, "could not find local lock "
														
 
															+					       "with cookie %"MLFu64"!\n",
														
 
															+				     ml->cookie);
														
 
															+				BUG();
														
 
															+			}
														
 
															+			BUG_ON(lock->ml.node != ml->node);
														
 
															+
														
 
															+			/* see NOTE above about why we do not update
														
 
															+			 * to match the master here */
														
 
															+
														
 
															+			/* move the lock to its proper place */
														
 
															+			/* do not alter lock refcount.  switching lists. */
														
 
															+			list_del_init(&lock->list);
														
 
															+			list_add_tail(&lock->list, queue);
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+
														
 
															+			mlog(0, "just reordered a local lock!\n");
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		/* lock is for another node. */
														
 
															+		newlock = dlm_new_lock(ml->type, ml->node,
														
 
															+				       be64_to_cpu(ml->cookie), NULL);
														
 
															+		if (!newlock) {
														
 
															+			ret = -ENOMEM;
														
 
															+			goto leave;
														
 
															+		}
														
 
															+		lksb = newlock->lksb;
														
 
															+		dlm_lock_attach_lockres(newlock, res);
														
 
															+
														
 
															+		if (ml->convert_type != LKM_IVMODE) {
														
 
															+			BUG_ON(queue != &res->converting);
														
 
															+			newlock->ml.convert_type = ml->convert_type;
														
 
															+		}
														
 
															+		lksb->flags |= (ml->flags &
														
 
															+				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
														
 
															+			
														
 
															+		if (mres->lvb[0]) {
														
 
															+			if (lksb->flags & DLM_LKSB_PUT_LVB) {
														
 
															+				/* other node was trying to update
														
 
															+				 * lvb when node died.  recreate the
														
 
															+				 * lksb with the updated lvb. */
														
 
															+				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
														
 
															+			} else {
														
 
															+				/* otherwise, the node is sending its 
														
 
															+				 * most recent valid lvb info */
														
 
															+				BUG_ON(ml->type != LKM_EXMODE &&
														
 
															+				       ml->type != LKM_PRMODE);
														
 
															+				if (res->lvb[0] && (ml->type == LKM_EXMODE ||
														
 
															+				    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
														
 
															+					mlog(ML_ERROR, "received bad lvb!\n");
														
 
															+					__dlm_print_one_lock_resource(res);
														
 
															+					BUG();
														
 
															+				}
														
 
															+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+
														
 
															+		/* NOTE:
														
 
															+		 * wrt lock queue ordering and recovery:
														
 
															+		 *    1. order of locks on granted queue is
														
 
															+		 *       meaningless.
														
 
															+		 *    2. order of locks on converting queue is
														
 
															+		 *       LOST with the node death.  sorry charlie.
														
 
															+		 *    3. order of locks on the blocked queue is
														
 
															+		 *       also LOST.
														
 
															+		 * order of locks does not affect integrity, it
														
 
															+		 * just means that a lock request may get pushed
														
 
															+		 * back in line as a result of the node death.
														
 
															+		 * also note that for a given node the lock order
														
 
															+		 * for its secondary queue locks is preserved
														
 
															+		 * relative to each other, but clearly *not*
														
 
															+		 * preserved relative to locks from other nodes.
														
 
															+		 */
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		dlm_lock_get(newlock);
														
 
															+		list_add_tail(&newlock->list, queue);
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+	}
														
 
															+	mlog(0, "done running all the locks\n");
														
 
															+
														
 
															+leave:
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		if (newlock)
														
 
															+			dlm_lock_put(newlock);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
														
 
															+				       struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct list_head *queue, *iter, *iter2;
														
 
															+	struct dlm_lock *lock;
														
 
															+
														
 
															+	res->state |= DLM_LOCK_RES_RECOVERING;
														
 
															+	if (!list_empty(&res->recovering))
														
 
															+		list_del_init(&res->recovering);
														
 
															+	list_add_tail(&res->recovering, &dlm->reco.resources);
														
 
															+
														
 
															+	/* find any pending locks and put them back on proper list */
														
 
															+	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
														
 
															+		queue = dlm_list_idx_to_ptr(res, i);
														
 
															+		list_for_each_safe(iter, iter2, queue) {
														
 
															+			lock = list_entry (iter, struct dlm_lock, list);
														
 
															+			dlm_lock_get(lock);
														
 
															+			if (lock->convert_pending) {
														
 
															+				/* move converting lock back to granted */
														
 
															+				BUG_ON(i != DLM_CONVERTING_LIST);
														
 
															+				mlog(0, "node died with convert pending "
														
 
															+				     "on %.*s. move back to granted list.\n",
														
 
															+				     res->lockname.len, res->lockname.name);
														
 
															+				dlm_revert_pending_convert(res, lock);
														
 
															+				lock->convert_pending = 0;
														
 
															+			} else if (lock->lock_pending) {
														
 
															+				/* remove pending lock requests completely */
														
 
															+				BUG_ON(i != DLM_BLOCKED_LIST);
														
 
															+				mlog(0, "node died with lock pending "
														
 
															+				     "on %.*s. remove from blocked list and skip.\n",
														
 
															+				     res->lockname.len, res->lockname.name);
														
 
															+				/* lock will be floating until ref in
														
 
															+				 * dlmlock_remote is freed after the network
														
 
															+				 * call returns.  ok for it to not be on any
														
 
															+				 * list since no ast can be called
														
 
															+				 * (the master is dead). */
														
 
															+				dlm_revert_pending_lock(res, lock);
														
 
															+				lock->lock_pending = 0;
														
 
															+			} else if (lock->unlock_pending) {
														
 
															+				/* if an unlock was in progress, treat as
														
 
															+				 * if this had completed successfully
														
 
															+				 * before sending this lock state to the
														
 
															+				 * new master.  note that the dlm_unlock
														
 
															+				 * call is still responsible for calling
														
 
															+				 * the unlockast.  that will happen after
														
 
															+				 * the network call times out.  for now,
														
 
															+				 * just move lists to prepare the new
														
 
															+				 * recovery master.  */
														
 
															+				BUG_ON(i != DLM_GRANTED_LIST);
														
 
															+				mlog(0, "node died with unlock pending "
														
 
															+				     "on %.*s. remove from blocked list and skip.\n",
														
 
															+				     res->lockname.len, res->lockname.name);
														
 
															+				dlm_commit_pending_unlock(res, lock);
														
 
															+				lock->unlock_pending = 0;
														
 
															+			} else if (lock->cancel_pending) {
														
 
															+				/* if a cancel was in progress, treat as
														
 
															+				 * if this had completed successfully
														
 
															+				 * before sending this lock state to the
														
 
															+				 * new master */
														
 
															+				BUG_ON(i != DLM_CONVERTING_LIST);
														
 
															+				mlog(0, "node died with cancel pending "
														
 
															+				     "on %.*s. move back to granted list.\n",
														
 
															+				     res->lockname.len, res->lockname.name);
														
 
															+				dlm_commit_pending_cancel(res, lock);
														
 
															+				lock->cancel_pending = 0;
														
 
															+			}
														
 
															+			dlm_lock_put(lock);
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+/* removes all recovered locks from the recovery list.
														
 
															+ * sets the res->owner to the new master.
														
 
															+ * unsets the RECOVERY flag and wakes waiters. */
														
 
															+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
														
 
															+					      u8 dead_node, u8 new_master)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct list_head *iter, *iter2, *bucket;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
														
 
															+		res = list_entry (iter, struct dlm_lock_resource, recovering);
														
 
															+		if (res->owner == dead_node) {
														
 
															+			list_del_init(&res->recovering);
														
 
															+			spin_lock(&res->spinlock);
														
 
															+			dlm_change_lockres_owner(dlm, res, new_master);
														
 
															+			res->state &= ~DLM_LOCK_RES_RECOVERING;
														
 
															+			__dlm_dirty_lockres(dlm, res);
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			wake_up(&res->wq);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* this will become unnecessary eventually, but
														
 
															+	 * for now we need to run the whole hash, clear
														
 
															+	 * the RECOVERING state and set the owner
														
 
															+	 * if necessary */
														
 
															+	for (i=0; i<DLM_HASH_SIZE; i++) {
														
 
															+		bucket = &(dlm->resources[i]);
														
 
															+		list_for_each(iter, bucket) {
														
 
															+			res = list_entry (iter, struct dlm_lock_resource, list);
														
 
															+			if (res->state & DLM_LOCK_RES_RECOVERING) {
														
 
															+				if (res->owner == dead_node) {
														
 
															+					mlog(0, "(this=%u) res %.*s owner=%u "
														
 
															+					     "was not on recovering list, but "
														
 
															+					     "clearing state anyway\n",
														
 
															+					     dlm->node_num, res->lockname.len,
														
 
															+					     res->lockname.name, new_master);
														
 
															+				} else if (res->owner == dlm->node_num) {
														
 
															+					mlog(0, "(this=%u) res %.*s owner=%u "
														
 
															+					     "was not on recovering list, "
														
 
															+					     "owner is THIS node, clearing\n",
														
 
															+					     dlm->node_num, res->lockname.len,
														
 
															+					     res->lockname.name, new_master);
														
 
															+				} else
														
 
															+					continue;
														
 
															+
														
 
															+				spin_lock(&res->spinlock);
														
 
															+				dlm_change_lockres_owner(dlm, res, new_master);
														
 
															+				res->state &= ~DLM_LOCK_RES_RECOVERING;
														
 
															+				__dlm_dirty_lockres(dlm, res);
														
 
															+				spin_unlock(&res->spinlock);
														
 
															+				wake_up(&res->wq);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
														
 
															+{
														
 
															+	if (local) {
														
 
															+		if (lock->ml.type != LKM_EXMODE &&
														
 
															+		    lock->ml.type != LKM_PRMODE)
														
 
															+			return 1;
														
 
															+	} else if (lock->ml.type == LKM_EXMODE)
														
 
															+		return 1;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
														
 
															+			       struct dlm_lock_resource *res, u8 dead_node)
														
 
															+{
														
 
															+	struct list_head *iter, *queue;
														
 
															+	struct dlm_lock *lock;
														
 
															+	int blank_lvb = 0, local = 0;
														
 
															+	int i;
														
 
															+	u8 search_node;
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	if (res->owner == dlm->node_num)
														
 
															+		/* if this node owned the lockres, and if the dead node 
														
 
															+		 * had an EX when he died, blank out the lvb */
														
 
															+		search_node = dead_node;
														
 
															+	else {
														
 
															+		/* if this is a secondary lockres, and we had no EX or PR
														
 
															+		 * locks granted, we can no longer trust the lvb */
														
 
															+		search_node = dlm->node_num;
														
 
															+		local = 1;  /* check local state for valid lvb */
														
 
															+	}
														
 
															+
														
 
															+	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
														
 
															+		queue = dlm_list_idx_to_ptr(res, i);
														
 
															+		list_for_each(iter, queue) {
														
 
															+			lock = list_entry (iter, struct dlm_lock, list);
														
 
															+			if (lock->ml.node == search_node) {
														
 
															+				if (dlm_lvb_needs_invalidation(lock, local)) {
														
 
															+					/* zero the lksb lvb and lockres lvb */
														
 
															+					blank_lvb = 1;
														
 
															+					memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
														
 
															+				}
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (blank_lvb) {
														
 
															+		mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
														
 
															+		     res->lockname.len, res->lockname.name, dead_node);
														
 
															+		memset(res->lvb, 0, DLM_LVB_LEN);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
														
 
															+				struct dlm_lock_resource *res, u8 dead_node)
														
 
															+{
														
 
															+	struct list_head *iter, *tmpiter;
														
 
															+	struct dlm_lock *lock;
														
 
															+
														
 
															+	/* this node is the lockres master:
														
 
															+	 * 1) remove any stale locks for the dead node
														
 
															+	 * 2) if the dead node had an EX when he died, blank out the lvb 
														
 
															+	 */
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	/* TODO: check pending_asts, pending_basts here */
														
 
															+	list_for_each_safe(iter, tmpiter, &res->granted) {
														
 
															+		lock = list_entry (iter, struct dlm_lock, list);
														
 
															+		if (lock->ml.node == dead_node) {
														
 
															+			list_del_init(&lock->list);
														
 
															+			dlm_lock_put(lock);
														
 
															+		}
														
 
															+	}
														
 
															+	list_for_each_safe(iter, tmpiter, &res->converting) {
														
 
															+		lock = list_entry (iter, struct dlm_lock, list);
														
 
															+		if (lock->ml.node == dead_node) {
														
 
															+			list_del_init(&lock->list);
														
 
															+			dlm_lock_put(lock);
														
 
															+		}
														
 
															+	}
														
 
															+	list_for_each_safe(iter, tmpiter, &res->blocked) {
														
 
															+		lock = list_entry (iter, struct dlm_lock, list);
														
 
															+		if (lock->ml.node == dead_node) {
														
 
															+			list_del_init(&lock->list);
														
 
															+			dlm_lock_put(lock);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* do not kick thread yet */
														
 
															+	__dlm_dirty_lockres(dlm, res);
														
 
															+}
														
 
															+
														
 
															+/* if this node is the recovery master, and there are no
														
 
															+ * locks for a given lockres owned by this node that are in
														
 
															+ * either PR or EX mode, zero out the lvb before requesting.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
														
 
															+{
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	int i;
														
 
															+	struct list_head *bucket;
														
 
															+
														
 
															+
														
 
															+	/* purge any stale mles */
														
 
															+	dlm_clean_master_list(dlm, dead_node);
														
 
															+
														
 
															+	/*
														
 
															+	 * now clean up all lock resources.  there are two rules:
														
 
															+	 *
														
 
															+	 * 1) if the dead node was the master, move the lockres
														
 
															+	 *    to the recovering list.  set the RECOVERING flag.
														
 
															+	 *    this lockres needs to be cleaned up before it can
														
 
															+	 *    be used further.
														
 
															+	 *
														
 
															+	 * 2) if this node was the master, remove all locks from
														
 
															+	 *    each of the lockres queues that were owned by the
														
 
															+	 *    dead node.  once recovery finishes, the dlm thread
														
 
															+	 *    can be kicked again to see if any ASTs or BASTs
														
 
															+	 *    need to be fired as a result.
														
 
															+	 */
														
 
															+	for (i=0; i<DLM_HASH_SIZE; i++) {
														
 
															+		bucket = &(dlm->resources[i]);
														
 
															+		list_for_each(iter, bucket) {
														
 
															+			res = list_entry (iter, struct dlm_lock_resource, list);
														
 
															+			if (dlm_is_recovery_lock(res->lockname.name,
														
 
															+						 res->lockname.len))
														
 
															+				continue;
														
 
															+			
														
 
															+			spin_lock(&res->spinlock);
														
 
															+			/* zero the lvb if necessary */
														
 
															+			dlm_revalidate_lvb(dlm, res, dead_node);
														
 
															+			if (res->owner == dead_node)
														
 
															+				dlm_move_lockres_to_recovery_list(dlm, res);
														
 
															+			else if (res->owner == dlm->node_num) {
														
 
															+				dlm_free_dead_locks(dlm, res, dead_node);
														
 
															+				__dlm_lockres_calc_usage(dlm, res);
														
 
															+			}
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+}
														
 
															+
														
 
															+static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
														
 
															+{
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+
														
 
															+	/* check to see if the node is already considered dead */
														
 
															+	if (!test_bit(idx, dlm->live_nodes_map)) {
														
 
															+		mlog(0, "for domain %s, node %d is already dead. "
														
 
															+		     "another node likely did recovery already.\n",
														
 
															+		     dlm->name, idx);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/* check to see if we do not care about this node */
														
 
															+	if (!test_bit(idx, dlm->domain_map)) {
														
 
															+		/* This also catches the case that we get a node down
														
 
															+		 * but haven't joined the domain yet. */
														
 
															+		mlog(0, "node %u already removed from domain!\n", idx);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	clear_bit(idx, dlm->live_nodes_map);
														
 
															+
														
 
															+	/* Clean up join state on node death. */
														
 
															+	if (dlm->joining_node == idx) {
														
 
															+		mlog(0, "Clearing join state for node %u\n", idx);
														
 
															+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
														
 
															+	}
														
 
															+
														
 
															+	/* make sure local cleanup occurs before the heartbeat events */
														
 
															+	if (!test_bit(idx, dlm->recovery_map))
														
 
															+		dlm_do_local_recovery_cleanup(dlm, idx);
														
 
															+
														
 
															+	/* notify anything attached to the heartbeat events */
														
 
															+	dlm_hb_event_notify_attached(dlm, idx, 0);
														
 
															+
														
 
															+	mlog(0, "node %u being removed from domain map!\n", idx);
														
 
															+	clear_bit(idx, dlm->domain_map);
														
 
															+	/* wake up migration waiters if a node goes down.
														
 
															+	 * perhaps later we can genericize this for other waiters. */
														
 
															+	wake_up(&dlm->migration_wq);
														
 
															+
														
 
															+	if (test_bit(idx, dlm->recovery_map))
														
 
															+		mlog(0, "domain %s, node %u already added "
														
 
															+		     "to recovery map!\n", dlm->name, idx);
														
 
															+	else
														
 
															+		set_bit(idx, dlm->recovery_map);
														
 
															+}
														
 
															+
														
 
															+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	__dlm_hb_node_down(dlm, idx);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+}
														
 
															+
														
 
															+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+	set_bit(idx, dlm->live_nodes_map);
														
 
															+
														
 
															+	/* notify any mles attached to the heartbeat events */
														
 
															+	dlm_hb_event_notify_attached(dlm, idx, 1);
														
 
															+
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+}
														
 
															+
														
 
															+static void dlm_reco_ast(void *astdata)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = astdata;
														
 
															+	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
														
 
															+	     dlm->node_num, dlm->name);
														
 
															+}
														
 
															+static void dlm_reco_bast(void *astdata, int blocked_type)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = astdata;
														
 
															+	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
														
 
															+	     dlm->node_num, dlm->name);
														
 
															+}
														
 
															+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
														
 
															+{
														
 
															+	mlog(0, "unlockast for recovery lock fired!\n");
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	enum dlm_status ret;
														
 
															+	struct dlm_lockstatus lksb;
														
 
															+	int status = -EINVAL;
														
 
															+
														
 
															+	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
														
 
															+	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
														
 
															+retry:
														
 
															+	memset(&lksb, 0, sizeof(lksb));
														
 
															+
														
 
															+	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
														
 
															+		      DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
														
 
															+
														
 
															+	if (ret == DLM_NORMAL) {
														
 
															+		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
														
 
															+		     dlm->name, dlm->node_num);
														
 
															+		/* I am master, send message to all nodes saying
														
 
															+		 * that I am beginning a recovery session */
														
 
															+		status = dlm_send_begin_reco_message(dlm,
														
 
															+					      dlm->reco.dead_node);
														
 
															+
														
 
															+		/* recovery lock is a special case.  ast will not get fired,
														
 
															+		 * so just go ahead and unlock it. */
														
 
															+		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
														
 
															+		if (ret != DLM_NORMAL) {
														
 
															+			/* this would really suck. this could only happen
														
 
															+			 * if there was a network error during the unlock
														
 
															+			 * because of node death.  this means the unlock
														
 
															+			 * is actually "done" and the lock structure is
														
 
															+			 * even freed.  we can continue, but only
														
 
															+			 * because this specific lock name is special. */
														
 
															+			mlog(0, "dlmunlock returned %d\n", ret);
														
 
															+		}
														
 
															+
														
 
															+		if (status < 0) {
														
 
															+			mlog(0, "failed to send recovery message. "
														
 
															+				   "must retry with new node map.\n");
														
 
															+			goto retry;
														
 
															+		}
														
 
															+	} else if (ret == DLM_NOTQUEUED) {
														
 
															+		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
														
 
															+		     dlm->name, dlm->node_num);
														
 
															+		/* another node is master. wait on
														
 
															+		 * reco.new_master != O2NM_INVALID_NODE_NUM */
														
 
															+		status = -EEXIST;
														
 
															+	}
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
														
 
															+{
														
 
															+	struct dlm_begin_reco br;
														
 
															+	int ret = 0;
														
 
															+	struct dlm_node_iter iter;
														
 
															+	int nodenum;
														
 
															+	int status;
														
 
															+
														
 
															+	mlog_entry("%u\n", dead_node);
														
 
															+
														
 
															+	mlog(0, "dead node is %u\n", dead_node);
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	dlm_node_iter_init(dlm->domain_map, &iter);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	clear_bit(dead_node, iter.node_map);
														
 
															+
														
 
															+	memset(&br, 0, sizeof(br));
														
 
															+	br.node_idx = dlm->node_num;
														
 
															+	br.dead_node = dead_node;
														
 
															+
														
 
															+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
														
 
															+		ret = 0;
														
 
															+		if (nodenum == dead_node) {
														
 
															+			mlog(0, "not sending begin reco to dead node "
														
 
															+				  "%u\n", dead_node);
														
 
															+			continue;
														
 
															+		}
														
 
															+		if (nodenum == dlm->node_num) {
														
 
															+			mlog(0, "not sending begin reco to self\n");
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		ret = -EINVAL;
														
 
															+		mlog(0, "attempting to send begin reco msg to %d\n",
														
 
															+			  nodenum);
														
 
															+		ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
														
 
															+					 &br, sizeof(br), nodenum, &status);
														
 
															+		/* negative status is handled ok by caller here */
														
 
															+		if (ret >= 0)
														
 
															+			ret = status;
														
 
															+		if (ret < 0) {
														
 
															+			struct dlm_lock_resource *res;
														
 
															+			mlog_errno(ret);
														
 
															+			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
														
 
															+			    " returned %d\n", dlm->name, nodenum, ret);
														
 
															+			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
														
 
															+						 DLM_RECOVERY_LOCK_NAME_LEN);
														
 
															+			if (res) {
														
 
															+				dlm_print_one_lock_resource(res);
														
 
															+				dlm_lockres_put(res);
														
 
															+			} else {
														
 
															+				mlog(ML_ERROR, "recovery lock not found\n");
														
 
															+			}
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
														
 
															+
														
 
															+	/* ok to return 0, domain has gone away */
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return 0;
														
 
															+
														
 
															+	mlog(0, "node %u wants to recover node %u\n",
														
 
															+		  br->node_idx, br->dead_node);
														
 
															+
														
 
															+	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
														
 
															+		mlog(0, "new_master already set to %u!\n",
														
 
															+			  dlm->reco.new_master);
														
 
															+	}
														
 
															+	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
														
 
															+		mlog(0, "dead_node already set to %u!\n",
														
 
															+			  dlm->reco.dead_node);
														
 
															+	}
														
 
															+	dlm->reco.new_master = br->node_idx;
														
 
															+	dlm->reco.dead_node = br->dead_node;
														
 
															+	if (!test_bit(br->dead_node, dlm->recovery_map)) {
														
 
															+		mlog(ML_ERROR, "recovery master %u sees %u as dead, but this "
														
 
															+		     "node has not yet.  marking %u as dead\n",
														
 
															+		     br->node_idx, br->dead_node, br->dead_node);
														
 
															+		__dlm_hb_node_down(dlm, br->dead_node);
														
 
															+	}
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	dlm_kick_recovery_thread(dlm);
														
 
															+	dlm_put(dlm);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	struct dlm_finalize_reco fr;
														
 
															+	struct dlm_node_iter iter;
														
 
															+	int nodenum;
														
 
															+	int status;
														
 
															+
														
 
															+	mlog(0, "finishing recovery for node %s:%u\n",
														
 
															+	     dlm->name, dlm->reco.dead_node);
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	dlm_node_iter_init(dlm->domain_map, &iter);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	memset(&fr, 0, sizeof(fr));
														
 
															+	fr.node_idx = dlm->node_num;
														
 
															+	fr.dead_node = dlm->reco.dead_node;
														
 
															+
														
 
															+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
														
 
															+		if (nodenum == dlm->node_num)
														
 
															+			continue;
														
 
															+		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
														
 
															+					 &fr, sizeof(fr), nodenum, &status);
														
 
															+		if (ret >= 0) {
														
 
															+			ret = status;
														
 
															+			if (dlm_is_host_down(ret)) {
														
 
															+				/* this has no effect on this recovery 
														
 
															+				 * session, so set the status to zero to 
														
 
															+				 * finish out the last recovery */
														
 
															+				mlog(ML_ERROR, "node %u went down after this "
														
 
															+				     "node finished recovery.\n", nodenum);
														
 
															+				ret = 0;
														
 
															+			}
														
 
															+		}
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
														
 
															+
														
 
															+	/* ok to return 0, domain has gone away */
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return 0;
														
 
															+
														
 
															+	mlog(0, "node %u finalizing recovery of node %u\n",
														
 
															+	     fr->node_idx, fr->dead_node);
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+	if (dlm->reco.new_master != fr->node_idx) {
														
 
															+		mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
														
 
															+		     "%u is supposed to be the new master, dead=%u\n",
														
 
															+		     fr->node_idx, dlm->reco.new_master, fr->dead_node);
														
 
															+		BUG();
														
 
															+	}
														
 
															+	if (dlm->reco.dead_node != fr->dead_node) {
														
 
															+		mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
														
 
															+		     "node %u, but node %u is supposed to be dead\n",
														
 
															+		     fr->node_idx, fr->dead_node, dlm->reco.dead_node);
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
														
 
															+
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	dlm_reset_recovery(dlm);
														
 
															+
														
 
															+	dlm_kick_recovery_thread(dlm);
														
 
															+	dlm_put(dlm);
														
 
															+	return 0;
														
 
															+}
														
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -0,0 +1,692 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmthread.c
														
 
															+ *
														
 
															+ * standalone DLM module
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/socket.h>
														
 
															+#include <linux/inet.h>
														
 
															+#include <linux/timer.h>
														
 
															+#include <linux/kthread.h>
														
 
															+
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+#include "dlmdomain.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+static int dlm_thread(void *data);
														
 
															+
														
 
															+static void dlm_flush_asts(struct dlm_ctxt *dlm);
														
 
															+
														
 
															+#define dlm_lock_is_remote(dlm, lock)     ((lock)->ml.node != (dlm)->node_num)
														
 
															+
														
 
															+/* will exit holding res->spinlock, but may drop in function */
														
 
															+/* waits until flags are cleared on res->state */
														
 
															+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
														
 
															+{
														
 
															+	DECLARE_WAITQUEUE(wait, current);
														
 
															+
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	add_wait_queue(&res->wq, &wait);
														
 
															+repeat:
														
 
															+	set_current_state(TASK_UNINTERRUPTIBLE);
														
 
															+	if (res->state & flags) {
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		schedule();
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		goto repeat;
														
 
															+	}
														
 
															+	remove_wait_queue(&res->wq, &wait);
														
 
															+	current->state = TASK_RUNNING;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int __dlm_lockres_unused(struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	if (list_empty(&res->granted) &&
														
 
															+	    list_empty(&res->converting) &&
														
 
															+	    list_empty(&res->blocked) &&
														
 
															+	    list_empty(&res->dirty))
														
 
															+		return 1;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* Call whenever you may have added or deleted something from one of
														
 
															+ * the lockres queue's. This will figure out whether it belongs on the
														
 
															+ * unused list or not and does the appropriate thing. */
														
 
															+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	if (__dlm_lockres_unused(res)){
														
 
															+		if (list_empty(&res->purge)) {
														
 
															+			mlog(0, "putting lockres %.*s from purge list\n",
														
 
															+			     res->lockname.len, res->lockname.name);
														
 
															+
														
 
															+			res->last_used = jiffies;
														
 
															+			list_add_tail(&res->purge, &dlm->purge_list);
														
 
															+			dlm->purge_count++;
														
 
															+		}
														
 
															+	} else if (!list_empty(&res->purge)) {
														
 
															+		mlog(0, "removing lockres %.*s from purge list\n",
														
 
															+		     res->lockname.len, res->lockname.name);
														
 
															+
														
 
															+		list_del_init(&res->purge);
														
 
															+		dlm->purge_count--;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
														
 
															+			    struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	spin_lock(&res->spinlock);
														
 
															+
														
 
															+	__dlm_lockres_calc_usage(dlm, res);
														
 
															+
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
														
 
															+ * to do migration, but will re-acquire before exit. */
														
 
															+void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
														
 
															+{
														
 
															+	int master;
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&lockres->spinlock);
														
 
															+	master = lockres->owner == dlm->node_num;
														
 
															+	spin_unlock(&lockres->spinlock);
														
 
															+
														
 
															+	mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
														
 
															+	     lockres->lockname.name, master);
														
 
															+
														
 
															+	/* Non master is the easy case -- no migration required, just
														
 
															+	 * quit. */
														
 
															+	if (!master)
														
 
															+		goto finish;
														
 
															+
														
 
															+	/* Wheee! Migrate lockres here! */
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+again:
														
 
															+
														
 
															+	ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
														
 
															+	if (ret == -ENOTEMPTY) {
														
 
															+		mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
														
 
															+		     lockres->lockname.len, lockres->lockname.name);
														
 
															+
														
 
															+		BUG();
														
 
															+	} else if (ret < 0) {
														
 
															+		mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
														
 
															+		     lockres->lockname.len, lockres->lockname.name);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+
														
 
															+finish:
														
 
															+	if (!list_empty(&lockres->purge)) {
														
 
															+		list_del_init(&lockres->purge);
														
 
															+		dlm->purge_count--;
														
 
															+	}
														
 
															+	__dlm_unhash_lockres(lockres);
														
 
															+}
														
 
															+
														
 
															+static void dlm_run_purge_list(struct dlm_ctxt *dlm,
														
 
															+			       int purge_now)
														
 
															+{
														
 
															+	unsigned int run_max, unused;
														
 
															+	unsigned long purge_jiffies;
														
 
															+	struct dlm_lock_resource *lockres;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	run_max = dlm->purge_count;
														
 
															+
														
 
															+	while(run_max && !list_empty(&dlm->purge_list)) {
														
 
															+		run_max--;
														
 
															+
														
 
															+		lockres = list_entry(dlm->purge_list.next,
														
 
															+				     struct dlm_lock_resource, purge);
														
 
															+
														
 
															+		/* Status of the lockres *might* change so double
														
 
															+		 * check. If the lockres is unused, holding the dlm
														
 
															+		 * spinlock will prevent people from getting and more
														
 
															+		 * refs on it -- there's no need to keep the lockres
														
 
															+		 * spinlock. */
														
 
															+		spin_lock(&lockres->spinlock);
														
 
															+		unused = __dlm_lockres_unused(lockres);
														
 
															+		spin_unlock(&lockres->spinlock);
														
 
															+
														
 
															+		if (!unused)
														
 
															+			continue;
														
 
															+
														
 
															+		purge_jiffies = lockres->last_used +
														
 
															+			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
														
 
															+
														
 
															+		/* Make sure that we want to be processing this guy at
														
 
															+		 * this time. */
														
 
															+		if (!purge_now && time_after(purge_jiffies, jiffies)) {
														
 
															+			/* Since resources are added to the purge list
														
 
															+			 * in tail order, we can stop at the first
														
 
															+			 * unpurgable resource -- anyone added after
														
 
															+			 * him will have a greater last_used value */
														
 
															+			break;
														
 
															+		}
														
 
															+
														
 
															+		list_del_init(&lockres->purge);
														
 
															+		dlm->purge_count--;
														
 
															+
														
 
															+		/* This may drop and reacquire the dlm spinlock if it
														
 
															+		 * has to do migration. */
														
 
															+		mlog(0, "calling dlm_purge_lockres!\n");
														
 
															+		dlm_purge_lockres(dlm, lockres);
														
 
															+		mlog(0, "DONE calling dlm_purge_lockres!\n");
														
 
															+
														
 
															+		/* Avoid adding any scheduling latencies */
														
 
															+		cond_resched_lock(&dlm->spinlock);
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+}
														
 
															+
														
 
															+static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
														
 
															+			      struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	struct dlm_lock *lock, *target;
														
 
															+	struct list_head *iter;
														
 
															+	struct list_head *head;
														
 
															+	int can_grant = 1;
														
 
															+
														
 
															+	//mlog(0, "res->lockname.len=%d\n", res->lockname.len);
														
 
															+	//mlog(0, "res->lockname.name=%p\n", res->lockname.name);
														
 
															+	//mlog(0, "shuffle res %.*s\n", res->lockname.len,
														
 
															+	//	  res->lockname.name);
														
 
															+
														
 
															+	/* because this function is called with the lockres
														
 
															+	 * spinlock, and because we know that it is not migrating/
														
 
															+	 * recovering/in-progress, it is fine to reserve asts and
														
 
															+	 * basts right before queueing them all throughout */
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
														
 
															+			      DLM_LOCK_RES_RECOVERING|
														
 
															+			      DLM_LOCK_RES_IN_PROGRESS)));
														
 
															+
														
 
															+converting:
														
 
															+	if (list_empty(&res->converting))
														
 
															+		goto blocked;
														
 
															+	mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
														
 
															+	     res->lockname.name);
														
 
															+
														
 
															+	target = list_entry(res->converting.next, struct dlm_lock, list);
														
 
															+	if (target->ml.convert_type == LKM_IVMODE) {
														
 
															+		mlog(ML_ERROR, "%.*s: converting a lock with no "
														
 
															+		     "convert_type!\n", res->lockname.len, res->lockname.name);
														
 
															+		BUG();
														
 
															+	}
														
 
															+	head = &res->granted;
														
 
															+	list_for_each(iter, head) {
														
 
															+		lock = list_entry(iter, struct dlm_lock, list);
														
 
															+		if (lock==target)
														
 
															+			continue;
														
 
															+		if (!dlm_lock_compatible(lock->ml.type,
														
 
															+					 target->ml.convert_type)) {
														
 
															+			can_grant = 0;
														
 
															+			/* queue the BAST if not already */
														
 
															+			if (lock->ml.highest_blocked == LKM_IVMODE) {
														
 
															+				__dlm_lockres_reserve_ast(res);
														
 
															+				dlm_queue_bast(dlm, lock);
														
 
															+			}
														
 
															+			/* update the highest_blocked if needed */
														
 
															+			if (lock->ml.highest_blocked < target->ml.convert_type)
														
 
															+				lock->ml.highest_blocked =
														
 
															+					target->ml.convert_type;
														
 
															+		}
														
 
															+	}
														
 
															+	head = &res->converting;
														
 
															+	list_for_each(iter, head) {
														
 
															+		lock = list_entry(iter, struct dlm_lock, list);
														
 
															+		if (lock==target)
														
 
															+			continue;
														
 
															+		if (!dlm_lock_compatible(lock->ml.type,
														
 
															+					 target->ml.convert_type)) {
														
 
															+			can_grant = 0;
														
 
															+			if (lock->ml.highest_blocked == LKM_IVMODE) {
														
 
															+				__dlm_lockres_reserve_ast(res);
														
 
															+				dlm_queue_bast(dlm, lock);
														
 
															+			}
														
 
															+			if (lock->ml.highest_blocked < target->ml.convert_type)
														
 
															+				lock->ml.highest_blocked =
														
 
															+					target->ml.convert_type;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* we can convert the lock */
														
 
															+	if (can_grant) {
														
 
															+		spin_lock(&target->spinlock);
														
 
															+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
														
 
															+
														
 
															+		mlog(0, "calling ast for converting lock: %.*s, have: %d, "
														
 
															+		     "granting: %d, node: %u\n", res->lockname.len,
														
 
															+		     res->lockname.name, target->ml.type,
														
 
															+		     target->ml.convert_type, target->ml.node);
														
 
															+
														
 
															+		target->ml.type = target->ml.convert_type;
														
 
															+		target->ml.convert_type = LKM_IVMODE;
														
 
															+		list_del_init(&target->list);
														
 
															+		list_add_tail(&target->list, &res->granted);
														
 
															+
														
 
															+		BUG_ON(!target->lksb);
														
 
															+		target->lksb->status = DLM_NORMAL;
														
 
															+
														
 
															+		spin_unlock(&target->spinlock);
														
 
															+
														
 
															+		__dlm_lockres_reserve_ast(res);
														
 
															+		dlm_queue_ast(dlm, target);
														
 
															+		/* go back and check for more */
														
 
															+		goto converting;
														
 
															+	}
														
 
															+
														
 
															+blocked:
														
 
															+	if (list_empty(&res->blocked))
														
 
															+		goto leave;
														
 
															+	target = list_entry(res->blocked.next, struct dlm_lock, list);
														
 
															+
														
 
															+	head = &res->granted;
														
 
															+	list_for_each(iter, head) {
														
 
															+		lock = list_entry(iter, struct dlm_lock, list);
														
 
															+		if (lock==target)
														
 
															+			continue;
														
 
															+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
														
 
															+			can_grant = 0;
														
 
															+			if (lock->ml.highest_blocked == LKM_IVMODE) {
														
 
															+				__dlm_lockres_reserve_ast(res);
														
 
															+				dlm_queue_bast(dlm, lock);
														
 
															+			}
														
 
															+			if (lock->ml.highest_blocked < target->ml.type)
														
 
															+				lock->ml.highest_blocked = target->ml.type;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	head = &res->converting;
														
 
															+	list_for_each(iter, head) {
														
 
															+		lock = list_entry(iter, struct dlm_lock, list);
														
 
															+		if (lock==target)
														
 
															+			continue;
														
 
															+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
														
 
															+			can_grant = 0;
														
 
															+			if (lock->ml.highest_blocked == LKM_IVMODE) {
														
 
															+				__dlm_lockres_reserve_ast(res);
														
 
															+				dlm_queue_bast(dlm, lock);
														
 
															+			}
														
 
															+			if (lock->ml.highest_blocked < target->ml.type)
														
 
															+				lock->ml.highest_blocked = target->ml.type;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* we can grant the blocked lock (only
														
 
															+	 * possible if converting list empty) */
														
 
															+	if (can_grant) {
														
 
															+		spin_lock(&target->spinlock);
														
 
															+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
														
 
															+
														
 
															+		mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
														
 
															+		     "node: %u\n", res->lockname.len, res->lockname.name,
														
 
															+		     target->ml.type, target->ml.node);
														
 
															+
														
 
															+		// target->ml.type is already correct
														
 
															+		list_del_init(&target->list);
														
 
															+		list_add_tail(&target->list, &res->granted);
														
 
															+
														
 
															+		BUG_ON(!target->lksb);
														
 
															+		target->lksb->status = DLM_NORMAL;
														
 
															+
														
 
															+		spin_unlock(&target->spinlock);
														
 
															+
														
 
															+		__dlm_lockres_reserve_ast(res);
														
 
															+		dlm_queue_ast(dlm, target);
														
 
															+		/* go back and check for more */
														
 
															+		goto converting;
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	return;
														
 
															+}
														
 
															+
														
 
															+/* must have NO locks when calling this with res !=NULL * */
														
 
															+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
														
 
															+	if (res) {
														
 
															+		spin_lock(&dlm->spinlock);
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		__dlm_dirty_lockres(dlm, res);
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+	}
														
 
															+	wake_up(&dlm->dlm_thread_wq);
														
 
															+}
														
 
															+
														
 
															+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
														
 
															+{
														
 
															+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
														
 
															+
														
 
															+	assert_spin_locked(&dlm->spinlock);
														
 
															+	assert_spin_locked(&res->spinlock);
														
 
															+
														
 
															+	/* don't shuffle secondary queues */
														
 
															+	if ((res->owner == dlm->node_num) &&
														
 
															+	    !(res->state & DLM_LOCK_RES_DIRTY)) {
														
 
															+		list_add_tail(&res->dirty, &dlm->dirty_list);
														
 
															+		res->state |= DLM_LOCK_RES_DIRTY;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* Launch the NM thread for the mounted volume */
														
 
															+int dlm_launch_thread(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	mlog(0, "starting dlm thread...\n");
														
 
															+
														
 
															+	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
														
 
															+	if (IS_ERR(dlm->dlm_thread_task)) {
														
 
															+		mlog_errno(PTR_ERR(dlm->dlm_thread_task));
														
 
															+		dlm->dlm_thread_task = NULL;
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void dlm_complete_thread(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	if (dlm->dlm_thread_task) {
														
 
															+		mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
														
 
															+		kthread_stop(dlm->dlm_thread_task);
														
 
															+		dlm->dlm_thread_task = NULL;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int empty;
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	empty = list_empty(&dlm->dirty_list);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+	return empty;
														
 
															+}
														
 
															+
														
 
															+static void dlm_flush_asts(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct dlm_lock *lock;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	u8 hi;
														
 
															+
														
 
															+	spin_lock(&dlm->ast_lock);
														
 
															+	while (!list_empty(&dlm->pending_asts)) {
														
 
															+		lock = list_entry(dlm->pending_asts.next,
														
 
															+				  struct dlm_lock, ast_list);
														
 
															+		/* get an extra ref on lock */
														
 
															+		dlm_lock_get(lock);
														
 
															+		res = lock->lockres;
														
 
															+		mlog(0, "delivering an ast for this lockres\n");
														
 
															+
														
 
															+		BUG_ON(!lock->ast_pending);
														
 
															+
														
 
															+		/* remove from list (including ref) */
														
 
															+		list_del_init(&lock->ast_list);
														
 
															+		dlm_lock_put(lock);
														
 
															+		spin_unlock(&dlm->ast_lock);
														
 
															+
														
 
															+		if (lock->ml.node != dlm->node_num) {
														
 
															+			ret = dlm_do_remote_ast(dlm, res, lock);
														
 
															+			if (ret < 0)
														
 
															+				mlog_errno(ret);
														
 
															+		} else
														
 
															+			dlm_do_local_ast(dlm, res, lock);
														
 
															+
														
 
															+		spin_lock(&dlm->ast_lock);
														
 
															+
														
 
															+		/* possible that another ast was queued while
														
 
															+		 * we were delivering the last one */
														
 
															+		if (!list_empty(&lock->ast_list)) {
														
 
															+			mlog(0, "aha another ast got queued while "
														
 
															+			     "we were finishing the last one.  will "
														
 
															+			     "keep the ast_pending flag set.\n");
														
 
															+		} else
														
 
															+			lock->ast_pending = 0;
														
 
															+
														
 
															+		/* drop the extra ref.
														
 
															+		 * this may drop it completely. */
														
 
															+		dlm_lock_put(lock);
														
 
															+		dlm_lockres_release_ast(dlm, res);
														
 
															+	}
														
 
															+
														
 
															+	while (!list_empty(&dlm->pending_basts)) {
														
 
															+		lock = list_entry(dlm->pending_basts.next,
														
 
															+				  struct dlm_lock, bast_list);
														
 
															+		/* get an extra ref on lock */
														
 
															+		dlm_lock_get(lock);
														
 
															+		res = lock->lockres;
														
 
															+
														
 
															+		BUG_ON(!lock->bast_pending);
														
 
															+
														
 
															+		/* get the highest blocked lock, and reset */
														
 
															+		spin_lock(&lock->spinlock);
														
 
															+		BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
														
 
															+		hi = lock->ml.highest_blocked;
														
 
															+		lock->ml.highest_blocked = LKM_IVMODE;
														
 
															+		spin_unlock(&lock->spinlock);
														
 
															+
														
 
															+		/* remove from list (including ref) */
														
 
															+		list_del_init(&lock->bast_list);
														
 
															+		dlm_lock_put(lock);
														
 
															+		spin_unlock(&dlm->ast_lock);
														
 
															+
														
 
															+		mlog(0, "delivering a bast for this lockres "
														
 
															+		     "(blocked = %d\n", hi);
														
 
															+
														
 
															+		if (lock->ml.node != dlm->node_num) {
														
 
															+			ret = dlm_send_proxy_bast(dlm, res, lock, hi);
														
 
															+			if (ret < 0)
														
 
															+				mlog_errno(ret);
														
 
															+		} else
														
 
															+			dlm_do_local_bast(dlm, res, lock, hi);
														
 
															+
														
 
															+		spin_lock(&dlm->ast_lock);
														
 
															+
														
 
															+		/* possible that another bast was queued while
														
 
															+		 * we were delivering the last one */
														
 
															+		if (!list_empty(&lock->bast_list)) {
														
 
															+			mlog(0, "aha another bast got queued while "
														
 
															+			     "we were finishing the last one.  will "
														
 
															+			     "keep the bast_pending flag set.\n");
														
 
															+		} else
														
 
															+			lock->bast_pending = 0;
														
 
															+
														
 
															+		/* drop the extra ref.
														
 
															+		 * this may drop it completely. */
														
 
															+		dlm_lock_put(lock);
														
 
															+		dlm_lockres_release_ast(dlm, res);
														
 
															+	}
														
 
															+	wake_up(&dlm->ast_wq);
														
 
															+	spin_unlock(&dlm->ast_lock);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
														
 
															+#define DLM_THREAD_MAX_DIRTY  100
														
 
															+#define DLM_THREAD_MAX_ASTS   10
														
 
															+
														
 
															+static int dlm_thread(void *data)
														
 
															+{
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
														
 
															+
														
 
															+	mlog(0, "dlm thread running for %s...\n", dlm->name);
														
 
															+
														
 
															+	while (!kthread_should_stop()) {
														
 
															+		int n = DLM_THREAD_MAX_DIRTY;
														
 
															+
														
 
															+		/* dlm_shutting_down is very point-in-time, but that
														
 
															+		 * doesn't matter as we'll just loop back around if we
														
 
															+		 * get false on the leading edge of a state
														
 
															+		 * transition. */
														
 
															+		dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
														
 
															+
														
 
															+		/* We really don't want to hold dlm->spinlock while
														
 
															+		 * calling dlm_shuffle_lists on each lockres that
														
 
															+		 * needs to have its queues adjusted and AST/BASTs
														
 
															+		 * run.  So let's pull each entry off the dirty_list
														
 
															+		 * and drop dlm->spinlock ASAP.  Once off the list,
														
 
															+		 * res->spinlock needs to be taken again to protect
														
 
															+		 * the queues while calling dlm_shuffle_lists.  */
														
 
															+		spin_lock(&dlm->spinlock);
														
 
															+		while (!list_empty(&dlm->dirty_list)) {
														
 
															+			int delay = 0;
														
 
															+			res = list_entry(dlm->dirty_list.next,
														
 
															+					 struct dlm_lock_resource, dirty);
														
 
															+
														
 
															+			/* peel a lockres off, remove it from the list,
														
 
															+			 * unset the dirty flag and drop the dlm lock */
														
 
															+			BUG_ON(!res);
														
 
															+			dlm_lockres_get(res);
														
 
															+
														
 
															+			spin_lock(&res->spinlock);
														
 
															+			res->state &= ~DLM_LOCK_RES_DIRTY;
														
 
															+			list_del_init(&res->dirty);
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			spin_unlock(&dlm->spinlock);
														
 
															+
														
 
															+		 	/* lockres can be re-dirtied/re-added to the
														
 
															+			 * dirty_list in this gap, but that is ok */
														
 
															+
														
 
															+			spin_lock(&res->spinlock);
														
 
															+			if (res->owner != dlm->node_num) {
														
 
															+				__dlm_print_one_lock_resource(res);
														
 
															+				mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
														
 
															+				     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
														
 
															+				     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
														
 
															+				     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
														
 
															+				     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
														
 
															+			}
														
 
															+			BUG_ON(res->owner != dlm->node_num);
														
 
															+
														
 
															+			/* it is now ok to move lockreses in these states
														
 
															+			 * to the dirty list, assuming that they will only be
														
 
															+			 * dirty for a short while. */
														
 
															+			if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
														
 
															+					  DLM_LOCK_RES_MIGRATING |
														
 
															+					  DLM_LOCK_RES_RECOVERING)) {
														
 
															+				/* move it to the tail and keep going */
														
 
															+				spin_unlock(&res->spinlock);
														
 
															+				mlog(0, "delaying list shuffling for in-"
														
 
															+				     "progress lockres %.*s, state=%d\n",
														
 
															+				     res->lockname.len, res->lockname.name,
														
 
															+				     res->state);
														
 
															+				delay = 1;
														
 
															+				goto in_progress;
														
 
															+			}
														
 
															+
														
 
															+			/* at this point the lockres is not migrating/
														
 
															+			 * recovering/in-progress.  we have the lockres
														
 
															+			 * spinlock and do NOT have the dlm lock.
														
 
															+			 * safe to reserve/queue asts and run the lists. */
														
 
															+
														
 
															+			mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
														
 
															+			     "res=%p\n", dlm, res);
														
 
															+
														
 
															+			/* called while holding lockres lock */
														
 
															+			dlm_shuffle_lists(dlm, res);
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+
														
 
															+			dlm_lockres_calc_usage(dlm, res);
														
 
															+
														
 
															+in_progress:
														
 
															+
														
 
															+			spin_lock(&dlm->spinlock);
														
 
															+			/* if the lock was in-progress, stick
														
 
															+			 * it on the back of the list */
														
 
															+			if (delay) {
														
 
															+				spin_lock(&res->spinlock);
														
 
															+				list_add_tail(&res->dirty, &dlm->dirty_list);
														
 
															+				res->state |= DLM_LOCK_RES_DIRTY;
														
 
															+				spin_unlock(&res->spinlock);
														
 
															+			}
														
 
															+			dlm_lockres_put(res);
														
 
															+
														
 
															+			/* unlikely, but we may need to give time to
														
 
															+			 * other tasks */
														
 
															+			if (!--n) {
														
 
															+				mlog(0, "throttling dlm_thread\n");
														
 
															+				break;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		spin_unlock(&dlm->spinlock);
														
 
															+		dlm_flush_asts(dlm);
														
 
															+
														
 
															+		/* yield and continue right away if there is more work to do */
														
 
															+		if (!n) {
														
 
															+			yield();
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		wait_event_interruptible_timeout(dlm->dlm_thread_wq,
														
 
															+						 !dlm_dirty_list_empty(dlm) ||
														
 
															+						 kthread_should_stop(),
														
 
															+						 timeout);
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "quitting DLM thread\n");
														
 
															+	return 0;
														
 
															+}
														
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -0,0 +1,672 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmunlock.c
														
 
															+ *
														
 
															+ * underlying calls for unlocking locks
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/utsname.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/sysctl.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/socket.h>
														
 
															+#include <linux/inet.h>
														
 
															+#include <linux/spinlock.h>
														
 
															+#include <linux/delay.h>
														
 
															+
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+#include "dlmcommon.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DLM
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+#define DLM_UNLOCK_FREE_LOCK           0x00000001
														
 
															+#define DLM_UNLOCK_CALL_AST            0x00000002
														
 
															+#define DLM_UNLOCK_REMOVE_LOCK         0x00000004
														
 
															+#define DLM_UNLOCK_REGRANT_LOCK        0x00000008
														
 
															+#define DLM_UNLOCK_CLEAR_CONVERT_TYPE  0x00000010
														
 
															+
														
 
															+
														
 
															+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
														
 
															+					      struct dlm_lock_resource *res,
														
 
															+					      struct dlm_lock *lock,
														
 
															+					      struct dlm_lockstatus *lksb,
														
 
															+					      int *actions);
														
 
															+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
														
 
															+					      struct dlm_lock_resource *res,
														
 
															+					      struct dlm_lock *lock,
														
 
															+					      struct dlm_lockstatus *lksb,
														
 
															+					      int *actions);
														
 
															+
														
 
															+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
														
 
															+						 struct dlm_lock_resource *res,
														
 
															+						 struct dlm_lock *lock,
														
 
															+						 struct dlm_lockstatus *lksb,
														
 
															+						 int flags,
														
 
															+						 u8 owner);
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * according to the spec:
														
 
															+ * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
														
 
															+ *
														
 
															+ *  flags & LKM_CANCEL != 0: must be converting or blocked
														
 
															+ *  flags & LKM_CANCEL == 0: must be granted
														
 
															+ *
														
 
															+ * So to unlock a converting lock, you must first cancel the
														
 
															+ * convert (passing LKM_CANCEL in flags), then call the unlock
														
 
															+ * again (with no LKM_CANCEL in flags).
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         res->spinlock and lock->spinlock taken and dropped
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
														
 
															+ * all callers should have taken an extra ref on lock coming in
														
 
															+ */
														
 
															+static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
														
 
															+					struct dlm_lock_resource *res,
														
 
															+					struct dlm_lock *lock,
														
 
															+					struct dlm_lockstatus *lksb,
														
 
															+					int flags, int *call_ast,
														
 
															+					int master_node)
														
 
															+{
														
 
															+	enum dlm_status status;
														
 
															+	int actions = 0;
														
 
															+	int in_use;
														
 
															+        u8 owner;
														
 
															+
														
 
															+	mlog(0, "master_node = %d, valblk = %d\n", master_node,
														
 
															+	     flags & LKM_VALBLK);
														
 
															+
														
 
															+	if (master_node)
														
 
															+		BUG_ON(res->owner != dlm->node_num);
														
 
															+	else
														
 
															+		BUG_ON(res->owner == dlm->node_num);
														
 
															+
														
 
															+	spin_lock(&dlm->spinlock);
														
 
															+	/* We want to be sure that we're not freeing a lock
														
 
															+	 * that still has AST's pending... */
														
 
															+	in_use = !list_empty(&lock->ast_list);
														
 
															+	spin_unlock(&dlm->spinlock);
														
 
															+	if (in_use) {
														
 
															+	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
														
 
															+		    "while waiting for an ast!", res->lockname.len,
														
 
															+		    res->lockname.name);
														
 
															+		return DLM_BADPARAM;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
														
 
															+		if (master_node) {
														
 
															+			mlog(ML_ERROR, "lockres in progress!\n");
														
 
															+			spin_unlock(&res->spinlock);
														
 
															+			return DLM_FORWARD;
														
 
															+		}
														
 
															+		/* ok for this to sleep if not in a network handler */
														
 
															+		__dlm_wait_on_lockres(res);
														
 
															+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
														
 
															+	}
														
 
															+	spin_lock(&lock->spinlock);
														
 
															+
														
 
															+	if (res->state & DLM_LOCK_RES_RECOVERING) {
														
 
															+		status = DLM_RECOVERING;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	/* see above for what the spec says about
														
 
															+	 * LKM_CANCEL and the lock queue state */
														
 
															+	if (flags & LKM_CANCEL)
														
 
															+		status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
														
 
															+	else
														
 
															+		status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
														
 
															+
														
 
															+	if (status != DLM_NORMAL)
														
 
															+		goto leave;
														
 
															+
														
 
															+	/* By now this has been masked out of cancel requests. */
														
 
															+	if (flags & LKM_VALBLK) {
														
 
															+		/* make the final update to the lvb */
														
 
															+		if (master_node)
														
 
															+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
														
 
															+		else
														
 
															+			flags |= LKM_PUT_LVB; /* let the send function
														
 
															+					       * handle it. */
														
 
															+	}
														
 
															+
														
 
															+	if (!master_node) {
														
 
															+		owner = res->owner;
														
 
															+		/* drop locks and send message */
														
 
															+		if (flags & LKM_CANCEL)
														
 
															+			lock->cancel_pending = 1;
														
 
															+		else
														
 
															+			lock->unlock_pending = 1;
														
 
															+		spin_unlock(&lock->spinlock);
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
														
 
															+							flags, owner);
														
 
															+		spin_lock(&res->spinlock);
														
 
															+		spin_lock(&lock->spinlock);
														
 
															+		/* if the master told us the lock was already granted,
														
 
															+		 * let the ast handle all of these actions */
														
 
															+		if (status == DLM_NORMAL &&
														
 
															+		    lksb->status == DLM_CANCELGRANT) {
														
 
															+			actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
														
 
															+				     DLM_UNLOCK_REGRANT_LOCK|
														
 
															+				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
														
 
															+		}
														
 
															+		if (flags & LKM_CANCEL)
														
 
															+			lock->cancel_pending = 0;
														
 
															+		else
														
 
															+			lock->unlock_pending = 0;
														
 
															+
														
 
															+	}
														
 
															+
														
 
															+	/* get an extra ref on lock.  if we are just switching
														
 
															+	 * lists here, we dont want the lock to go away. */
														
 
															+	dlm_lock_get(lock);
														
 
															+
														
 
															+	if (actions & DLM_UNLOCK_REMOVE_LOCK) {
														
 
															+		list_del_init(&lock->list);
														
 
															+		dlm_lock_put(lock);
														
 
															+	}
														
 
															+	if (actions & DLM_UNLOCK_REGRANT_LOCK) {
														
 
															+		dlm_lock_get(lock);
														
 
															+		list_add_tail(&lock->list, &res->granted);
														
 
															+	}
														
 
															+	if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
														
 
															+		mlog(0, "clearing convert_type at %smaster node\n",
														
 
															+		     master_node ? "" : "non-");
														
 
															+		lock->ml.convert_type = LKM_IVMODE;
														
 
															+	}
														
 
															+
														
 
															+	/* remove the extra ref on lock */
														
 
															+	dlm_lock_put(lock);
														
 
															+
														
 
															+leave:
														
 
															+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
														
 
															+	if (!dlm_lock_on_list(&res->converting, lock))
														
 
															+		BUG_ON(lock->ml.convert_type != LKM_IVMODE);
														
 
															+	else
														
 
															+		BUG_ON(lock->ml.convert_type == LKM_IVMODE);
														
 
															+	spin_unlock(&lock->spinlock);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	wake_up(&res->wq);
														
 
															+
														
 
															+	/* let the caller's final dlm_lock_put handle the actual kfree */
														
 
															+	if (actions & DLM_UNLOCK_FREE_LOCK) {
														
 
															+		/* this should always be coupled with list removal */
														
 
															+		BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
														
 
															+		mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
														
 
															+		     lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
														
 
															+		dlm_lock_put(lock);
														
 
															+	}
														
 
															+	if (actions & DLM_UNLOCK_CALL_AST)
														
 
															+		*call_ast = 1;
														
 
															+
														
 
															+	/* if cancel or unlock succeeded, lvb work is done */
														
 
															+	if (status == DLM_NORMAL)
														
 
															+		lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
														
 
															+			       struct dlm_lock *lock)
														
 
															+{
														
 
															+	/* leave DLM_LKSB_PUT_LVB on the lksb so any final
														
 
															+	 * update of the lvb will be sent to the new master */
														
 
															+	list_del_init(&lock->list);
														
 
															+}
														
 
															+
														
 
															+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
														
 
															+			       struct dlm_lock *lock)
														
 
															+{
														
 
															+	list_del_init(&lock->list);
														
 
															+	list_add_tail(&lock->list, &res->granted);
														
 
															+	lock->ml.convert_type = LKM_IVMODE;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
														
 
															+					  struct dlm_lock_resource *res,
														
 
															+					  struct dlm_lock *lock,
														
 
															+					  struct dlm_lockstatus *lksb,
														
 
															+					  int flags,
														
 
															+					  int *call_ast)
														
 
															+{
														
 
															+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
														
 
															+}
														
 
															+
														
 
															+static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
														
 
															+					  struct dlm_lock_resource *res,
														
 
															+					  struct dlm_lock *lock,
														
 
															+					  struct dlm_lockstatus *lksb,
														
 
															+					  int flags, int *call_ast)
														
 
															+{
														
 
															+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         none
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
														
 
															+ */
														
 
															+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
														
 
															+						 struct dlm_lock_resource *res,
														
 
															+						 struct dlm_lock *lock,
														
 
															+						 struct dlm_lockstatus *lksb,
														
 
															+						 int flags,
														
 
															+						 u8 owner)
														
 
															+{
														
 
															+	struct dlm_unlock_lock unlock;
														
 
															+	int tmpret;
														
 
															+	enum dlm_status ret;
														
 
															+	int status = 0;
														
 
															+	struct kvec vec[2];
														
 
															+	size_t veclen = 1;
														
 
															+
														
 
															+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
														
 
															+
														
 
															+	memset(&unlock, 0, sizeof(unlock));
														
 
															+	unlock.node_idx = dlm->node_num;
														
 
															+	unlock.flags = cpu_to_be32(flags);
														
 
															+	unlock.cookie = lock->ml.cookie;
														
 
															+	unlock.namelen = res->lockname.len;
														
 
															+	memcpy(unlock.name, res->lockname.name, unlock.namelen);
														
 
															+
														
 
															+	vec[0].iov_len = sizeof(struct dlm_unlock_lock);
														
 
															+	vec[0].iov_base = &unlock;
														
 
															+
														
 
															+	if (flags & LKM_PUT_LVB) {
														
 
															+		/* extra data to send if we are updating lvb */
														
 
															+		vec[1].iov_len = DLM_LVB_LEN;
														
 
															+		vec[1].iov_base = lock->lksb->lvb;
														
 
															+		veclen++;
														
 
															+	}
														
 
															+
														
 
															+	tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
														
 
															+					vec, veclen, owner, &status);
														
 
															+	if (tmpret >= 0) {
														
 
															+		// successfully sent and received
														
 
															+		if (status == DLM_CANCELGRANT)
														
 
															+			ret = DLM_NORMAL;
														
 
															+		else if (status == DLM_FORWARD) {
														
 
															+			mlog(0, "master was in-progress.  retry\n");
														
 
															+			ret = DLM_FORWARD;
														
 
															+		} else
														
 
															+			ret = status;
														
 
															+		lksb->status = status;
														
 
															+	} else {
														
 
															+		mlog_errno(tmpret);
														
 
															+		if (dlm_is_host_down(tmpret)) {
														
 
															+			/* NOTE: this seems strange, but it is what we want.
														
 
															+			 * when the master goes down during a cancel or
														
 
															+			 * unlock, the recovery code completes the operation
														
 
															+			 * as if the master had not died, then passes the
														
 
															+			 * updated state to the recovery master.  this thread
														
 
															+			 * just needs to finish out the operation and call
														
 
															+			 * the unlockast. */
														
 
															+			ret = DLM_NORMAL;
														
 
															+		} else {
														
 
															+			/* something bad.  this will BUG in ocfs2 */
														
 
															+			ret = dlm_err_to_dlm_status(tmpret);
														
 
															+		}
														
 
															+		lksb->status = ret;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * locking:
														
 
															+ *   caller needs:  none
														
 
															+ *   taken:         takes and drops res->spinlock
														
 
															+ *   held on exit:  none
														
 
															+ * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
														
 
															+ *          return value from dlmunlock_master
														
 
															+ */
														
 
															+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm = data;
														
 
															+	struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
														
 
															+	struct dlm_lock_resource *res = NULL;
														
 
															+	struct list_head *iter;
														
 
															+	struct dlm_lock *lock = NULL;
														
 
															+	enum dlm_status status = DLM_NORMAL;
														
 
															+	int found = 0, i;
														
 
															+	struct dlm_lockstatus *lksb = NULL;
														
 
															+	int ignore;
														
 
															+	u32 flags;
														
 
															+	struct list_head *queue;
														
 
															+
														
 
															+	flags = be32_to_cpu(unlock->flags);
														
 
															+
														
 
															+	if (flags & LKM_GET_LVB) {
														
 
															+		mlog(ML_ERROR, "bad args!  GET_LVB specified on unlock!\n");
														
 
															+		return DLM_BADARGS;
														
 
															+	}
														
 
															+
														
 
															+	if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
														
 
															+		mlog(ML_ERROR, "bad args!  cannot modify lvb on a CANCEL "
														
 
															+		     "request!\n");
														
 
															+		return DLM_BADARGS;
														
 
															+	}
														
 
															+
														
 
															+	if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
														
 
															+		mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
														
 
															+		return DLM_IVBUFLEN;
														
 
															+	}
														
 
															+
														
 
															+	if (!dlm_grab(dlm))
														
 
															+		return DLM_REJECTED;
														
 
															+
														
 
															+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
														
 
															+			"Domain %s not fully joined!\n", dlm->name);
														
 
															+
														
 
															+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
														
 
															+
														
 
															+	res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
														
 
															+	if (!res) {
														
 
															+		/* We assume here that a no lock resource simply means
														
 
															+		 * it was migrated away and destroyed before the other
														
 
															+		 * node could detect it. */
														
 
															+		mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
														
 
															+		status = DLM_FORWARD;
														
 
															+		goto not_found;
														
 
															+	}
														
 
															+
														
 
															+	queue=&res->granted;
														
 
															+	found = 0;
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	if (res->state & DLM_LOCK_RES_RECOVERING) {
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		mlog(0, "returning DLM_RECOVERING\n");
														
 
															+		status = DLM_RECOVERING;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (res->state & DLM_LOCK_RES_MIGRATING) {
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		mlog(0, "returning DLM_MIGRATING\n");
														
 
															+		status = DLM_MIGRATING;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (res->owner != dlm->node_num) {
														
 
															+		spin_unlock(&res->spinlock);
														
 
															+		mlog(0, "returning DLM_FORWARD -- not master\n");
														
 
															+		status = DLM_FORWARD;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	for (i=0; i<3; i++) {
														
 
															+		list_for_each(iter, queue) {
														
 
															+			lock = list_entry(iter, struct dlm_lock, list);
														
 
															+			if (lock->ml.cookie == unlock->cookie &&
														
 
															+		    	    lock->ml.node == unlock->node_idx) {
														
 
															+				dlm_lock_get(lock);
														
 
															+				found = 1;
														
 
															+				break;
														
 
															+			}
														
 
															+		}
														
 
															+		if (found)
														
 
															+			break;
														
 
															+		/* scan granted -> converting -> blocked queues */
														
 
															+		queue++;
														
 
															+	}
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+	if (!found) {
														
 
															+		status = DLM_IVLOCKID;
														
 
															+		goto not_found;
														
 
															+	}
														
 
															+
														
 
															+	/* lock was found on queue */
														
 
															+	lksb = lock->lksb;
														
 
															+	/* unlockast only called on originating node */
														
 
															+	if (flags & LKM_PUT_LVB) {
														
 
															+		lksb->flags |= DLM_LKSB_PUT_LVB;
														
 
															+		memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
														
 
															+	}
														
 
															+
														
 
															+	/* if this is in-progress, propagate the DLM_FORWARD
														
 
															+	 * all the way back out */
														
 
															+	status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
														
 
															+	if (status == DLM_FORWARD)
														
 
															+		mlog(0, "lockres is in progress\n");
														
 
															+
														
 
															+	if (flags & LKM_PUT_LVB)
														
 
															+		lksb->flags &= ~DLM_LKSB_PUT_LVB;
														
 
															+
														
 
															+	dlm_lockres_calc_usage(dlm, res);
														
 
															+	dlm_kick_thread(dlm, res);
														
 
															+
														
 
															+not_found:
														
 
															+	if (!found)
														
 
															+		mlog(ML_ERROR, "failed to find lock to unlock! "
														
 
															+			       "cookie=%"MLFu64"\n",
														
 
															+		     unlock->cookie);
														
 
															+	else {
														
 
															+		/* send the lksb->status back to the other node */
														
 
															+		status = lksb->status;
														
 
															+		dlm_lock_put(lock);
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	if (res)
														
 
															+		dlm_lockres_put(res);
														
 
															+
														
 
															+	dlm_put(dlm);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
														
 
															+					      struct dlm_lock_resource *res,
														
 
															+					      struct dlm_lock *lock,
														
 
															+					      struct dlm_lockstatus *lksb,
														
 
															+					      int *actions)
														
 
															+{
														
 
															+	enum dlm_status status;
														
 
															+
														
 
															+	if (dlm_lock_on_list(&res->blocked, lock)) {
														
 
															+		/* cancel this outright */
														
 
															+		lksb->status = DLM_NORMAL;
														
 
															+		status = DLM_NORMAL;
														
 
															+		*actions = (DLM_UNLOCK_CALL_AST |
														
 
															+			    DLM_UNLOCK_REMOVE_LOCK);
														
 
															+	} else if (dlm_lock_on_list(&res->converting, lock)) {
														
 
															+		/* cancel the request, put back on granted */
														
 
															+		lksb->status = DLM_NORMAL;
														
 
															+		status = DLM_NORMAL;
														
 
															+		*actions = (DLM_UNLOCK_CALL_AST |
														
 
															+			    DLM_UNLOCK_REMOVE_LOCK |
														
 
															+			    DLM_UNLOCK_REGRANT_LOCK |
														
 
															+			    DLM_UNLOCK_CLEAR_CONVERT_TYPE);
														
 
															+	} else if (dlm_lock_on_list(&res->granted, lock)) {
														
 
															+		/* too late, already granted.  DLM_CANCELGRANT */
														
 
															+		lksb->status = DLM_CANCELGRANT;
														
 
															+		status = DLM_NORMAL;
														
 
															+		*actions = DLM_UNLOCK_CALL_AST;
														
 
															+	} else {
														
 
															+		mlog(ML_ERROR, "lock to cancel is not on any list!\n");
														
 
															+		lksb->status = DLM_IVLOCKID;
														
 
															+		status = DLM_IVLOCKID;
														
 
															+		*actions = 0;
														
 
															+	}
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
														
 
															+					      struct dlm_lock_resource *res,
														
 
															+					      struct dlm_lock *lock,
														
 
															+					      struct dlm_lockstatus *lksb,
														
 
															+					      int *actions)
														
 
															+{
														
 
															+	enum dlm_status status;
														
 
															+
														
 
															+	/* unlock request */
														
 
															+	if (!dlm_lock_on_list(&res->granted, lock)) {
														
 
															+		lksb->status = DLM_DENIED;
														
 
															+		status = DLM_DENIED;
														
 
															+		dlm_error(status);
														
 
															+		*actions = 0;
														
 
															+	} else {
														
 
															+		/* unlock granted lock */
														
 
															+		lksb->status = DLM_NORMAL;
														
 
															+		status = DLM_NORMAL;
														
 
															+		*actions = (DLM_UNLOCK_FREE_LOCK |
														
 
															+			    DLM_UNLOCK_CALL_AST |
														
 
															+			    DLM_UNLOCK_REMOVE_LOCK);
														
 
															+	}
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* there seems to be no point in doing this async
														
 
															+ * since (even for the remote case) there is really
														
 
															+ * no work to queue up... so just do it and fire the
														
 
															+ * unlockast by hand when done... */
														
 
															+enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
														
 
															+			  int flags, dlm_astunlockfunc_t *unlockast, void *data)
														
 
															+{
														
 
															+	enum dlm_status status;
														
 
															+	struct dlm_lock_resource *res;
														
 
															+	struct dlm_lock *lock = NULL;
														
 
															+	int call_ast, is_master;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!lksb) {
														
 
															+		dlm_error(DLM_BADARGS);
														
 
															+		return DLM_BADARGS;
														
 
															+	}
														
 
															+
														
 
															+	if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
														
 
															+		dlm_error(DLM_BADPARAM);
														
 
															+		return DLM_BADPARAM;
														
 
															+	}
														
 
															+
														
 
															+	if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
														
 
															+		mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
														
 
															+		flags &= ~LKM_VALBLK;
														
 
															+	}
														
 
															+
														
 
															+	if (!lksb->lockid || !lksb->lockid->lockres) {
														
 
															+		dlm_error(DLM_BADPARAM);
														
 
															+		return DLM_BADPARAM;
														
 
															+	}
														
 
															+
														
 
															+	lock = lksb->lockid;
														
 
															+	BUG_ON(!lock);
														
 
															+	dlm_lock_get(lock);
														
 
															+
														
 
															+	res = lock->lockres;
														
 
															+	BUG_ON(!res);
														
 
															+	dlm_lockres_get(res);
														
 
															+retry:
														
 
															+	call_ast = 0;
														
 
															+	/* need to retry up here because owner may have changed */
														
 
															+	mlog(0, "lock=%p res=%p\n", lock, res);
														
 
															+
														
 
															+	spin_lock(&res->spinlock);
														
 
															+	is_master = (res->owner == dlm->node_num);
														
 
															+	spin_unlock(&res->spinlock);
														
 
															+
														
 
															+	if (is_master) {
														
 
															+		status = dlmunlock_master(dlm, res, lock, lksb, flags,
														
 
															+					  &call_ast);
														
 
															+		mlog(0, "done calling dlmunlock_master: returned %d, "
														
 
															+		     "call_ast is %d\n", status, call_ast);
														
 
															+	} else {
														
 
															+		status = dlmunlock_remote(dlm, res, lock, lksb, flags,
														
 
															+					  &call_ast);
														
 
															+		mlog(0, "done calling dlmunlock_remote: returned %d, "
														
 
															+		     "call_ast is %d\n", status, call_ast);
														
 
															+	}
														
 
															+
														
 
															+	if (status == DLM_RECOVERING ||
														
 
															+	    status == DLM_MIGRATING ||
														
 
															+	    status == DLM_FORWARD) {
														
 
															+		/* We want to go away for a tiny bit to allow recovery
														
 
															+		 * / migration to complete on this resource. I don't
														
 
															+		 * know of any wait queue we could sleep on as this
														
 
															+		 * may be happening on another node. Perhaps the
														
 
															+		 * proper solution is to queue up requests on the
														
 
															+		 * other end? */
														
 
															+
														
 
															+		/* do we want to yield(); ?? */
														
 
															+		msleep(50);
														
 
															+
														
 
															+		mlog(0, "retrying unlock due to pending recovery/"
														
 
															+		     "migration/in-progress\n");
														
 
															+		goto retry;
														
 
															+	}
														
 
															+
														
 
															+	if (call_ast) {
														
 
															+		mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status);
														
 
															+		if (is_master) {
														
 
															+			/* it is possible that there is one last bast 
														
 
															+			 * pending.  make sure it is flushed, then
														
 
															+			 * call the unlockast.
														
 
															+			 * not an issue if this is a mastered remotely,
														
 
															+			 * since this lock has been removed from the
														
 
															+			 * lockres queues and cannot be found. */
														
 
															+			dlm_kick_thread(dlm, NULL);
														
 
															+			wait_event(dlm->ast_wq, 
														
 
															+				   dlm_lock_basts_flushed(dlm, lock));
														
 
															+		}
														
 
															+		(*unlockast)(data, lksb->status);
														
 
															+	}
														
 
															+
														
 
															+	if (status == DLM_NORMAL) {
														
 
															+		mlog(0, "kicking the thread\n");
														
 
															+		dlm_kick_thread(dlm, res);
														
 
															+	} else
														
 
															+		dlm_error(status);
														
 
															+
														
 
															+	dlm_lockres_calc_usage(dlm, res);
														
 
															+	dlm_lockres_put(res);
														
 
															+	dlm_lock_put(lock);
														
 
															+
														
 
															+	mlog(0, "returning status=%d!\n", status);
														
 
															+	return status;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(dlmunlock);
														
 
															+
														
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -0,0 +1,42 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmver.c
														
 
															+ *
														
 
															+ * version string
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/kernel.h>
														
 
															+
														
 
															+#include "dlmver.h"
														
 
															+
														
 
															+#define DLM_BUILD_VERSION "1.3.3"
														
 
															+
														
 
															+#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
														
 
															+
														
 
															+void dlm_print_version(void)
														
 
															+{
														
 
															+	printk(KERN_INFO "%s\n", VERSION_STR);
														
 
															+}
														
 
															+
														
 
															+MODULE_DESCRIPTION(VERSION_STR);
														
 
															+
														
 
															+MODULE_VERSION(DLM_BUILD_VERSION);
														
--- a/fs/ocfs2/dlm/dlmver.h
+++ b/fs/ocfs2/dlm/dlmver.h
@@ -0,0 +1,31 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmfsver.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef DLM_VER_H
														
 
															+#define DLM_VER_H
														
 
															+
														
 
															+void dlm_print_version(void);
														
 
															+
														
 
															+#endif /* DLM_VER_H */
														
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -0,0 +1,658 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * userdlm.c
														
 
															+ *
														
 
															+ * Code which implements the kernel side of a minimal userspace
														
 
															+ * interface to our DLM.
														
 
															+ *
														
 
															+ * Many of the functions here are pared down versions of dlmglue.c
														
 
															+ * functions.
														
 
															+ *
														
 
															+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <asm/signal.h>
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/crc32.h>
														
 
															+
														
 
															+
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlmapi.h"
														
 
															+
														
 
															+#include "userdlm.h"
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DLMFS
														
 
															+#include "cluster/masklog.h"
														
 
															+
														
 
															+static inline int user_check_wait_flag(struct user_lock_res *lockres,
														
 
															+				       int flag)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+	ret = lockres->l_flags & flag;
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
														
 
															+
														
 
															+{
														
 
															+	wait_event(lockres->l_event,
														
 
															+		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
														
 
															+}
														
 
															+
														
 
															+static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
														
 
															+
														
 
															+{
														
 
															+	wait_event(lockres->l_event,
														
 
															+		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
														
 
															+}
														
 
															+
														
 
															+/* I heart container_of... */
														
 
															+static inline struct dlm_ctxt *
														
 
															+dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
														
 
															+{
														
 
															+	struct dlmfs_inode_private *ip;
														
 
															+
														
 
															+	ip = container_of(lockres,
														
 
															+			  struct dlmfs_inode_private,
														
 
															+			  ip_lockres);
														
 
															+	return ip->ip_dlm;
														
 
															+}
														
 
															+
														
 
															+static struct inode *
														
 
															+user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
														
 
															+{
														
 
															+	struct dlmfs_inode_private *ip;
														
 
															+
														
 
															+	ip = container_of(lockres,
														
 
															+			  struct dlmfs_inode_private,
														
 
															+			  ip_lockres);
														
 
															+	return &ip->ip_vfs_inode;
														
 
															+}
														
 
															+
														
 
															+static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
														
 
															+{
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+	lockres->l_flags &= ~USER_LOCK_BUSY;
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+}
														
 
															+
														
 
															+#define user_log_dlm_error(_func, _stat, _lockres) do {		\
														
 
															+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
														
 
															+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
														
 
															+		_lockres->l_name, dlm_errmsg(_stat));		\
														
 
															+} while (0)
														
 
															+
														
 
															+/* WARNING: This function lives in a world where the only three lock
														
 
															+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
														
 
															+ * lock types are added. */
														
 
															+static inline int user_highest_compat_lock_level(int level)
														
 
															+{
														
 
															+	int new_level = LKM_EXMODE;
														
 
															+
														
 
															+	if (level == LKM_EXMODE)
														
 
															+		new_level = LKM_NLMODE;
														
 
															+	else if (level == LKM_PRMODE)
														
 
															+		new_level = LKM_PRMODE;
														
 
															+	return new_level;
														
 
															+}
														
 
															+
														
 
															+static void user_ast(void *opaque)
														
 
															+{
														
 
															+	struct user_lock_res *lockres = opaque;
														
 
															+	struct dlm_lockstatus *lksb;
														
 
															+
														
 
															+	mlog(0, "AST fired for lockres %s\n", lockres->l_name);
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+
														
 
															+	lksb = &(lockres->l_lksb);
														
 
															+	if (lksb->status != DLM_NORMAL) {
														
 
															+		mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
														
 
															+		     lksb->status, lockres->l_name);
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/* we're downconverting. */
														
 
															+	if (lockres->l_requested < lockres->l_level) {
														
 
															+		if (lockres->l_requested <=
														
 
															+		    user_highest_compat_lock_level(lockres->l_blocking)) {
														
 
															+			lockres->l_blocking = LKM_NLMODE;
														
 
															+			lockres->l_flags &= ~USER_LOCK_BLOCKED;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	lockres->l_level = lockres->l_requested;
														
 
															+	lockres->l_requested = LKM_IVMODE;
														
 
															+	lockres->l_flags |= USER_LOCK_ATTACHED;
														
 
															+	lockres->l_flags &= ~USER_LOCK_BUSY;
														
 
															+
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+	wake_up(&lockres->l_event);
														
 
															+}
														
 
															+
														
 
															+static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
														
 
															+{
														
 
															+	struct inode *inode;
														
 
															+	inode = user_dlm_inode_from_user_lockres(lockres);
														
 
															+	if (!igrab(inode))
														
 
															+		BUG();
														
 
															+}
														
 
															+
														
 
															+static void user_dlm_unblock_lock(void *opaque);
														
 
															+
														
 
															+static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
														
 
															+{
														
 
															+	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
														
 
															+		user_dlm_grab_inode_ref(lockres);
														
 
															+
														
 
															+		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
														
 
															+			  lockres);
														
 
															+
														
 
															+		queue_work(user_dlm_worker, &lockres->l_work);
														
 
															+		lockres->l_flags |= USER_LOCK_QUEUED;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
														
 
															+{
														
 
															+	int queue = 0;
														
 
															+
														
 
															+	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
														
 
															+		return;
														
 
															+
														
 
															+	switch (lockres->l_blocking) {
														
 
															+	case LKM_EXMODE:
														
 
															+		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
														
 
															+			queue = 1;
														
 
															+		break;
														
 
															+	case LKM_PRMODE:
														
 
															+		if (!lockres->l_ex_holders)
														
 
															+			queue = 1;
														
 
															+		break;
														
 
															+	default:
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	if (queue)
														
 
															+		__user_dlm_queue_lockres(lockres);
														
 
															+}
														
 
															+
														
 
															+static void user_bast(void *opaque, int level)
														
 
															+{
														
 
															+	struct user_lock_res *lockres = opaque;
														
 
															+
														
 
															+	mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
														
 
															+		lockres->l_name, level);
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+	lockres->l_flags |= USER_LOCK_BLOCKED;
														
 
															+	if (level > lockres->l_blocking)
														
 
															+		lockres->l_blocking = level;
														
 
															+
														
 
															+	__user_dlm_queue_lockres(lockres);
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+	wake_up(&lockres->l_event);
														
 
															+}
														
 
															+
														
 
															+static void user_unlock_ast(void *opaque, enum dlm_status status)
														
 
															+{
														
 
															+	struct user_lock_res *lockres = opaque;
														
 
															+
														
 
															+	mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
														
 
															+
														
 
															+	if (status != DLM_NORMAL)
														
 
															+		mlog(ML_ERROR, "Dlm returns status %d\n", status);
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
														
 
															+		lockres->l_level = LKM_IVMODE;
														
 
															+	else {
														
 
															+		lockres->l_requested = LKM_IVMODE; /* cancel an
														
 
															+						    * upconvert
														
 
															+						    * request. */
														
 
															+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
														
 
															+		/* we want the unblock thread to look at it again
														
 
															+		 * now. */
														
 
															+		__user_dlm_queue_lockres(lockres);
														
 
															+	}
														
 
															+
														
 
															+	lockres->l_flags &= ~USER_LOCK_BUSY;
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+	wake_up(&lockres->l_event);
														
 
															+}
														
 
															+
														
 
															+static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
														
 
															+{
														
 
															+	struct inode *inode;
														
 
															+	inode = user_dlm_inode_from_user_lockres(lockres);
														
 
															+	iput(inode);
														
 
															+}
														
 
															+
														
 
															+static void user_dlm_unblock_lock(void *opaque)
														
 
															+{
														
 
															+	int new_level, status;
														
 
															+	struct user_lock_res *lockres = (struct user_lock_res *) opaque;
														
 
															+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
														
 
															+
														
 
															+	mlog(0, "processing lockres %s\n", lockres->l_name);
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+
														
 
															+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
														
 
															+	BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
														
 
															+
														
 
															+	/* notice that we don't clear USER_LOCK_BLOCKED here. That's
														
 
															+	 * for user_ast to do. */
														
 
															+	lockres->l_flags &= ~USER_LOCK_QUEUED;
														
 
															+
														
 
															+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
														
 
															+		mlog(0, "lock is in teardown so we do nothing\n");
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+		goto drop_ref;
														
 
															+	}
														
 
															+
														
 
															+	if (lockres->l_flags & USER_LOCK_BUSY) {
														
 
															+		mlog(0, "BUSY flag detected...\n");
														
 
															+		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
														
 
															+			spin_unlock(&lockres->l_lock);
														
 
															+			goto drop_ref;
														
 
															+		}
														
 
															+
														
 
															+		lockres->l_flags |= USER_LOCK_IN_CANCEL;
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+		status = dlmunlock(dlm,
														
 
															+				   &lockres->l_lksb,
														
 
															+				   LKM_CANCEL,
														
 
															+				   user_unlock_ast,
														
 
															+				   lockres);
														
 
															+		if (status == DLM_CANCELGRANT) {
														
 
															+			/* If we got this, then the ast was fired
														
 
															+			 * before we could cancel. We cleanup our
														
 
															+			 * state, and restart the function. */
														
 
															+			spin_lock(&lockres->l_lock);
														
 
															+			lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
														
 
															+			spin_unlock(&lockres->l_lock);
														
 
															+		} else if (status != DLM_NORMAL)
														
 
															+			user_log_dlm_error("dlmunlock", status, lockres);
														
 
															+		goto drop_ref;
														
 
															+	}
														
 
															+
														
 
															+	/* If there are still incompat holders, we can exit safely
														
 
															+	 * without worrying about re-queueing this lock as that will
														
 
															+	 * happen on the last call to user_cluster_unlock. */
														
 
															+	if ((lockres->l_blocking == LKM_EXMODE)
														
 
															+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
														
 
															+			lockres->l_ro_holders, lockres->l_ex_holders);
														
 
															+		goto drop_ref;
														
 
															+	}
														
 
															+
														
 
															+	if ((lockres->l_blocking == LKM_PRMODE)
														
 
															+	    && lockres->l_ex_holders) {
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+		mlog(0, "can't downconvert for pr: ex = %u\n",
														
 
															+			lockres->l_ex_holders);
														
 
															+		goto drop_ref;
														
 
															+	}
														
 
															+
														
 
															+	/* yay, we can downconvert now. */
														
 
															+	new_level = user_highest_compat_lock_level(lockres->l_blocking);
														
 
															+	lockres->l_requested = new_level;
														
 
															+	lockres->l_flags |= USER_LOCK_BUSY;
														
 
															+	mlog(0, "Downconvert lock from %d to %d\n",
														
 
															+		lockres->l_level, new_level);
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+	/* need lock downconvert request now... */
														
 
															+	status = dlmlock(dlm,
														
 
															+			 new_level,
														
 
															+			 &lockres->l_lksb,
														
 
															+			 LKM_CONVERT|LKM_VALBLK,
														
 
															+			 lockres->l_name,
														
 
															+			 user_ast,
														
 
															+			 lockres,
														
 
															+			 user_bast);
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		user_log_dlm_error("dlmlock", status, lockres);
														
 
															+		user_recover_from_dlm_error(lockres);
														
 
															+	}
														
 
															+
														
 
															+drop_ref:
														
 
															+	user_dlm_drop_inode_ref(lockres);
														
 
															+}
														
 
															+
														
 
															+static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
														
 
															+					int level)
														
 
															+{
														
 
															+	switch(level) {
														
 
															+	case LKM_EXMODE:
														
 
															+		lockres->l_ex_holders++;
														
 
															+		break;
														
 
															+	case LKM_PRMODE:
														
 
															+		lockres->l_ro_holders++;
														
 
															+		break;
														
 
															+	default:
														
 
															+		BUG();
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* predict what lock level we'll be dropping down to on behalf
														
 
															+ * of another node, and return true if the currently wanted
														
 
															+ * level will be compatible with it. */
														
 
															+static inline int
														
 
															+user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
														
 
															+				  int wanted)
														
 
															+{
														
 
															+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
														
 
															+
														
 
															+	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
														
 
															+}
														
 
															+
														
 
															+int user_dlm_cluster_lock(struct user_lock_res *lockres,
														
 
															+			  int level,
														
 
															+			  int lkm_flags)
														
 
															+{
														
 
															+	int status, local_flags;
														
 
															+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
														
 
															+
														
 
															+	if (level != LKM_EXMODE &&
														
 
															+	    level != LKM_PRMODE) {
														
 
															+		mlog(ML_ERROR, "lockres %s: invalid request!\n",
														
 
															+		     lockres->l_name);
														
 
															+		status = -EINVAL;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
														
 
															+		lockres->l_name,
														
 
															+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
														
 
															+		lkm_flags);
														
 
															+
														
 
															+again:
														
 
															+	if (signal_pending(current)) {
														
 
															+		status = -ERESTARTSYS;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+
														
 
															+	/* We only compare against the currently granted level
														
 
															+	 * here. If the lock is blocked waiting on a downconvert,
														
 
															+	 * we'll get caught below. */
														
 
															+	if ((lockres->l_flags & USER_LOCK_BUSY) &&
														
 
															+	    (level > lockres->l_level)) {
														
 
															+		/* is someone sitting in dlm_lock? If so, wait on
														
 
															+		 * them. */
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+		user_wait_on_busy_lock(lockres);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
														
 
															+	    (!user_may_continue_on_blocked_lock(lockres, level))) {
														
 
															+		/* is the lock is currently blocked on behalf of
														
 
															+		 * another node */
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+		user_wait_on_blocked_lock(lockres);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	if (level > lockres->l_level) {
														
 
															+		local_flags = lkm_flags | LKM_VALBLK;
														
 
															+		if (lockres->l_level != LKM_IVMODE)
														
 
															+			local_flags |= LKM_CONVERT;
														
 
															+
														
 
															+		lockres->l_requested = level;
														
 
															+		lockres->l_flags |= USER_LOCK_BUSY;
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+		BUG_ON(level == LKM_IVMODE);
														
 
															+		BUG_ON(level == LKM_NLMODE);
														
 
															+
														
 
															+		mlog(0, "lock %s, get lock from %d to level = %d\n",
														
 
															+			lockres->l_name, lockres->l_level, level);
														
 
															+
														
 
															+		/* call dlm_lock to upgrade lock now */
														
 
															+		status = dlmlock(dlm,
														
 
															+				 level,
														
 
															+				 &lockres->l_lksb,
														
 
															+				 local_flags,
														
 
															+				 lockres->l_name,
														
 
															+				 user_ast,
														
 
															+				 lockres,
														
 
															+				 user_bast);
														
 
															+		if (status != DLM_NORMAL) {
														
 
															+			if ((lkm_flags & LKM_NOQUEUE) &&
														
 
															+			    (status == DLM_NOTQUEUED))
														
 
															+				status = -EAGAIN;
														
 
															+			else {
														
 
															+				user_log_dlm_error("dlmlock", status, lockres);
														
 
															+				status = -EINVAL;
														
 
															+			}
														
 
															+			user_recover_from_dlm_error(lockres);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		mlog(0, "lock %s, successfull return from dlmlock\n",
														
 
															+			lockres->l_name);
														
 
															+
														
 
															+		user_wait_on_busy_lock(lockres);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	user_dlm_inc_holders(lockres, level);
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+	mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
														
 
															+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
														
 
															+					int level)
														
 
															+{
														
 
															+	switch(level) {
														
 
															+	case LKM_EXMODE:
														
 
															+		BUG_ON(!lockres->l_ex_holders);
														
 
															+		lockres->l_ex_holders--;
														
 
															+		break;
														
 
															+	case LKM_PRMODE:
														
 
															+		BUG_ON(!lockres->l_ro_holders);
														
 
															+		lockres->l_ro_holders--;
														
 
															+		break;
														
 
															+	default:
														
 
															+		BUG();
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
														
 
															+			     int level)
														
 
															+{
														
 
															+	if (level != LKM_EXMODE &&
														
 
															+	    level != LKM_PRMODE) {
														
 
															+		mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
														
 
															+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+	user_dlm_dec_holders(lockres, level);
														
 
															+	__user_dlm_cond_queue_lockres(lockres);
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+}
														
 
															+
														
 
															+void user_dlm_write_lvb(struct inode *inode,
														
 
															+			const char *val,
														
 
															+			unsigned int len)
														
 
															+{
														
 
															+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
														
 
															+	char *lvb = lockres->l_lksb.lvb;
														
 
															+
														
 
															+	BUG_ON(len > DLM_LVB_LEN);
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+
														
 
															+	BUG_ON(lockres->l_level < LKM_EXMODE);
														
 
															+	memcpy(lvb, val, len);
														
 
															+
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+}
														
 
															+
														
 
															+void user_dlm_read_lvb(struct inode *inode,
														
 
															+		       char *val,
														
 
															+		       unsigned int len)
														
 
															+{
														
 
															+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
														
 
															+	char *lvb = lockres->l_lksb.lvb;
														
 
															+
														
 
															+	BUG_ON(len > DLM_LVB_LEN);
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+
														
 
															+	BUG_ON(lockres->l_level < LKM_PRMODE);
														
 
															+	memcpy(val, lvb, len);
														
 
															+
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+}
														
 
															+
														
 
															+void user_dlm_lock_res_init(struct user_lock_res *lockres,
														
 
															+			    struct dentry *dentry)
														
 
															+{
														
 
															+	memset(lockres, 0, sizeof(*lockres));
														
 
															+
														
 
															+	spin_lock_init(&lockres->l_lock);
														
 
															+	init_waitqueue_head(&lockres->l_event);
														
 
															+	lockres->l_level = LKM_IVMODE;
														
 
															+	lockres->l_requested = LKM_IVMODE;
														
 
															+	lockres->l_blocking = LKM_IVMODE;
														
 
															+
														
 
															+	/* should have been checked before getting here. */
														
 
															+	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
														
 
															+
														
 
															+	memcpy(lockres->l_name,
														
 
															+	       dentry->d_name.name,
														
 
															+	       dentry->d_name.len);
														
 
															+}
														
 
															+
														
 
															+int user_dlm_destroy_lock(struct user_lock_res *lockres)
														
 
															+{
														
 
															+	int status = -EBUSY;
														
 
															+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
														
 
															+
														
 
															+	mlog(0, "asked to destroy %s\n", lockres->l_name);
														
 
															+
														
 
															+	spin_lock(&lockres->l_lock);
														
 
															+	while (lockres->l_flags & USER_LOCK_BUSY) {
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+		mlog(0, "lock %s is busy\n", lockres->l_name);
														
 
															+
														
 
															+		user_wait_on_busy_lock(lockres);
														
 
															+
														
 
															+		spin_lock(&lockres->l_lock);
														
 
															+	}
														
 
															+
														
 
															+	if (lockres->l_ro_holders || lockres->l_ex_holders) {
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+		mlog(0, "lock %s has holders\n", lockres->l_name);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
														
 
															+		spin_unlock(&lockres->l_lock);
														
 
															+		mlog(0, "lock %s is not attached\n", lockres->l_name);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	lockres->l_flags &= ~USER_LOCK_ATTACHED;
														
 
															+	lockres->l_flags |= USER_LOCK_BUSY;
														
 
															+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
														
 
															+	spin_unlock(&lockres->l_lock);
														
 
															+
														
 
															+	mlog(0, "unlocking lockres %s\n", lockres->l_name);
														
 
															+	status = dlmunlock(dlm,
														
 
															+			   &lockres->l_lksb,
														
 
															+			   LKM_VALBLK,
														
 
															+			   user_unlock_ast,
														
 
															+			   lockres);
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		user_log_dlm_error("dlmunlock", status, lockres);
														
 
															+		status = -EINVAL;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	user_wait_on_busy_lock(lockres);
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
														
 
															+{
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+	u32 dlm_key;
														
 
															+	char *domain;
														
 
															+
														
 
															+	domain = kmalloc(name->len + 1, GFP_KERNEL);
														
 
															+	if (!domain) {
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		return ERR_PTR(-ENOMEM);
														
 
															+	}
														
 
															+
														
 
															+	dlm_key = crc32_le(0, name->name, name->len);
														
 
															+
														
 
															+	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
														
 
															+
														
 
															+	dlm = dlm_register_domain(domain, dlm_key);
														
 
															+	if (IS_ERR(dlm))
														
 
															+		mlog_errno(PTR_ERR(dlm));
														
 
															+
														
 
															+	kfree(domain);
														
 
															+	return dlm;
														
 
															+}
														
 
															+
														
 
															+void user_dlm_unregister_context(struct dlm_ctxt *dlm)
														
 
															+{
														
 
															+	dlm_unregister_domain(dlm);
														
 
															+}
														
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -0,0 +1,111 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * userdlm.h
														
 
															+ *
														
 
															+ * Userspace dlm defines
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#ifndef USERDLM_H
														
 
															+#define USERDLM_H
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/workqueue.h>
														
 
															+
														
 
															+/* user_lock_res->l_flags flags. */
														
 
															+#define USER_LOCK_ATTACHED      (0x00000001) /* have we initialized
														
 
															+					       * the lvb */
														
 
															+#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
														
 
															+					       * dlm_lock */
														
 
															+#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
														
 
															+					      * downconvert*/
														
 
															+#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
														
 
															+					      * destroying this
														
 
															+					      * lock. */
														
 
															+#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
														
 
															+					      * workqueue */
														
 
															+#define USER_LOCK_IN_CANCEL     (0x00000020)
														
 
															+
														
 
															+struct user_lock_res {
														
 
															+	spinlock_t               l_lock;
														
 
															+
														
 
															+	int                      l_flags;
														
 
															+
														
 
															+#define USER_DLM_LOCK_ID_MAX_LEN  32
														
 
															+	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
														
 
															+	int                      l_level;
														
 
															+	unsigned int             l_ro_holders;
														
 
															+	unsigned int             l_ex_holders;
														
 
															+	struct dlm_lockstatus    l_lksb;
														
 
															+
														
 
															+	int                      l_requested;
														
 
															+	int                      l_blocking;
														
 
															+
														
 
															+	wait_queue_head_t        l_event;
														
 
															+
														
 
															+	struct work_struct       l_work;
														
 
															+};
														
 
															+
														
 
															+extern struct workqueue_struct *user_dlm_worker;
														
 
															+
														
 
															+void user_dlm_lock_res_init(struct user_lock_res *lockres,
														
 
															+			    struct dentry *dentry);
														
 
															+int user_dlm_destroy_lock(struct user_lock_res *lockres);
														
 
															+int user_dlm_cluster_lock(struct user_lock_res *lockres,
														
 
															+			  int level,
														
 
															+			  int lkm_flags);
														
 
															+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
														
 
															+			     int level);
														
 
															+void user_dlm_write_lvb(struct inode *inode,
														
 
															+			const char *val,
														
 
															+			unsigned int len);
														
 
															+void user_dlm_read_lvb(struct inode *inode,
														
 
															+		       char *val,
														
 
															+		       unsigned int len);
														
 
															+struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
														
 
															+void user_dlm_unregister_context(struct dlm_ctxt *dlm);
														
 
															+
														
 
															+struct dlmfs_inode_private {
														
 
															+	struct dlm_ctxt             *ip_dlm;
														
 
															+
														
 
															+	struct user_lock_res ip_lockres; /* unused for directories. */
														
 
															+	struct inode         *ip_parent;
														
 
															+
														
 
															+	struct inode         ip_vfs_inode;
														
 
															+};
														
 
															+
														
 
															+static inline struct dlmfs_inode_private *
														
 
															+DLMFS_I(struct inode *inode)
														
 
															+{
														
 
															+        return container_of(inode,
														
 
															+			    struct dlmfs_inode_private,
														
 
															+			    ip_vfs_inode);
														
 
															+}
														
 
															+
														
 
															+struct dlmfs_filp_private {
														
 
															+	int                  fp_lock_level;
														
 
															+};
														
 
															+
														
 
															+#define DLMFS_MAGIC	0x76a9f425
														
 
															+
														
 
															+#endif /* USERDLM_H */
														
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,2904 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmglue.c
														
 
															+ *
														
 
															+ * Code which implements an OCFS2 specific interface to our DLM.
														
 
															+ *
														
 
															+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/mm.h>
														
 
															+#include <linux/smp_lock.h>
														
 
															+#include <linux/crc32.h>
														
 
															+#include <linux/kthread.h>
														
 
															+#include <linux/pagemap.h>
														
 
															+#include <linux/debugfs.h>
														
 
															+#include <linux/seq_file.h>
														
 
															+
														
 
															+#include <cluster/heartbeat.h>
														
 
															+#include <cluster/nodemanager.h>
														
 
															+#include <cluster/tcp.h>
														
 
															+
														
 
															+#include <dlm/dlmapi.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DLM_GLUE
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "heartbeat.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "slot_map.h"
														
 
															+#include "super.h"
														
 
															+#include "uptodate.h"
														
 
															+#include "vote.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+struct ocfs2_mask_waiter {
														
 
															+	struct list_head	mw_item;
														
 
															+	int			mw_status;
														
 
															+	struct completion	mw_complete;
														
 
															+	unsigned long		mw_mask;
														
 
															+	unsigned long		mw_goal;
														
 
															+};
														
 
															+
														
 
															+static void ocfs2_inode_ast_func(void *opaque);
														
 
															+static void ocfs2_inode_bast_func(void *opaque,
														
 
															+				  int level);
														
 
															+static void ocfs2_super_ast_func(void *opaque);
														
 
															+static void ocfs2_super_bast_func(void *opaque,
														
 
															+				  int level);
														
 
															+static void ocfs2_rename_ast_func(void *opaque);
														
 
															+static void ocfs2_rename_bast_func(void *opaque,
														
 
															+				   int level);
														
 
															+
														
 
															+/* so far, all locks have gotten along with the same unlock ast */
														
 
															+static void ocfs2_unlock_ast_func(void *opaque,
														
 
															+				  enum dlm_status status);
														
 
															+static int ocfs2_do_unblock_meta(struct inode *inode,
														
 
															+				 int *requeue);
														
 
															+static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
														
 
															+			      int *requeue);
														
 
															+static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
														
 
															+			      int *requeue);
														
 
															+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
														
 
															+			      int *requeue);
														
 
															+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
														
 
															+				  int *requeue);
														
 
															+typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
														
 
															+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
														
 
															+				      struct ocfs2_lock_res *lockres,
														
 
															+				      int *requeue,
														
 
															+				      ocfs2_convert_worker_t *worker);
														
 
															+
														
 
															+struct ocfs2_lock_res_ops {
														
 
															+	void (*ast)(void *);
														
 
															+	void (*bast)(void *, int);
														
 
															+	void (*unlock_ast)(void *, enum dlm_status);
														
 
															+	int  (*unblock)(struct ocfs2_lock_res *, int *);
														
 
															+};
														
 
															+
														
 
															+static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
														
 
															+	.ast		= ocfs2_inode_ast_func,
														
 
															+	.bast		= ocfs2_inode_bast_func,
														
 
															+	.unlock_ast	= ocfs2_unlock_ast_func,
														
 
															+	.unblock	= ocfs2_unblock_inode_lock,
														
 
															+};
														
 
															+
														
 
															+static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
														
 
															+	.ast		= ocfs2_inode_ast_func,
														
 
															+	.bast		= ocfs2_inode_bast_func,
														
 
															+	.unlock_ast	= ocfs2_unlock_ast_func,
														
 
															+	.unblock	= ocfs2_unblock_meta,
														
 
															+};
														
 
															+
														
 
															+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
														
 
															+				      int blocking);
														
 
															+
														
 
															+static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
														
 
															+	.ast		= ocfs2_inode_ast_func,
														
 
															+	.bast		= ocfs2_inode_bast_func,
														
 
															+	.unlock_ast	= ocfs2_unlock_ast_func,
														
 
															+	.unblock	= ocfs2_unblock_data,
														
 
															+};
														
 
															+
														
 
															+static struct ocfs2_lock_res_ops ocfs2_super_lops = {
														
 
															+	.ast		= ocfs2_super_ast_func,
														
 
															+	.bast		= ocfs2_super_bast_func,
														
 
															+	.unlock_ast	= ocfs2_unlock_ast_func,
														
 
															+	.unblock	= ocfs2_unblock_osb_lock,
														
 
															+};
														
 
															+
														
 
															+static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
														
 
															+	.ast		= ocfs2_rename_ast_func,
														
 
															+	.bast		= ocfs2_rename_bast_func,
														
 
															+	.unlock_ast	= ocfs2_unlock_ast_func,
														
 
															+	.unblock	= ocfs2_unblock_osb_lock,
														
 
															+};
														
 
															+
														
 
															+static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
														
 
															+		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
														
 
															+		lockres->l_type == OCFS2_LOCK_TYPE_RW;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
														
 
															+}
														
 
															+
														
 
															+static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	BUG_ON(!ocfs2_is_super_lock(lockres)
														
 
															+	       && !ocfs2_is_rename_lock(lockres));
														
 
															+
														
 
															+	return (struct ocfs2_super *) lockres->l_priv;
														
 
															+}
														
 
															+
														
 
															+static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	BUG_ON(!ocfs2_is_inode_lock(lockres));
														
 
															+
														
 
															+	return (struct inode *) lockres->l_priv;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_lock_create(struct ocfs2_super *osb,
														
 
															+			     struct ocfs2_lock_res *lockres,
														
 
															+			     int level,
														
 
															+			     int dlm_flags);
														
 
															+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
														
 
															+						     int wanted);
														
 
															+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
														
 
															+				 struct ocfs2_lock_res *lockres,
														
 
															+				 int level);
														
 
															+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
														
 
															+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
														
 
															+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
														
 
															+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
														
 
															+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
														
 
															+					struct ocfs2_lock_res *lockres);
														
 
															+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
														
 
															+						int convert);
														
 
															+#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
														
 
															+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
														
 
															+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
														
 
															+		_lockres->l_name, dlm_errmsg(_stat));		\
														
 
															+} while (0)
														
 
															+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
														
 
															+				 struct ocfs2_lock_res *lockres);
														
 
															+static int ocfs2_meta_lock_update(struct inode *inode,
														
 
															+				  struct buffer_head **bh);
														
 
															+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
														
 
															+static inline int ocfs2_highest_compat_lock_level(int level);
														
 
															+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
														
 
															+						  struct ocfs2_lock_res *lockres,
														
 
															+						  int new_level);
														
 
															+
														
 
															+static char *ocfs2_lock_type_strings[] = {
														
 
															+	[OCFS2_LOCK_TYPE_META] = "Meta",
														
 
															+	[OCFS2_LOCK_TYPE_DATA] = "Data",
														
 
															+	[OCFS2_LOCK_TYPE_SUPER] = "Super",
														
 
															+	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
														
 
															+	/* Need to differntiate from [R]ename.. serializing writes is the
														
 
															+	 * important job it does, anyway. */
														
 
															+	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
														
 
															+};
														
 
															+
														
 
															+static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
														
 
															+{
														
 
															+	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
														
 
															+	return ocfs2_lock_type_strings[type];
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
														
 
															+				  u64 blkno,
														
 
															+				  u32 generation,
														
 
															+				  char *name)
														
 
															+{
														
 
															+	int len;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
														
 
															+
														
 
															+	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
														
 
															+		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
														
 
															+		       generation);
														
 
															+
														
 
															+	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
														
 
															+
														
 
															+	mlog(0, "built lock resource with name: %s\n", name);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
														
 
															+
														
 
															+static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
														
 
															+				       struct ocfs2_dlm_debug *dlm_debug)
														
 
															+{
														
 
															+	mlog(0, "Add tracking for lockres %s\n", res->l_name);
														
 
															+
														
 
															+	spin_lock(&ocfs2_dlm_tracking_lock);
														
 
															+	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
														
 
															+	spin_unlock(&ocfs2_dlm_tracking_lock);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
														
 
															+{
														
 
															+	spin_lock(&ocfs2_dlm_tracking_lock);
														
 
															+	if (!list_empty(&res->l_debug_list))
														
 
															+		list_del_init(&res->l_debug_list);
														
 
															+	spin_unlock(&ocfs2_dlm_tracking_lock);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
														
 
															+				       struct ocfs2_lock_res *res,
														
 
															+				       enum ocfs2_lock_type type,
														
 
															+				       u64 blkno,
														
 
															+				       u32 generation,
														
 
															+				       struct ocfs2_lock_res_ops *ops,
														
 
															+				       void *priv)
														
 
															+{
														
 
															+	ocfs2_build_lock_name(type, blkno, generation, res->l_name);
														
 
															+
														
 
															+	res->l_type          = type;
														
 
															+	res->l_ops           = ops;
														
 
															+	res->l_priv          = priv;
														
 
															+
														
 
															+	res->l_level         = LKM_IVMODE;
														
 
															+	res->l_requested     = LKM_IVMODE;
														
 
															+	res->l_blocking      = LKM_IVMODE;
														
 
															+	res->l_action        = OCFS2_AST_INVALID;
														
 
															+	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
														
 
															+
														
 
															+	res->l_flags         = OCFS2_LOCK_INITIALIZED;
														
 
															+
														
 
															+	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
														
 
															+{
														
 
															+	/* This also clears out the lock status block */
														
 
															+	memset(res, 0, sizeof(struct ocfs2_lock_res));
														
 
															+	spin_lock_init(&res->l_lock);
														
 
															+	init_waitqueue_head(&res->l_event);
														
 
															+	INIT_LIST_HEAD(&res->l_blocked_list);
														
 
															+	INIT_LIST_HEAD(&res->l_mask_waiters);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
														
 
															+			       enum ocfs2_lock_type type,
														
 
															+			       struct inode *inode)
														
 
															+{
														
 
															+	struct ocfs2_lock_res_ops *ops;
														
 
															+
														
 
															+	switch(type) {
														
 
															+		case OCFS2_LOCK_TYPE_RW:
														
 
															+			ops = &ocfs2_inode_rw_lops;
														
 
															+			break;
														
 
															+		case OCFS2_LOCK_TYPE_META:
														
 
															+			ops = &ocfs2_inode_meta_lops;
														
 
															+			break;
														
 
															+		case OCFS2_LOCK_TYPE_DATA:
														
 
															+			ops = &ocfs2_inode_data_lops;
														
 
															+			break;
														
 
															+		default:
														
 
															+			mlog_bug_on_msg(1, "type: %d\n", type);
														
 
															+			ops = NULL; /* thanks, gcc */
														
 
															+			break;
														
 
															+	};
														
 
															+
														
 
															+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
														
 
															+				   OCFS2_I(inode)->ip_blkno,
														
 
															+				   inode->i_generation, ops, inode);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
														
 
															+				      struct ocfs2_super *osb)
														
 
															+{
														
 
															+	/* Superblock lockres doesn't come from a slab so we call init
														
 
															+	 * once on it manually.  */
														
 
															+	ocfs2_lock_res_init_once(res);
														
 
															+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
														
 
															+				   OCFS2_SUPER_BLOCK_BLKNO, 0,
														
 
															+				   &ocfs2_super_lops, osb);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
														
 
															+				       struct ocfs2_super *osb)
														
 
															+{
														
 
															+	/* Rename lockres doesn't come from a slab so we call init
														
 
															+	 * once on it manually.  */
														
 
															+	ocfs2_lock_res_init_once(res);
														
 
															+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
														
 
															+				   &ocfs2_rename_lops, osb);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
														
 
															+		return;
														
 
															+
														
 
															+	ocfs2_remove_lockres_tracking(res);
														
 
															+
														
 
															+	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
														
 
															+			"Lockres %s is on the blocked list\n",
														
 
															+			res->l_name);
														
 
															+	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
														
 
															+			"Lockres %s has mask waiters pending\n",
														
 
															+			res->l_name);
														
 
															+	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
														
 
															+			"Lockres %s is locked\n",
														
 
															+			res->l_name);
														
 
															+	mlog_bug_on_msg(res->l_ro_holders,
														
 
															+			"Lockres %s has %u ro holders\n",
														
 
															+			res->l_name, res->l_ro_holders);
														
 
															+	mlog_bug_on_msg(res->l_ex_holders,
														
 
															+			"Lockres %s has %u ex holders\n",
														
 
															+			res->l_name, res->l_ex_holders);
														
 
															+
														
 
															+	/* Need to clear out the lock status block for the dlm */
														
 
															+	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
														
 
															+
														
 
															+	res->l_flags = 0UL;
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
														
 
															+				     int level)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!lockres);
														
 
															+
														
 
															+	switch(level) {
														
 
															+	case LKM_EXMODE:
														
 
															+		lockres->l_ex_holders++;
														
 
															+		break;
														
 
															+	case LKM_PRMODE:
														
 
															+		lockres->l_ro_holders++;
														
 
															+		break;
														
 
															+	default:
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
														
 
															+				     int level)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!lockres);
														
 
															+
														
 
															+	switch(level) {
														
 
															+	case LKM_EXMODE:
														
 
															+		BUG_ON(!lockres->l_ex_holders);
														
 
															+		lockres->l_ex_holders--;
														
 
															+		break;
														
 
															+	case LKM_PRMODE:
														
 
															+		BUG_ON(!lockres->l_ro_holders);
														
 
															+		lockres->l_ro_holders--;
														
 
															+		break;
														
 
															+	default:
														
 
															+		BUG();
														
 
															+	}
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/* WARNING: This function lives in a world where the only three lock
														
 
															+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
														
 
															+ * lock types are added. */
														
 
															+static inline int ocfs2_highest_compat_lock_level(int level)
														
 
															+{
														
 
															+	int new_level = LKM_EXMODE;
														
 
															+
														
 
															+	if (level == LKM_EXMODE)
														
 
															+		new_level = LKM_NLMODE;
														
 
															+	else if (level == LKM_PRMODE)
														
 
															+		new_level = LKM_PRMODE;
														
 
															+	return new_level;
														
 
															+}
														
 
															+
														
 
															+static void lockres_set_flags(struct ocfs2_lock_res *lockres,
														
 
															+			      unsigned long newflags)
														
 
															+{
														
 
															+	struct list_head *pos, *tmp;
														
 
															+	struct ocfs2_mask_waiter *mw;
														
 
															+
														
 
															+ 	assert_spin_locked(&lockres->l_lock);
														
 
															+
														
 
															+	lockres->l_flags = newflags;
														
 
															+
														
 
															+	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
														
 
															+		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
														
 
															+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
														
 
															+			continue;
														
 
															+
														
 
															+		list_del_init(&mw->mw_item);
														
 
															+		mw->mw_status = 0;
														
 
															+		complete(&mw->mw_complete);
														
 
															+	}
														
 
															+}
														
 
															+static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
														
 
															+{
														
 
															+	lockres_set_flags(lockres, lockres->l_flags | or);
														
 
															+}
														
 
															+static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
														
 
															+				unsigned long clear)
														
 
															+{
														
 
															+	lockres_set_flags(lockres, lockres->l_flags & ~clear);
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
														
 
															+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
														
 
															+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
														
 
															+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
														
 
															+
														
 
															+	lockres->l_level = lockres->l_requested;
														
 
															+	if (lockres->l_level <=
														
 
															+	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
														
 
															+		lockres->l_blocking = LKM_NLMODE;
														
 
															+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
														
 
															+	}
														
 
															+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
														
 
															+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
														
 
															+
														
 
															+	/* Convert from RO to EX doesn't really need anything as our
														
 
															+	 * information is already up to data. Convert from NL to
														
 
															+	 * *anything* however should mark ourselves as needing an
														
 
															+	 * update */
														
 
															+	if (lockres->l_level == LKM_NLMODE)
														
 
															+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
														
 
															+
														
 
															+	lockres->l_level = lockres->l_requested;
														
 
															+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
														
 
															+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
														
 
															+
														
 
															+	if (lockres->l_requested > LKM_NLMODE &&
														
 
															+	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
														
 
															+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
														
 
															+
														
 
															+	lockres->l_level = lockres->l_requested;
														
 
															+	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
														
 
															+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_inode_ast_func(void *opaque)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *lockres = opaque;
														
 
															+	struct inode *inode;
														
 
															+	struct dlm_lockstatus *lksb;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	inode = ocfs2_lock_res_inode(lockres);
														
 
															+
														
 
															+	mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno, lockres->l_action,
														
 
															+	     ocfs2_lock_type_string(lockres->l_type));
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_inode_lock(lockres));
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+
														
 
															+	lksb = &(lockres->l_lksb);
														
 
															+	if (lksb->status != DLM_NORMAL) {
														
 
															+		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
														
 
															+		     "on inode %"MLFu64"\n", lksb->status,
														
 
															+		     OCFS2_I(inode)->ip_blkno);
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		mlog_exit_void();
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	switch(lockres->l_action) {
														
 
															+	case OCFS2_AST_ATTACH:
														
 
															+		ocfs2_generic_handle_attach_action(lockres);
														
 
															+		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
														
 
															+		break;
														
 
															+	case OCFS2_AST_CONVERT:
														
 
															+		ocfs2_generic_handle_convert_action(lockres);
														
 
															+		break;
														
 
															+	case OCFS2_AST_DOWNCONVERT:
														
 
															+		ocfs2_generic_handle_downconvert_action(lockres);
														
 
															+		break;
														
 
															+	default:
														
 
															+		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
														
 
															+		     "lockres flags = 0x%lx, unlock action: %u\n",
														
 
															+		     lockres->l_name, lockres->l_action, lockres->l_flags,
														
 
															+		     lockres->l_unlock_action);
														
 
															+
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	/* data and rw locking ignores refresh flag for now. */
														
 
															+	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
														
 
															+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
														
 
															+
														
 
															+	/* set it to something invalid so if we get called again we
														
 
															+	 * can catch it. */
														
 
															+	lockres->l_action = OCFS2_AST_INVALID;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+	wake_up(&lockres->l_event);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
														
 
															+				     int level)
														
 
															+{
														
 
															+	int needs_downconvert = 0;
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	assert_spin_locked(&lockres->l_lock);
														
 
															+
														
 
															+	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
														
 
															+
														
 
															+	if (level > lockres->l_blocking) {
														
 
															+		/* only schedule a downconvert if we haven't already scheduled
														
 
															+		 * one that goes low enough to satisfy the level we're
														
 
															+		 * blocking.  this also catches the case where we get
														
 
															+		 * duplicate BASTs */
														
 
															+		if (ocfs2_highest_compat_lock_level(level) <
														
 
															+		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
														
 
															+			needs_downconvert = 1;
														
 
															+
														
 
															+		lockres->l_blocking = level;
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(needs_downconvert);
														
 
															+	return needs_downconvert;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
														
 
															+				    struct ocfs2_lock_res *lockres,
														
 
															+				    int level)
														
 
															+{
														
 
															+	int needs_downconvert;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(level <= LKM_NLMODE);
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
														
 
															+	if (needs_downconvert)
														
 
															+		ocfs2_schedule_blocked_lock(osb, lockres);
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	ocfs2_kick_vote_thread(osb);
														
 
															+
														
 
															+	wake_up(&lockres->l_event);
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_inode_bast_func(void *opaque, int level)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *lockres = opaque;
														
 
															+	struct inode *inode;
														
 
															+	struct ocfs2_super *osb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_inode_lock(lockres));
														
 
															+
														
 
															+	inode = ocfs2_lock_res_inode(lockres);
														
 
															+	osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
														
 
															+	     "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
														
 
															+	     lockres->l_level,
														
 
															+	     ocfs2_lock_type_string(lockres->l_type));
														
 
															+
														
 
															+	ocfs2_generic_bast_func(osb, lockres, level);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
														
 
															+				   int ignore_refresh)
														
 
															+{
														
 
															+	struct dlm_lockstatus *lksb = &lockres->l_lksb;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+
														
 
															+	if (lksb->status != DLM_NORMAL) {
														
 
															+		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
														
 
															+		     lockres->l_name, lksb->status);
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	switch(lockres->l_action) {
														
 
															+	case OCFS2_AST_ATTACH:
														
 
															+		ocfs2_generic_handle_attach_action(lockres);
														
 
															+		break;
														
 
															+	case OCFS2_AST_CONVERT:
														
 
															+		ocfs2_generic_handle_convert_action(lockres);
														
 
															+		break;
														
 
															+	case OCFS2_AST_DOWNCONVERT:
														
 
															+		ocfs2_generic_handle_downconvert_action(lockres);
														
 
															+		break;
														
 
															+	default:
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	if (ignore_refresh)
														
 
															+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
														
 
															+
														
 
															+	/* set it to something invalid so if we get called again we
														
 
															+	 * can catch it. */
														
 
															+	lockres->l_action = OCFS2_AST_INVALID;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	wake_up(&lockres->l_event);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_super_ast_func(void *opaque)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *lockres = opaque;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	mlog(0, "Superblock AST fired\n");
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_super_lock(lockres));
														
 
															+	ocfs2_generic_ast_func(lockres, 0);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_super_bast_func(void *opaque,
														
 
															+				  int level)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *lockres = opaque;
														
 
															+	struct ocfs2_super *osb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	mlog(0, "Superblock BAST fired\n");
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_super_lock(lockres));
														
 
															+       	osb = ocfs2_lock_res_super(lockres);
														
 
															+	ocfs2_generic_bast_func(osb, lockres, level);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_rename_ast_func(void *opaque)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *lockres = opaque;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "Rename AST fired\n");
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_rename_lock(lockres));
														
 
															+
														
 
															+	ocfs2_generic_ast_func(lockres, 1);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_rename_bast_func(void *opaque,
														
 
															+				   int level)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *lockres = opaque;
														
 
															+	struct ocfs2_super *osb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "Rename BAST fired\n");
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_rename_lock(lockres));
														
 
															+
														
 
															+	osb = ocfs2_lock_res_super(lockres);
														
 
															+	ocfs2_generic_bast_func(osb, lockres, level);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
														
 
															+						int convert)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+	if (convert)
														
 
															+		lockres->l_action = OCFS2_AST_INVALID;
														
 
															+	else
														
 
															+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	wake_up(&lockres->l_event);
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/* Note: If we detect another process working on the lock (i.e.,
														
 
															+ * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
														
 
															+ * to do the right thing in that case.
														
 
															+ */
														
 
															+static int ocfs2_lock_create(struct ocfs2_super *osb,
														
 
															+			     struct ocfs2_lock_res *lockres,
														
 
															+			     int level,
														
 
															+			     int dlm_flags)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	enum dlm_status status;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
														
 
															+	     dlm_flags);
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
														
 
															+	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	lockres->l_action = OCFS2_AST_ATTACH;
														
 
															+	lockres->l_requested = level;
														
 
															+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	status = dlmlock(osb->dlm,
														
 
															+			 level,
														
 
															+			 &lockres->l_lksb,
														
 
															+			 dlm_flags,
														
 
															+			 lockres->l_name,
														
 
															+			 lockres->l_ops->ast,
														
 
															+			 lockres,
														
 
															+			 lockres->l_ops->bast);
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		ocfs2_log_dlm_error("dlmlock", status, lockres);
														
 
															+		ret = -EINVAL;
														
 
															+		ocfs2_recover_from_dlm_error(lockres, 1);
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
														
 
															+					int flag)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	ret = lockres->l_flags & flag;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
														
 
															+
														
 
															+{
														
 
															+	wait_event(lockres->l_event,
														
 
															+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
														
 
															+
														
 
															+{
														
 
															+	wait_event(lockres->l_event,
														
 
															+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
														
 
															+}
														
 
															+
														
 
															+/* predict what lock level we'll be dropping down to on behalf
														
 
															+ * of another node, and return true if the currently wanted
														
 
															+ * level will be compatible with it. */
														
 
															+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
														
 
															+						     int wanted)
														
 
															+{
														
 
															+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
														
 
															+
														
 
															+	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
														
 
															+{
														
 
															+	INIT_LIST_HEAD(&mw->mw_item);
														
 
															+	init_completion(&mw->mw_complete);
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
														
 
															+{
														
 
															+	wait_for_completion(&mw->mw_complete);
														
 
															+	/* Re-arm the completion in case we want to wait on it again */
														
 
															+	INIT_COMPLETION(mw->mw_complete);
														
 
															+	return mw->mw_status;
														
 
															+}
														
 
															+
														
 
															+static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
														
 
															+				    struct ocfs2_mask_waiter *mw,
														
 
															+				    unsigned long mask,
														
 
															+				    unsigned long goal)
														
 
															+{
														
 
															+	BUG_ON(!list_empty(&mw->mw_item));
														
 
															+
														
 
															+	assert_spin_locked(&lockres->l_lock);
														
 
															+
														
 
															+	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
														
 
															+	mw->mw_mask = mask;
														
 
															+	mw->mw_goal = goal;
														
 
															+}
														
 
															+
														
 
															+/* returns 0 if the mw that was removed was already satisfied, -EBUSY
														
 
															+ * if the mask still hadn't reached its goal */
														
 
															+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
														
 
															+				      struct ocfs2_mask_waiter *mw)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	if (!list_empty(&mw->mw_item)) {
														
 
															+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
														
 
															+			ret = -EBUSY;
														
 
															+
														
 
															+		list_del_init(&mw->mw_item);
														
 
															+		init_completion(&mw->mw_complete);
														
 
															+	}
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	return ret;
														
 
															+
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_cluster_lock(struct ocfs2_super *osb,
														
 
															+			      struct ocfs2_lock_res *lockres,
														
 
															+			      int level,
														
 
															+			      int lkm_flags,
														
 
															+			      int arg_flags)
														
 
															+{
														
 
															+	struct ocfs2_mask_waiter mw;
														
 
															+	enum dlm_status status;
														
 
															+	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
														
 
															+	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	ocfs2_init_mask_waiter(&mw);
														
 
															+
														
 
															+again:
														
 
															+	wait = 0;
														
 
															+
														
 
															+	if (catch_signals && signal_pending(current)) {
														
 
															+		ret = -ERESTARTSYS;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+
														
 
															+	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
														
 
															+			"Cluster lock called on freeing lockres %s! flags "
														
 
															+			"0x%lx\n", lockres->l_name, lockres->l_flags);
														
 
															+
														
 
															+	/* We only compare against the currently granted level
														
 
															+	 * here. If the lock is blocked waiting on a downconvert,
														
 
															+	 * we'll get caught below. */
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
														
 
															+	    level > lockres->l_level) {
														
 
															+		/* is someone sitting in dlm_lock? If so, wait on
														
 
															+		 * them. */
														
 
															+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
														
 
															+		wait = 1;
														
 
															+		goto unlock;
														
 
															+	}
														
 
															+
														
 
															+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
														
 
															+		/* lock has not been created yet. */
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
														
 
															+	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
														
 
															+		/* is the lock is currently blocked on behalf of
														
 
															+		 * another node */
														
 
															+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
														
 
															+		wait = 1;
														
 
															+		goto unlock;
														
 
															+	}
														
 
															+
														
 
															+	if (level > lockres->l_level) {
														
 
															+		if (lockres->l_action != OCFS2_AST_INVALID)
														
 
															+			mlog(ML_ERROR, "lockres %s has action %u pending\n",
														
 
															+			     lockres->l_name, lockres->l_action);
														
 
															+
														
 
															+		lockres->l_action = OCFS2_AST_CONVERT;
														
 
															+		lockres->l_requested = level;
														
 
															+		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+		BUG_ON(level == LKM_IVMODE);
														
 
															+		BUG_ON(level == LKM_NLMODE);
														
 
															+
														
 
															+		mlog(0, "lock %s, convert from %d to level = %d\n",
														
 
															+		     lockres->l_name, lockres->l_level, level);
														
 
															+
														
 
															+		/* call dlm_lock to upgrade lock now */
														
 
															+		status = dlmlock(osb->dlm,
														
 
															+				 level,
														
 
															+				 &lockres->l_lksb,
														
 
															+				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
														
 
															+				 lockres->l_name,
														
 
															+				 lockres->l_ops->ast,
														
 
															+				 lockres,
														
 
															+				 lockres->l_ops->bast);
														
 
															+		if (status != DLM_NORMAL) {
														
 
															+			if ((lkm_flags & LKM_NOQUEUE) &&
														
 
															+			    (status == DLM_NOTQUEUED))
														
 
															+				ret = -EAGAIN;
														
 
															+			else {
														
 
															+				ocfs2_log_dlm_error("dlmlock", status,
														
 
															+						    lockres);
														
 
															+				ret = -EINVAL;
														
 
															+			}
														
 
															+			ocfs2_recover_from_dlm_error(lockres, 1);
														
 
															+			goto out;
														
 
															+		}
														
 
															+
														
 
															+		mlog(0, "lock %s, successfull return from dlmlock\n",
														
 
															+		     lockres->l_name);
														
 
															+
														
 
															+		/* At this point we've gone inside the dlm and need to
														
 
															+		 * complete our work regardless. */
														
 
															+		catch_signals = 0;
														
 
															+
														
 
															+		/* wait for busy to clear and carry on */
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	/* Ok, if we get here then we're good to go. */
														
 
															+	ocfs2_inc_holders(lockres, level);
														
 
															+
														
 
															+	ret = 0;
														
 
															+unlock:
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+out:
														
 
															+	/*
														
 
															+	 * This is helping work around a lock inversion between the page lock
														
 
															+	 * and dlm locks.  One path holds the page lock while calling aops
														
 
															+	 * which block acquiring dlm locks.  The voting thread holds dlm
														
 
															+	 * locks while acquiring page locks while down converting data locks.
														
 
															+	 * This block is helping an aop path notice the inversion and back
														
 
															+	 * off to unlock its page lock before trying the dlm lock again.
														
 
															+	 */
														
 
															+	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
														
 
															+	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
														
 
															+		wait = 0;
														
 
															+		if (lockres_remove_mask_waiter(lockres, &mw))
														
 
															+			ret = -EAGAIN;
														
 
															+		else
														
 
															+			goto again;
														
 
															+	}
														
 
															+	if (wait) {
														
 
															+		ret = ocfs2_wait_for_mask(&mw);
														
 
															+		if (ret == 0)
														
 
															+			goto again;
														
 
															+		mlog_errno(ret);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
														
 
															+				 struct ocfs2_lock_res *lockres,
														
 
															+				 int level)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	ocfs2_dec_holders(lockres, level);
														
 
															+	ocfs2_vote_on_unlock(osb, lockres);
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_create_new_inode_lock(struct inode *inode,
														
 
															+				       struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
														
 
															+	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
														
 
															+}
														
 
															+
														
 
															+/* Grants us an EX lock on the data and metadata resources, skipping
														
 
															+ * the normal cluster directory lookup. Use this ONLY on newly created
														
 
															+ * inodes which other nodes can't possibly see, and which haven't been
														
 
															+ * hashed in the inode hash yet. This can give us a good performance
														
 
															+ * increase as it'll skip the network broadcast normally associated
														
 
															+ * with creating a new lock resource. */
														
 
															+int ocfs2_create_new_inode_locks(struct inode *inode)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	BUG_ON(!inode);
														
 
															+	BUG_ON(!ocfs2_inode_is_new(inode));
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
														
 
															+
														
 
															+	/* NOTE: That we don't increment any of the holder counts, nor
														
 
															+	 * do we add anything to a journal handle. Since this is
														
 
															+	 * supposed to be a new inode which the cluster doesn't know
														
 
															+	 * about yet, there is no need to.  As far as the LVB handling
														
 
															+	 * is concerned, this is basically like acquiring an EX lock
														
 
															+	 * on a resource which has an invalid one -- we'll set it
														
 
															+	 * valid when we release the EX. */
														
 
															+
														
 
															+	ret = ocfs2_create_new_inode_lock(inode,
														
 
															+					  &OCFS2_I(inode)->ip_rw_lockres);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_create_new_inode_lock(inode,
														
 
															+					  &OCFS2_I(inode)->ip_meta_lockres);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_create_new_inode_lock(inode,
														
 
															+					  &OCFS2_I(inode)->ip_data_lockres);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_rw_lock(struct inode *inode, int write)
														
 
															+{
														
 
															+	int status, level;
														
 
															+	struct ocfs2_lock_res *lockres;
														
 
															+
														
 
															+	BUG_ON(!inode);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64" take %s RW lock\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno,
														
 
															+	     write ? "EXMODE" : "PRMODE");
														
 
															+
														
 
															+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
														
 
															+
														
 
															+	level = write ? LKM_EXMODE : LKM_PRMODE;
														
 
															+
														
 
															+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
														
 
															+				    0);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_rw_unlock(struct inode *inode, int write)
														
 
															+{
														
 
															+	int level = write ? LKM_EXMODE : LKM_PRMODE;
														
 
															+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64" drop %s RW lock\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno,
														
 
															+	     write ? "EXMODE" : "PRMODE");
														
 
															+
														
 
															+	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+int ocfs2_data_lock_full(struct inode *inode,
														
 
															+			 int write,
														
 
															+			 int arg_flags)
														
 
															+{
														
 
															+	int status = 0, level;
														
 
															+	struct ocfs2_lock_res *lockres;
														
 
															+
														
 
															+	BUG_ON(!inode);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64" take %s DATA lock\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno,
														
 
															+	     write ? "EXMODE" : "PRMODE");
														
 
															+
														
 
															+	/* We'll allow faking a readonly data lock for
														
 
															+	 * rodevices. */
														
 
															+	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
														
 
															+		if (write) {
														
 
															+			status = -EROFS;
														
 
															+			mlog_errno(status);
														
 
															+		}
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	lockres = &OCFS2_I(inode)->ip_data_lockres;
														
 
															+
														
 
															+	level = write ? LKM_EXMODE : LKM_PRMODE;
														
 
															+
														
 
															+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
														
 
															+				    0, arg_flags);
														
 
															+	if (status < 0 && status != -EAGAIN)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+out:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* see ocfs2_meta_lock_with_page() */
														
 
															+int ocfs2_data_lock_with_page(struct inode *inode,
														
 
															+			      int write,
														
 
															+			      struct page *page)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
														
 
															+	if (ret == -EAGAIN) {
														
 
															+		unlock_page(page);
														
 
															+		if (ocfs2_data_lock(inode, write) == 0)
														
 
															+			ocfs2_data_unlock(inode, write);
														
 
															+		ret = AOP_TRUNCATED_PAGE;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
														
 
															+				 struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	int kick = 0;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	/* If we know that another node is waiting on our lock, kick
														
 
															+	 * the vote thread * pre-emptively when we reach a release
														
 
															+	 * condition. */
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
														
 
															+		switch(lockres->l_blocking) {
														
 
															+		case LKM_EXMODE:
														
 
															+			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
														
 
															+				kick = 1;
														
 
															+			break;
														
 
															+		case LKM_PRMODE:
														
 
															+			if (!lockres->l_ex_holders)
														
 
															+				kick = 1;
														
 
															+			break;
														
 
															+		default:
														
 
															+			BUG();
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (kick)
														
 
															+		ocfs2_kick_vote_thread(osb);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+void ocfs2_data_unlock(struct inode *inode,
														
 
															+		       int write)
														
 
															+{
														
 
															+	int level = write ? LKM_EXMODE : LKM_PRMODE;
														
 
															+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno,
														
 
															+	     write ? "EXMODE" : "PRMODE");
														
 
															+
														
 
															+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
														
 
															+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+#define OCFS2_SEC_BITS   34
														
 
															+#define OCFS2_SEC_SHIFT  (64 - 34)
														
 
															+#define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
														
 
															+
														
 
															+/* LVB only has room for 64 bits of time here so we pack it for
														
 
															+ * now. */
														
 
															+static u64 ocfs2_pack_timespec(struct timespec *spec)
														
 
															+{
														
 
															+	u64 res;
														
 
															+	u64 sec = spec->tv_sec;
														
 
															+	u32 nsec = spec->tv_nsec;
														
 
															+
														
 
															+	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
														
 
															+
														
 
															+	return res;
														
 
															+}
														
 
															+
														
 
															+/* Call this with the lockres locked. I am reasonably sure we don't
														
 
															+ * need ip_lock in this function as anyone who would be changing those
														
 
															+ * values is supposed to be blocked in ocfs2_meta_lock right now. */
														
 
															+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
														
 
															+{
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
														
 
															+	struct ocfs2_meta_lvb *lvb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
														
 
															+
														
 
															+	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
														
 
															+	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
														
 
															+	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
														
 
															+	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
														
 
															+	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
														
 
															+	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
														
 
															+	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
														
 
															+	lvb->lvb_iatime_packed  =
														
 
															+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
														
 
															+	lvb->lvb_ictime_packed =
														
 
															+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
														
 
															+	lvb->lvb_imtime_packed =
														
 
															+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
														
 
															+
														
 
															+	mlog_meta_lvb(0, lockres);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_unpack_timespec(struct timespec *spec,
														
 
															+				  u64 packed_time)
														
 
															+{
														
 
															+	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
														
 
															+	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
														
 
															+{
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
														
 
															+	struct ocfs2_meta_lvb *lvb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog_meta_lvb(0, lockres);
														
 
															+
														
 
															+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
														
 
															+
														
 
															+	/* We're safe here without the lockres lock... */
														
 
															+	spin_lock(&oi->ip_lock);
														
 
															+	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
														
 
															+	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
														
 
															+
														
 
															+	/* fast-symlinks are a special case */
														
 
															+	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
														
 
															+		inode->i_blocks = 0;
														
 
															+	else
														
 
															+		inode->i_blocks =
														
 
															+			ocfs2_align_bytes_to_sectors(i_size_read(inode));
														
 
															+
														
 
															+	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
														
 
															+	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
														
 
															+	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
														
 
															+	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
														
 
															+	ocfs2_unpack_timespec(&inode->i_atime,
														
 
															+			      be64_to_cpu(lvb->lvb_iatime_packed));
														
 
															+	ocfs2_unpack_timespec(&inode->i_mtime,
														
 
															+			      be64_to_cpu(lvb->lvb_imtime_packed));
														
 
															+	ocfs2_unpack_timespec(&inode->i_ctime,
														
 
															+			      be64_to_cpu(lvb->lvb_ictime_packed));
														
 
															+	spin_unlock(&oi->ip_lock);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
														
 
															+
														
 
															+	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
														
 
															+		return 1;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Determine whether a lock resource needs to be refreshed, and
														
 
															+ * arbitrate who gets to refresh it.
														
 
															+ *
														
 
															+ *   0 means no refresh needed.
														
 
															+ *
														
 
															+ *   > 0 means you need to refresh this and you MUST call
														
 
															+ *   ocfs2_complete_lock_res_refresh afterwards. */
														
 
															+static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	int status = 0;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+refresh_check:
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+		ocfs2_wait_on_refreshing_lock(lockres);
														
 
															+		goto refresh_check;
														
 
															+	}
														
 
															+
														
 
															+	/* Ok, I'll be the one to refresh this lock. */
														
 
															+	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	status = 1;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* If status is non zero, I'll mark it as not being in refresh
														
 
															+ * anymroe, but i won't clear the needs refresh flag. */
														
 
															+static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
														
 
															+						   int status)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
														
 
															+	if (!status)
														
 
															+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	wake_up(&lockres->l_event);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/* may or may not return a bh if it went to disk. */
														
 
															+static int ocfs2_meta_lock_update(struct inode *inode,
														
 
															+				  struct buffer_head **bh)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+	struct ocfs2_lock_res *lockres;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	spin_lock(&oi->ip_lock);
														
 
															+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
														
 
															+		mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
														
 
															+		     "were waiting on a lock. ip_flags = 0x%x\n",
														
 
															+		     oi->ip_blkno, oi->ip_flags);
														
 
															+		spin_unlock(&oi->ip_lock);
														
 
															+		status = -ENOENT;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	spin_unlock(&oi->ip_lock);
														
 
															+
														
 
															+	lockres = &oi->ip_meta_lockres;
														
 
															+
														
 
															+	if (!ocfs2_should_refresh_lock_res(lockres))
														
 
															+		goto bail;
														
 
															+
														
 
															+	/* This will discard any caching information we might have had
														
 
															+	 * for the inode metadata. */
														
 
															+	ocfs2_metadata_cache_purge(inode);
														
 
															+
														
 
															+	/* will do nothing for inode types that don't use the extent
														
 
															+	 * map (directories, bitmap files, etc) */
														
 
															+	ocfs2_extent_map_trunc(inode, 0);
														
 
															+
														
 
															+	if (ocfs2_meta_lvb_is_trustable(lockres)) {
														
 
															+		mlog(0, "Trusting LVB on inode %"MLFu64"\n",
														
 
															+		     oi->ip_blkno);
														
 
															+		ocfs2_refresh_inode_from_lvb(inode);
														
 
															+	} else {
														
 
															+		/* Boo, we have to go to disk. */
														
 
															+		/* read bh, cast, ocfs2_refresh_inode */
														
 
															+		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
														
 
															+					  bh, OCFS2_BH_CACHED, inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail_refresh;
														
 
															+		}
														
 
															+		fe = (struct ocfs2_dinode *) (*bh)->b_data;
														
 
															+
														
 
															+		/* This is a good chance to make sure we're not
														
 
															+		 * locking an invalid object.
														
 
															+		 *
														
 
															+		 * We bug on a stale inode here because we checked
														
 
															+		 * above whether it was wiped from disk. The wiping
														
 
															+		 * node provides a guarantee that we receive that
														
 
															+		 * message and can mark the inode before dropping any
														
 
															+		 * locks associated with it. */
														
 
															+		if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
														
 
															+			status = -EIO;
														
 
															+			goto bail_refresh;
														
 
															+		}
														
 
															+		mlog_bug_on_msg(inode->i_generation !=
														
 
															+				le32_to_cpu(fe->i_generation),
														
 
															+				"Invalid dinode %"MLFu64" disk generation: %u "
														
 
															+				"inode->i_generation: %u\n",
														
 
															+				oi->ip_blkno, le32_to_cpu(fe->i_generation),
														
 
															+				inode->i_generation);
														
 
															+		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
														
 
															+				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
														
 
															+				"Stale dinode %"MLFu64" dtime: %"MLFu64" "
														
 
															+				"flags: 0x%x\n", oi->ip_blkno,
														
 
															+				le64_to_cpu(fe->i_dtime),
														
 
															+				le32_to_cpu(fe->i_flags));
														
 
															+
														
 
															+		ocfs2_refresh_inode(inode, fe);
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail_refresh:
														
 
															+	ocfs2_complete_lock_res_refresh(lockres, status);
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_assign_bh(struct inode *inode,
														
 
															+			   struct buffer_head **ret_bh,
														
 
															+			   struct buffer_head *passed_bh)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	if (passed_bh) {
														
 
															+		/* Ok, the update went to disk for us, use the
														
 
															+		 * returned bh. */
														
 
															+		*ret_bh = passed_bh;
														
 
															+		get_bh(*ret_bh);
														
 
															+
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
														
 
															+				  OCFS2_I(inode)->ip_blkno,
														
 
															+				  ret_bh,
														
 
															+				  OCFS2_BH_CACHED,
														
 
															+				  inode);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * returns < 0 error if the callback will never be called, otherwise
														
 
															+ * the result of the lock will be communicated via the callback.
														
 
															+ */
														
 
															+int ocfs2_meta_lock_full(struct inode *inode,
														
 
															+			 struct ocfs2_journal_handle *handle,
														
 
															+			 struct buffer_head **ret_bh,
														
 
															+			 int ex,
														
 
															+			 int arg_flags)
														
 
															+{
														
 
															+	int status, level, dlm_flags, acquired;
														
 
															+	struct ocfs2_lock_res *lockres;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	struct buffer_head *local_bh = NULL;
														
 
															+
														
 
															+	BUG_ON(!inode);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64", take %s META lock\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno,
														
 
															+	     ex ? "EXMODE" : "PRMODE");
														
 
															+
														
 
															+	status = 0;
														
 
															+	acquired = 0;
														
 
															+	/* We'll allow faking a readonly metadata lock for
														
 
															+	 * rodevices. */
														
 
															+	if (ocfs2_is_hard_readonly(osb)) {
														
 
															+		if (ex)
														
 
															+			status = -EROFS;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
														
 
															+		wait_event(osb->recovery_event,
														
 
															+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
														
 
															+
														
 
															+	acquired = 0;
														
 
															+	lockres = &OCFS2_I(inode)->ip_meta_lockres;
														
 
															+	level = ex ? LKM_EXMODE : LKM_PRMODE;
														
 
															+	dlm_flags = 0;
														
 
															+	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
														
 
															+		dlm_flags |= LKM_NOQUEUE;
														
 
															+
														
 
															+	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -EAGAIN && status != -EIOCBRETRY)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* Notify the error cleanup path to drop the cluster lock. */
														
 
															+	acquired = 1;
														
 
															+
														
 
															+	/* We wait twice because a node may have died while we were in
														
 
															+	 * the lower dlm layers. The second time though, we've
														
 
															+	 * committed to owning this lock so we don't allow signals to
														
 
															+	 * abort the operation. */
														
 
															+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
														
 
															+		wait_event(osb->recovery_event,
														
 
															+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
														
 
															+
														
 
															+	/* This is fun. The caller may want a bh back, or it may
														
 
															+	 * not. ocfs2_meta_lock_update definitely wants one in, but
														
 
															+	 * may or may not read one, depending on what's in the
														
 
															+	 * LVB. The result of all of this is that we've *only* gone to
														
 
															+	 * disk if we have to, so the complexity is worthwhile. */
														
 
															+	status = ocfs2_meta_lock_update(inode, &local_bh);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (ret_bh) {
														
 
															+		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (handle) {
														
 
															+		status = ocfs2_handle_add_lock(handle, inode);
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	if (status < 0) {
														
 
															+		if (ret_bh && (*ret_bh)) {
														
 
															+			brelse(*ret_bh);
														
 
															+			*ret_bh = NULL;
														
 
															+		}
														
 
															+		if (acquired)
														
 
															+			ocfs2_meta_unlock(inode, ex);
														
 
															+	}
														
 
															+
														
 
															+	if (local_bh)
														
 
															+		brelse(local_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This is working around a lock inversion between tasks acquiring DLM locks
														
 
															+ * while holding a page lock and the vote thread which blocks dlm lock acquiry
														
 
															+ * while acquiring page locks.
														
 
															+ *
														
 
															+ * ** These _with_page variantes are only intended to be called from aop
														
 
															+ * methods that hold page locks and return a very specific *positive* error
														
 
															+ * code that aop methods pass up to the VFS -- test for errors with != 0. **
														
 
															+ *
														
 
															+ * The DLM is called such that it returns -EAGAIN if it would have blocked
														
 
															+ * waiting for the vote thread.  In that case we unlock our page so the vote
														
 
															+ * thread can make progress.  Once we've done this we have to return
														
 
															+ * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
														
 
															+ * into the VFS who will then immediately retry the aop call.
														
 
															+ *
														
 
															+ * We do a blocking lock and immediate unlock before returning, though, so that
														
 
															+ * the lock has a great chance of being cached on this node by the time the VFS
														
 
															+ * calls back to retry the aop.    This has a potential to livelock as nodes
														
 
															+ * ping locks back and forth, but that's a risk we're willing to take to avoid
														
 
															+ * the lock inversion simply.
														
 
															+ */
														
 
															+int ocfs2_meta_lock_with_page(struct inode *inode,
														
 
															+			      struct ocfs2_journal_handle *handle,
														
 
															+			      struct buffer_head **ret_bh,
														
 
															+			      int ex,
														
 
															+			      struct page *page)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
														
 
															+				   OCFS2_LOCK_NONBLOCK);
														
 
															+	if (ret == -EAGAIN) {
														
 
															+		unlock_page(page);
														
 
															+		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
														
 
															+			ocfs2_meta_unlock(inode, ex);
														
 
															+		ret = AOP_TRUNCATED_PAGE;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_meta_unlock(struct inode *inode,
														
 
															+		       int ex)
														
 
															+{
														
 
															+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
														
 
															+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64" drop %s META lock\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno,
														
 
															+	     ex ? "EXMODE" : "PRMODE");
														
 
															+
														
 
															+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
														
 
															+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+int ocfs2_super_lock(struct ocfs2_super *osb,
														
 
															+		     int ex)
														
 
															+{
														
 
															+	int status;
														
 
															+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
														
 
															+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
														
 
															+	struct buffer_head *bh;
														
 
															+	struct ocfs2_slot_info *si = osb->slot_info;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (ocfs2_is_hard_readonly(osb))
														
 
															+		return -EROFS;
														
 
															+
														
 
															+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* The super block lock path is really in the best position to
														
 
															+	 * know when resources covered by the lock need to be
														
 
															+	 * refreshed, so we do it here. Of course, making sense of
														
 
															+	 * everything is up to the caller :) */
														
 
															+	status = ocfs2_should_refresh_lock_res(lockres);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (status) {
														
 
															+		bh = si->si_bh;
														
 
															+		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
														
 
															+					  si->si_inode);
														
 
															+		if (status == 0)
														
 
															+			ocfs2_update_slot_info(si);
														
 
															+
														
 
															+		ocfs2_complete_lock_res_refresh(lockres, status);
														
 
															+
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+	}
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_super_unlock(struct ocfs2_super *osb,
														
 
															+			int ex)
														
 
															+{
														
 
															+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
														
 
															+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
														
 
															+
														
 
															+	ocfs2_cluster_unlock(osb, lockres, level);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_rename_lock(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
														
 
															+
														
 
															+	if (ocfs2_is_hard_readonly(osb))
														
 
															+		return -EROFS;
														
 
															+
														
 
															+	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_rename_unlock(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
														
 
															+
														
 
															+	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
														
 
															+}
														
 
															+
														
 
															+/* Reference counting of the dlm debug structure. We want this because
														
 
															+ * open references on the debug inodes can live on after a mount, so
														
 
															+ * we can't rely on the ocfs2_super to always exist. */
														
 
															+static void ocfs2_dlm_debug_free(struct kref *kref)
														
 
															+{
														
 
															+	struct ocfs2_dlm_debug *dlm_debug;
														
 
															+
														
 
															+	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
														
 
															+
														
 
															+	kfree(dlm_debug);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
														
 
															+{
														
 
															+	if (dlm_debug)
														
 
															+		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
														
 
															+{
														
 
															+	kref_get(&debug->d_refcnt);
														
 
															+}
														
 
															+
														
 
															+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
														
 
															+{
														
 
															+	struct ocfs2_dlm_debug *dlm_debug;
														
 
															+
														
 
															+	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
														
 
															+	if (!dlm_debug) {
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	kref_init(&dlm_debug->d_refcnt);
														
 
															+	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
														
 
															+	dlm_debug->d_locking_state = NULL;
														
 
															+out:
														
 
															+	return dlm_debug;
														
 
															+}
														
 
															+
														
 
															+/* Access to this is arbitrated for us via seq_file->sem. */
														
 
															+struct ocfs2_dlm_seq_priv {
														
 
															+	struct ocfs2_dlm_debug *p_dlm_debug;
														
 
															+	struct ocfs2_lock_res p_iter_res;
														
 
															+	struct ocfs2_lock_res p_tmp_res;
														
 
															+};
														
 
															+
														
 
															+static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
														
 
															+						 struct ocfs2_dlm_seq_priv *priv)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *iter, *ret = NULL;
														
 
															+	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
														
 
															+
														
 
															+	assert_spin_locked(&ocfs2_dlm_tracking_lock);
														
 
															+
														
 
															+	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
														
 
															+		/* discover the head of the list */
														
 
															+		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
														
 
															+			mlog(0, "End of list found, %p\n", ret);
														
 
															+			break;
														
 
															+		}
														
 
															+
														
 
															+		/* We track our "dummy" iteration lockres' by a NULL
														
 
															+		 * l_ops field. */
														
 
															+		if (iter->l_ops != NULL) {
														
 
															+			ret = iter;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
														
 
															+{
														
 
															+	struct ocfs2_dlm_seq_priv *priv = m->private;
														
 
															+	struct ocfs2_lock_res *iter;
														
 
															+
														
 
															+	spin_lock(&ocfs2_dlm_tracking_lock);
														
 
															+	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
														
 
															+	if (iter) {
														
 
															+		/* Since lockres' have the lifetime of their container
														
 
															+		 * (which can be inodes, ocfs2_supers, etc) we want to
														
 
															+		 * copy this out to a temporary lockres while still
														
 
															+		 * under the spinlock. Obviously after this we can't
														
 
															+		 * trust any pointers on the copy returned, but that's
														
 
															+		 * ok as the information we want isn't typically held
														
 
															+		 * in them. */
														
 
															+		priv->p_tmp_res = *iter;
														
 
															+		iter = &priv->p_tmp_res;
														
 
															+	}
														
 
															+	spin_unlock(&ocfs2_dlm_tracking_lock);
														
 
															+
														
 
															+	return iter;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
														
 
															+{
														
 
															+}
														
 
															+
														
 
															+static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
														
 
															+{
														
 
															+	struct ocfs2_dlm_seq_priv *priv = m->private;
														
 
															+	struct ocfs2_lock_res *iter = v;
														
 
															+	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
														
 
															+
														
 
															+	spin_lock(&ocfs2_dlm_tracking_lock);
														
 
															+	iter = ocfs2_dlm_next_res(iter, priv);
														
 
															+	list_del_init(&dummy->l_debug_list);
														
 
															+	if (iter) {
														
 
															+		list_add(&dummy->l_debug_list, &iter->l_debug_list);
														
 
															+		priv->p_tmp_res = *iter;
														
 
															+		iter = &priv->p_tmp_res;
														
 
															+	}
														
 
															+	spin_unlock(&ocfs2_dlm_tracking_lock);
														
 
															+
														
 
															+	return iter;
														
 
															+}
														
 
															+
														
 
															+/* So that debugfs.ocfs2 can determine which format is being used */
														
 
															+#define OCFS2_DLM_DEBUG_STR_VERSION 1
														
 
															+static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
														
 
															+{
														
 
															+	int i;
														
 
															+	char *lvb;
														
 
															+	struct ocfs2_lock_res *lockres = v;
														
 
															+
														
 
															+	if (!lockres)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	seq_printf(m, "0x%x\t"
														
 
															+		   "%.*s\t"
														
 
															+		   "%d\t"
														
 
															+		   "0x%lx\t"
														
 
															+		   "0x%x\t"
														
 
															+		   "0x%x\t"
														
 
															+		   "%u\t"
														
 
															+		   "%u\t"
														
 
															+		   "%d\t"
														
 
															+		   "%d\t",
														
 
															+		   OCFS2_DLM_DEBUG_STR_VERSION,
														
 
															+		   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
														
 
															+		   lockres->l_level,
														
 
															+		   lockres->l_flags,
														
 
															+		   lockres->l_action,
														
 
															+		   lockres->l_unlock_action,
														
 
															+		   lockres->l_ro_holders,
														
 
															+		   lockres->l_ex_holders,
														
 
															+		   lockres->l_requested,
														
 
															+		   lockres->l_blocking);
														
 
															+
														
 
															+	/* Dump the raw LVB */
														
 
															+	lvb = lockres->l_lksb.lvb;
														
 
															+	for(i = 0; i < DLM_LVB_LEN; i++)
														
 
															+		seq_printf(m, "0x%x\t", lvb[i]);
														
 
															+
														
 
															+	/* End the line */
														
 
															+	seq_printf(m, "\n");
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static struct seq_operations ocfs2_dlm_seq_ops = {
														
 
															+	.start =	ocfs2_dlm_seq_start,
														
 
															+	.stop =		ocfs2_dlm_seq_stop,
														
 
															+	.next =		ocfs2_dlm_seq_next,
														
 
															+	.show =		ocfs2_dlm_seq_show,
														
 
															+};
														
 
															+
														
 
															+static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
														
 
															+{
														
 
															+	struct seq_file *seq = (struct seq_file *) file->private_data;
														
 
															+	struct ocfs2_dlm_seq_priv *priv = seq->private;
														
 
															+	struct ocfs2_lock_res *res = &priv->p_iter_res;
														
 
															+
														
 
															+	ocfs2_remove_lockres_tracking(res);
														
 
															+	ocfs2_put_dlm_debug(priv->p_dlm_debug);
														
 
															+	return seq_release_private(inode, file);
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct ocfs2_dlm_seq_priv *priv;
														
 
															+	struct seq_file *seq;
														
 
															+	struct ocfs2_super *osb;
														
 
															+
														
 
															+	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
														
 
															+	if (!priv) {
														
 
															+		ret = -ENOMEM;
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+	osb = (struct ocfs2_super *) inode->u.generic_ip;
														
 
															+	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
														
 
															+	priv->p_dlm_debug = osb->osb_dlm_debug;
														
 
															+	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
														
 
															+
														
 
															+	ret = seq_open(file, &ocfs2_dlm_seq_ops);
														
 
															+	if (ret) {
														
 
															+		kfree(priv);
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	seq = (struct seq_file *) file->private_data;
														
 
															+	seq->private = priv;
														
 
															+
														
 
															+	ocfs2_add_lockres_tracking(&priv->p_iter_res,
														
 
															+				   priv->p_dlm_debug);
														
 
															+
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static struct file_operations ocfs2_dlm_debug_fops = {
														
 
															+	.open =		ocfs2_dlm_debug_open,
														
 
															+	.release =	ocfs2_dlm_debug_release,
														
 
															+	.read =		seq_read,
														
 
															+	.llseek =	seq_lseek,
														
 
															+};
														
 
															+
														
 
															+static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
														
 
															+
														
 
															+	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
														
 
															+							 S_IFREG|S_IRUSR,
														
 
															+							 osb->osb_debug_root,
														
 
															+							 osb,
														
 
															+							 &ocfs2_dlm_debug_fops);
														
 
															+	if (!dlm_debug->d_locking_state) {
														
 
															+		ret = -EINVAL;
														
 
															+		mlog(ML_ERROR,
														
 
															+		     "Unable to create locking state debugfs file.\n");
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_get_dlm_debug(dlm_debug);
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
														
 
															+
														
 
															+	if (dlm_debug) {
														
 
															+		debugfs_remove(dlm_debug->d_locking_state);
														
 
															+		ocfs2_put_dlm_debug(dlm_debug);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+int ocfs2_dlm_init(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	u32 dlm_key;
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	status = ocfs2_dlm_init_debug(osb);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* launch vote thread */
														
 
															+	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
														
 
															+				     osb->osb_id);
														
 
															+	if (IS_ERR(osb->vote_task)) {
														
 
															+		status = PTR_ERR(osb->vote_task);
														
 
															+		osb->vote_task = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* used by the dlm code to make message headers unique, each
														
 
															+	 * node in this domain must agree on this. */
														
 
															+	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
														
 
															+
														
 
															+	/* for now, uuid == domain */
														
 
															+	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
														
 
															+	if (IS_ERR(dlm)) {
														
 
															+		status = PTR_ERR(dlm);
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
														
 
															+	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
														
 
															+
														
 
															+	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
														
 
															+
														
 
															+	osb->dlm = dlm;
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (status < 0) {
														
 
															+		ocfs2_dlm_shutdown_debug(osb);
														
 
															+		if (osb->vote_task)
														
 
															+			kthread_stop(osb->vote_task);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
														
 
															+
														
 
															+	ocfs2_drop_osb_locks(osb);
														
 
															+
														
 
															+	if (osb->vote_task) {
														
 
															+		kthread_stop(osb->vote_task);
														
 
															+		osb->vote_task = NULL;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_lock_res_free(&osb->osb_super_lockres);
														
 
															+	ocfs2_lock_res_free(&osb->osb_rename_lockres);
														
 
															+
														
 
															+	dlm_unregister_domain(osb->dlm);
														
 
															+	osb->dlm = NULL;
														
 
															+
														
 
															+	ocfs2_dlm_shutdown_debug(osb);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
														
 
															+{
														
 
															+	struct ocfs2_lock_res *lockres = opaque;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
														
 
															+	     lockres->l_unlock_action);
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	/* We tried to cancel a convert request, but it was already
														
 
															+	 * granted. All we want to do here is clear our unlock
														
 
															+	 * state. The wake_up call done at the bottom is redundant
														
 
															+	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
														
 
															+	 * hurt anything anyway */
														
 
															+	if (status == DLM_CANCELGRANT &&
														
 
															+	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
														
 
															+		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
														
 
															+
														
 
															+		/* We don't clear the busy flag in this case as it
														
 
															+		 * should have been cleared by the ast which the dlm
														
 
															+		 * has called. */
														
 
															+		goto complete_unlock;
														
 
															+	}
														
 
															+
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
														
 
															+		     "unlock_action %d\n", status, lockres->l_name,
														
 
															+		     lockres->l_unlock_action);
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	switch(lockres->l_unlock_action) {
														
 
															+	case OCFS2_UNLOCK_CANCEL_CONVERT:
														
 
															+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
														
 
															+		lockres->l_action = OCFS2_AST_INVALID;
														
 
															+		break;
														
 
															+	case OCFS2_UNLOCK_DROP_LOCK:
														
 
															+		lockres->l_level = LKM_IVMODE;
														
 
															+		break;
														
 
															+	default:
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+complete_unlock:
														
 
															+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	wake_up(&lockres->l_event);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
														
 
															+
														
 
															+struct drop_lock_cb {
														
 
															+	ocfs2_pre_drop_cb_t	*drop_func;
														
 
															+	void			*drop_data;
														
 
															+};
														
 
															+
														
 
															+static int ocfs2_drop_lock(struct ocfs2_super *osb,
														
 
															+			   struct ocfs2_lock_res *lockres,
														
 
															+			   struct drop_lock_cb *dcb)
														
 
															+{
														
 
															+	enum dlm_status status;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	/* We didn't get anywhere near actually using this lockres. */
														
 
															+	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
														
 
															+		goto out;
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+
														
 
															+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
														
 
															+			"lockres %s, flags 0x%lx\n",
														
 
															+			lockres->l_name, lockres->l_flags);
														
 
															+
														
 
															+	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
														
 
															+		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
														
 
															+		     "%u, unlock_action = %u\n",
														
 
															+		     lockres->l_name, lockres->l_flags, lockres->l_action,
														
 
															+		     lockres->l_unlock_action);
														
 
															+
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+		/* XXX: Today we just wait on any busy
														
 
															+		 * locks... Perhaps we need to cancel converts in the
														
 
															+		 * future? */
														
 
															+		ocfs2_wait_on_busy_lock(lockres);
														
 
															+
														
 
															+		spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	}
														
 
															+
														
 
															+	if (dcb)
														
 
															+		dcb->drop_func(lockres, dcb->drop_data);
														
 
															+
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
														
 
															+		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
														
 
															+		     lockres->l_name);
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
														
 
															+		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
														
 
															+
														
 
															+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
														
 
															+
														
 
															+	/* make sure we never get here while waiting for an ast to
														
 
															+	 * fire. */
														
 
															+	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
														
 
															+
														
 
															+	/* is this necessary? */
														
 
															+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	mlog(0, "lock %s\n", lockres->l_name);
														
 
															+
														
 
															+	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
														
 
															+			   lockres->l_ops->unlock_ast, lockres);
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
														
 
															+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
														
 
															+		dlm_print_one_lock(lockres->l_lksb.lockid);
														
 
															+		BUG();
														
 
															+	}
														
 
															+	mlog(0, "lock %s, successfull return from dlmunlock\n",
														
 
															+	     lockres->l_name);
														
 
															+
														
 
															+	ocfs2_wait_on_busy_lock(lockres);
														
 
															+out:
														
 
															+	mlog_exit(0);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Mark the lockres as being dropped. It will no longer be
														
 
															+ * queued if blocking, but we still may have to wait on it
														
 
															+ * being dequeued from the vote thread before we can consider
														
 
															+ * it safe to drop. 
														
 
															+ *
														
 
															+ * You can *not* attempt to call cluster_lock on this lockres anymore. */
														
 
															+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_mask_waiter mw;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	ocfs2_init_mask_waiter(&mw);
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	lockres->l_flags |= OCFS2_LOCK_FREEING;
														
 
															+	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
														
 
															+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
														
 
															+
														
 
															+		status = ocfs2_wait_for_mask(&mw);
														
 
															+		if (status)
														
 
															+			mlog_errno(status);
														
 
															+
														
 
															+		spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	}
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
														
 
															+
														
 
															+	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
														
 
															+
														
 
															+	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
														
 
															+{
														
 
															+	struct inode *inode = data;
														
 
															+
														
 
															+	/* the metadata lock requires a bit more work as we have an
														
 
															+	 * LVB to worry about. */
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
														
 
															+	    lockres->l_level == LKM_EXMODE &&
														
 
															+	    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
														
 
															+		__ocfs2_stuff_meta_lvb(inode);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_drop_inode_locks(struct inode *inode)
														
 
															+{
														
 
															+	int status, err;
														
 
															+	struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	/* No need to call ocfs2_mark_lockres_freeing here -
														
 
															+	 * ocfs2_clear_inode has done it for us. */
														
 
															+
														
 
															+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
														
 
															+			      &OCFS2_I(inode)->ip_data_lockres,
														
 
															+			      NULL);
														
 
															+	if (err < 0)
														
 
															+		mlog_errno(err);
														
 
															+
														
 
															+	status = err;
														
 
															+
														
 
															+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
														
 
															+			      &OCFS2_I(inode)->ip_meta_lockres,
														
 
															+			      &meta_dcb);
														
 
															+	if (err < 0)
														
 
															+		mlog_errno(err);
														
 
															+	if (err < 0 && !status)
														
 
															+		status = err;
														
 
															+
														
 
															+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
														
 
															+			      &OCFS2_I(inode)->ip_rw_lockres,
														
 
															+			      NULL);
														
 
															+	if (err < 0)
														
 
															+		mlog_errno(err);
														
 
															+	if (err < 0 && !status)
														
 
															+		status = err;
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
														
 
															+				      int new_level)
														
 
															+{
														
 
															+	assert_spin_locked(&lockres->l_lock);
														
 
															+
														
 
															+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
														
 
															+
														
 
															+	if (lockres->l_level <= new_level) {
														
 
															+		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
														
 
															+		     lockres->l_level, new_level);
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
														
 
															+	     lockres->l_name, new_level, lockres->l_blocking);
														
 
															+
														
 
															+	lockres->l_action = OCFS2_AST_DOWNCONVERT;
														
 
															+	lockres->l_requested = new_level;
														
 
															+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
														
 
															+				  struct ocfs2_lock_res *lockres,
														
 
															+				  int new_level,
														
 
															+				  int lvb)
														
 
															+{
														
 
															+	int ret, dlm_flags = LKM_CONVERT;
														
 
															+	enum dlm_status status;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (lvb)
														
 
															+		dlm_flags |= LKM_VALBLK;
														
 
															+
														
 
															+	status = dlmlock(osb->dlm,
														
 
															+			 new_level,
														
 
															+			 &lockres->l_lksb,
														
 
															+			 dlm_flags,
														
 
															+			 lockres->l_name,
														
 
															+			 lockres->l_ops->ast,
														
 
															+			 lockres,
														
 
															+			 lockres->l_ops->bast);
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		ocfs2_log_dlm_error("dlmlock", status, lockres);
														
 
															+		ret = -EINVAL;
														
 
															+		ocfs2_recover_from_dlm_error(lockres, 1);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ret = 0;
														
 
															+bail:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* returns 1 when the caller should unlock and call dlmunlock */
														
 
															+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
														
 
															+				        struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	assert_spin_locked(&lockres->l_lock);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	mlog(0, "lock %s\n", lockres->l_name);
														
 
															+
														
 
															+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
														
 
															+		/* If we're already trying to cancel a lock conversion
														
 
															+		 * then just drop the spinlock and allow the caller to
														
 
															+		 * requeue this lock. */
														
 
															+
														
 
															+		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	/* were we in a convert when we got the bast fire? */
														
 
															+	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
														
 
															+	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
														
 
															+	/* set things up for the unlockast to know to just
														
 
															+	 * clear out the ast_action and unset busy, etc. */
														
 
															+	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
														
 
															+
														
 
															+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
														
 
															+			"lock %s, invalid flags: 0x%lx\n",
														
 
															+			lockres->l_name, lockres->l_flags);
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
														
 
															+				struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	int ret;
														
 
															+	enum dlm_status status;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	mlog(0, "lock %s\n", lockres->l_name);
														
 
															+
														
 
															+	ret = 0;
														
 
															+	status = dlmunlock(osb->dlm,
														
 
															+			   &lockres->l_lksb,
														
 
															+			   LKM_CANCEL,
														
 
															+			   lockres->l_ops->unlock_ast,
														
 
															+			   lockres);
														
 
															+	if (status != DLM_NORMAL) {
														
 
															+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
														
 
															+		ret = -EINVAL;
														
 
															+		ocfs2_recover_from_dlm_error(lockres, 0);
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
														
 
															+						  struct ocfs2_lock_res *lockres,
														
 
															+						  int new_level)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
														
 
															+
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
														
 
															+		ret = 0;
														
 
															+		mlog(0, "lockres %s currently being refreshed -- backing "
														
 
															+		     "off!\n", lockres->l_name);
														
 
															+	} else if (new_level == LKM_PRMODE)
														
 
															+		ret = !lockres->l_ex_holders &&
														
 
															+			ocfs2_inode_fully_checkpointed(inode);
														
 
															+	else /* Must be NLMODE we're converting to. */
														
 
															+		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
														
 
															+			ocfs2_inode_fully_checkpointed(inode);
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_do_unblock_meta(struct inode *inode,
														
 
															+				 int *requeue)
														
 
															+{
														
 
															+	int new_level;
														
 
															+	int set_lvb = 0;
														
 
															+	int ret = 0;
														
 
															+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+
														
 
															+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
														
 
															+
														
 
															+	mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
														
 
															+	     lockres->l_blocking);
														
 
															+
														
 
															+	BUG_ON(lockres->l_level != LKM_EXMODE &&
														
 
															+	       lockres->l_level != LKM_PRMODE);
														
 
															+
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
														
 
															+		*requeue = 1;
														
 
															+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		if (ret) {
														
 
															+			ret = ocfs2_cancel_convert(osb, lockres);
														
 
															+			if (ret < 0)
														
 
															+				mlog_errno(ret);
														
 
															+		}
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
														
 
															+
														
 
															+	mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
														
 
															+	     lockres->l_level, lockres->l_blocking, new_level);
														
 
															+
														
 
															+	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
														
 
															+		if (lockres->l_level == LKM_EXMODE)
														
 
															+			set_lvb = 1;
														
 
															+
														
 
															+		/* If the lock hasn't been refreshed yet (rare), then
														
 
															+		 * our memory inode values are old and we skip
														
 
															+		 * stuffing the lvb. There's no need to actually clear
														
 
															+		 * out the lvb here as it's value is still valid. */
														
 
															+		if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
														
 
															+			if (set_lvb)
														
 
															+				__ocfs2_stuff_meta_lvb(inode);
														
 
															+		} else
														
 
															+			mlog(0, "lockres %s: downconverting stale lock!\n",
														
 
															+			     lockres->l_name);
														
 
															+
														
 
															+		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
														
 
															+		     "l_blocking=%d, new_level=%d\n",
														
 
															+		     lockres->l_level, lockres->l_blocking, new_level);
														
 
															+
														
 
															+		ocfs2_prepare_downconvert(lockres, new_level);
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+	if (!ocfs2_inode_fully_checkpointed(inode))
														
 
															+		ocfs2_start_checkpoint(osb);
														
 
															+
														
 
															+	*requeue = 1;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+	ret = 0;
														
 
															+leave:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
														
 
															+				      struct ocfs2_lock_res *lockres,
														
 
															+				      int *requeue,
														
 
															+				      ocfs2_convert_worker_t *worker)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	int blocking;
														
 
															+	int new_level;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+
														
 
															+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
														
 
															+
														
 
															+recheck:
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
														
 
															+		*requeue = 1;
														
 
															+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		if (ret) {
														
 
															+			ret = ocfs2_cancel_convert(osb, lockres);
														
 
															+			if (ret < 0)
														
 
															+				mlog_errno(ret);
														
 
															+		}
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* if we're blocking an exclusive and we have *any* holders,
														
 
															+	 * then requeue. */
														
 
															+	if ((lockres->l_blocking == LKM_EXMODE)
														
 
															+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		*requeue = 1;
														
 
															+		ret = 0;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* If it's a PR we're blocking, then only
														
 
															+	 * requeue if we've got any EX holders */
														
 
															+	if (lockres->l_blocking == LKM_PRMODE &&
														
 
															+	    lockres->l_ex_holders) {
														
 
															+		spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+		*requeue = 1;
														
 
															+		ret = 0;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* If we get here, then we know that there are no more
														
 
															+	 * incompatible holders (and anyone asking for an incompatible
														
 
															+	 * lock is blocked). We can now downconvert the lock */
														
 
															+	if (!worker)
														
 
															+		goto downconvert;
														
 
															+
														
 
															+	/* Some lockres types want to do a bit of work before
														
 
															+	 * downconverting a lock. Allow that here. The worker function
														
 
															+	 * may sleep, so we save off a copy of what we're blocking as
														
 
															+	 * it may change while we're not holding the spin lock. */
														
 
															+	blocking = lockres->l_blocking;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	worker(lockres, blocking);
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	if (blocking != lockres->l_blocking) {
														
 
															+		/* If this changed underneath us, then we can't drop
														
 
															+		 * it just yet. */
														
 
															+		goto recheck;
														
 
															+	}
														
 
															+
														
 
															+downconvert:
														
 
															+	*requeue = 0;
														
 
															+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
														
 
															+
														
 
															+	ocfs2_prepare_downconvert(lockres, new_level);
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
														
 
															+leave:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
														
 
															+				      int blocking)
														
 
															+{
														
 
															+	struct inode *inode;
														
 
															+	struct address_space *mapping;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+       	inode = ocfs2_lock_res_inode(lockres);
														
 
															+	mapping = inode->i_mapping;
														
 
															+
														
 
															+	if (filemap_fdatawrite(mapping)) {
														
 
															+		mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
														
 
															+		     OCFS2_I(inode)->ip_blkno);
														
 
															+	}
														
 
															+	sync_mapping_buffers(mapping);
														
 
															+	if (blocking == LKM_EXMODE) {
														
 
															+		truncate_inode_pages(mapping, 0);
														
 
															+		unmap_mapping_range(mapping, 0, 0, 0);
														
 
															+	} else {
														
 
															+		/* We only need to wait on the I/O if we're not also
														
 
															+		 * truncating pages because truncate_inode_pages waits
														
 
															+		 * for us above. We don't truncate pages if we're
														
 
															+		 * blocking anything < EXMODE because we want to keep
														
 
															+		 * them around in that case. */
														
 
															+		filemap_fdatawait(mapping);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
														
 
															+		       int *requeue)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *inode;
														
 
															+	struct ocfs2_super *osb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	inode = ocfs2_lock_res_inode(lockres);
														
 
															+	osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
														
 
															+
														
 
															+	status = ocfs2_generic_unblock_lock(osb,
														
 
															+					    lockres,
														
 
															+					    requeue,
														
 
															+					    ocfs2_data_convert_worker);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64", requeue = %d\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno, *requeue);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
														
 
															+				    int *requeue)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *inode;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
														
 
															+
														
 
															+	inode  = ocfs2_lock_res_inode(lockres);
														
 
															+
														
 
															+	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
														
 
															+					    lockres,
														
 
															+					    requeue,
														
 
															+					    NULL);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
														
 
															+		       int *requeue)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *inode;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+       	inode = ocfs2_lock_res_inode(lockres);
														
 
															+
														
 
															+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
														
 
															+
														
 
															+	status = ocfs2_do_unblock_meta(inode, requeue);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64", requeue = %d\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno, *requeue);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Generic unblock function for any lockres whose private data is an
														
 
															+ * ocfs2_super pointer. */
														
 
															+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
														
 
															+				  int *requeue)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_super *osb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
														
 
															+
														
 
															+	osb = ocfs2_lock_res_super(lockres);
														
 
															+
														
 
															+	status = ocfs2_generic_unblock_lock(osb,
														
 
															+					    lockres,
														
 
															+					    requeue,
														
 
															+					    NULL);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
														
 
															+				struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	int status;
														
 
															+	int requeue = 0;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	/* Our reference to the lockres in this function can be
														
 
															+	 * considered valid until we remove the OCFS2_LOCK_QUEUED
														
 
															+	 * flag. */
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!lockres);
														
 
															+	BUG_ON(!lockres->l_ops);
														
 
															+	BUG_ON(!lockres->l_ops->unblock);
														
 
															+
														
 
															+	mlog(0, "lockres %s blocked.\n", lockres->l_name);
														
 
															+
														
 
															+	/* Detect whether a lock has been marked as going away while
														
 
															+	 * the vote thread was processing other things. A lock can
														
 
															+	 * still be marked with OCFS2_LOCK_FREEING after this check,
														
 
															+	 * but short circuiting here will still save us some
														
 
															+	 * performance. */
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_FREEING)
														
 
															+		goto unqueue;
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	status = lockres->l_ops->unblock(lockres, &requeue);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	spin_lock_irqsave(&lockres->l_lock, flags);
														
 
															+unqueue:
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
														
 
															+		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
														
 
															+	} else
														
 
															+		ocfs2_schedule_blocked_lock(osb, lockres);
														
 
															+
														
 
															+	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
														
 
															+	     requeue ? "yes" : "no");
														
 
															+	spin_unlock_irqrestore(&lockres->l_lock, flags);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
														
 
															+					struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	assert_spin_locked(&lockres->l_lock);
														
 
															+
														
 
															+	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
														
 
															+		/* Do not schedule a lock for downconvert when it's on
														
 
															+		 * the way to destruction - any nodes wanting access
														
 
															+		 * to the resource will get it soon. */
														
 
															+		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
														
 
															+		     lockres->l_name, lockres->l_flags);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
														
 
															+
														
 
															+	spin_lock(&osb->vote_task_lock);
														
 
															+	if (list_empty(&lockres->l_blocked_list)) {
														
 
															+		list_add_tail(&lockres->l_blocked_list,
														
 
															+			      &osb->blocked_lock_list);
														
 
															+		osb->blocked_lock_count++;
														
 
															+	}
														
 
															+	spin_unlock(&osb->vote_task_lock);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/* This aids in debugging situations where a bad LVB might be involved. */
														
 
															+void ocfs2_dump_meta_lvb_info(u64 level,
														
 
															+			      const char *function,
														
 
															+			      unsigned int line,
														
 
															+			      struct ocfs2_lock_res *lockres)
														
 
															+{
														
 
															+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
														
 
															+
														
 
															+	mlog(level, "LVB information for %s (called from %s:%u):\n",
														
 
															+	     lockres->l_name, function, line);
														
 
															+	mlog(level, "version: %u, clusters: %u\n",
														
 
															+	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
														
 
															+	mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
														
 
															+	     be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
														
 
															+	     be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
														
 
															+	mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
														
 
															+	     "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
														
 
															+	     be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
														
 
															+	     be64_to_cpu(lvb->lvb_ictime_packed),
														
 
															+	     be64_to_cpu(lvb->lvb_imtime_packed));
														
 
															+}
														
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,111 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * dlmglue.h
														
 
															+ *
														
 
															+ * description here
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#ifndef DLMGLUE_H
														
 
															+#define DLMGLUE_H
														
 
															+
														
 
															+#define OCFS2_LVB_VERSION 2
														
 
															+
														
 
															+struct ocfs2_meta_lvb {
														
 
															+	__be32       lvb_version;
														
 
															+	__be32       lvb_iclusters;
														
 
															+	__be32       lvb_iuid;
														
 
															+	__be32       lvb_igid;
														
 
															+	__be64       lvb_iatime_packed;
														
 
															+	__be64       lvb_ictime_packed;
														
 
															+	__be64       lvb_imtime_packed;
														
 
															+	__be64       lvb_isize;
														
 
															+	__be16       lvb_imode;
														
 
															+	__be16       lvb_inlink;
														
 
															+	__be32       lvb_reserved[3];
														
 
															+};
														
 
															+
														
 
															+/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
														
 
															+/* don't wait on recovery. */
														
 
															+#define OCFS2_META_LOCK_RECOVERY	(0x01)
														
 
															+/* Instruct the dlm not to queue ourselves on the other node. */
														
 
															+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
														
 
															+/* don't block waiting for the vote thread, instead return -EAGAIN */
														
 
															+#define OCFS2_LOCK_NONBLOCK		(0x04)
														
 
															+
														
 
															+int ocfs2_dlm_init(struct ocfs2_super *osb);
														
 
															+void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
														
 
															+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
														
 
															+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
														
 
															+			       enum ocfs2_lock_type type,
														
 
															+			       struct inode *inode);
														
 
															+void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
														
 
															+int ocfs2_create_new_inode_locks(struct inode *inode);
														
 
															+int ocfs2_drop_inode_locks(struct inode *inode);
														
 
															+int ocfs2_data_lock_full(struct inode *inode,
														
 
															+			 int write,
														
 
															+			 int arg_flags);
														
 
															+#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
														
 
															+int ocfs2_data_lock_with_page(struct inode *inode,
														
 
															+			      int write,
														
 
															+			      struct page *page);
														
 
															+void ocfs2_data_unlock(struct inode *inode,
														
 
															+		       int write);
														
 
															+int ocfs2_rw_lock(struct inode *inode, int write);
														
 
															+void ocfs2_rw_unlock(struct inode *inode, int write);
														
 
															+int ocfs2_meta_lock_full(struct inode *inode,
														
 
															+			 struct ocfs2_journal_handle *handle,
														
 
															+			 struct buffer_head **ret_bh,
														
 
															+			 int ex,
														
 
															+			 int arg_flags);
														
 
															+int ocfs2_meta_lock_with_page(struct inode *inode,
														
 
															+			      struct ocfs2_journal_handle *handle,
														
 
															+			      struct buffer_head **ret_bh,
														
 
															+			      int ex,
														
 
															+			      struct page *page);
														
 
															+/* 99% of the time we don't want to supply any additional flags --
														
 
															+ * those are for very specific cases only. */
														
 
															+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
														
 
															+void ocfs2_meta_unlock(struct inode *inode,
														
 
															+		       int ex);
														
 
															+int ocfs2_super_lock(struct ocfs2_super *osb,
														
 
															+		     int ex);
														
 
															+void ocfs2_super_unlock(struct ocfs2_super *osb,
														
 
															+			int ex);
														
 
															+int ocfs2_rename_lock(struct ocfs2_super *osb);
														
 
															+void ocfs2_rename_unlock(struct ocfs2_super *osb);
														
 
															+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
														
 
															+
														
 
															+/* for the vote thread */
														
 
															+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
														
 
															+				struct ocfs2_lock_res *lockres);
														
 
															+
														
 
															+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
														
 
															+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
														
 
															+
														
 
															+/* aids in debugging and tracking lvbs */
														
 
															+void ocfs2_dump_meta_lvb_info(u64 level,
														
 
															+			      const char *function,
														
 
															+			      unsigned int line,
														
 
															+			      struct ocfs2_lock_res *lockres);
														
 
															+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
														
 
															+
														
 
															+#endif	/* DLMGLUE_H */
														
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -0,0 +1,45 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * Copyright (C) 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_ENDIAN_H
														
 
															+#define OCFS2_ENDIAN_H
														
 
															+
														
 
															+static inline void le16_add_cpu(__le16 *var, u16 val)
														
 
															+{
														
 
															+	*var = cpu_to_le16(le16_to_cpu(*var) + val);
														
 
															+}
														
 
															+
														
 
															+static inline void le32_add_cpu(__le32 *var, u32 val)
														
 
															+{
														
 
															+	*var = cpu_to_le32(le32_to_cpu(*var) + val);
														
 
															+}
														
 
															+
														
 
															+static inline void le32_and_cpu(__le32 *var, u32 val)
														
 
															+{
														
 
															+	*var = cpu_to_le32(le32_to_cpu(*var) & val);
														
 
															+}
														
 
															+
														
 
															+static inline void be32_add_cpu(__be32 *var, u32 val)
														
 
															+{
														
 
															+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
														
 
															+}
														
 
															+
														
 
															+#endif /* OCFS2_ENDIAN_H */
														
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,248 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * export.c
														
 
															+ *
														
 
															+ * Functions to facilitate NFS exporting
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_EXPORT
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "dir.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "export.h"
														
 
															+#include "inode.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+struct ocfs2_inode_handle
														
 
															+{
														
 
															+	u64 ih_blkno;
														
 
															+	u32 ih_generation;
														
 
															+};
														
 
															+
														
 
															+static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
														
 
															+{
														
 
															+	struct ocfs2_inode_handle *handle = vobjp;
														
 
															+	struct inode *inode;
														
 
															+	struct dentry *result;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
														
 
															+
														
 
															+	if (handle->ih_blkno == 0) {
														
 
															+		mlog_errno(-ESTALE);
														
 
															+		return ERR_PTR(-ESTALE);
														
 
															+	}
														
 
															+
														
 
															+	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
														
 
															+
														
 
															+	if (IS_ERR(inode)) {
														
 
															+		mlog_errno(PTR_ERR(inode));
														
 
															+		return (void *)inode;
														
 
															+	}
														
 
															+
														
 
															+	if (handle->ih_generation != inode->i_generation) {
														
 
															+		iput(inode);
														
 
															+		mlog_errno(-ESTALE);
														
 
															+		return ERR_PTR(-ESTALE);
														
 
															+	}
														
 
															+
														
 
															+	result = d_alloc_anon(inode);
														
 
															+
														
 
															+	if (!result) {
														
 
															+		iput(inode);
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		return ERR_PTR(-ENOMEM);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit_ptr(result);
														
 
															+	return result;
														
 
															+}
														
 
															+
														
 
															+static struct dentry *ocfs2_get_parent(struct dentry *child)
														
 
															+{
														
 
															+	int status;
														
 
															+	u64 blkno;
														
 
															+	struct dentry *parent;
														
 
															+	struct inode *inode;
														
 
															+	struct inode *dir = child->d_inode;
														
 
															+	struct buffer_head *dirent_bh = NULL;
														
 
															+	struct ocfs2_dir_entry *dirent;
														
 
															+
														
 
															+	mlog_entry("(0x%p, '%.*s')\n", child,
														
 
															+		   child->d_name.len, child->d_name.name);
														
 
															+
														
 
															+	mlog(0, "find parent of directory %"MLFu64"\n",
														
 
															+	     OCFS2_I(dir)->ip_blkno);
														
 
															+
														
 
															+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		parent = ERR_PTR(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
														
 
															+					  &dirent);
														
 
															+	if (status < 0) {
														
 
															+		parent = ERR_PTR(-ENOENT);
														
 
															+		goto bail_unlock;
														
 
															+	}
														
 
															+
														
 
															+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
														
 
															+	if (IS_ERR(inode)) {
														
 
															+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
														
 
															+		parent = ERR_PTR(-EACCES);
														
 
															+		goto bail_unlock;
														
 
															+	}
														
 
															+
														
 
															+	parent = d_alloc_anon(inode);
														
 
															+	if (!parent) {
														
 
															+		iput(inode);
														
 
															+		parent = ERR_PTR(-ENOMEM);
														
 
															+	}
														
 
															+
														
 
															+bail_unlock:
														
 
															+	ocfs2_meta_unlock(dir, 0);
														
 
															+
														
 
															+	if (dirent_bh)
														
 
															+		brelse(dirent_bh);
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit_ptr(parent);
														
 
															+
														
 
															+	return parent;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
														
 
															+			   int connectable)
														
 
															+{
														
 
															+	struct inode *inode = dentry->d_inode;
														
 
															+	int len = *max_len;
														
 
															+	int type = 1;
														
 
															+	u64 blkno;
														
 
															+	u32 generation;
														
 
															+
														
 
															+	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
														
 
															+		   dentry->d_name.len, dentry->d_name.name,
														
 
															+		   fh, len, connectable);
														
 
															+
														
 
															+	if (len < 3 || (connectable && len < 6)) {
														
 
															+		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
														
 
															+		type = 255;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	blkno = OCFS2_I(inode)->ip_blkno;
														
 
															+	generation = inode->i_generation;
														
 
															+
														
 
															+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
														
 
															+	     blkno, generation);
														
 
															+
														
 
															+	len = 3;
														
 
															+	fh[0] = cpu_to_le32((u32)(blkno >> 32));
														
 
															+	fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
														
 
															+	fh[2] = cpu_to_le32(generation);
														
 
															+
														
 
															+	if (connectable && !S_ISDIR(inode->i_mode)) {
														
 
															+		struct inode *parent;
														
 
															+
														
 
															+		spin_lock(&dentry->d_lock);
														
 
															+
														
 
															+		parent = dentry->d_parent->d_inode;
														
 
															+		blkno = OCFS2_I(parent)->ip_blkno;
														
 
															+		generation = parent->i_generation;
														
 
															+
														
 
															+		fh[3] = cpu_to_le32((u32)(blkno >> 32));
														
 
															+		fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
														
 
															+		fh[5] = cpu_to_le32(generation);
														
 
															+
														
 
															+		spin_unlock(&dentry->d_lock);
														
 
															+
														
 
															+		len = 6;
														
 
															+		type = 2;
														
 
															+
														
 
															+		mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
														
 
															+		     blkno, generation);
														
 
															+	}
														
 
															+	
														
 
															+	*max_len = len;
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(type);
														
 
															+	return type;
														
 
															+}
														
 
															+
														
 
															+static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
														
 
															+				      int fh_len, int fileid_type,
														
 
															+				      int (*acceptable)(void *context,
														
 
															+						        struct dentry *de),
														
 
															+				      void *context)
														
 
															+{
														
 
															+	struct ocfs2_inode_handle handle, parent;
														
 
															+	struct dentry *ret = NULL;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
														
 
															+		   sb, fh, fh_len, fileid_type, acceptable, context);
														
 
															+
														
 
															+	if (fh_len < 3 || fileid_type > 2)
														
 
															+		goto bail;
														
 
															+
														
 
															+	if (fileid_type == 2) {
														
 
															+		if (fh_len < 6)
														
 
															+			goto bail;
														
 
															+
														
 
															+		parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
														
 
															+		parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
														
 
															+		parent.ih_generation = le32_to_cpu(fh[5]);
														
 
															+
														
 
															+		mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
														
 
															+		     parent.ih_blkno, parent.ih_generation);
														
 
															+	}
														
 
															+
														
 
															+	handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
														
 
															+	handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
														
 
															+	handle.ih_generation = le32_to_cpu(fh[2]);
														
 
															+
														
 
															+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
														
 
															+	     handle.ih_blkno, handle.ih_generation);
														
 
															+
														
 
															+	ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
														
 
															+						    acceptable, context);
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit_ptr(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct export_operations ocfs2_export_ops = {
														
 
															+	.decode_fh	= ocfs2_decode_fh,
														
 
															+	.encode_fh	= ocfs2_encode_fh,
														
 
															+
														
 
															+	.get_parent	= ocfs2_get_parent,
														
 
															+	.get_dentry	= ocfs2_get_dentry,
														
 
															+};
														
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,31 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * export.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_EXPORT_H
														
 
															+#define OCFS2_EXPORT_H
														
 
															+
														
 
															+extern struct export_operations ocfs2_export_ops;
														
 
															+
														
 
															+#endif /* OCFS2_EXPORT_H */
														
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * extent_map.c
														
 
															+ *
														
 
															+ * In-memory extent map for OCFS2.  Man, this code was prettier in
														
 
															+ * the library.
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License, version 2,  as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/rbtree.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_EXTENT_MAP
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "extent_map.h"
														
 
															+#include "inode.h"
														
 
															+#include "super.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * SUCK SUCK SUCK
														
 
															+ * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
														
 
															+ */
														
 
															+
														
 
															+struct ocfs2_extent_map_entry {
														
 
															+	struct rb_node e_node;
														
 
															+	int e_tree_depth;
														
 
															+	struct ocfs2_extent_rec e_rec;
														
 
															+};
														
 
															+
														
 
															+struct ocfs2_em_insert_context {
														
 
															+	int need_left;
														
 
															+	int need_right;
														
 
															+	struct ocfs2_extent_map_entry *new_ent;
														
 
															+	struct ocfs2_extent_map_entry *old_ent;
														
 
															+	struct ocfs2_extent_map_entry *left_ent;
														
 
															+	struct ocfs2_extent_map_entry *right_ent;
														
 
															+};
														
 
															+
														
 
															+static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
														
 
															+
														
 
															+
														
 
															+static struct ocfs2_extent_map_entry *
														
 
															+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
														
 
															+			u32 cpos, u32 clusters,
														
 
															+			struct rb_node ***ret_p,
														
 
															+			struct rb_node **ret_parent);
														
 
															+static int ocfs2_extent_map_insert(struct inode *inode,
														
 
															+				   struct ocfs2_extent_rec *rec,
														
 
															+				   int tree_depth);
														
 
															+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
														
 
															+					 struct ocfs2_extent_map_entry *ent);
														
 
															+static int ocfs2_extent_map_find_leaf(struct inode *inode,
														
 
															+				      u32 cpos, u32 clusters,
														
 
															+				      struct ocfs2_extent_list *el);
														
 
															+static int ocfs2_extent_map_lookup_read(struct inode *inode,
														
 
															+					u32 cpos, u32 clusters,
														
 
															+					struct ocfs2_extent_map_entry **ret_ent);
														
 
															+static int ocfs2_extent_map_try_insert(struct inode *inode,
														
 
															+				       struct ocfs2_extent_rec *rec,
														
 
															+				       int tree_depth,
														
 
															+				       struct ocfs2_em_insert_context *ctxt);
														
 
															+
														
 
															+/* returns 1 only if the rec contains all the given clusters -- that is that
														
 
															+ * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
														
 
															+ * clusters) is >= the argument's endpoint */
														
 
															+static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
														
 
															+					      u32 cpos, u32 clusters)
														
 
															+{
														
 
															+	if (le32_to_cpu(rec->e_cpos) > cpos)
														
 
															+		return 0;
														
 
															+	if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
														
 
															+			      le32_to_cpu(rec->e_clusters))
														
 
															+		return 0;
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * Find an entry in the tree that intersects the region passed in.
														
 
															+ * Note that this will find straddled intervals, it is up to the
														
 
															+ * callers to enforce any boundary conditions.
														
 
															+ *
														
 
															+ * Callers must hold ip_lock.  This lookup is not guaranteed to return
														
 
															+ * a tree_depth 0 match, and as such can race inserts if the lock
														
 
															+ * were not held.
														
 
															+ *
														
 
															+ * The rb_node garbage lets insertion share the search.  Trivial
														
 
															+ * callers pass NULL.
														
 
															+ */
														
 
															+static struct ocfs2_extent_map_entry *
														
 
															+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
														
 
															+			u32 cpos, u32 clusters,
														
 
															+			struct rb_node ***ret_p,
														
 
															+			struct rb_node **ret_parent)
														
 
															+{
														
 
															+	struct rb_node **p = &em->em_extents.rb_node;
														
 
															+	struct rb_node *parent = NULL;
														
 
															+	struct ocfs2_extent_map_entry *ent = NULL;
														
 
															+
														
 
															+	while (*p)
														
 
															+	{
														
 
															+		parent = *p;
														
 
															+		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
														
 
															+			       e_node);
														
 
															+		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
														
 
															+			p = &(*p)->rb_left;
														
 
															+			ent = NULL;
														
 
															+		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
														
 
															+				    le32_to_cpu(ent->e_rec.e_clusters))) {
														
 
															+			p = &(*p)->rb_right;
														
 
															+			ent = NULL;
														
 
															+		} else
														
 
															+			break;
														
 
															+	}
														
 
															+
														
 
															+	if (ret_p != NULL)
														
 
															+		*ret_p = p;
														
 
															+	if (ret_parent != NULL)
														
 
															+		*ret_parent = parent;
														
 
															+	return ent;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Find the leaf containing the interval we want.  While we're on our
														
 
															+ * way down the tree, fill in every record we see at any depth, because
														
 
															+ * we might want it later.
														
 
															+ *
														
 
															+ * Note that this code is run without ip_lock.  That's because it
														
 
															+ * sleeps while reading.  If someone is also filling the extent list at
														
 
															+ * the same time we are, we might have to restart.
														
 
															+ */
														
 
															+static int ocfs2_extent_map_find_leaf(struct inode *inode,
														
 
															+				      u32 cpos, u32 clusters,
														
 
															+				      struct ocfs2_extent_list *el)
														
 
															+{
														
 
															+	int i, ret;
														
 
															+	struct buffer_head *eb_bh = NULL;
														
 
															+	u64 blkno;
														
 
															+	u32 rec_end;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_extent_rec *rec;
														
 
															+
														
 
															+	/*
														
 
															+	 * The bh data containing the el cannot change here, because
														
 
															+	 * we hold alloc_sem.  So we can do this without other
														
 
															+	 * locks.
														
 
															+	 */
														
 
															+	while (el->l_tree_depth)
														
 
															+	{
														
 
															+		blkno = 0;
														
 
															+		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
														
 
															+			rec = &el->l_recs[i];
														
 
															+			rec_end = (le32_to_cpu(rec->e_cpos) +
														
 
															+				   le32_to_cpu(rec->e_clusters));
														
 
															+
														
 
															+			ret = -EBADR;
														
 
															+			if (rec_end > OCFS2_I(inode)->ip_clusters) {
														
 
															+				mlog_errno(ret);
														
 
															+				goto out_free;
														
 
															+			}
														
 
															+
														
 
															+			if (rec_end <= cpos) {
														
 
															+				ret = ocfs2_extent_map_insert(inode, rec,
														
 
															+						le16_to_cpu(el->l_tree_depth));
														
 
															+				if (ret && (ret != -EEXIST)) {
														
 
															+					mlog_errno(ret);
														
 
															+					goto out_free;
														
 
															+				}
														
 
															+				continue;
														
 
															+			}
														
 
															+			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
														
 
															+				ret = ocfs2_extent_map_insert(inode, rec,
														
 
															+						le16_to_cpu(el->l_tree_depth));
														
 
															+				if (ret && (ret != -EEXIST)) {
														
 
															+					mlog_errno(ret);
														
 
															+					goto out_free;
														
 
															+				}
														
 
															+				continue;
														
 
															+			}
														
 
															+
														
 
															+			/*
														
 
															+			 * We've found a record that matches our
														
 
															+			 * interval.  We don't insert it because we're
														
 
															+			 * about to traverse it.
														
 
															+			 */
														
 
															+
														
 
															+			/* Check to see if we're stradling */
														
 
															+			ret = -ESRCH;
														
 
															+			if (!ocfs2_extent_rec_contains_clusters(rec,
														
 
															+							        cpos,
														
 
															+								clusters)) {
														
 
															+				mlog_errno(ret);
														
 
															+				goto out_free;
														
 
															+			}
														
 
															+
														
 
															+			/*
														
 
															+			 * If we've already found a record, the el has
														
 
															+			 * two records covering the same interval.
														
 
															+			 * EEEK!
														
 
															+			 */
														
 
															+			ret = -EBADR;
														
 
															+			if (blkno) {
														
 
															+				mlog_errno(ret);
														
 
															+				goto out_free;
														
 
															+			}
														
 
															+
														
 
															+			blkno = le64_to_cpu(rec->e_blkno);
														
 
															+		}
														
 
															+
														
 
															+		/*
														
 
															+		 * We don't support holes, and we're still up
														
 
															+		 * in the branches, so we'd better have found someone
														
 
															+		 */
														
 
															+		ret = -EBADR;
														
 
															+		if (!blkno) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out_free;
														
 
															+		}
														
 
															+
														
 
															+		if (eb_bh) {
														
 
															+			brelse(eb_bh);
														
 
															+			eb_bh = NULL;
														
 
															+		}
														
 
															+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
														
 
															+				       blkno, &eb_bh, OCFS2_BH_CACHED,
														
 
															+				       inode);
														
 
															+		if (ret) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out_free;
														
 
															+		}
														
 
															+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+			ret = -EIO;
														
 
															+			goto out_free;
														
 
															+		}
														
 
															+		el = &eb->h_list;
														
 
															+	}
														
 
															+
														
 
															+	if (el->l_tree_depth)
														
 
															+		BUG();
														
 
															+
														
 
															+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
														
 
															+		rec = &el->l_recs[i];
														
 
															+		ret = ocfs2_extent_map_insert(inode, rec,
														
 
															+					      le16_to_cpu(el->l_tree_depth));
														
 
															+		if (ret) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out_free;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	ret = 0;
														
 
															+
														
 
															+out_free:
														
 
															+	if (eb_bh)
														
 
															+		brelse(eb_bh);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This lookup actually will read from disk.  It has one invariant:
														
 
															+ * It will never re-traverse blocks.  This means that all inserts should
														
 
															+ * be new regions or more granular regions (both allowed by insert).
														
 
															+ */
														
 
															+static int ocfs2_extent_map_lookup_read(struct inode *inode,
														
 
															+					u32 cpos,
														
 
															+					u32 clusters,
														
 
															+					struct ocfs2_extent_map_entry **ret_ent)
														
 
															+{
														
 
															+	int ret;
														
 
															+	u64 blkno;
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+	struct ocfs2_extent_map_entry *ent;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_extent_block *eb;
														
 
															+	struct ocfs2_dinode *di;
														
 
															+	struct ocfs2_extent_list *el;
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
														
 
															+	if (ent) {
														
 
															+		if (!ent->e_tree_depth) {
														
 
															+			spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+			*ret_ent = ent;
														
 
															+			return 0;
														
 
															+		}
														
 
															+		blkno = le64_to_cpu(ent->e_rec.e_blkno);
														
 
															+		spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
														
 
															+				       OCFS2_BH_CACHED, inode);
														
 
															+		if (ret) {
														
 
															+			mlog_errno(ret);
														
 
															+			if (bh)
														
 
															+				brelse(bh);
														
 
															+			return ret;
														
 
															+		}
														
 
															+		eb = (struct ocfs2_extent_block *)bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
														
 
															+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
														
 
															+			brelse(bh);
														
 
															+			return -EIO;
														
 
															+		}
														
 
															+		el = &eb->h_list;
														
 
															+	} else {
														
 
															+		spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
														
 
															+				       OCFS2_I(inode)->ip_blkno, &bh,
														
 
															+				       OCFS2_BH_CACHED, inode);
														
 
															+		if (ret) {
														
 
															+			mlog_errno(ret);
														
 
															+			if (bh)
														
 
															+				brelse(bh);
														
 
															+			return ret;
														
 
															+		}
														
 
															+		di = (struct ocfs2_dinode *)bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_DINODE(di)) {
														
 
															+			brelse(bh);
														
 
															+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
														
 
															+			return -EIO;
														
 
															+		}
														
 
															+		el = &di->id2.i_list;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
														
 
															+	brelse(bh);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
														
 
															+	if (!ent) {
														
 
															+		ret = -ESRCH;
														
 
															+		mlog_errno(ret);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	if (ent->e_tree_depth)
														
 
															+		BUG();  /* FIXME: Make sure this isn't a corruption */
														
 
															+
														
 
															+	*ret_ent = ent;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Callers must hold ip_lock.  This can insert pieces of the tree,
														
 
															+ * thus racing lookup if the lock weren't held.
														
 
															+ */
														
 
															+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
														
 
															+					 struct ocfs2_extent_map_entry *ent)
														
 
															+{
														
 
															+	struct rb_node **p, *parent;
														
 
															+	struct ocfs2_extent_map_entry *old_ent;
														
 
															+
														
 
															+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
														
 
															+					  le32_to_cpu(ent->e_rec.e_clusters),
														
 
															+					  &p, &parent);
														
 
															+	if (old_ent)
														
 
															+		return -EEXIST;
														
 
															+
														
 
															+	rb_link_node(&ent->e_node, parent, p);
														
 
															+	rb_insert_color(&ent->e_node, &em->em_extents);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * Simple rule: on any return code other than -EAGAIN, anything left
														
 
															+ * in the insert_context will be freed.
														
 
															+ */
														
 
															+static int ocfs2_extent_map_try_insert(struct inode *inode,
														
 
															+				       struct ocfs2_extent_rec *rec,
														
 
															+				       int tree_depth,
														
 
															+				       struct ocfs2_em_insert_context *ctxt)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+	struct ocfs2_extent_map_entry *old_ent;
														
 
															+
														
 
															+	ctxt->need_left = 0;
														
 
															+	ctxt->need_right = 0;
														
 
															+	ctxt->old_ent = NULL;
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
														
 
															+	if (!ret) {
														
 
															+		ctxt->new_ent = NULL;
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															+
														
 
															+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
														
 
															+					  le32_to_cpu(rec->e_clusters), NULL,
														
 
															+					  NULL);
														
 
															+
														
 
															+	if (!old_ent)
														
 
															+		BUG();
														
 
															+
														
 
															+	ret = -EEXIST;
														
 
															+	if (old_ent->e_tree_depth < tree_depth)
														
 
															+		goto out_unlock;
														
 
															+
														
 
															+	if (old_ent->e_tree_depth == tree_depth) {
														
 
															+		if (!memcmp(rec, &old_ent->e_rec,
														
 
															+			    sizeof(struct ocfs2_extent_rec)))
														
 
															+			ret = 0;
														
 
															+
														
 
															+		/* FIXME: Should this be ESRCH/EBADR??? */
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * We do it in this order specifically so that no actual tree
														
 
															+	 * changes occur until we have all the pieces we need.  We
														
 
															+	 * don't want malloc failures to leave an inconsistent tree.
														
 
															+	 * Whenever we drop the lock, another process could be
														
 
															+	 * inserting.  Also note that, if another process just beat us
														
 
															+	 * to an insert, we might not need the same pieces we needed
														
 
															+	 * the first go round.  In the end, the pieces we need will
														
 
															+	 * be used, and the pieces we don't will be freed.
														
 
															+	 */
														
 
															+	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
														
 
															+			     le32_to_cpu(old_ent->e_rec.e_cpos));
														
 
															+	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
														
 
															+			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
														
 
															+			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
														
 
															+	ret = -EAGAIN;
														
 
															+	if (ctxt->need_left) {
														
 
															+		if (!ctxt->left_ent)
														
 
															+			goto out_unlock;
														
 
															+		*(ctxt->left_ent) = *old_ent;
														
 
															+		ctxt->left_ent->e_rec.e_clusters =
														
 
															+			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
														
 
															+				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
														
 
															+	}
														
 
															+	if (ctxt->need_right) {
														
 
															+		if (!ctxt->right_ent)
														
 
															+			goto out_unlock;
														
 
															+		*(ctxt->right_ent) = *old_ent;
														
 
															+		ctxt->right_ent->e_rec.e_cpos =
														
 
															+			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
														
 
															+				    le32_to_cpu(rec->e_clusters));
														
 
															+		ctxt->right_ent->e_rec.e_clusters =
														
 
															+			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
														
 
															+				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
														
 
															+				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
														
 
															+	}
														
 
															+
														
 
															+	rb_erase(&old_ent->e_node, &em->em_extents);
														
 
															+	/* Now that he's erased, set him up for deletion */
														
 
															+	ctxt->old_ent = old_ent;
														
 
															+
														
 
															+	if (ctxt->need_left) {
														
 
															+		ret = ocfs2_extent_map_insert_entry(em,
														
 
															+						    ctxt->left_ent);
														
 
															+		if (ret)
														
 
															+			goto out_unlock;
														
 
															+		ctxt->left_ent = NULL;
														
 
															+	}
														
 
															+
														
 
															+	if (ctxt->need_right) {
														
 
															+		ret = ocfs2_extent_map_insert_entry(em,
														
 
															+						    ctxt->right_ent);
														
 
															+		if (ret)
														
 
															+			goto out_unlock;
														
 
															+		ctxt->right_ent = NULL;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
														
 
															+
														
 
															+	if (!ret)
														
 
															+		ctxt->new_ent = NULL;
														
 
															+
														
 
															+out_unlock:
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int ocfs2_extent_map_insert(struct inode *inode,
														
 
															+				   struct ocfs2_extent_rec *rec,
														
 
															+				   int tree_depth)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct ocfs2_em_insert_context ctxt = {0, };
														
 
															+
														
 
															+	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
														
 
															+	    OCFS2_I(inode)->ip_map.em_clusters) {
														
 
															+		ret = -EBADR;
														
 
															+		mlog_errno(ret);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	/* Zero e_clusters means a truncated tail record.  It better be EOF */
														
 
															+	if (!rec->e_clusters) {
														
 
															+		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
														
 
															+		    OCFS2_I(inode)->ip_map.em_clusters) {
														
 
															+			ret = -EBADR;
														
 
															+			mlog_errno(ret);
														
 
															+			return ret;
														
 
															+		}
														
 
															+
														
 
															+		/* Ignore the truncated tail */
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	ret = -ENOMEM;
														
 
															+	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
														
 
															+					GFP_KERNEL);
														
 
															+	if (!ctxt.new_ent) {
														
 
															+		mlog_errno(ret);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	ctxt.new_ent->e_rec = *rec;
														
 
															+	ctxt.new_ent->e_tree_depth = tree_depth;
														
 
															+
														
 
															+	do {
														
 
															+		ret = -ENOMEM;
														
 
															+		if (ctxt.need_left && !ctxt.left_ent) {
														
 
															+			ctxt.left_ent =
														
 
															+				kmem_cache_alloc(ocfs2_em_ent_cachep,
														
 
															+						 GFP_KERNEL);
														
 
															+			if (!ctxt.left_ent)
														
 
															+				break;
														
 
															+		}
														
 
															+		if (ctxt.need_right && !ctxt.right_ent) {
														
 
															+			ctxt.right_ent =
														
 
															+				kmem_cache_alloc(ocfs2_em_ent_cachep,
														
 
															+						 GFP_KERNEL);
														
 
															+			if (!ctxt.right_ent)
														
 
															+				break;
														
 
															+		}
														
 
															+
														
 
															+		ret = ocfs2_extent_map_try_insert(inode, rec,
														
 
															+						  tree_depth, &ctxt);
														
 
															+	} while (ret == -EAGAIN);
														
 
															+
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+
														
 
															+	if (ctxt.left_ent)
														
 
															+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
														
 
															+	if (ctxt.right_ent)
														
 
															+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
														
 
															+	if (ctxt.old_ent)
														
 
															+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
														
 
															+	if (ctxt.new_ent)
														
 
															+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Append this record to the tail of the extent map.  It must be
														
 
															+ * tree_depth 0.  The record might be an extension of an existing
														
 
															+ * record, and as such that needs to be handled.  eg:
														
 
															+ *
														
 
															+ * Existing record in the extent map:
														
 
															+ *
														
 
															+ *	cpos = 10, len = 10
														
 
															+ * 	|---------|
														
 
															+ *
														
 
															+ * New Record:
														
 
															+ *
														
 
															+ *	cpos = 10, len = 20
														
 
															+ * 	|------------------|
														
 
															+ *
														
 
															+ * The passed record is the new on-disk record.  The new_clusters value
														
 
															+ * is how many clusters were added to the file.  If the append is a
														
 
															+ * contiguous append, the new_clusters has been added to
														
 
															+ * rec->e_clusters.  If the append is an entirely new extent, then
														
 
															+ * rec->e_clusters is == new_clusters.
														
 
															+ */
														
 
															+int ocfs2_extent_map_append(struct inode *inode,
														
 
															+			    struct ocfs2_extent_rec *rec,
														
 
															+			    u32 new_clusters)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+	struct ocfs2_extent_map_entry *ent;
														
 
															+	struct ocfs2_extent_rec *old;
														
 
															+
														
 
															+	BUG_ON(!new_clusters);
														
 
															+	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
														
 
															+
														
 
															+	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
														
 
															+		/*
														
 
															+		 * Size changed underneath us on disk.  Drop any
														
 
															+		 * straddling records and update our idea of
														
 
															+		 * i_clusters
														
 
															+		 */
														
 
															+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
														
 
															+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
														
 
															+	}
														
 
															+
														
 
															+	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
														
 
															+			 le32_to_cpu(rec->e_clusters)) !=
														
 
															+			(em->em_clusters + new_clusters),
														
 
															+			"Inode %"MLFu64":\n"
														
 
															+			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
														
 
															+			"em->em_clusters = %u + new_clusters = %u = %u\n",
														
 
															+			OCFS2_I(inode)->ip_blkno,
														
 
															+			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
														
 
															+			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
														
 
															+			em->em_clusters, new_clusters,
														
 
															+			em->em_clusters + new_clusters);
														
 
															+
														
 
															+	em->em_clusters += new_clusters;
														
 
															+
														
 
															+	ret = -ENOENT;
														
 
															+	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
														
 
															+		/* This is a contiguous append */
														
 
															+		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
														
 
															+					      NULL, NULL);
														
 
															+		if (ent) {
														
 
															+			old = &ent->e_rec;
														
 
															+			BUG_ON((le32_to_cpu(rec->e_cpos) +
														
 
															+				le32_to_cpu(rec->e_clusters)) !=
														
 
															+				 (le32_to_cpu(old->e_cpos) +
														
 
															+				  le32_to_cpu(old->e_clusters) +
														
 
															+				  new_clusters));
														
 
															+			if (ent->e_tree_depth == 0) {
														
 
															+				BUG_ON(le32_to_cpu(old->e_cpos) !=
														
 
															+				       le32_to_cpu(rec->e_cpos));
														
 
															+				BUG_ON(le64_to_cpu(old->e_blkno) !=
														
 
															+				       le64_to_cpu(rec->e_blkno));
														
 
															+				ret = 0;
														
 
															+			}
														
 
															+			/*
														
 
															+			 * Let non-leafs fall through as -ENOENT to
														
 
															+			 * force insertion of the new leaf.
														
 
															+			 */
														
 
															+			le32_add_cpu(&old->e_clusters, new_clusters);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (ret == -ENOENT)
														
 
															+		ret = ocfs2_extent_map_insert(inode, rec, 0);
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+#if 0
														
 
															+/* Code here is included but defined out as it completes the extent
														
 
															+ * map api and may be used in the future. */
														
 
															+
														
 
															+/*
														
 
															+ * Look up the record containing this cluster offset.  This record is
														
 
															+ * part of the extent map.  Do not free it.  Any changes you make to
														
 
															+ * it will reflect in the extent map.  So, if your last extent
														
 
															+ * is (cpos = 10, clusters = 10) and you truncate the file by 5
														
 
															+ * clusters, you can do:
														
 
															+ *
														
 
															+ * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
														
 
															+ * rec->e_clusters -= 5;
														
 
															+ *
														
 
															+ * The lookup does not read from disk.  If the map isn't filled in for
														
 
															+ * an entry, you won't find it.
														
 
															+ *
														
 
															+ * Also note that the returned record is valid until alloc_sem is
														
 
															+ * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
														
 
															+ */
														
 
															+int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
														
 
															+			     struct ocfs2_extent_rec **rec,
														
 
															+			     int *tree_depth)
														
 
															+{
														
 
															+	int ret = -ENOENT;
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+	struct ocfs2_extent_map_entry *ent;
														
 
															+
														
 
															+	*rec = NULL;
														
 
															+
														
 
															+	if (cpos >= OCFS2_I(inode)->ip_clusters)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (cpos >= em->em_clusters) {
														
 
															+		/*
														
 
															+		 * Size changed underneath us on disk.  Drop any
														
 
															+		 * straddling records and update our idea of
														
 
															+		 * i_clusters
														
 
															+		 */
														
 
															+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
														
 
															+		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
														
 
															+	}
														
 
															+
														
 
															+	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
														
 
															+				      NULL, NULL);
														
 
															+
														
 
															+	if (ent) {
														
 
															+		*rec = &ent->e_rec;
														
 
															+		if (tree_depth)
														
 
															+			*tree_depth = ent->e_tree_depth;
														
 
															+		ret = 0;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_extent_map_get_clusters(struct inode *inode,
														
 
															+				  u32 v_cpos, int count,
														
 
															+				  u32 *p_cpos, int *ret_count)
														
 
															+{
														
 
															+	int ret;
														
 
															+	u32 coff, ccount;
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+	struct ocfs2_extent_map_entry *ent = NULL;
														
 
															+
														
 
															+	*p_cpos = ccount = 0;
														
 
															+
														
 
															+	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if ((v_cpos + count) > em->em_clusters) {
														
 
															+		/*
														
 
															+		 * Size changed underneath us on disk.  Drop any
														
 
															+		 * straddling records and update our idea of
														
 
															+		 * i_clusters
														
 
															+		 */
														
 
															+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
														
 
															+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	if (ent) {
														
 
															+		/* We should never find ourselves straddling an interval */
														
 
															+		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
														
 
															+							v_cpos,
														
 
															+							count))
														
 
															+			return -ESRCH;
														
 
															+
														
 
															+		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
														
 
															+		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
														
 
															+				le64_to_cpu(ent->e_rec.e_blkno)) +
														
 
															+			  coff;
														
 
															+
														
 
															+		if (ret_count)
														
 
															+			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
														
 
															+
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	return -ENOENT;
														
 
															+}
														
 
															+
														
 
															+#endif  /*  0  */
														
 
															+
														
 
															+int ocfs2_extent_map_get_blocks(struct inode *inode,
														
 
															+				u64 v_blkno, int count,
														
 
															+				u64 *p_blkno, int *ret_count)
														
 
															+{
														
 
															+	int ret;
														
 
															+	u64 boff;
														
 
															+	u32 cpos, clusters;
														
 
															+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
														
 
															+	struct ocfs2_extent_map_entry *ent = NULL;
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+	struct ocfs2_extent_rec *rec;
														
 
															+
														
 
															+	*p_blkno = 0;
														
 
															+
														
 
															+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
														
 
															+	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
														
 
															+					    (u64)count + bpc - 1);
														
 
															+	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
														
 
															+		ret = -EINVAL;
														
 
															+		mlog_errno(ret);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	if ((cpos + clusters) > em->em_clusters) {
														
 
															+		/*
														
 
															+		 * Size changed underneath us on disk.  Drop any
														
 
															+		 * straddling records and update our idea of
														
 
															+		 * i_clusters
														
 
															+		 */
														
 
															+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
														
 
															+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
														
 
															+	if (ret) {
														
 
															+		mlog_errno(ret);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	if (ent)
														
 
															+	{
														
 
															+		rec = &ent->e_rec;
														
 
															+
														
 
															+		/* We should never find ourselves straddling an interval */
														
 
															+		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
														
 
															+			ret = -ESRCH;
														
 
															+			mlog_errno(ret);
														
 
															+			return ret;
														
 
															+		}
														
 
															+
														
 
															+		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
														
 
															+						le32_to_cpu(rec->e_cpos));
														
 
															+		boff += (v_blkno & (u64)(bpc - 1));
														
 
															+		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
														
 
															+
														
 
															+		if (ret_count) {
														
 
															+			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
														
 
															+					le32_to_cpu(rec->e_clusters)) - boff;
														
 
															+		}
														
 
															+
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return -ENOENT;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_extent_map_init(struct inode *inode)
														
 
															+{
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+
														
 
															+	em->em_extents = RB_ROOT;
														
 
															+	em->em_clusters = 0;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Needs the lock */
														
 
															+static void __ocfs2_extent_map_drop(struct inode *inode,
														
 
															+				    u32 new_clusters,
														
 
															+				    struct rb_node **free_head,
														
 
															+				    struct ocfs2_extent_map_entry **tail_ent)
														
 
															+{
														
 
															+	struct rb_node *node, *next;
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+	struct ocfs2_extent_map_entry *ent;
														
 
															+
														
 
															+	*free_head = NULL;
														
 
															+
														
 
															+	ent = NULL;
														
 
															+	node = rb_last(&em->em_extents);
														
 
															+	while (node)
														
 
															+	{
														
 
															+		next = rb_prev(node);
														
 
															+
														
 
															+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
														
 
															+			       e_node);
														
 
															+		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
														
 
															+			break;
														
 
															+
														
 
															+		rb_erase(&ent->e_node, &em->em_extents);
														
 
															+
														
 
															+		node->rb_right = *free_head;
														
 
															+		*free_head = node;
														
 
															+
														
 
															+		ent = NULL;
														
 
															+		node = next;
														
 
															+	}
														
 
															+
														
 
															+	/* Do we have an entry straddling new_clusters? */
														
 
															+	if (tail_ent) {
														
 
															+		if (ent &&
														
 
															+		    ((le32_to_cpu(ent->e_rec.e_cpos) +
														
 
															+		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
														
 
															+			*tail_ent = ent;
														
 
															+		else
														
 
															+			*tail_ent = NULL;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
														
 
															+{
														
 
															+	struct rb_node *node;
														
 
															+	struct ocfs2_extent_map_entry *ent;
														
 
															+
														
 
															+	while (free_head) {
														
 
															+		node = free_head;
														
 
															+		free_head = node->rb_right;
														
 
															+
														
 
															+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
														
 
															+			       e_node);
														
 
															+		kmem_cache_free(ocfs2_em_ent_cachep, ent);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Remove all entries past new_clusters, inclusive of an entry that
														
 
															+ * contains new_clusters.  This is effectively a cache forget.
														
 
															+ *
														
 
															+ * If you want to also clip the last extent by some number of clusters,
														
 
															+ * you need to call ocfs2_extent_map_trunc().
														
 
															+ * This code does not check or modify ip_clusters.
														
 
															+ */
														
 
															+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
														
 
															+{
														
 
															+	struct rb_node *free_head = NULL;
														
 
															+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
														
 
															+	struct ocfs2_extent_map_entry *ent;
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
														
 
															+
														
 
															+	if (ent) {
														
 
															+		rb_erase(&ent->e_node, &em->em_extents);
														
 
															+		ent->e_node.rb_right = free_head;
														
 
															+		free_head = &ent->e_node;
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	if (free_head)
														
 
															+		__ocfs2_extent_map_drop_cleanup(free_head);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Remove all entries past new_clusters and also clip any extent
														
 
															+ * straddling new_clusters, if there is one.  This does not check
														
 
															+ * or modify ip_clusters
														
 
															+ */
														
 
															+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
														
 
															+{
														
 
															+	struct rb_node *free_head = NULL;
														
 
															+	struct ocfs2_extent_map_entry *ent = NULL;
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
														
 
															+
														
 
															+	if (ent)
														
 
															+		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
														
 
															+					       le32_to_cpu(ent->e_rec.e_cpos));
														
 
															+
														
 
															+	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
														
 
															+
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	if (free_head)
														
 
															+		__ocfs2_extent_map_drop_cleanup(free_head);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int __init init_ocfs2_extent_maps(void)
														
 
															+{
														
 
															+	ocfs2_em_ent_cachep =
														
 
															+		kmem_cache_create("ocfs2_em_ent",
														
 
															+				  sizeof(struct ocfs2_extent_map_entry),
														
 
															+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
														
 
															+	if (!ocfs2_em_ent_cachep)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void __exit exit_ocfs2_extent_maps(void)
														
 
															+{
														
 
															+	kmem_cache_destroy(ocfs2_em_ent_cachep);
														
 
															+}
														
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,46 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * extent_map.h
														
 
															+ *
														
 
															+ * In-memory file extent mappings for OCFS2.
														
 
															+ *
														
 
															+ * Copyright (C) 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License, version 2,  as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef _EXTENT_MAP_H
														
 
															+#define _EXTENT_MAP_H
														
 
															+
														
 
															+int init_ocfs2_extent_maps(void);
														
 
															+void exit_ocfs2_extent_maps(void);
														
 
															+
														
 
															+/*
														
 
															+ * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
														
 
															+ * to be held.  The allocation cannot change at all while the map is
														
 
															+ * in the process of being updated.
														
 
															+ */
														
 
															+int ocfs2_extent_map_init(struct inode *inode);
														
 
															+int ocfs2_extent_map_append(struct inode *inode,
														
 
															+			    struct ocfs2_extent_rec *rec,
														
 
															+			    u32 new_clusters);
														
 
															+int ocfs2_extent_map_get_blocks(struct inode *inode,
														
 
															+				u64 v_blkno, int count,
														
 
															+				u64 *p_blkno, int *ret_count);
														
 
															+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
														
 
															+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
														
 
															+
														
 
															+#endif  /* _EXTENT_MAP_H */
														
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,1237 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * file.c
														
 
															+ *
														
 
															+ * File open, close, extend, truncate
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/pagemap.h>
														
 
															+#include <linux/uio.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_INODE
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "aops.h"
														
 
															+#include "dir.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "file.h"
														
 
															+#include "sysfile.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "mmap.h"
														
 
															+#include "suballoc.h"
														
 
															+#include "super.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+static int ocfs2_sync_inode(struct inode *inode)
														
 
															+{
														
 
															+	filemap_fdatawrite(inode->i_mapping);
														
 
															+	return sync_mapping_buffers(inode->i_mapping);
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_file_open(struct inode *inode, struct file *file)
														
 
															+{
														
 
															+	int status;
														
 
															+	int mode = file->f_flags;
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
														
 
															+		   file->f_dentry->d_name.len, file->f_dentry->d_name.name);
														
 
															+
														
 
															+	spin_lock(&oi->ip_lock);
														
 
															+
														
 
															+	/* Check that the inode hasn't been wiped from disk by another
														
 
															+	 * node. If it hasn't then we're safe as long as we hold the
														
 
															+	 * spin lock until our increment of open count. */
														
 
															+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
														
 
															+		spin_unlock(&oi->ip_lock);
														
 
															+
														
 
															+		status = -ENOENT;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (mode & O_DIRECT)
														
 
															+		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
														
 
															+
														
 
															+	oi->ip_open_count++;
														
 
															+	spin_unlock(&oi->ip_lock);
														
 
															+	status = 0;
														
 
															+leave:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_file_release(struct inode *inode, struct file *file)
														
 
															+{
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
														
 
															+		       file->f_dentry->d_name.len,
														
 
															+		       file->f_dentry->d_name.name);
														
 
															+
														
 
															+	spin_lock(&oi->ip_lock);
														
 
															+	if (!--oi->ip_open_count)
														
 
															+		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
														
 
															+	spin_unlock(&oi->ip_lock);
														
 
															+
														
 
															+	mlog_exit(0);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_sync_file(struct file *file,
														
 
															+			   struct dentry *dentry,
														
 
															+			   int datasync)
														
 
															+{
														
 
															+	int err = 0;
														
 
															+	journal_t *journal;
														
 
															+	struct inode *inode = dentry->d_inode;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
														
 
															+		   dentry->d_name.len, dentry->d_name.name);
														
 
															+
														
 
															+	err = ocfs2_sync_inode(dentry->d_inode);
														
 
															+	if (err)
														
 
															+		goto bail;
														
 
															+
														
 
															+	journal = osb->journal->j_journal;
														
 
															+	err = journal_force_commit(journal);
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(err);
														
 
															+
														
 
															+	return (err < 0) ? -EIO : 0;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
														
 
															+			 struct inode *inode,
														
 
															+			 struct buffer_head *fe_bh,
														
 
															+			 u64 new_i_size)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	i_size_write(inode, new_i_size);
														
 
															+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
														
 
															+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
														
 
															+
														
 
															+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_simple_size_update(struct inode *inode,
														
 
															+				    struct buffer_head *di_bh,
														
 
															+				    u64 new_i_size)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, NULL,
														
 
															+				   OCFS2_INODE_UPDATE_CREDITS);
														
 
															+	if (handle == NULL) {
														
 
															+		ret = -ENOMEM;
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
														
 
															+				   new_i_size);
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+
														
 
															+	ocfs2_commit_trans(handle);
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
														
 
															+				     struct inode *inode,
														
 
															+				     struct buffer_head *fe_bh,
														
 
															+				     u64 new_i_size)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_journal_handle *handle;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	/* TODO: This needs to actually orphan the inode in this
														
 
															+	 * transaction. */
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		mlog_errno(status);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	ocfs2_commit_trans(handle);
														
 
															+out:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_truncate_file(struct inode *inode,
														
 
															+			       struct buffer_head *di_bh,
														
 
															+			       u64 new_i_size)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_dinode *fe = NULL;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	struct ocfs2_truncate_context *tc = NULL;
														
 
															+
														
 
															+	mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
														
 
															+		   OCFS2_I(inode)->ip_blkno, new_i_size);
														
 
															+
														
 
															+	truncate_inode_pages(inode->i_mapping, new_i_size);
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) di_bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
														
 
															+			"Inode %"MLFu64", inode i_size = %lld != di "
														
 
															+			"i_size = %"MLFu64", i_flags = 0x%x\n",
														
 
															+			OCFS2_I(inode)->ip_blkno,
														
 
															+			i_size_read(inode),
														
 
															+			le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
														
 
															+
														
 
															+	if (new_i_size > le64_to_cpu(fe->i_size)) {
														
 
															+		mlog(0, "asked to truncate file with size (%"MLFu64") "
														
 
															+		     "to size (%"MLFu64")!\n",
														
 
															+		     le64_to_cpu(fe->i_size), new_i_size);
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
														
 
															+	     le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
														
 
															+
														
 
															+	/* lets handle the simple truncate cases before doing any more
														
 
															+	 * cluster locking. */
														
 
															+	if (new_i_size == le64_to_cpu(fe->i_size))
														
 
															+		goto bail;
														
 
															+
														
 
															+	if (le32_to_cpu(fe->i_clusters) ==
														
 
															+	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
														
 
															+		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
														
 
															+		     fe->i_clusters);
														
 
															+		/* No allocation change is required, so lets fast path
														
 
															+		 * this truncate. */
														
 
															+		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* This forces other nodes to sync and drop their pages */
														
 
															+	status = ocfs2_data_lock(inode, 1);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	ocfs2_data_unlock(inode, 1);
														
 
															+
														
 
															+	/* alright, we're going to need to do a full blown alloc size
														
 
															+	 * change. Orphan the inode so that recovery can complete the
														
 
															+	 * truncate if necessary. This does the task of marking
														
 
															+	 * i_size. */
														
 
															+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* TODO: orphan dir cleanup here. */
														
 
															+bail:
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * extend allocation only here.
														
 
															+ * we'll update all the disk stuff, and oip->alloc_size
														
 
															+ *
														
 
															+ * expect stuff to be locked, a transaction started and enough data /
														
 
															+ * metadata reservations in the contexts.
														
 
															+ *
														
 
															+ * Will return -EAGAIN, and a reason if a restart is needed.
														
 
															+ * If passed in, *reason will always be set, even in error.
														
 
															+ */
														
 
															+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
														
 
															+			       struct inode *inode,
														
 
															+			       u32 clusters_to_add,
														
 
															+			       struct buffer_head *fe_bh,
														
 
															+			       struct ocfs2_journal_handle *handle,
														
 
															+			       struct ocfs2_alloc_context *data_ac,
														
 
															+			       struct ocfs2_alloc_context *meta_ac,
														
 
															+			       enum ocfs2_alloc_restarted *reason_ret)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int free_extents;
														
 
															+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
														
 
															+	u32 bit_off, num_bits;
														
 
															+	u64 block;
														
 
															+
														
 
															+	BUG_ON(!clusters_to_add);
														
 
															+
														
 
															+	free_extents = ocfs2_num_free_extents(osb, inode, fe);
														
 
															+	if (free_extents < 0) {
														
 
															+		status = free_extents;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* there are two cases which could cause us to EAGAIN in the
														
 
															+	 * we-need-more-metadata case:
														
 
															+	 * 1) we haven't reserved *any*
														
 
															+	 * 2) we are so fragmented, we've needed to add metadata too
														
 
															+	 *    many times. */
														
 
															+	if (!free_extents && !meta_ac) {
														
 
															+		mlog(0, "we haven't reserved any metadata!\n");
														
 
															+		status = -EAGAIN;
														
 
															+		reason = RESTART_META;
														
 
															+		goto leave;
														
 
															+	} else if ((!free_extents)
														
 
															+		   && (ocfs2_alloc_context_bits_left(meta_ac)
														
 
															+		       < ocfs2_extend_meta_needed(fe))) {
														
 
															+		mlog(0, "filesystem is really fragmented...\n");
														
 
															+		status = -EAGAIN;
														
 
															+		reason = RESTART_META;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
														
 
															+				      &bit_off, &num_bits);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(num_bits > clusters_to_add);
														
 
															+
														
 
															+	/* reserve our write early -- insert_extent may update the inode */
														
 
															+	status = ocfs2_journal_access(handle, inode, fe_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
														
 
															+	mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
														
 
															+	     num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
														
 
															+	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
														
 
															+				     num_bits, meta_ac);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	le32_add_cpu(&fe->i_clusters, num_bits);
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	clusters_to_add -= num_bits;
														
 
															+
														
 
															+	if (clusters_to_add) {
														
 
															+		mlog(0, "need to alloc once more, clusters = %u, wanted = "
														
 
															+		     "%u\n", fe->i_clusters, clusters_to_add);
														
 
															+		status = -EAGAIN;
														
 
															+		reason = RESTART_TRANS;
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	mlog_exit(status);
														
 
															+	if (reason_ret)
														
 
															+		*reason_ret = reason;
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_extend_allocation(struct inode *inode,
														
 
															+				   u32 clusters_to_add)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int restart_func = 0;
														
 
															+	int drop_alloc_sem = 0;
														
 
															+	int credits, num_free_extents;
														
 
															+	u32 prev_clusters;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_dinode *fe = NULL;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct ocfs2_alloc_context *data_ac = NULL;
														
 
															+	struct ocfs2_alloc_context *meta_ac = NULL;
														
 
															+	enum ocfs2_alloc_restarted why;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
														
 
															+
														
 
															+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
														
 
															+				  OCFS2_BH_CACHED, inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
														
 
															+		status = -EIO;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+restart_all:
														
 
															+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
														
 
															+
														
 
															+	mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
														
 
															+	     "clusters_to_add = %u\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno, i_size_read(inode),
														
 
															+	     fe->i_clusters, clusters_to_add);
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (handle == NULL) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	num_free_extents = ocfs2_num_free_extents(osb,
														
 
															+						  inode,
														
 
															+						  fe);
														
 
															+	if (num_free_extents < 0) {
														
 
															+		status = num_free_extents;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (!num_free_extents) {
														
 
															+		status = ocfs2_reserve_new_metadata(osb,
														
 
															+						    handle,
														
 
															+						    fe,
														
 
															+						    &meta_ac);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC)
														
 
															+				mlog_errno(status);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_reserve_clusters(osb,
														
 
															+					handle,
														
 
															+					clusters_to_add,
														
 
															+					&data_ac);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* blocks peope in read/write from reading our allocation
														
 
															+	 * until we're done changing it. We depend on i_sem to block
														
 
															+	 * other extend/truncate calls while we're here. Ordering wrt
														
 
															+	 * start_trans is important here -- always do it before! */
														
 
															+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+	drop_alloc_sem = 1;
														
 
															+
														
 
															+	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
														
 
															+	handle = ocfs2_start_trans(osb, handle, credits);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+restarted_transaction:
														
 
															+	/* reserve a write to the file entry early on - that we if we
														
 
															+	 * run out of credits in the allocation path, we can still
														
 
															+	 * update i_size. */
														
 
															+	status = ocfs2_journal_access(handle, inode, bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	prev_clusters = OCFS2_I(inode)->ip_clusters;
														
 
															+
														
 
															+	status = ocfs2_do_extend_allocation(osb,
														
 
															+					    inode,
														
 
															+					    clusters_to_add,
														
 
															+					    bh,
														
 
															+					    handle,
														
 
															+					    data_ac,
														
 
															+					    meta_ac,
														
 
															+					    &why);
														
 
															+	if ((status < 0) && (status != -EAGAIN)) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	if (why != RESTART_NONE && clusters_to_add) {
														
 
															+		if (why == RESTART_META) {
														
 
															+			mlog(0, "restarting function.\n");
														
 
															+			restart_func = 1;
														
 
															+		} else {
														
 
															+			BUG_ON(why != RESTART_TRANS);
														
 
															+
														
 
															+			mlog(0, "restarting transaction.\n");
														
 
															+			/* TODO: This can be more intelligent. */
														
 
															+			credits = ocfs2_calc_extend_credits(osb->sb,
														
 
															+							    fe,
														
 
															+							    clusters_to_add);
														
 
															+			status = ocfs2_extend_trans(handle, credits);
														
 
															+			if (status < 0) {
														
 
															+				/* handle still has to be committed at
														
 
															+				 * this point. */
														
 
															+				status = -ENOMEM;
														
 
															+				mlog_errno(status);
														
 
															+				goto leave;
														
 
															+			}
														
 
															+			goto restarted_transaction;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
														
 
															+	     fe->i_clusters, fe->i_size);
														
 
															+	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
														
 
															+	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
														
 
															+
														
 
															+leave:
														
 
															+	if (drop_alloc_sem) {
														
 
															+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
														
 
															+		drop_alloc_sem = 0;
														
 
															+	}
														
 
															+	if (handle) {
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+		handle = NULL;
														
 
															+	}
														
 
															+	if (data_ac) {
														
 
															+		ocfs2_free_alloc_context(data_ac);
														
 
															+		data_ac = NULL;
														
 
															+	}
														
 
															+	if (meta_ac) {
														
 
															+		ocfs2_free_alloc_context(meta_ac);
														
 
															+		meta_ac = NULL;
														
 
															+	}
														
 
															+	if ((!status) && restart_func) {
														
 
															+		restart_func = 0;
														
 
															+		goto restart_all;
														
 
															+	}
														
 
															+	if (bh) {
														
 
															+		brelse(bh);
														
 
															+		bh = NULL;
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Some parts of this taken from generic_cont_expand, which turned out
														
 
															+ * to be too fragile to do exactly what we need without us having to
														
 
															+ * worry about recursive locking in ->commit_write(). */
														
 
															+static int ocfs2_write_zero_page(struct inode *inode,
														
 
															+				 u64 size)
														
 
															+{
														
 
															+	struct address_space *mapping = inode->i_mapping;
														
 
															+	struct page *page;
														
 
															+	unsigned long index;
														
 
															+	unsigned int offset;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	int ret;
														
 
															+
														
 
															+	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
														
 
															+	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
														
 
															+	** skip the prepare.  make sure we never send an offset for the start
														
 
															+	** of a block
														
 
															+	*/
														
 
															+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
														
 
															+		offset++;
														
 
															+	}
														
 
															+	index = size >> PAGE_CACHE_SHIFT;
														
 
															+
														
 
															+	page = grab_cache_page(mapping, index);
														
 
															+	if (!page) {
														
 
															+		ret = -ENOMEM;
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_prepare_write(NULL, page, offset, offset);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															+
														
 
															+	if (ocfs2_should_order_data(inode)) {
														
 
															+		handle = ocfs2_start_walk_page_trans(inode, page, offset,
														
 
															+						     offset);
														
 
															+		if (IS_ERR(handle)) {
														
 
															+			ret = PTR_ERR(handle);
														
 
															+			handle = NULL;
														
 
															+			goto out_unlock;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* must not update i_size! */
														
 
															+	ret = block_commit_write(page, offset, offset);
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+	else
														
 
															+		ret = 0;
														
 
															+
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+out_unlock:
														
 
															+	unlock_page(page);
														
 
															+	page_cache_release(page);
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_zero_extend(struct inode *inode,
														
 
															+			     u64 zero_to_size)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	u64 start_off;
														
 
															+	struct super_block *sb = inode->i_sb;
														
 
															+
														
 
															+	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
														
 
															+	while (start_off < zero_to_size) {
														
 
															+		ret = ocfs2_write_zero_page(inode, start_off);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+
														
 
															+		start_off += sb->s_blocksize;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_extend_file(struct inode *inode,
														
 
															+			     struct buffer_head *di_bh,
														
 
															+			     u64 new_i_size)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	u32 clusters_to_add;
														
 
															+
														
 
															+	/* setattr sometimes calls us like this. */
														
 
															+	if (new_i_size == 0)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (i_size_read(inode) == new_i_size)
														
 
															+  		goto out;
														
 
															+	BUG_ON(new_i_size < i_size_read(inode));
														
 
															+
														
 
															+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
														
 
															+		OCFS2_I(inode)->ip_clusters;
														
 
															+
														
 
															+	if (clusters_to_add) {
														
 
															+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+
														
 
															+		ret = ocfs2_zero_extend(inode, new_i_size);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+	} 
														
 
															+
														
 
															+	/* No allocation required, we just use this helper to
														
 
															+	 * do a trivial update of i_size. */
														
 
															+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
														
 
															+{
														
 
															+	int status = 0, size_change;
														
 
															+	struct inode *inode = dentry->d_inode;
														
 
															+	struct super_block *sb = inode->i_sb;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(sb);
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+
														
 
															+	mlog_entry("(0x%p, '%.*s')\n", dentry,
														
 
															+	           dentry->d_name.len, dentry->d_name.name);
														
 
															+
														
 
															+	if (attr->ia_valid & ATTR_MODE)
														
 
															+		mlog(0, "mode change: %d\n", attr->ia_mode);
														
 
															+	if (attr->ia_valid & ATTR_UID)
														
 
															+		mlog(0, "uid change: %d\n", attr->ia_uid);
														
 
															+	if (attr->ia_valid & ATTR_GID)
														
 
															+		mlog(0, "gid change: %d\n", attr->ia_gid);
														
 
															+	if (attr->ia_valid & ATTR_SIZE)
														
 
															+		mlog(0, "size change...\n");
														
 
															+	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
														
 
															+		mlog(0, "time change...\n");
														
 
															+
														
 
															+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
														
 
															+			   | ATTR_GID | ATTR_UID | ATTR_MODE)
														
 
															+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
														
 
															+		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	status = inode_change_ok(inode, attr);
														
 
															+	if (status)
														
 
															+		return status;
														
 
															+
														
 
															+	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
														
 
															+	if (size_change) {
														
 
															+		status = ocfs2_rw_lock(inode, 1);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail_unlock_rw;
														
 
															+	}
														
 
															+
														
 
															+	if (size_change && attr->ia_size != i_size_read(inode)) {
														
 
															+		if (i_size_read(inode) > attr->ia_size)
														
 
															+			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
														
 
															+		else
														
 
															+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC)
														
 
															+				mlog_errno(status);
														
 
															+			status = -ENOSPC;
														
 
															+			goto bail_unlock;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		mlog_errno(status);
														
 
															+		goto bail_unlock;
														
 
															+	}
														
 
															+
														
 
															+	status = inode_setattr(inode, attr);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail_commit;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail_commit:
														
 
															+	ocfs2_commit_trans(handle);
														
 
															+bail_unlock:
														
 
															+	ocfs2_meta_unlock(inode, 1);
														
 
															+bail_unlock_rw:
														
 
															+	if (size_change)
														
 
															+		ocfs2_rw_unlock(inode, 1);
														
 
															+bail:
														
 
															+	if (bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_getattr(struct vfsmount *mnt,
														
 
															+		  struct dentry *dentry,
														
 
															+		  struct kstat *stat)
														
 
															+{
														
 
															+	struct inode *inode = dentry->d_inode;
														
 
															+	struct super_block *sb = dentry->d_inode->i_sb;
														
 
															+	struct ocfs2_super *osb = sb->s_fs_info;
														
 
															+	int err;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	err = ocfs2_inode_revalidate(dentry);
														
 
															+	if (err) {
														
 
															+		if (err != -ENOENT)
														
 
															+			mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	generic_fillattr(inode, stat);
														
 
															+
														
 
															+	/* We set the blksize from the cluster size for performance */
														
 
															+	stat->blksize = osb->s_clustersize;
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(err);
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_write_remove_suid(struct inode *inode)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+	struct ocfs2_journal_handle *handle;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	struct ocfs2_dinode *di;
														
 
															+
														
 
															+	mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
														
 
															+		   inode->i_mode);
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
														
 
															+	if (handle == NULL) {
														
 
															+		ret = -ENOMEM;
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out_trans;
														
 
															+	}
														
 
															+
														
 
															+	ret = ocfs2_journal_access(handle, inode, bh,
														
 
															+				   OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out_bh;
														
 
															+	}
														
 
															+
														
 
															+	inode->i_mode &= ~S_ISUID;
														
 
															+	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
														
 
															+		inode->i_mode &= ~S_ISGID;
														
 
															+
														
 
															+	di = (struct ocfs2_dinode *) bh->b_data;
														
 
															+	di->i_mode = cpu_to_le16(inode->i_mode);
														
 
															+
														
 
															+	ret = ocfs2_journal_dirty(handle, bh);
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+out_bh:
														
 
															+	brelse(bh);
														
 
															+out_trans:
														
 
															+	ocfs2_commit_trans(handle);
														
 
															+out:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_write_should_remove_suid(struct inode *inode)
														
 
															+{
														
 
															+	mode_t mode = inode->i_mode;
														
 
															+
														
 
															+	if (!capable(CAP_FSETID)) {
														
 
															+		if (unlikely(mode & S_ISUID))
														
 
															+			return 1;
														
 
															+
														
 
															+		if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
														
 
															+			return 1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
														
 
															+				    const char __user *buf,
														
 
															+				    size_t count,
														
 
															+				    loff_t pos)
														
 
															+{
														
 
															+	struct iovec local_iov = { .iov_base = (void __user *)buf,
														
 
															+				   .iov_len = count };
														
 
															+	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
														
 
															+	u32 clusters;
														
 
															+	struct file *filp = iocb->ki_filp;
														
 
															+	struct inode *inode = filp->f_dentry->d_inode;
														
 
															+	loff_t newsize, saved_pos;
														
 
															+#ifdef OCFS2_ORACORE_WORKAROUNDS
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+#endif
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
														
 
															+		   (unsigned int)count,
														
 
															+		   filp->f_dentry->d_name.len,
														
 
															+		   filp->f_dentry->d_name.name);
														
 
															+
														
 
															+	/* happy write of zero bytes */
														
 
															+	if (count == 0)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (!inode) {
														
 
															+		mlog(0, "bad inode\n");
														
 
															+		return -EIO;
														
 
															+	}
														
 
															+
														
 
															+#ifdef OCFS2_ORACORE_WORKAROUNDS
														
 
															+	/* ugh, work around some applications which open everything O_DIRECT +
														
 
															+	 * O_APPEND and really don't mean to use O_DIRECT. */
														
 
															+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
														
 
															+	    (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 
														
 
															+		filp->f_flags &= ~O_DIRECT;
														
 
															+#endif
														
 
															+
														
 
															+	down(&inode->i_sem);
														
 
															+	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
														
 
															+	if (filp->f_flags & O_DIRECT) {
														
 
															+		have_alloc_sem = 1;
														
 
															+		down_read(&inode->i_alloc_sem);
														
 
															+	}
														
 
															+
														
 
															+	/* concurrent O_DIRECT writes are allowed */
														
 
															+	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
														
 
															+	ret = ocfs2_rw_lock(inode, rw_level);
														
 
															+	if (ret < 0) {
														
 
															+		rw_level = -1;
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	/* 
														
 
															+	 * We sample i_size under a read level meta lock to see if our write
														
 
															+	 * is extending the file, if it is we back off and get a write level
														
 
															+	 * meta lock.
														
 
															+	 */
														
 
															+	meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
														
 
															+	for(;;) {
														
 
															+		ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
														
 
															+		if (ret < 0) {
														
 
															+			meta_level = -1;
														
 
															+			mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+
														
 
															+		/* Clear suid / sgid if necessary. We do this here
														
 
															+		 * instead of later in the write path because
														
 
															+		 * remove_suid() calls ->setattr without any hint that
														
 
															+		 * we may have already done our cluster locking. Since
														
 
															+		 * ocfs2_setattr() *must* take cluster locks to
														
 
															+		 * proceeed, this will lead us to recursively lock the
														
 
															+		 * inode. There's also the dinode i_size state which
														
 
															+		 * can be lost via setattr during extending writes (we
														
 
															+		 * set inode->i_size at the end of a write. */
														
 
															+		if (ocfs2_write_should_remove_suid(inode)) {
														
 
															+			if (meta_level == 0) {
														
 
															+				ocfs2_meta_unlock(inode, meta_level);
														
 
															+				meta_level = 1;
														
 
															+				continue;
														
 
															+			}
														
 
															+
														
 
															+			ret = ocfs2_write_remove_suid(inode);
														
 
															+			if (ret < 0) {
														
 
															+				mlog_errno(ret);
														
 
															+				goto out;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		/* work on a copy of ppos until we're sure that we won't have
														
 
															+		 * to recalculate it due to relocking. */
														
 
															+		if (filp->f_flags & O_APPEND) {
														
 
															+			saved_pos = i_size_read(inode);
														
 
															+			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
														
 
															+		} else {
														
 
															+			saved_pos = iocb->ki_pos;
														
 
															+		}
														
 
															+		newsize = count + saved_pos;
														
 
															+
														
 
															+		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
														
 
															+		     saved_pos, newsize, i_size_read(inode));
														
 
															+
														
 
															+		/* No need for a higher level metadata lock if we're
														
 
															+		 * never going past i_size. */
														
 
															+		if (newsize <= i_size_read(inode))
														
 
															+			break;
														
 
															+
														
 
															+		if (meta_level == 0) {
														
 
															+			ocfs2_meta_unlock(inode, meta_level);
														
 
															+			meta_level = 1;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
														
 
															+			OCFS2_I(inode)->ip_clusters;
														
 
															+		spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+		mlog(0, "Writing at EOF, may need more allocation: "
														
 
															+		     "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
														
 
															+		     i_size_read(inode), newsize, clusters);
														
 
															+
														
 
															+		/* We only want to continue the rest of this loop if
														
 
															+		 * our extend will actually require more
														
 
															+		 * allocation. */
														
 
															+		if (!clusters)
														
 
															+			break;
														
 
															+
														
 
															+		ret = ocfs2_extend_allocation(inode, clusters);
														
 
															+		if (ret < 0) {
														
 
															+			if (ret != -ENOSPC)
														
 
															+				mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+
														
 
															+		/* Fill any holes which would've been created by this
														
 
															+		 * write. If we're O_APPEND, this will wind up
														
 
															+		 * (correctly) being a noop. */
														
 
															+		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	/* ok, we're done with i_size and alloc work */
														
 
															+	iocb->ki_pos = saved_pos;
														
 
															+	ocfs2_meta_unlock(inode, meta_level);
														
 
															+	meta_level = -1;
														
 
															+
														
 
															+	/* communicate with ocfs2_dio_end_io */
														
 
															+	ocfs2_iocb_set_rw_locked(iocb);
														
 
															+
														
 
															+#ifdef OCFS2_ORACORE_WORKAROUNDS
														
 
															+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
														
 
															+	    filp->f_flags & O_DIRECT) {
														
 
															+		unsigned int saved_flags = filp->f_flags;
														
 
															+		int sector_size = 1 << osb->s_sectsize_bits;
														
 
															+
														
 
															+		if ((saved_pos & (sector_size - 1)) ||
														
 
															+		    (count & (sector_size - 1)) ||
														
 
															+		    ((unsigned long)buf & (sector_size - 1))) {
														
 
															+			filp->f_flags |= O_SYNC;
														
 
															+			filp->f_flags &= ~O_DIRECT;
														
 
															+		}
														
 
															+
														
 
															+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
														
 
															+						    &iocb->ki_pos);
														
 
															+
														
 
															+		filp->f_flags = saved_flags;
														
 
															+	} else
														
 
															+#endif
														
 
															+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
														
 
															+						    &iocb->ki_pos);
														
 
															+
														
 
															+	/* buffered aio wouldn't have proper lock coverage today */
														
 
															+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
														
 
															+
														
 
															+	/* 
														
 
															+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
														
 
															+	 * function pointer which is called when o_direct io completes so that
														
 
															+	 * it can unlock our rw lock.  (it's the clustered equivalent of
														
 
															+	 * i_alloc_sem; protects truncate from racing with pending ios).
														
 
															+	 * Unfortunately there are error cases which call end_io and others
														
 
															+	 * that don't.  so we don't have to unlock the rw_lock if either an
														
 
															+	 * async dio is going to do it in the future or an end_io after an
														
 
															+	 * error has already done it.
														
 
															+	 */
														
 
															+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
														
 
															+		rw_level = -1;
														
 
															+		have_alloc_sem = 0;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	if (meta_level != -1)
														
 
															+		ocfs2_meta_unlock(inode, meta_level);
														
 
															+	if (have_alloc_sem)
														
 
															+		up_read(&inode->i_alloc_sem);
														
 
															+	if (rw_level != -1) 
														
 
															+		ocfs2_rw_unlock(inode, rw_level);
														
 
															+	up(&inode->i_sem);
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
														
 
															+				   char __user *buf,
														
 
															+				   size_t count,
														
 
															+				   loff_t pos)
														
 
															+{
														
 
															+	int ret = 0, rw_level = -1, have_alloc_sem = 0;
														
 
															+	struct file *filp = iocb->ki_filp;
														
 
															+	struct inode *inode = filp->f_dentry->d_inode;
														
 
															+#ifdef OCFS2_ORACORE_WORKAROUNDS
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+#endif
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
														
 
															+		   (unsigned int)count,
														
 
															+		   filp->f_dentry->d_name.len,
														
 
															+		   filp->f_dentry->d_name.name);
														
 
															+
														
 
															+	if (!inode) {
														
 
															+		ret = -EINVAL;
														
 
															+		mlog_errno(ret);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+#ifdef OCFS2_ORACORE_WORKAROUNDS
														
 
															+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
														
 
															+		if (filp->f_flags & O_DIRECT) {
														
 
															+			int sector_size = 1 << osb->s_sectsize_bits;
														
 
															+
														
 
															+			if ((pos & (sector_size - 1)) ||
														
 
															+			    (count & (sector_size - 1)) ||
														
 
															+			    ((unsigned long)buf & (sector_size - 1)) ||
														
 
															+			    (i_size_read(inode) & (sector_size -1))) {
														
 
															+				filp->f_flags &= ~O_DIRECT;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+#endif
														
 
															+
														
 
															+	/* 
														
 
															+	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
														
 
															+	 * need locks to protect pending reads from racing with truncate.
														
 
															+	 */
														
 
															+	if (filp->f_flags & O_DIRECT) {
														
 
															+		down_read(&inode->i_alloc_sem);
														
 
															+		have_alloc_sem = 1;
														
 
															+
														
 
															+		ret = ocfs2_rw_lock(inode, 0);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		rw_level = 0;
														
 
															+		/* communicate with ocfs2_dio_end_io */
														
 
															+		ocfs2_iocb_set_rw_locked(iocb);
														
 
															+	}
														
 
															+
														
 
															+	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
														
 
															+	if (ret == -EINVAL)
														
 
															+		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
														
 
															+
														
 
															+	/* buffered aio wouldn't have proper lock coverage today */
														
 
															+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
														
 
															+
														
 
															+	/* see ocfs2_file_aio_write */
														
 
															+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
														
 
															+		rw_level = -1;
														
 
															+		have_alloc_sem = 0;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	if (have_alloc_sem)
														
 
															+		up_read(&inode->i_alloc_sem);
														
 
															+	if (rw_level != -1) 
														
 
															+		ocfs2_rw_unlock(inode, rw_level);
														
 
															+	mlog_exit(ret);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct inode_operations ocfs2_file_iops = {
														
 
															+	.setattr	= ocfs2_setattr,
														
 
															+	.getattr	= ocfs2_getattr,
														
 
															+};
														
 
															+
														
 
															+struct inode_operations ocfs2_special_file_iops = {
														
 
															+	.setattr	= ocfs2_setattr,
														
 
															+	.getattr	= ocfs2_getattr,
														
 
															+};
														
 
															+
														
 
															+struct file_operations ocfs2_fops = {
														
 
															+	.read		= do_sync_read,
														
 
															+	.write		= do_sync_write,
														
 
															+	.sendfile	= generic_file_sendfile,
														
 
															+	.mmap		= ocfs2_mmap,
														
 
															+	.fsync		= ocfs2_sync_file,
														
 
															+	.release	= ocfs2_file_release,
														
 
															+	.open		= ocfs2_file_open,
														
 
															+	.aio_read	= ocfs2_file_aio_read,
														
 
															+	.aio_write	= ocfs2_file_aio_write,
														
 
															+};
														
 
															+
														
 
															+struct file_operations ocfs2_dops = {
														
 
															+	.read		= generic_read_dir,
														
 
															+	.readdir	= ocfs2_readdir,
														
 
															+	.fsync		= ocfs2_sync_file,
														
 
															+};
														
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,57 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * file.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_FILE_H
														
 
															+#define OCFS2_FILE_H
														
 
															+
														
 
															+extern struct file_operations ocfs2_fops;
														
 
															+extern struct file_operations ocfs2_dops;
														
 
															+extern struct inode_operations ocfs2_file_iops;
														
 
															+extern struct inode_operations ocfs2_special_file_iops;
														
 
															+struct ocfs2_alloc_context;
														
 
															+
														
 
															+enum ocfs2_alloc_restarted {
														
 
															+	RESTART_NONE = 0,
														
 
															+	RESTART_TRANS,
														
 
															+	RESTART_META
														
 
															+};
														
 
															+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
														
 
															+			       struct inode *inode,
														
 
															+			       u32 clusters_to_add,
														
 
															+			       struct buffer_head *fe_bh,
														
 
															+			       struct ocfs2_journal_handle *handle,
														
 
															+			       struct ocfs2_alloc_context *data_ac,
														
 
															+			       struct ocfs2_alloc_context *meta_ac,
														
 
															+			       enum ocfs2_alloc_restarted *reason);
														
 
															+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
														
 
															+int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
														
 
															+		  struct kstat *stat);
														
 
															+
														
 
															+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
														
 
															+			 struct inode *inode,
														
 
															+			 struct buffer_head *fe_bh,
														
 
															+			 u64 new_i_size);
														
 
															+
														
 
															+#endif /* OCFS2_FILE_H */
														
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,378 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * heartbeat.c
														
 
															+ *
														
 
															+ * Register ourselves with the heartbaet service, keep our node maps
														
 
															+ * up to date, and fire off recovery when needed.
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/kmod.h>
														
 
															+
														
 
															+#include <cluster/heartbeat.h>
														
 
															+#include <cluster/nodemanager.h>
														
 
															+
														
 
															+#include <dlm/dlmapi.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_SUPER
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "heartbeat.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "vote.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
														
 
															+#define OCFS2_HB_NODE_UP_PRI	   OCFS2_HB_NODE_DOWN_PRI
														
 
															+
														
 
															+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
														
 
															+					    int bit);
														
 
															+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
														
 
															+					      int bit);
														
 
															+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
														
 
															+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
														
 
															+				 struct ocfs2_node_map *from);
														
 
															+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
														
 
															+				 struct ocfs2_node_map *from);
														
 
															+
														
 
															+void ocfs2_init_node_maps(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	spin_lock_init(&osb->node_map_lock);
														
 
															+	ocfs2_node_map_init(&osb->mounted_map);
														
 
															+	ocfs2_node_map_init(&osb->recovery_map);
														
 
															+	ocfs2_node_map_init(&osb->umount_map);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_do_node_down(int node_num,
														
 
															+			       struct ocfs2_super *osb)
														
 
															+{
														
 
															+	BUG_ON(osb->node_num == node_num);
														
 
															+
														
 
															+	mlog(0, "ocfs2: node down event for %d\n", node_num);
														
 
															+
														
 
															+	if (!osb->dlm) {
														
 
															+		/*
														
 
															+		 * No DLM means we're not even ready to participate yet.
														
 
															+		 * We check the slots after the DLM comes up, so we will
														
 
															+		 * notice the node death then.  We can safely ignore it
														
 
															+		 * here.
														
 
															+		 */
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
														
 
															+		/* If a node is in the umount map, then we've been
														
 
															+		 * expecting him to go down and we know ahead of time
														
 
															+		 * that recovery is not necessary. */
														
 
															+		ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_recovery_thread(osb, node_num);
														
 
															+
														
 
															+	ocfs2_remove_node_from_vote_queues(osb, node_num);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
														
 
															+				  int node_num,
														
 
															+				  void *data)
														
 
															+{
														
 
															+	ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
														
 
															+}
														
 
															+
														
 
															+/* Called from the dlm when it's about to evict a node. We may also
														
 
															+ * get a heartbeat callback later. */
														
 
															+static void ocfs2_dlm_eviction_cb(int node_num,
														
 
															+				  void *data)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = (struct ocfs2_super *) data;
														
 
															+	struct super_block *sb = osb->sb;
														
 
															+
														
 
															+	mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
														
 
															+	     MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
														
 
															+
														
 
															+	ocfs2_do_node_down(node_num, osb);
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
														
 
															+				int node_num,
														
 
															+				void *data)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = data;
														
 
															+
														
 
															+	BUG_ON(osb->node_num == node_num);
														
 
															+
														
 
															+	mlog(0, "node up event for %d\n", node_num);
														
 
															+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
														
 
															+			    ocfs2_hb_node_down_cb, osb,
														
 
															+			    OCFS2_HB_NODE_DOWN_PRI);
														
 
															+
														
 
															+	o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
														
 
															+			    ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
														
 
															+
														
 
															+	/* Not exactly a heartbeat callback, but leads to essentially
														
 
															+	 * the same path so we set it up here. */
														
 
															+	dlm_setup_eviction_cb(&osb->osb_eviction_cb,
														
 
															+			      ocfs2_dlm_eviction_cb,
														
 
															+			      osb);
														
 
															+}
														
 
															+
														
 
															+/* Most functions here are just stubs for now... */
														
 
															+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	status = o2hb_register_callback(&osb->osb_hb_down);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = o2hb_register_callback(&osb->osb_hb_up);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	status = o2hb_unregister_callback(&osb->osb_hb_down);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	status = o2hb_unregister_callback(&osb->osb_hb_up);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int ret;
														
 
															+	char *argv[5], *envp[3];
														
 
															+
														
 
															+	if (!osb->uuid_str) {
														
 
															+		/* This can happen if we don't get far enough in mount... */
														
 
															+		mlog(0, "No UUID with which to stop heartbeat!\n\n");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	argv[0] = (char *)o2nm_get_hb_ctl_path();
														
 
															+	argv[1] = "-K";
														
 
															+	argv[2] = "-u";
														
 
															+	argv[3] = osb->uuid_str;
														
 
															+	argv[4] = NULL;
														
 
															+
														
 
															+	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
														
 
															+
														
 
															+	/* minimal command environment taken from cpu_run_sbin_hotplug */
														
 
															+	envp[0] = "HOME=/";
														
 
															+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
														
 
															+	envp[2] = NULL;
														
 
															+
														
 
															+	ret = call_usermodehelper(argv[0], argv, envp, 1);
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+}
														
 
															+
														
 
															+/* special case -1 for now
														
 
															+ * TODO: should *really* make sure the calling func never passes -1!!  */
														
 
															+void ocfs2_node_map_init(struct ocfs2_node_map *map)
														
 
															+{
														
 
															+	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
														
 
															+	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
														
 
															+	       sizeof(unsigned long));
														
 
															+}
														
 
															+
														
 
															+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
														
 
															+					    int bit)
														
 
															+{
														
 
															+	set_bit(bit, map->map);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_node_map *map,
														
 
															+			    int bit)
														
 
															+{
														
 
															+	if (bit==-1)
														
 
															+		return;
														
 
															+	BUG_ON(bit >= map->num_nodes);
														
 
															+	spin_lock(&osb->node_map_lock);
														
 
															+	__ocfs2_node_map_set_bit(map, bit);
														
 
															+	spin_unlock(&osb->node_map_lock);
														
 
															+}
														
 
															+
														
 
															+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
														
 
															+					      int bit)
														
 
															+{
														
 
															+	clear_bit(bit, map->map);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
														
 
															+			      struct ocfs2_node_map *map,
														
 
															+			      int bit)
														
 
															+{
														
 
															+	if (bit==-1)
														
 
															+		return;
														
 
															+	BUG_ON(bit >= map->num_nodes);
														
 
															+	spin_lock(&osb->node_map_lock);
														
 
															+	__ocfs2_node_map_clear_bit(map, bit);
														
 
															+	spin_unlock(&osb->node_map_lock);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_node_map *map,
														
 
															+			    int bit)
														
 
															+{
														
 
															+	int ret;
														
 
															+	if (bit >= map->num_nodes) {
														
 
															+		mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
														
 
															+		BUG();
														
 
															+	}
														
 
															+	spin_lock(&osb->node_map_lock);
														
 
															+	ret = test_bit(bit, map->map);
														
 
															+	spin_unlock(&osb->node_map_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
														
 
															+{
														
 
															+	int bit;
														
 
															+	bit = find_next_bit(map->map, map->num_nodes, 0);
														
 
															+	if (bit < map->num_nodes)
														
 
															+		return 0;
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_node_map *map)
														
 
															+{
														
 
															+	int ret;
														
 
															+	BUG_ON(map->num_nodes == 0);
														
 
															+	spin_lock(&osb->node_map_lock);
														
 
															+	ret = __ocfs2_node_map_is_empty(map);
														
 
															+	spin_unlock(&osb->node_map_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
														
 
															+				 struct ocfs2_node_map *from)
														
 
															+{
														
 
															+	BUG_ON(from->num_nodes == 0);
														
 
															+	ocfs2_node_map_init(target);
														
 
															+	__ocfs2_node_map_set(target, from);
														
 
															+}
														
 
															+
														
 
															+/* returns 1 if bit is the only bit set in target, 0 otherwise */
														
 
															+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
														
 
															+			   struct ocfs2_node_map *target,
														
 
															+			   int bit)
														
 
															+{
														
 
															+	struct ocfs2_node_map temp;
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&osb->node_map_lock);
														
 
															+	__ocfs2_node_map_dup(&temp, target);
														
 
															+	__ocfs2_node_map_clear_bit(&temp, bit);
														
 
															+	ret = __ocfs2_node_map_is_empty(&temp);
														
 
															+	spin_unlock(&osb->node_map_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
														
 
															+				 struct ocfs2_node_map *from)
														
 
															+{
														
 
															+	int num_longs, i;
														
 
															+
														
 
															+	BUG_ON(target->num_nodes != from->num_nodes);
														
 
															+	BUG_ON(target->num_nodes == 0);
														
 
															+
														
 
															+	num_longs = BITS_TO_LONGS(target->num_nodes);
														
 
															+	for (i = 0; i < num_longs; i++)
														
 
															+		target->map[i] = from->map[i];
														
 
															+}
														
 
															+
														
 
															+/* Returns whether the recovery bit was actually set - it may not be
														
 
															+ * if a node is still marked as needing recovery */
														
 
															+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
														
 
															+			   int num)
														
 
															+{
														
 
															+	int set = 0;
														
 
															+
														
 
															+	spin_lock(&osb->node_map_lock);
														
 
															+
														
 
															+	__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
														
 
															+
														
 
															+	if (!test_bit(num, osb->recovery_map.map)) {
														
 
															+	    __ocfs2_node_map_set_bit(&osb->recovery_map, num);
														
 
															+	    set = 1;
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&osb->node_map_lock);
														
 
															+
														
 
															+	return set;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
														
 
															+			      int num)
														
 
															+{
														
 
															+	ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
														
 
															+			   struct ocfs2_node_map *map,
														
 
															+			   int idx)
														
 
															+{
														
 
															+	int i = idx;
														
 
															+
														
 
															+	idx = O2NM_INVALID_NODE_NUM;
														
 
															+	spin_lock(&osb->node_map_lock);
														
 
															+	if ((i != O2NM_INVALID_NODE_NUM) &&
														
 
															+	    (i >= 0) &&
														
 
															+	    (i < map->num_nodes)) {
														
 
															+		while(i < map->num_nodes) {
														
 
															+			if (test_bit(i, map->map)) {
														
 
															+				idx = i;
														
 
															+				break;
														
 
															+			}
														
 
															+			i++;
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&osb->node_map_lock);
														
 
															+	return idx;
														
 
															+}
														
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,67 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * heartbeat.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_HEARTBEAT_H
														
 
															+#define OCFS2_HEARTBEAT_H
														
 
															+
														
 
															+void ocfs2_init_node_maps(struct ocfs2_super *osb);
														
 
															+
														
 
															+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
														
 
															+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
														
 
															+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
														
 
															+void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
														
 
															+
														
 
															+/* node map functions - used to keep track of mounted and in-recovery
														
 
															+ * nodes. */
														
 
															+void ocfs2_node_map_init(struct ocfs2_node_map *map);
														
 
															+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_node_map *map);
														
 
															+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_node_map *map,
														
 
															+			    int bit);
														
 
															+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
														
 
															+			      struct ocfs2_node_map *map,
														
 
															+			      int bit);
														
 
															+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_node_map *map,
														
 
															+			    int bit);
														
 
															+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
														
 
															+			   struct ocfs2_node_map *map,
														
 
															+			   int idx);
														
 
															+static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
														
 
															+					       struct ocfs2_node_map *map)
														
 
															+{
														
 
															+	return ocfs2_node_map_iterate(osb, map, 0);
														
 
															+}
														
 
															+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
														
 
															+			   int num);
														
 
															+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
														
 
															+			      int num);
														
 
															+/* returns 1 if bit is the only bit set in target, 0 otherwise */
														
 
															+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
														
 
															+			   struct ocfs2_node_map *target,
														
 
															+			   int bit);
														
 
															+
														
 
															+#endif /* OCFS2_HEARTBEAT_H */
														
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1140 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * inode.c
														
 
															+ *
														
 
															+ * vfs' aops, fops, dops and iops
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/pagemap.h>
														
 
															+#include <linux/smp_lock.h>
														
 
															+
														
 
															+#include <asm/byteorder.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_INODE
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "file.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "namei.h"
														
 
															+#include "suballoc.h"
														
 
															+#include "super.h"
														
 
															+#include "symlink.h"
														
 
															+#include "sysfile.h"
														
 
															+#include "uptodate.h"
														
 
															+#include "vote.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+#define OCFS2_FI_FLAG_NOWAIT	0x1
														
 
															+#define OCFS2_FI_FLAG_DELETE	0x2
														
 
															+struct ocfs2_find_inode_args
														
 
															+{
														
 
															+	u64		fi_blkno;
														
 
															+	unsigned long	fi_ino;
														
 
															+	unsigned int	fi_flags;
														
 
															+};
														
 
															+
														
 
															+static int ocfs2_read_locked_inode(struct inode *inode,
														
 
															+				   struct ocfs2_find_inode_args *args);
														
 
															+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
														
 
															+static int ocfs2_find_actor(struct inode *inode, void *opaque);
														
 
															+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
														
 
															+				    struct inode *inode,
														
 
															+				    struct buffer_head *fe_bh);
														
 
															+
														
 
															+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
														
 
															+				     u64 blkno,
														
 
															+				     int delete_vote)
														
 
															+{
														
 
															+	struct ocfs2_find_inode_args args;
														
 
															+
														
 
															+	/* ocfs2_ilookup_for_vote should *only* be called from the
														
 
															+	 * vote thread */
														
 
															+	BUG_ON(current != osb->vote_task);
														
 
															+
														
 
															+	args.fi_blkno = blkno;
														
 
															+	args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
														
 
															+	if (delete_vote)
														
 
															+		args.fi_flags |= OCFS2_FI_FLAG_DELETE;
														
 
															+	args.fi_ino = ino_from_blkno(osb->sb, blkno);
														
 
															+	return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
														
 
															+}
														
 
															+
														
 
															+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
														
 
															+{
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct super_block *sb = osb->sb;
														
 
															+	struct ocfs2_find_inode_args args;
														
 
															+
														
 
															+	mlog_entry("(blkno = %"MLFu64")\n", blkno);
														
 
															+
														
 
															+	/* Ok. By now we've either got the offsets passed to us by the
														
 
															+	 * caller, or we just pulled them off the bh. Lets do some
														
 
															+	 * sanity checks to make sure they're OK. */
														
 
															+	if (blkno == 0) {
														
 
															+		inode = ERR_PTR(-EINVAL);
														
 
															+		mlog_errno(PTR_ERR(inode));
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	args.fi_blkno = blkno;
														
 
															+	args.fi_flags = 0;
														
 
															+	args.fi_ino = ino_from_blkno(sb, blkno);
														
 
															+
														
 
															+	inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
														
 
															+			     ocfs2_init_locked_inode, &args);
														
 
															+	/* inode was *not* in the inode cache. 2.6.x requires
														
 
															+	 * us to do our own read_inode call and unlock it
														
 
															+	 * afterwards. */
														
 
															+	if (inode && inode->i_state & I_NEW) {
														
 
															+		mlog(0, "Inode was not in inode cache, reading it.\n");
														
 
															+		ocfs2_read_locked_inode(inode, &args);
														
 
															+		unlock_new_inode(inode);
														
 
															+	}
														
 
															+	if (inode == NULL) {
														
 
															+		inode = ERR_PTR(-ENOMEM);
														
 
															+		mlog_errno(PTR_ERR(inode));
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (is_bad_inode(inode)) {
														
 
															+		iput(inode);
														
 
															+		inode = ERR_PTR(-ESTALE);
														
 
															+		mlog_errno(PTR_ERR(inode));
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	if (!IS_ERR(inode)) {
														
 
															+		mlog(0, "returning inode with number %"MLFu64"\n",
														
 
															+		     OCFS2_I(inode)->ip_blkno);
														
 
															+		mlog_exit_ptr(inode);
														
 
															+	} else
														
 
															+		mlog_errno(PTR_ERR(inode));
														
 
															+
														
 
															+	return inode;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * here's how inodes get read from disk:
														
 
															+ * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
														
 
															+ * found? : return the in-memory inode
														
 
															+ * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
														
 
															+ */
														
 
															+
														
 
															+static int ocfs2_find_actor(struct inode *inode, void *opaque)
														
 
															+{
														
 
															+	struct ocfs2_find_inode_args *args = NULL;
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
														
 
															+
														
 
															+	args = opaque;
														
 
															+
														
 
															+	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
														
 
															+
														
 
															+	if (oi->ip_blkno != args->fi_blkno)
														
 
															+		goto bail;
														
 
															+
														
 
															+	/* OCFS2_FI_FLAG_NOWAIT is *only* set from
														
 
															+	 * ocfs2_ilookup_for_vote which won't create an inode for one
														
 
															+	 * that isn't found. The vote thread which doesn't want to get
														
 
															+	 * an inode which is in the process of going away - otherwise
														
 
															+	 * the call to __wait_on_freeing_inode in find_inode_fast will
														
 
															+	 * cause it to deadlock on an inode which may be waiting on a
														
 
															+	 * vote (or lock release) in delete_inode */
														
 
															+	if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
														
 
															+	    (inode->i_state & (I_FREEING|I_CLEAR))) {
														
 
															+		/* As stated above, we're not going to return an
														
 
															+		 * inode.  In the case of a delete vote, the voting
														
 
															+		 * code is going to signal the other node to go
														
 
															+		 * ahead. Mark that state here, so this freeing inode
														
 
															+		 * has the state when it gets to delete_inode. */
														
 
															+		if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
														
 
															+			spin_lock(&oi->ip_lock);
														
 
															+			ocfs2_mark_inode_remotely_deleted(inode);
														
 
															+			spin_unlock(&oi->ip_lock);
														
 
															+		}
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ret = 1;
														
 
															+bail:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * initialize the new inode, but don't do anything that would cause
														
 
															+ * us to sleep.
														
 
															+ * return 0 on success, 1 on failure
														
 
															+ */
														
 
															+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
														
 
															+{
														
 
															+	struct ocfs2_find_inode_args *args = opaque;
														
 
															+
														
 
															+	mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
														
 
															+
														
 
															+	inode->i_ino = args->fi_ino;
														
 
															+	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
														
 
															+
														
 
															+	mlog_exit(0);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
														
 
															+		     	 int create_ino)
														
 
															+{
														
 
															+	struct super_block *sb;
														
 
															+	struct ocfs2_super *osb;
														
 
															+	int status = -EINVAL;
														
 
															+
														
 
															+	mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
														
 
															+
														
 
															+	sb = inode->i_sb;
														
 
															+	osb = OCFS2_SB(sb);
														
 
															+
														
 
															+	/* this means that read_inode cannot create a superblock inode
														
 
															+	 * today.  change if needed. */
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe) ||
														
 
															+	    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
														
 
															+		mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
														
 
															+		     "signature = %.*s, flags = 0x%x\n",
														
 
															+		     inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
														
 
															+		     fe->i_signature, le32_to_cpu(fe->i_flags));
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
														
 
															+		mlog(ML_ERROR, "file entry generation does not match "
														
 
															+		     "superblock! osb->fs_generation=%x, "
														
 
															+		     "fe->i_fs_generation=%x\n",
														
 
															+		     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	inode->i_version = 1;
														
 
															+	inode->i_generation = le32_to_cpu(fe->i_generation);
														
 
															+	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
														
 
															+	inode->i_mode = le16_to_cpu(fe->i_mode);
														
 
															+	inode->i_uid = le32_to_cpu(fe->i_uid);
														
 
															+	inode->i_gid = le32_to_cpu(fe->i_gid);
														
 
															+	inode->i_blksize = (u32)osb->s_clustersize;
														
 
															+
														
 
															+	/* Fast symlinks will have i_size but no allocated clusters. */
														
 
															+	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
														
 
															+		inode->i_blocks = 0;
														
 
															+	else
														
 
															+		inode->i_blocks =
														
 
															+			ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
														
 
															+	inode->i_mapping->a_ops = &ocfs2_aops;
														
 
															+	inode->i_flags |= S_NOATIME;
														
 
															+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
														
 
															+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
														
 
															+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
														
 
															+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
														
 
															+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
														
 
															+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
														
 
															+
														
 
															+	if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
														
 
															+		mlog(ML_ERROR,
														
 
															+		     "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
														
 
															+		     OCFS2_I(inode)->ip_blkno, fe->i_blkno);
														
 
															+
														
 
															+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
														
 
															+	OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
														
 
															+
														
 
															+	if (create_ino)
														
 
															+		inode->i_ino = ino_from_blkno(inode->i_sb,
														
 
															+			       le64_to_cpu(fe->i_blkno));
														
 
															+
														
 
															+	mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
														
 
															+	     fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
														
 
															+
														
 
															+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
														
 
															+
														
 
															+	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
														
 
															+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
														
 
															+		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
														
 
															+	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
														
 
															+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
														
 
															+	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
														
 
															+		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
														
 
															+		/* we can't actually hit this as read_inode can't
														
 
															+		 * handle superblocks today ;-) */
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	switch (inode->i_mode & S_IFMT) {
														
 
															+	    case S_IFREG:
														
 
															+		    inode->i_fop = &ocfs2_fops;
														
 
															+		    inode->i_op = &ocfs2_file_iops;
														
 
															+		    i_size_write(inode, le64_to_cpu(fe->i_size));
														
 
															+		    break;
														
 
															+	    case S_IFDIR:
														
 
															+		    inode->i_op = &ocfs2_dir_iops;
														
 
															+		    inode->i_fop = &ocfs2_dops;
														
 
															+		    i_size_write(inode, le64_to_cpu(fe->i_size));
														
 
															+		    break;
														
 
															+	    case S_IFLNK:
														
 
															+		    if (ocfs2_inode_is_fast_symlink(inode))
														
 
															+			inode->i_op = &ocfs2_fast_symlink_inode_operations;
														
 
															+		    else
														
 
															+			inode->i_op = &ocfs2_symlink_inode_operations;
														
 
															+		    i_size_write(inode, le64_to_cpu(fe->i_size));
														
 
															+		    break;
														
 
															+	    default:
														
 
															+		    inode->i_op = &ocfs2_special_file_iops;
														
 
															+		    init_special_inode(inode, inode->i_mode,
														
 
															+				       inode->i_rdev);
														
 
															+		    break;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
														
 
															+				  OCFS2_LOCK_TYPE_RW, inode);
														
 
															+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
														
 
															+				  OCFS2_LOCK_TYPE_META, inode);
														
 
															+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
														
 
															+				  OCFS2_LOCK_TYPE_DATA, inode);
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_read_locked_inode(struct inode *inode,
														
 
															+				   struct ocfs2_find_inode_args *args)
														
 
															+{
														
 
															+	struct super_block *sb;
														
 
															+	struct ocfs2_super *osb;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	int status;
														
 
															+	int sysfile = 0;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p)\n", inode, args);
														
 
															+
														
 
															+	status = -EINVAL;
														
 
															+	if (inode == NULL || inode->i_sb == NULL) {
														
 
															+		mlog(ML_ERROR, "bad inode\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	sb = inode->i_sb;
														
 
															+	osb = OCFS2_SB(sb);
														
 
															+
														
 
															+	if (!args) {
														
 
															+		mlog(ML_ERROR, "bad inode args\n");
														
 
															+		make_bad_inode(inode);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* Read the FE off disk. This is safe because the kernel only
														
 
															+	 * does one read_inode2 for a new inode, and if it doesn't
														
 
															+	 * exist yet then nobody can be working on it! */
														
 
															+	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		make_bad_inode(inode);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
														
 
															+		     fe->i_blkno, 7, fe->i_signature);
														
 
															+		make_bad_inode(inode);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
														
 
															+		sysfile = 1;
														
 
															+
														
 
															+	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
														
 
															+	    S_ISBLK(le16_to_cpu(fe->i_mode)))
														
 
															+    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
														
 
															+
														
 
															+	status = -EINVAL;
														
 
															+	if (ocfs2_populate_inode(inode, fe, 0) < 0) {
														
 
															+		mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
														
 
															+		     "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
														
 
															+		make_bad_inode(inode);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
														
 
															+
														
 
															+	if (sysfile)
														
 
															+	       OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
														
 
															+
														
 
															+	status = 0;
														
 
															+
														
 
															+bail:
														
 
															+	if (args && bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_sync_blockdev(struct super_block *sb)
														
 
															+{
														
 
															+	sync_blockdev(sb->s_bdev);
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
														
 
															+				     struct inode *inode,
														
 
															+				     struct buffer_head *fe_bh)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct ocfs2_truncate_context *tc = NULL;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+
														
 
															+	/* zero allocation, zero truncate :) */
														
 
															+	if (!fe->i_clusters)
														
 
															+		goto bail;
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_commit_trans(handle);
														
 
															+	handle = NULL;
														
 
															+
														
 
															+	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_remove_inode(struct inode *inode,
														
 
															+			      struct buffer_head *di_bh,
														
 
															+			      struct inode *orphan_dir_inode,
														
 
															+			      struct buffer_head *orphan_dir_bh)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *inode_alloc_inode = NULL;
														
 
															+	struct buffer_head *inode_alloc_bh = NULL;
														
 
															+	struct ocfs2_journal_handle *handle;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
														
 
															+
														
 
															+	inode_alloc_inode =
														
 
															+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
														
 
															+					    le16_to_cpu(di->i_suballoc_slot));
														
 
															+	if (!inode_alloc_inode) {
														
 
															+		status = -EEXIST;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	down(&inode_alloc_inode->i_sem);
														
 
															+	status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		up(&inode_alloc_inode->i_sem);
														
 
															+
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		mlog_errno(status);
														
 
															+		goto bail_unlock;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
														
 
															+				  orphan_dir_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail_commit;
														
 
															+	}
														
 
															+
														
 
															+	/* set the inodes dtime */
														
 
															+	status = ocfs2_journal_access(handle, inode, di_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail_commit;
														
 
															+	}
														
 
															+
														
 
															+	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
														
 
															+	le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, di_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail_commit;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_remove_from_cache(inode, di_bh);
														
 
															+
														
 
															+	status = ocfs2_free_dinode(handle, inode_alloc_inode,
														
 
															+				   inode_alloc_bh, di);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail_commit:
														
 
															+	ocfs2_commit_trans(handle);
														
 
															+bail_unlock:
														
 
															+	ocfs2_meta_unlock(inode_alloc_inode, 1);
														
 
															+	up(&inode_alloc_inode->i_sem);
														
 
															+	brelse(inode_alloc_bh);
														
 
															+bail:
														
 
															+	iput(inode_alloc_inode);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_wipe_inode(struct inode *inode,
														
 
															+			    struct buffer_head *di_bh)
														
 
															+{
														
 
															+	int status, orphaned_slot;
														
 
															+	struct inode *orphan_dir_inode = NULL;
														
 
															+	struct buffer_head *orphan_dir_bh = NULL;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	/* We've already voted on this so it should be readonly - no
														
 
															+	 * spinlock needed. */
														
 
															+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
														
 
															+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						       ORPHAN_DIR_SYSTEM_INODE,
														
 
															+						       orphaned_slot);
														
 
															+	if (!orphan_dir_inode) {
														
 
															+		status = -EEXIST;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* Lock the orphan dir. The lock will be held for the entire
														
 
															+	 * delete_inode operation. We do this now to avoid races with
														
 
															+	 * recovery completion on other nodes. */
														
 
															+	down(&orphan_dir_inode->i_sem);
														
 
															+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		up(&orphan_dir_inode->i_sem);
														
 
															+
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* we do this while holding the orphan dir lock because we
														
 
															+	 * don't want recovery being run from another node to vote for
														
 
															+	 * an inode delete on us -- this will result in two nodes
														
 
															+	 * truncating the same file! */
														
 
															+	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail_unlock_dir;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
														
 
															+				    orphan_dir_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail_unlock_dir:
														
 
															+	ocfs2_meta_unlock(orphan_dir_inode, 1);
														
 
															+	up(&orphan_dir_inode->i_sem);
														
 
															+	brelse(orphan_dir_bh);
														
 
															+bail:
														
 
															+	iput(orphan_dir_inode);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* There is a series of simple checks that should be done before a
														
 
															+ * vote is even considered. Encapsulate those in this function. */
														
 
															+static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	/* We shouldn't be getting here for the root directory
														
 
															+	 * inode.. */
														
 
															+	if (inode == osb->root_inode) {
														
 
															+		mlog(ML_ERROR, "Skipping delete of root inode.\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* If we're coming from process_vote we can't go into our own
														
 
															+	 * voting [hello, deadlock city!], so unforuntately we just
														
 
															+	 * have to skip deleting this guy. That's OK though because
														
 
															+	 * the node who's doing the actual deleting should handle it
														
 
															+	 * anyway. */
														
 
															+	if (current == osb->vote_task) {
														
 
															+		mlog(0, "Skipping delete of %lu because we're currently "
														
 
															+		     "in process_vote\n", inode->i_ino);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&oi->ip_lock);
														
 
															+	/* OCFS2 *never* deletes system files. This should technically
														
 
															+	 * never get here as system file inodes should always have a
														
 
															+	 * positive link count. */
														
 
															+	if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
														
 
															+		mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
														
 
															+		     oi->ip_blkno);
														
 
															+		goto bail_unlock;
														
 
															+	}
														
 
															+
														
 
															+	/* If we have voted "yes" on the wipe of this inode for
														
 
															+	 * another node, it will be marked here so we can safely skip
														
 
															+	 * it. Recovery will cleanup any inodes we might inadvertantly
														
 
															+	 * skip here. */
														
 
															+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
														
 
															+		mlog(0, "Skipping delete of %lu because another node "
														
 
															+		     "has done this for us.\n", inode->i_ino);
														
 
															+		goto bail_unlock;
														
 
															+	}
														
 
															+
														
 
															+	ret = 1;
														
 
															+bail_unlock:
														
 
															+	spin_unlock(&oi->ip_lock);
														
 
															+bail:
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* Query the cluster to determine whether we should wipe an inode from
														
 
															+ * disk or not.
														
 
															+ *
														
 
															+ * Requires the inode to have the cluster lock. */
														
 
															+static int ocfs2_query_inode_wipe(struct inode *inode,
														
 
															+				  struct buffer_head *di_bh,
														
 
															+				  int *wipe)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+	struct ocfs2_dinode *di;
														
 
															+
														
 
															+	*wipe = 0;
														
 
															+
														
 
															+	/* While we were waiting for the cluster lock in
														
 
															+	 * ocfs2_delete_inode, another node might have asked to delete
														
 
															+	 * the inode. Recheck our flags to catch this. */
														
 
															+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
														
 
															+		mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
														
 
															+		     oi->ip_blkno);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* Now that we have an up to date inode, we can double check
														
 
															+	 * the link count. */
														
 
															+	if (inode->i_nlink) {
														
 
															+		mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
														
 
															+		     oi->ip_blkno, inode->i_nlink);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* Do some basic inode verification... */
														
 
															+	di = (struct ocfs2_dinode *) di_bh->b_data;
														
 
															+	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
														
 
															+		/* for lack of a better error? */
														
 
															+		status = -EEXIST;
														
 
															+		mlog(ML_ERROR,
														
 
															+		     "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
														
 
															+		     "Disk flags  0x%x, inode flags 0x%x\n",
														
 
															+		     oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* has someone already deleted us?! baaad... */
														
 
															+	if (di->i_dtime) {
														
 
															+		status = -EEXIST;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_request_delete_vote(inode);
														
 
															+	/* -EBUSY means that other nodes are still using the
														
 
															+	 * inode. We're done here though, so avoid doing anything on
														
 
															+	 * disk and let them worry about deleting it. */
														
 
															+	if (status == -EBUSY) {
														
 
															+		status = 0;
														
 
															+		mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
														
 
															+		     "other nodes\n", oi->ip_blkno);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&oi->ip_lock);
														
 
															+	if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
														
 
															+		/* Nobody knew which slot this inode was orphaned
														
 
															+		 * into. This may happen during node death and
														
 
															+		 * recovery knows how to clean it up so we can safely
														
 
															+		 * ignore this inode for now on. */
														
 
															+		mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
														
 
															+		     oi->ip_blkno);
														
 
															+	} else {
														
 
															+		*wipe = 1;
														
 
															+
														
 
															+		mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
														
 
															+		     oi->ip_blkno, oi->ip_orphaned_slot);
														
 
															+	}
														
 
															+	spin_unlock(&oi->ip_lock);
														
 
															+
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Support function for ocfs2_delete_inode. Will help us keep the
														
 
															+ * inode data in a consistent state for clear_inode. Always truncates
														
 
															+ * pages, optionally sync's them first. */
														
 
															+static void ocfs2_cleanup_delete_inode(struct inode *inode,
														
 
															+				       int sync_data)
														
 
															+{
														
 
															+	mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno, sync_data);
														
 
															+	if (sync_data)
														
 
															+		write_inode_now(inode, 1);
														
 
															+	truncate_inode_pages(&inode->i_data, 0);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_delete_inode(struct inode *inode)
														
 
															+{
														
 
															+	int wipe, status;
														
 
															+	sigset_t blocked, oldset;
														
 
															+	struct buffer_head *di_bh = NULL;
														
 
															+
														
 
															+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
														
 
															+
														
 
															+	if (is_bad_inode(inode)) {
														
 
															+		mlog(0, "Skipping delete of bad inode\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
														
 
															+		/* It's probably not necessary to truncate_inode_pages
														
 
															+		 * here but we do it for safety anyway (it will most
														
 
															+		 * likely be a no-op anyway) */
														
 
															+		ocfs2_cleanup_delete_inode(inode, 0);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* We want to block signals in delete_inode as the lock and
														
 
															+	 * messaging paths may return us -ERESTARTSYS. Which would
														
 
															+	 * cause us to exit early, resulting in inodes being orphaned
														
 
															+	 * forever. */
														
 
															+	sigfillset(&blocked);
														
 
															+	status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		ocfs2_cleanup_delete_inode(inode, 1);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* Lock down the inode. This gives us an up to date view of
														
 
															+	 * it's metadata (for verification), and allows us to
														
 
															+	 * serialize delete_inode votes. 
														
 
															+	 *
														
 
															+	 * Even though we might be doing a truncate, we don't take the
														
 
															+	 * allocation lock here as it won't be needed - nobody will
														
 
															+	 * have the file open.
														
 
															+	 */
														
 
															+	status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		ocfs2_cleanup_delete_inode(inode, 0);
														
 
															+		goto bail_unblock;
														
 
															+	}
														
 
															+
														
 
															+	/* Query the cluster. This will be the final decision made
														
 
															+	 * before we go ahead and wipe the inode. */
														
 
															+	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
														
 
															+	if (!wipe || status < 0) {
														
 
															+		/* Error and inode busy vote both mean we won't be
														
 
															+		 * removing the inode, so they take almost the same
														
 
															+		 * path. */
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+
														
 
															+		/* Someone in the cluster has voted to not wipe this
														
 
															+		 * inode, or it was never completely orphaned. Write
														
 
															+		 * out the pages and exit now. */
														
 
															+		ocfs2_cleanup_delete_inode(inode, 1);
														
 
															+		goto bail_unlock_inode;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_cleanup_delete_inode(inode, 0);
														
 
															+
														
 
															+	status = ocfs2_wipe_inode(inode, di_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail_unlock_inode;
														
 
															+	}
														
 
															+
														
 
															+	/* Mark the inode as successfully deleted. This is important
														
 
															+	 * for ocfs2_clear_inode as it will check this flag and skip
														
 
															+	 * any checkpointing work */
														
 
															+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
														
 
															+
														
 
															+bail_unlock_inode:
														
 
															+	ocfs2_meta_unlock(inode, 1);
														
 
															+	brelse(di_bh);
														
 
															+bail_unblock:
														
 
															+	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+bail:
														
 
															+	clear_inode(inode);
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+void ocfs2_clear_inode(struct inode *inode)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!inode)
														
 
															+		goto bail;
														
 
															+
														
 
															+	mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno, inode->i_nlink);
														
 
															+
														
 
															+	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
														
 
															+			"Inode=%lu\n", inode->i_ino);
														
 
															+
														
 
															+	/* Do these before all the other work so that we don't bounce
														
 
															+	 * the vote thread while waiting to destroy the locks. */
														
 
															+	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
														
 
															+	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
														
 
															+	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
														
 
															+
														
 
															+	/* We very well may get a clear_inode before all an inodes
														
 
															+	 * metadata has hit disk. Of course, we can't drop any cluster
														
 
															+	 * locks until the journal has finished with it. The only
														
 
															+	 * exception here are successfully wiped inodes - their
														
 
															+	 * metadata can now be considered to be part of the system
														
 
															+	 * inodes from which it came. */
														
 
															+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
														
 
															+		ocfs2_checkpoint_inode(inode);
														
 
															+
														
 
															+	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
														
 
															+			"Clear inode of %"MLFu64", inode has io markers\n",
														
 
															+			oi->ip_blkno);
														
 
															+
														
 
															+	ocfs2_extent_map_drop(inode, 0);
														
 
															+	ocfs2_extent_map_init(inode);
														
 
															+
														
 
															+	status = ocfs2_drop_inode_locks(inode);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
														
 
															+	ocfs2_lock_res_free(&oi->ip_meta_lockres);
														
 
															+	ocfs2_lock_res_free(&oi->ip_data_lockres);
														
 
															+
														
 
															+	ocfs2_metadata_cache_purge(inode);
														
 
															+
														
 
															+	mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
														
 
															+			"Clear inode of %"MLFu64", inode has %u cache items\n",
														
 
															+			oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
														
 
															+
														
 
															+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
														
 
															+			"Clear inode of %"MLFu64", inode has a bad flag\n",
														
 
															+			oi->ip_blkno);
														
 
															+
														
 
															+	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
														
 
															+			"Clear inode of %"MLFu64", inode is locked\n",
														
 
															+			oi->ip_blkno);
														
 
															+
														
 
															+	mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
														
 
															+			"Clear inode of %"MLFu64", io_sem is locked\n",
														
 
															+			oi->ip_blkno);
														
 
															+	up(&oi->ip_io_sem);
														
 
															+
														
 
															+	/*
														
 
															+	 * down_trylock() returns 0, down_write_trylock() returns 1
														
 
															+	 * kernel 1, world 0
														
 
															+	 */
														
 
															+	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
														
 
															+			"Clear inode of %"MLFu64", alloc_sem is locked\n",
														
 
															+			oi->ip_blkno);
														
 
															+	up_write(&oi->ip_alloc_sem);
														
 
															+
														
 
															+	mlog_bug_on_msg(oi->ip_open_count,
														
 
															+			"Clear inode of %"MLFu64" has open count %d\n",
														
 
															+			oi->ip_blkno, oi->ip_open_count);
														
 
															+	mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
														
 
															+			"Clear inode of %"MLFu64" has non empty handle list\n",
														
 
															+			oi->ip_blkno);
														
 
															+	mlog_bug_on_msg(oi->ip_handle,
														
 
															+			"Clear inode of %"MLFu64" has non empty handle pointer\n",
														
 
															+			oi->ip_blkno);
														
 
															+
														
 
															+	/* Clear all other flags. */
														
 
															+	oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
														
 
															+	oi->ip_created_trans = 0;
														
 
															+	oi->ip_last_trans = 0;
														
 
															+	oi->ip_dir_start_lookup = 0;
														
 
															+	oi->ip_blkno = 0ULL;
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/* Called under inode_lock, with no more references on the
														
 
															+ * struct inode, so it's safe here to check the flags field
														
 
															+ * and to manipulate i_nlink without any other locks. */
														
 
															+void ocfs2_drop_inode(struct inode *inode)
														
 
															+{
														
 
															+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
														
 
															+	     oi->ip_blkno, inode->i_nlink, oi->ip_flags);
														
 
															+
														
 
															+	/* Testing ip_orphaned_slot here wouldn't work because we may
														
 
															+	 * not have gotten a delete_inode vote from any other nodes
														
 
															+	 * yet. */
														
 
															+	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
														
 
															+		mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
														
 
															+		inode->i_nlink = 0;
														
 
															+	}
														
 
															+
														
 
															+	generic_drop_inode(inode);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * TODO: this should probably be merged into ocfs2_get_block
														
 
															+ *
														
 
															+ * However, you now need to pay attention to the cont_prepare_write()
														
 
															+ * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
														
 
															+ * expects never to extend).
														
 
															+ */
														
 
															+struct buffer_head *ocfs2_bread(struct inode *inode,
														
 
															+				int block, int *err, int reada)
														
 
															+{
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	int tmperr;
														
 
															+	u64 p_blkno;
														
 
															+	int readflags = OCFS2_BH_CACHED;
														
 
															+
														
 
															+#if 0
														
 
															+	/* only turn this on if we know we can deal with read_block
														
 
															+	 * returning nothing */
														
 
															+	if (reada)
														
 
															+		readflags |= OCFS2_BH_READAHEAD;
														
 
															+#endif
														
 
															+
														
 
															+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
														
 
															+	    i_size_read(inode)) {
														
 
															+		BUG_ON(!reada);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															+	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
														
 
															+					     &p_blkno, NULL);
														
 
															+	if (tmperr < 0) {
														
 
															+		mlog_errno(tmperr);
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
														
 
															+				  readflags, inode);
														
 
															+	if (tmperr < 0)
														
 
															+		goto fail;
														
 
															+
														
 
															+	tmperr = 0;
														
 
															+
														
 
															+	*err = 0;
														
 
															+	return bh;
														
 
															+
														
 
															+fail:
														
 
															+	if (bh) {
														
 
															+		brelse(bh);
														
 
															+		bh = NULL;
														
 
															+	}
														
 
															+	*err = -EIO;
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This is called from our getattr.
														
 
															+ */
														
 
															+int ocfs2_inode_revalidate(struct dentry *dentry)
														
 
															+{
														
 
															+	struct inode *inode = dentry->d_inode;
														
 
															+	int status = 0;
														
 
															+
														
 
															+	mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
														
 
															+		   inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
														
 
															+
														
 
															+	if (!inode) {
														
 
															+		mlog(0, "eep, no inode!\n");
														
 
															+		status = -ENOENT;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
														
 
															+		spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+		mlog(0, "inode deleted!\n");
														
 
															+		status = -ENOENT;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	/* Let ocfs2_meta_lock do the work of updating our struct
														
 
															+	 * inode for us. */
														
 
															+	status = ocfs2_meta_lock(inode, NULL, NULL, 0);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	ocfs2_meta_unlock(inode, 0);
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Updates a disk inode from a
														
 
															+ * struct inode.
														
 
															+ * Only takes ip_lock.
														
 
															+ */
														
 
															+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
														
 
															+			   struct inode *inode,
														
 
															+			   struct buffer_head *bh)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
														
 
															+
														
 
															+	mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, inode, bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	fe->i_size = cpu_to_le64(i_size_read(inode));
														
 
															+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
														
 
															+	fe->i_uid = cpu_to_le32(inode->i_uid);
														
 
															+	fe->i_gid = cpu_to_le32(inode->i_gid);
														
 
															+	fe->i_mode = cpu_to_le16(inode->i_mode);
														
 
															+	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
														
 
															+	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
														
 
															+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
														
 
															+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
														
 
															+	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
														
 
															+	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	status = 0;
														
 
															+leave:
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ *
														
 
															+ * Updates a struct inode from a disk inode.
														
 
															+ * does no i/o, only takes ip_lock.
														
 
															+ */
														
 
															+void ocfs2_refresh_inode(struct inode *inode,
														
 
															+			 struct ocfs2_dinode *fe)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
														
 
															+	i_size_write(inode, le64_to_cpu(fe->i_size));
														
 
															+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
														
 
															+	inode->i_uid = le32_to_cpu(fe->i_uid);
														
 
															+	inode->i_gid = le32_to_cpu(fe->i_gid);
														
 
															+	inode->i_mode = le16_to_cpu(fe->i_mode);
														
 
															+	inode->i_blksize = (u32) osb->s_clustersize;
														
 
															+	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
														
 
															+		inode->i_blocks = 0;
														
 
															+	else
														
 
															+		inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
														
 
															+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
														
 
															+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
														
 
															+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
														
 
															+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
														
 
															+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
														
 
															+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
														
 
															+
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+}
														
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,145 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * inode.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_INODE_H
														
 
															+#define OCFS2_INODE_H
														
 
															+
														
 
															+/* OCFS2 Inode Private Data */
														
 
															+struct ocfs2_inode_info
														
 
															+{
														
 
															+	u64			ip_blkno;
														
 
															+
														
 
															+	struct ocfs2_lock_res		ip_rw_lockres;
														
 
															+	struct ocfs2_lock_res		ip_meta_lockres;
														
 
															+	struct ocfs2_lock_res		ip_data_lockres;
														
 
															+
														
 
															+	/* protects allocation changes on this inode. */
														
 
															+	struct rw_semaphore		ip_alloc_sem;
														
 
															+
														
 
															+	/* These fields are protected by ip_lock */
														
 
															+	spinlock_t			ip_lock;
														
 
															+	u32				ip_open_count;
														
 
															+	u32				ip_clusters;
														
 
															+	struct ocfs2_extent_map		ip_map;
														
 
															+	struct list_head		ip_io_markers;
														
 
															+	int				ip_orphaned_slot;
														
 
															+
														
 
															+	struct semaphore		ip_io_sem;
														
 
															+
														
 
															+	/* Used by the journalling code to attach an inode to a
														
 
															+	 * handle.  These are protected by ip_io_sem in order to lock
														
 
															+	 * out other I/O to the inode until we either commit or
														
 
															+	 * abort. */
														
 
															+	struct list_head		ip_handle_list;
														
 
															+	struct ocfs2_journal_handle	*ip_handle;
														
 
															+
														
 
															+	u32				ip_flags; /* see below */
														
 
															+
														
 
															+	/* protected by recovery_lock. */
														
 
															+	struct inode			*ip_next_orphan;
														
 
															+
														
 
															+	u32				ip_dir_start_lookup;
														
 
															+
														
 
															+	/* next two are protected by trans_inc_lock */
														
 
															+	/* which transaction were we created on? Zero if none. */
														
 
															+	unsigned long			ip_created_trans;
														
 
															+	/* last transaction we were a part of. */
														
 
															+	unsigned long			ip_last_trans;
														
 
															+
														
 
															+	struct ocfs2_caching_info	ip_metadata_cache;
														
 
															+
														
 
															+	struct inode			vfs_inode;
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Flags for the ip_flags field
														
 
															+ */
														
 
															+/* System file inodes  */
														
 
															+#define OCFS2_INODE_SYSTEM_FILE		0x00000001
														
 
															+#define OCFS2_INODE_JOURNAL		0x00000002
														
 
															+#define OCFS2_INODE_BITMAP		0x00000004
														
 
															+/* This inode has been wiped from disk */
														
 
															+#define OCFS2_INODE_DELETED		0x00000008
														
 
															+/* Another node is deleting, so our delete is a nop */
														
 
															+#define OCFS2_INODE_SKIP_DELETE		0x00000010
														
 
															+/* Has the inode been orphaned on another node?
														
 
															+ *
														
 
															+ * This hints to ocfs2_drop_inode that it should clear i_nlink before
														
 
															+ * continuing.
														
 
															+ *
														
 
															+ * We *only* set this on unlink vote from another node. If the inode
														
 
															+ * was locally orphaned, then we're sure of the state and don't need
														
 
															+ * to twiddle i_nlink later - it's either zero or not depending on
														
 
															+ * whether our unlink succeeded. Otherwise we got this from a node
														
 
															+ * whose intention was to orphan the inode, however he may have
														
 
															+ * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
														
 
															+ * rely on ocfs2_delete_inode to sort things out under the proper
														
 
															+ * cluster locks.
														
 
															+ */
														
 
															+#define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
														
 
															+/* Does someone have the file open O_DIRECT */
														
 
															+#define OCFS2_INODE_OPEN_DIRECT		0x00000040
														
 
															+/* Indicates that the metadata cache should be used as an array. */
														
 
															+#define OCFS2_INODE_CACHE_INLINE	0x00000080
														
 
															+
														
 
															+static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
														
 
															+{
														
 
															+	return container_of(inode, struct ocfs2_inode_info, vfs_inode);
														
 
															+}
														
 
															+
														
 
															+#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
														
 
															+#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
														
 
															+
														
 
															+extern kmem_cache_t *ocfs2_inode_cache;
														
 
															+
														
 
															+extern struct address_space_operations ocfs2_aops;
														
 
															+
														
 
															+struct buffer_head *ocfs2_bread(struct inode *inode, int block,
														
 
															+				int *err, int reada);
														
 
															+void ocfs2_clear_inode(struct inode *inode);
														
 
															+void ocfs2_delete_inode(struct inode *inode);
														
 
															+void ocfs2_drop_inode(struct inode *inode);
														
 
															+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
														
 
															+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
														
 
															+				     u64 blkno,
														
 
															+				     int delete_vote);
														
 
															+int ocfs2_inode_init_private(struct inode *inode);
														
 
															+int ocfs2_inode_revalidate(struct dentry *dentry);
														
 
															+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
														
 
															+			 int create_ino);
														
 
															+void ocfs2_read_inode(struct inode *inode);
														
 
															+void ocfs2_read_inode2(struct inode *inode, void *opaque);
														
 
															+ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
														
 
															+			size_t size, loff_t *offp);
														
 
															+void ocfs2_sync_blockdev(struct super_block *sb);
														
 
															+void ocfs2_refresh_inode(struct inode *inode,
														
 
															+			 struct ocfs2_dinode *fe);
														
 
															+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
														
 
															+			   struct inode *inode,
														
 
															+			   struct buffer_head *bh);
														
 
															+int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
														
 
															+int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
														
 
															+
														
 
															+#endif /* OCFS2_INODE_H */
														
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,1652 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * journal.c
														
 
															+ *
														
 
															+ * Defines functions of journalling api
														
 
															+ *
														
 
															+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/kthread.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_JOURNAL
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "heartbeat.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "localalloc.h"
														
 
															+#include "namei.h"
														
 
															+#include "slot_map.h"
														
 
															+#include "super.h"
														
 
															+#include "vote.h"
														
 
															+#include "sysfile.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
														
 
															+
														
 
															+static int ocfs2_force_read_journal(struct inode *inode);
														
 
															+static int ocfs2_recover_node(struct ocfs2_super *osb,
														
 
															+			      int node_num);
														
 
															+static int __ocfs2_recovery_thread(void *arg);
														
 
															+static int ocfs2_commit_cache(struct ocfs2_super *osb);
														
 
															+static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
														
 
															+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
														
 
															+				       struct ocfs2_journal_handle *handle);
														
 
															+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
														
 
															+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
														
 
															+				      int dirty);
														
 
															+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
														
 
															+				 int slot_num);
														
 
															+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
														
 
															+				 int slot);
														
 
															+static int ocfs2_commit_thread(void *arg);
														
 
															+
														
 
															+static int ocfs2_commit_cache(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	unsigned int flushed;
														
 
															+	unsigned long old_id;
														
 
															+	struct ocfs2_journal *journal = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	journal = osb->journal;
														
 
															+
														
 
															+	/* Flush all pending commits and checkpoint the journal. */
														
 
															+	down_write(&journal->j_trans_barrier);
														
 
															+
														
 
															+	if (atomic_read(&journal->j_num_trans) == 0) {
														
 
															+		up_write(&journal->j_trans_barrier);
														
 
															+		mlog(0, "No transactions for me to flush!\n");
														
 
															+		goto finally;
														
 
															+	}
														
 
															+
														
 
															+	journal_lock_updates(journal->j_journal);
														
 
															+	status = journal_flush(journal->j_journal);
														
 
															+	journal_unlock_updates(journal->j_journal);
														
 
															+	if (status < 0) {
														
 
															+		up_write(&journal->j_trans_barrier);
														
 
															+		mlog_errno(status);
														
 
															+		goto finally;
														
 
															+	}
														
 
															+
														
 
															+	old_id = ocfs2_inc_trans_id(journal);
														
 
															+
														
 
															+	flushed = atomic_read(&journal->j_num_trans);
														
 
															+	atomic_set(&journal->j_num_trans, 0);
														
 
															+	up_write(&journal->j_trans_barrier);
														
 
															+
														
 
															+	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
														
 
															+	     journal->j_trans_id, flushed);
														
 
															+
														
 
															+	ocfs2_kick_vote_thread(osb);
														
 
															+	wake_up(&journal->j_checkpointed);
														
 
															+finally:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	struct ocfs2_journal_handle *retval = NULL;
														
 
															+
														
 
															+	retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
														
 
															+	if (!retval) {
														
 
															+		mlog(ML_ERROR, "Failed to allocate memory for journal "
														
 
															+		     "handle!\n");
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															+	retval->max_buffs = 0;
														
 
															+	retval->num_locks = 0;
														
 
															+	retval->k_handle = NULL;
														
 
															+
														
 
															+	INIT_LIST_HEAD(&retval->locks);
														
 
															+	INIT_LIST_HEAD(&retval->inode_list);
														
 
															+	retval->journal = osb->journal;
														
 
															+
														
 
															+	return retval;
														
 
															+}
														
 
															+
														
 
															+/* pass it NULL and it will allocate a new handle object for you.  If
														
 
															+ * you pass it a handle however, it may still return error, in which
														
 
															+ * case it has free'd the passed handle for you. */
														
 
															+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
														
 
															+					       struct ocfs2_journal_handle *handle,
														
 
															+					       int max_buffs)
														
 
															+{
														
 
															+	int ret;
														
 
															+	journal_t *journal = osb->journal->j_journal;
														
 
															+
														
 
															+	mlog_entry("(max_buffs = %d)\n", max_buffs);
														
 
															+
														
 
															+	if (!osb || !osb->journal->j_journal)
														
 
															+		BUG();
														
 
															+
														
 
															+	if (ocfs2_is_hard_readonly(osb)) {
														
 
															+		ret = -EROFS;
														
 
															+		goto done_free;
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
														
 
															+	BUG_ON(max_buffs <= 0);
														
 
															+
														
 
															+	/* JBD might support this, but our journalling code doesn't yet. */
														
 
															+	if (journal_current_handle()) {
														
 
															+		mlog(ML_ERROR, "Recursive transaction attempted!\n");
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	if (!handle)
														
 
															+		handle = ocfs2_alloc_handle(osb);
														
 
															+	if (!handle) {
														
 
															+		ret = -ENOMEM;
														
 
															+		mlog(ML_ERROR, "Failed to allocate memory for journal "
														
 
															+		     "handle!\n");
														
 
															+		goto done_free;
														
 
															+	}
														
 
															+
														
 
															+	handle->max_buffs = max_buffs;
														
 
															+
														
 
															+	down_read(&osb->journal->j_trans_barrier);
														
 
															+
														
 
															+	/* actually start the transaction now */
														
 
															+	handle->k_handle = journal_start(journal, max_buffs);
														
 
															+	if (IS_ERR(handle->k_handle)) {
														
 
															+		up_read(&osb->journal->j_trans_barrier);
														
 
															+
														
 
															+		ret = PTR_ERR(handle->k_handle);
														
 
															+		handle->k_handle = NULL;
														
 
															+		mlog_errno(ret);
														
 
															+
														
 
															+		if (is_journal_aborted(journal)) {
														
 
															+			ocfs2_abort(osb->sb, "Detected aborted journal");
														
 
															+			ret = -EROFS;
														
 
															+		}
														
 
															+		goto done_free;
														
 
															+	}
														
 
															+
														
 
															+	atomic_inc(&(osb->journal->j_num_trans));
														
 
															+	handle->flags |= OCFS2_HANDLE_STARTED;
														
 
															+
														
 
															+	mlog_exit_ptr(handle);
														
 
															+	return handle;
														
 
															+
														
 
															+done_free:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
														
 
															+
														
 
															+	mlog_exit(ret);
														
 
															+	return ERR_PTR(ret);
														
 
															+}
														
 
															+
														
 
															+void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
														
 
															+			    struct inode *inode)
														
 
															+{
														
 
															+	BUG_ON(!handle);
														
 
															+	BUG_ON(!inode);
														
 
															+
														
 
															+	atomic_inc(&inode->i_count);
														
 
															+
														
 
															+	/* we're obviously changing it... */
														
 
															+	down(&inode->i_sem);
														
 
															+
														
 
															+	/* sanity check */
														
 
															+	BUG_ON(OCFS2_I(inode)->ip_handle);
														
 
															+	BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
														
 
															+
														
 
															+	OCFS2_I(inode)->ip_handle = handle;
														
 
															+	list_del(&(OCFS2_I(inode)->ip_handle_list));
														
 
															+	list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
														
 
															+{
														
 
															+	struct list_head *p, *n;
														
 
															+	struct inode *inode;
														
 
															+	struct ocfs2_inode_info *oi;
														
 
															+
														
 
															+	list_for_each_safe(p, n, &handle->inode_list) {
														
 
															+		oi = list_entry(p, struct ocfs2_inode_info,
														
 
															+				ip_handle_list);
														
 
															+		inode = &oi->vfs_inode;
														
 
															+
														
 
															+		OCFS2_I(inode)->ip_handle = NULL;
														
 
															+		list_del_init(&OCFS2_I(inode)->ip_handle_list);
														
 
															+
														
 
															+		up(&inode->i_sem);
														
 
															+		iput(inode);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* This is trivial so we do it out of the main commit
														
 
															+ * paths. Beware, it can be called from start_trans too! */
														
 
															+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
														
 
															+{
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
														
 
															+
														
 
															+	ocfs2_handle_unlock_inodes(handle);
														
 
															+	/* You are allowed to add journal locks before the transaction
														
 
															+	 * has started. */
														
 
															+	ocfs2_handle_cleanup_locks(handle->journal, handle);
														
 
															+
														
 
															+	kfree(handle);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
														
 
															+{
														
 
															+	handle_t *jbd_handle;
														
 
															+	int retval;
														
 
															+	struct ocfs2_journal *journal = handle->journal;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!handle);
														
 
															+
														
 
															+	if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
														
 
															+		ocfs2_commit_unstarted_handle(handle);
														
 
															+		mlog_exit_void();
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/* release inode semaphores we took during this transaction */
														
 
															+	ocfs2_handle_unlock_inodes(handle);
														
 
															+
														
 
															+	/* ocfs2_extend_trans may have had to call journal_restart
														
 
															+	 * which will always commit the transaction, but may return
														
 
															+	 * error for any number of reasons. If this is the case, we
														
 
															+	 * clear k_handle as it's not valid any more. */
														
 
															+	if (handle->k_handle) {
														
 
															+		jbd_handle = handle->k_handle;
														
 
															+
														
 
															+		if (handle->flags & OCFS2_HANDLE_SYNC)
														
 
															+			jbd_handle->h_sync = 1;
														
 
															+		else
														
 
															+			jbd_handle->h_sync = 0;
														
 
															+
														
 
															+		/* actually stop the transaction. if we've set h_sync,
														
 
															+		 * it'll have been committed when we return */
														
 
															+		retval = journal_stop(jbd_handle);
														
 
															+		if (retval < 0) {
														
 
															+			mlog_errno(retval);
														
 
															+			mlog(ML_ERROR, "Could not commit transaction\n");
														
 
															+			BUG();
														
 
															+		}
														
 
															+
														
 
															+		handle->k_handle = NULL; /* it's been free'd in journal_stop */
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_handle_cleanup_locks(journal, handle);
														
 
															+
														
 
															+	up_read(&journal->j_trans_barrier);
														
 
															+
														
 
															+	kfree(handle);
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * 'nblocks' is what you want to add to the current
														
 
															+ * transaction. extend_trans will either extend the current handle by
														
 
															+ * nblocks, or commit it and start a new one with nblocks credits.
														
 
															+ *
														
 
															+ * WARNING: This will not release any semaphores or disk locks taken
														
 
															+ * during the transaction, so make sure they were taken *before*
														
 
															+ * start_trans or we'll have ordering deadlocks.
														
 
															+ *
														
 
															+ * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
														
 
															+ * good because transaction ids haven't yet been recorded on the
														
 
															+ * cluster locks associated with this handle.
														
 
															+ */
														
 
															+int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
														
 
															+		       int nblocks)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	BUG_ON(!handle);
														
 
															+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
														
 
															+	BUG_ON(!nblocks);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
														
 
															+
														
 
															+	status = journal_extend(handle->k_handle, nblocks);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (status > 0) {
														
 
															+		mlog(0, "journal_extend failed, trying journal_restart\n");
														
 
															+		status = journal_restart(handle->k_handle, nblocks);
														
 
															+		if (status < 0) {
														
 
															+			handle->k_handle = NULL;
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		handle->max_buffs = nblocks;
														
 
															+	} else
														
 
															+		handle->max_buffs += nblocks;
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
														
 
															+			 struct inode *inode,
														
 
															+			 struct buffer_head *bh,
														
 
															+			 int type)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	BUG_ON(!inode);
														
 
															+	BUG_ON(!handle);
														
 
															+	BUG_ON(!bh);
														
 
															+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
														
 
															+
														
 
															+	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
														
 
															+		   (unsigned long long)bh->b_blocknr, type,
														
 
															+		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
														
 
															+		   "OCFS2_JOURNAL_ACCESS_CREATE" :
														
 
															+		   "OCFS2_JOURNAL_ACCESS_WRITE",
														
 
															+		   bh->b_size);
														
 
															+
														
 
															+	/* we can safely remove this assertion after testing. */
														
 
															+	if (!buffer_uptodate(bh)) {
														
 
															+		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
														
 
															+		mlog(ML_ERROR, "b_blocknr=%llu\n",
														
 
															+		     (unsigned long long)bh->b_blocknr);
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	/* Set the current transaction information on the inode so
														
 
															+	 * that the locking code knows whether it can drop it's locks
														
 
															+	 * on this inode or not. We're protected from the commit
														
 
															+	 * thread updating the current transaction id until
														
 
															+	 * ocfs2_commit_trans() because ocfs2_start_trans() took
														
 
															+	 * j_trans_barrier for us. */
														
 
															+	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
														
 
															+
														
 
															+	down(&OCFS2_I(inode)->ip_io_sem);
														
 
															+	switch (type) {
														
 
															+	case OCFS2_JOURNAL_ACCESS_CREATE:
														
 
															+	case OCFS2_JOURNAL_ACCESS_WRITE:
														
 
															+		status = journal_get_write_access(handle->k_handle, bh);
														
 
															+		break;
														
 
															+
														
 
															+	case OCFS2_JOURNAL_ACCESS_UNDO:
														
 
															+		status = journal_get_undo_access(handle->k_handle, bh);
														
 
															+		break;
														
 
															+
														
 
															+	default:
														
 
															+		status = -EINVAL;
														
 
															+		mlog(ML_ERROR, "Uknown access type!\n");
														
 
															+	}
														
 
															+	up(&OCFS2_I(inode)->ip_io_sem);
														
 
															+
														
 
															+	if (status < 0)
														
 
															+		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
														
 
															+		     status, type);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
														
 
															+			struct buffer_head *bh)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
														
 
															+
														
 
															+	mlog_entry("(bh->b_blocknr=%llu)\n",
														
 
															+		   (unsigned long long)bh->b_blocknr);
														
 
															+
														
 
															+	status = journal_dirty_metadata(handle->k_handle, bh);
														
 
															+	if (status < 0)
														
 
															+		mlog(ML_ERROR, "Could not dirty metadata buffer. "
														
 
															+		     "(bh->b_blocknr=%llu)\n",
														
 
															+		     (unsigned long long)bh->b_blocknr);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_journal_dirty_data(handle_t *handle,
														
 
															+			     struct buffer_head *bh)
														
 
															+{
														
 
															+	int err = journal_dirty_data(handle, bh);
														
 
															+	if (err)
														
 
															+		mlog_errno(err);
														
 
															+	/* TODO: When we can handle it, abort the handle and go RO on
														
 
															+	 * error here. */
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+/* We always assume you're adding a metadata lock at level 'ex' */
														
 
															+int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
														
 
															+			  struct inode *inode)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_journal_lock *lock;
														
 
															+
														
 
															+	BUG_ON(!inode);
														
 
															+
														
 
															+	lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
														
 
															+	if (!lock) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (!igrab(inode))
														
 
															+		BUG();
														
 
															+	lock->jl_inode = inode;
														
 
															+
														
 
															+	list_add_tail(&(lock->jl_lock_list), &(handle->locks));
														
 
															+	handle->num_locks++;
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
														
 
															+				       struct ocfs2_journal_handle *handle)
														
 
															+{
														
 
															+	struct list_head *p, *n;
														
 
															+	struct ocfs2_journal_lock *lock;
														
 
															+	struct inode *inode;
														
 
															+
														
 
															+	list_for_each_safe(p, n, &(handle->locks)) {
														
 
															+		lock = list_entry(p, struct ocfs2_journal_lock,
														
 
															+				  jl_lock_list);
														
 
															+		list_del(&lock->jl_lock_list);
														
 
															+		handle->num_locks--;
														
 
															+
														
 
															+		inode = lock->jl_inode;
														
 
															+		ocfs2_meta_unlock(inode, 1);
														
 
															+		if (atomic_read(&inode->i_count) == 1)
														
 
															+			mlog(ML_ERROR,
														
 
															+			     "Inode %"MLFu64", I'm doing a last iput for!",
														
 
															+			     OCFS2_I(inode)->ip_blkno);
														
 
															+		iput(inode);
														
 
															+		kmem_cache_free(ocfs2_lock_cache, lock);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * 5)
														
 
															+
														
 
															+void ocfs2_set_journal_params(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	journal_t *journal = osb->journal->j_journal;
														
 
															+
														
 
															+	spin_lock(&journal->j_state_lock);
														
 
															+	journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
														
 
															+	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
														
 
															+		journal->j_flags |= JFS_BARRIER;
														
 
															+	else
														
 
															+		journal->j_flags &= ~JFS_BARRIER;
														
 
															+	spin_unlock(&journal->j_state_lock);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
														
 
															+{
														
 
															+	int status = -1;
														
 
															+	struct inode *inode = NULL; /* the journal inode */
														
 
															+	journal_t *j_journal = NULL;
														
 
															+	struct ocfs2_dinode *di = NULL;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_super *osb;
														
 
															+	int meta_lock = 0;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!journal);
														
 
															+
														
 
															+	osb = journal->j_osb;
														
 
															+
														
 
															+	/* already have the inode for our journal */
														
 
															+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
														
 
															+					    osb->slot_num);
														
 
															+	if (inode == NULL) {
														
 
															+		status = -EACCES;
														
 
															+		mlog_errno(status);
														
 
															+		goto done;
														
 
															+	}
														
 
															+	if (is_bad_inode(inode)) {
														
 
															+		mlog(ML_ERROR, "access error (bad inode)\n");
														
 
															+		iput(inode);
														
 
															+		inode = NULL;
														
 
															+		status = -EACCES;
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	SET_INODE_JOURNAL(inode);
														
 
															+	OCFS2_I(inode)->ip_open_count++;
														
 
															+
														
 
															+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ERESTARTSYS)
														
 
															+			mlog(ML_ERROR, "Could not get lock on journal!\n");
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	meta_lock = 1;
														
 
															+	di = (struct ocfs2_dinode *)bh->b_data;
														
 
															+
														
 
															+	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
														
 
															+		mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
														
 
															+		     inode->i_size);
														
 
															+		status = -EINVAL;
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "inode->i_size = %lld\n", inode->i_size);
														
 
															+	mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
														
 
															+	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
														
 
															+
														
 
															+	/* call the kernels journal init function now */
														
 
															+	j_journal = journal_init_inode(inode);
														
 
															+	if (j_journal == NULL) {
														
 
															+		mlog(ML_ERROR, "Linux journal layer error\n");
														
 
															+		status = -EINVAL;
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "Returned from journal_init_inode\n");
														
 
															+	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
														
 
															+
														
 
															+	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
														
 
															+		  OCFS2_JOURNAL_DIRTY_FL);
														
 
															+
														
 
															+	journal->j_journal = j_journal;
														
 
															+	journal->j_inode = inode;
														
 
															+	journal->j_bh = bh;
														
 
															+
														
 
															+	ocfs2_set_journal_params(osb);
														
 
															+
														
 
															+	journal->j_state = OCFS2_JOURNAL_LOADED;
														
 
															+
														
 
															+	status = 0;
														
 
															+done:
														
 
															+	if (status < 0) {
														
 
															+		if (meta_lock)
														
 
															+			ocfs2_meta_unlock(inode, 1);
														
 
															+		if (bh != NULL)
														
 
															+			brelse(bh);
														
 
															+		if (inode) {
														
 
															+			OCFS2_I(inode)->ip_open_count--;
														
 
															+			iput(inode);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
														
 
															+				      int dirty)
														
 
															+{
														
 
															+	int status;
														
 
															+	unsigned int flags;
														
 
															+	struct ocfs2_journal *journal = osb->journal;
														
 
															+	struct buffer_head *bh = journal->j_bh;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *)bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		/* This is called from startup/shutdown which will
														
 
															+		 * handle the errors in a specific manner, so no need
														
 
															+		 * to call ocfs2_error() here. */
														
 
															+		mlog(ML_ERROR, "Journal dinode %"MLFu64"  has invalid "
														
 
															+		     "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
														
 
															+		status = -EIO;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
														
 
															+	if (dirty)
														
 
															+		flags |= OCFS2_JOURNAL_DIRTY_FL;
														
 
															+	else
														
 
															+		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
														
 
															+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
														
 
															+
														
 
															+	status = ocfs2_write_block(osb, bh, journal->j_inode);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+out:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * If the journal has been kmalloc'd it needs to be freed after this
														
 
															+ * call.
														
 
															+ */
														
 
															+void ocfs2_journal_shutdown(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	struct ocfs2_journal *journal = NULL;
														
 
															+	int status = 0;
														
 
															+	struct inode *inode = NULL;
														
 
															+	int num_running_trans = 0;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!osb)
														
 
															+		BUG();
														
 
															+
														
 
															+	journal = osb->journal;
														
 
															+	if (!journal)
														
 
															+		goto done;
														
 
															+
														
 
															+	inode = journal->j_inode;
														
 
															+
														
 
															+	if (journal->j_state != OCFS2_JOURNAL_LOADED)
														
 
															+		goto done;
														
 
															+
														
 
															+	/* need to inc inode use count as journal_destroy will iput. */
														
 
															+	if (!igrab(inode))
														
 
															+		BUG();
														
 
															+
														
 
															+	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
														
 
															+	if (num_running_trans > 0)
														
 
															+		mlog(0, "Shutting down journal: must wait on %d "
														
 
															+		     "running transactions!\n",
														
 
															+		     num_running_trans);
														
 
															+
														
 
															+	/* Do a commit_cache here. It will flush our journal, *and*
														
 
															+	 * release any locks that are still held.
														
 
															+	 * set the SHUTDOWN flag and release the trans lock.
														
 
															+	 * the commit thread will take the trans lock for us below. */
														
 
															+	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
														
 
															+
														
 
															+	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
														
 
															+	 * drop the trans_lock (which we want to hold until we
														
 
															+	 * completely destroy the journal. */
														
 
															+	if (osb->commit_task) {
														
 
															+		/* Wait for the commit thread */
														
 
															+		mlog(0, "Waiting for ocfs2commit to exit....\n");
														
 
															+		kthread_stop(osb->commit_task);
														
 
															+		osb->commit_task = NULL;
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
														
 
															+
														
 
															+	status = ocfs2_journal_toggle_dirty(osb, 0);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	/* Shutdown the kernel journal system */
														
 
															+	journal_destroy(journal->j_journal);
														
 
															+
														
 
															+	OCFS2_I(inode)->ip_open_count--;
														
 
															+
														
 
															+	/* unlock our journal */
														
 
															+	ocfs2_meta_unlock(inode, 1);
														
 
															+
														
 
															+	brelse(journal->j_bh);
														
 
															+	journal->j_bh = NULL;
														
 
															+
														
 
															+	journal->j_state = OCFS2_JOURNAL_FREE;
														
 
															+
														
 
															+//	up_write(&journal->j_trans_barrier);
														
 
															+done:
														
 
															+	if (inode)
														
 
															+		iput(inode);
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_clear_journal_error(struct super_block *sb,
														
 
															+				      journal_t *journal,
														
 
															+				      int slot)
														
 
															+{
														
 
															+	int olderr;
														
 
															+
														
 
															+	olderr = journal_errno(journal);
														
 
															+	if (olderr) {
														
 
															+		mlog(ML_ERROR, "File system error %d recorded in "
														
 
															+		     "journal %u.\n", olderr, slot);
														
 
															+		mlog(ML_ERROR, "File system on device %s needs checking.\n",
														
 
															+		     sb->s_id);
														
 
															+
														
 
															+		journal_ack_err(journal);
														
 
															+		journal_clear_err(journal);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+int ocfs2_journal_load(struct ocfs2_journal *journal)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_super *osb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!journal)
														
 
															+		BUG();
														
 
															+
														
 
															+	osb = journal->j_osb;
														
 
															+
														
 
															+	status = journal_load(journal->j_journal);
														
 
															+	if (status < 0) {
														
 
															+		mlog(ML_ERROR, "Failed to load journal!\n");
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
														
 
															+
														
 
															+	status = ocfs2_journal_toggle_dirty(osb, 1);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	/* Launch the commit thread */
														
 
															+	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
														
 
															+				       osb->osb_id);
														
 
															+	if (IS_ERR(osb->commit_task)) {
														
 
															+		status = PTR_ERR(osb->commit_task);
														
 
															+		osb->commit_task = NULL;
														
 
															+		mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
														
 
															+		     status);
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+done:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* 'full' flag tells us whether we clear out all blocks or if we just
														
 
															+ * mark the journal clean */
														
 
															+int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!journal)
														
 
															+		BUG();
														
 
															+
														
 
															+	status = journal_wipe(journal->j_journal, full);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * JBD Might read a cached version of another nodes journal file. We
														
 
															+ * don't want this as this file changes often and we get no
														
 
															+ * notification on those changes. The only way to be sure that we've
														
 
															+ * got the most up to date version of those blocks then is to force
														
 
															+ * read them off disk. Just searching through the buffer cache won't
														
 
															+ * work as there may be pages backing this file which are still marked
														
 
															+ * up to date. We know things can't change on this file underneath us
														
 
															+ * as we have the lock by now :)
														
 
															+ */
														
 
															+static int ocfs2_force_read_journal(struct inode *inode)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int i, p_blocks;
														
 
															+	u64 v_blkno, p_blkno;
														
 
															+#define CONCURRENT_JOURNAL_FILL 32
														
 
															+	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(inode->i_blocks !=
														
 
															+		     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
														
 
															+
														
 
															+	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
														
 
															+
														
 
															+	mlog(0, "Force reading %lu blocks\n",
														
 
															+	     (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
														
 
															+
														
 
															+	v_blkno = 0;
														
 
															+	while (v_blkno <
														
 
															+	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
														
 
															+
														
 
															+		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
														
 
															+						     1, &p_blkno,
														
 
															+						     &p_blocks);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		if (p_blocks > CONCURRENT_JOURNAL_FILL)
														
 
															+			p_blocks = CONCURRENT_JOURNAL_FILL;
														
 
															+
														
 
															+		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
														
 
															+					   p_blkno, p_blocks, bhs, 0,
														
 
															+					   inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		for(i = 0; i < p_blocks; i++) {
														
 
															+			brelse(bhs[i]);
														
 
															+			bhs[i] = NULL;
														
 
															+		}
														
 
															+
														
 
															+		v_blkno += p_blocks;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
														
 
															+		if (bhs[i])
														
 
															+			brelse(bhs[i]);
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+struct ocfs2_la_recovery_item {
														
 
															+	struct list_head	lri_list;
														
 
															+	int			lri_slot;
														
 
															+	struct ocfs2_dinode	*lri_la_dinode;
														
 
															+	struct ocfs2_dinode	*lri_tl_dinode;
														
 
															+};
														
 
															+
														
 
															+/* Does the second half of the recovery process. By this point, the
														
 
															+ * node is marked clean and can actually be considered recovered,
														
 
															+ * hence it's no longer in the recovery map, but there's still some
														
 
															+ * cleanup we can do which shouldn't happen within the recovery thread
														
 
															+ * as locking in that context becomes very difficult if we are to take
														
 
															+ * recovering nodes into account.
														
 
															+ *
														
 
															+ * NOTE: This function can and will sleep on recovery of other nodes
														
 
															+ * during cluster locking, just like any other ocfs2 process.
														
 
															+ */
														
 
															+void ocfs2_complete_recovery(void *data)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct ocfs2_super *osb = data;
														
 
															+	struct ocfs2_journal *journal = osb->journal;
														
 
															+	struct ocfs2_dinode *la_dinode, *tl_dinode;
														
 
															+	struct ocfs2_la_recovery_item *item;
														
 
															+	struct list_head *p, *n;
														
 
															+	LIST_HEAD(tmp_la_list);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	mlog(0, "completing recovery from keventd\n");
														
 
															+
														
 
															+	spin_lock(&journal->j_lock);
														
 
															+	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
														
 
															+	spin_unlock(&journal->j_lock);
														
 
															+
														
 
															+	list_for_each_safe(p, n, &tmp_la_list) {
														
 
															+		item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
														
 
															+		list_del_init(&item->lri_list);
														
 
															+
														
 
															+		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
														
 
															+
														
 
															+		la_dinode = item->lri_la_dinode;
														
 
															+		if (la_dinode) {
														
 
															+			mlog(0, "Clean up local alloc %"MLFu64"\n",
														
 
															+			     la_dinode->i_blkno);
														
 
															+
														
 
															+			ret = ocfs2_complete_local_alloc_recovery(osb,
														
 
															+								  la_dinode);
														
 
															+			if (ret < 0)
														
 
															+				mlog_errno(ret);
														
 
															+
														
 
															+			kfree(la_dinode);
														
 
															+		}
														
 
															+
														
 
															+		tl_dinode = item->lri_tl_dinode;
														
 
															+		if (tl_dinode) {
														
 
															+			mlog(0, "Clean up truncate log %"MLFu64"\n",
														
 
															+			     tl_dinode->i_blkno);
														
 
															+
														
 
															+			ret = ocfs2_complete_truncate_log_recovery(osb,
														
 
															+								   tl_dinode);
														
 
															+			if (ret < 0)
														
 
															+				mlog_errno(ret);
														
 
															+
														
 
															+			kfree(tl_dinode);
														
 
															+		}
														
 
															+
														
 
															+		ret = ocfs2_recover_orphans(osb, item->lri_slot);
														
 
															+		if (ret < 0)
														
 
															+			mlog_errno(ret);
														
 
															+
														
 
															+		kfree(item);
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "Recovery completion\n");
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/* NOTE: This function always eats your references to la_dinode and
														
 
															+ * tl_dinode, either manually on error, or by passing them to
														
 
															+ * ocfs2_complete_recovery */
														
 
															+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
														
 
															+					    int slot_num,
														
 
															+					    struct ocfs2_dinode *la_dinode,
														
 
															+					    struct ocfs2_dinode *tl_dinode)
														
 
															+{
														
 
															+	struct ocfs2_la_recovery_item *item;
														
 
															+
														
 
															+	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
														
 
															+	if (!item) {
														
 
															+		/* Though we wish to avoid it, we are in fact safe in
														
 
															+		 * skipping local alloc cleanup as fsck.ocfs2 is more
														
 
															+		 * than capable of reclaiming unused space. */
														
 
															+		if (la_dinode)
														
 
															+			kfree(la_dinode);
														
 
															+
														
 
															+		if (tl_dinode)
														
 
															+			kfree(tl_dinode);
														
 
															+
														
 
															+		mlog_errno(-ENOMEM);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	INIT_LIST_HEAD(&item->lri_list);
														
 
															+	item->lri_la_dinode = la_dinode;
														
 
															+	item->lri_slot = slot_num;
														
 
															+	item->lri_tl_dinode = tl_dinode;
														
 
															+
														
 
															+	spin_lock(&journal->j_lock);
														
 
															+	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
														
 
															+	queue_work(ocfs2_wq, &journal->j_recovery_work);
														
 
															+	spin_unlock(&journal->j_lock);
														
 
															+}
														
 
															+
														
 
															+/* Called by the mount code to queue recovery the last part of
														
 
															+ * recovery for it's own slot. */
														
 
															+void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	struct ocfs2_journal *journal = osb->journal;
														
 
															+
														
 
															+	if (osb->dirty) {
														
 
															+		/* No need to queue up our truncate_log as regular
														
 
															+		 * cleanup will catch that. */
														
 
															+		ocfs2_queue_recovery_completion(journal,
														
 
															+						osb->slot_num,
														
 
															+						osb->local_alloc_copy,
														
 
															+						NULL);
														
 
															+		ocfs2_schedule_truncate_log_flush(osb, 0);
														
 
															+
														
 
															+		osb->local_alloc_copy = NULL;
														
 
															+		osb->dirty = 0;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int __ocfs2_recovery_thread(void *arg)
														
 
															+{
														
 
															+	int status, node_num;
														
 
															+	struct ocfs2_super *osb = arg;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	status = ocfs2_wait_on_mount(osb);
														
 
															+	if (status < 0) {
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+restart:
														
 
															+	status = ocfs2_super_lock(osb, 1);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
														
 
															+		node_num = ocfs2_node_map_first_set_bit(osb,
														
 
															+							&osb->recovery_map);
														
 
															+		if (node_num == O2NM_INVALID_NODE_NUM) {
														
 
															+			mlog(0, "Out of nodes to recover.\n");
														
 
															+			break;
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_recover_node(osb, node_num);
														
 
															+		if (status < 0) {
														
 
															+			mlog(ML_ERROR,
														
 
															+			     "Error %d recovering node %d on device (%u,%u)!\n",
														
 
															+			     status, node_num,
														
 
															+			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
														
 
															+			mlog(ML_ERROR, "Volume requires unmount.\n");
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		ocfs2_recovery_map_clear(osb, node_num);
														
 
															+	}
														
 
															+	ocfs2_super_unlock(osb, 1);
														
 
															+
														
 
															+	/* We always run recovery on our own orphan dir - the dead
														
 
															+	 * node(s) may have voted "no" on an inode delete earlier. A
														
 
															+	 * revote is therefore required. */
														
 
															+	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
														
 
															+					NULL);
														
 
															+
														
 
															+bail:
														
 
															+	down(&osb->recovery_lock);
														
 
															+	if (!status &&
														
 
															+	    !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
														
 
															+		up(&osb->recovery_lock);
														
 
															+		goto restart;
														
 
															+	}
														
 
															+
														
 
															+	osb->recovery_thread_task = NULL;
														
 
															+	mb(); /* sync with ocfs2_recovery_thread_running */
														
 
															+	wake_up(&osb->recovery_event);
														
 
															+
														
 
															+	up(&osb->recovery_lock);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	/* no one is callint kthread_stop() for us so the kthread() api
														
 
															+	 * requires that we call do_exit().  And it isn't exported, but
														
 
															+	 * complete_and_exit() seems to be a minimal wrapper around it. */
														
 
															+	complete_and_exit(NULL, status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
														
 
															+{
														
 
															+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
														
 
															+		   node_num, osb->node_num);
														
 
															+
														
 
															+	down(&osb->recovery_lock);
														
 
															+	if (osb->disable_recovery)
														
 
															+		goto out;
														
 
															+
														
 
															+	/* People waiting on recovery will wait on
														
 
															+	 * the recovery map to empty. */
														
 
															+	if (!ocfs2_recovery_map_set(osb, node_num))
														
 
															+		mlog(0, "node %d already be in recovery.\n", node_num);
														
 
															+
														
 
															+	mlog(0, "starting recovery thread...\n");
														
 
															+
														
 
															+	if (osb->recovery_thread_task)
														
 
															+		goto out;
														
 
															+
														
 
															+	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
														
 
															+						 "ocfs2rec-%d", osb->osb_id);
														
 
															+	if (IS_ERR(osb->recovery_thread_task)) {
														
 
															+		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
														
 
															+		osb->recovery_thread_task = NULL;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	up(&osb->recovery_lock);
														
 
															+	wake_up(&osb->recovery_event);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/* Does the actual journal replay and marks the journal inode as
														
 
															+ * clean. Will only replay if the journal inode is marked dirty. */
														
 
															+static int ocfs2_replay_journal(struct ocfs2_super *osb,
														
 
															+				int node_num,
														
 
															+				int slot_num)
														
 
															+{
														
 
															+	int status;
														
 
															+	int got_lock = 0;
														
 
															+	unsigned int flags;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	journal_t *journal = NULL;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+
														
 
															+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
														
 
															+					    slot_num);
														
 
															+	if (inode == NULL) {
														
 
															+		status = -EACCES;
														
 
															+		mlog_errno(status);
														
 
															+		goto done;
														
 
															+	}
														
 
															+	if (is_bad_inode(inode)) {
														
 
															+		status = -EACCES;
														
 
															+		iput(inode);
														
 
															+		inode = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto done;
														
 
															+	}
														
 
															+	SET_INODE_JOURNAL(inode);
														
 
															+
														
 
															+	status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
														
 
															+				      OCFS2_META_LOCK_RECOVERY);
														
 
															+	if (status < 0) {
														
 
															+		mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
														
 
															+		if (status != -ERESTARTSYS)
														
 
															+			mlog(ML_ERROR, "Could not lock journal!\n");
														
 
															+		goto done;
														
 
															+	}
														
 
															+	got_lock = 1;
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) bh->b_data;
														
 
															+
														
 
															+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
														
 
															+
														
 
															+	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
														
 
															+		mlog(0, "No recovery required for node %d\n", node_num);
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
														
 
															+	     node_num, slot_num,
														
 
															+	     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
														
 
															+
														
 
															+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
														
 
															+
														
 
															+	status = ocfs2_force_read_journal(inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "calling journal_init_inode\n");
														
 
															+	journal = journal_init_inode(inode);
														
 
															+	if (journal == NULL) {
														
 
															+		mlog(ML_ERROR, "Linux journal layer error\n");
														
 
															+		status = -EIO;
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	status = journal_load(journal);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		if (!igrab(inode))
														
 
															+			BUG();
														
 
															+		journal_destroy(journal);
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
														
 
															+
														
 
															+	/* wipe the journal */
														
 
															+	mlog(0, "flushing the journal.\n");
														
 
															+	journal_lock_updates(journal);
														
 
															+	status = journal_flush(journal);
														
 
															+	journal_unlock_updates(journal);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	/* This will mark the node clean */
														
 
															+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
														
 
															+	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
														
 
															+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
														
 
															+
														
 
															+	status = ocfs2_write_block(osb, bh, inode);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	if (!igrab(inode))
														
 
															+		BUG();
														
 
															+
														
 
															+	journal_destroy(journal);
														
 
															+
														
 
															+done:
														
 
															+	/* drop the lock on this nodes journal */
														
 
															+	if (got_lock)
														
 
															+		ocfs2_meta_unlock(inode, 1);
														
 
															+
														
 
															+	if (inode)
														
 
															+		iput(inode);
														
 
															+
														
 
															+	if (bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Do the most important parts of node recovery:
														
 
															+ *  - Replay it's journal
														
 
															+ *  - Stamp a clean local allocator file
														
 
															+ *  - Stamp a clean truncate log
														
 
															+ *  - Mark the node clean
														
 
															+ *
														
 
															+ * If this function completes without error, a node in OCFS2 can be
														
 
															+ * said to have been safely recovered. As a result, failure during the
														
 
															+ * second part of a nodes recovery process (local alloc recovery) is
														
 
															+ * far less concerning.
														
 
															+ */
														
 
															+static int ocfs2_recover_node(struct ocfs2_super *osb,
														
 
															+			      int node_num)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int slot_num;
														
 
															+	struct ocfs2_slot_info *si = osb->slot_info;
														
 
															+	struct ocfs2_dinode *la_copy = NULL;
														
 
															+	struct ocfs2_dinode *tl_copy = NULL;
														
 
															+
														
 
															+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
														
 
															+		   node_num, osb->node_num);
														
 
															+
														
 
															+	mlog(0, "checking node %d\n", node_num);
														
 
															+
														
 
															+	/* Should not ever be called to recover ourselves -- in that
														
 
															+	 * case we should've called ocfs2_journal_load instead. */
														
 
															+	if (osb->node_num == node_num)
														
 
															+		BUG();
														
 
															+
														
 
															+	slot_num = ocfs2_node_num_to_slot(si, node_num);
														
 
															+	if (slot_num == OCFS2_INVALID_SLOT) {
														
 
															+		status = 0;
														
 
															+		mlog(0, "no slot for this node, so no recovery required.\n");
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
														
 
															+
														
 
															+	status = ocfs2_replay_journal(osb, node_num, slot_num);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	/* Stamp a clean local alloc file AFTER recovering the journal... */
														
 
															+	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto done;
														
 
															+	}
														
 
															+
														
 
															+	/* An error from begin_truncate_log_recovery is not
														
 
															+	 * serious enough to warrant halting the rest of
														
 
															+	 * recovery. */
														
 
															+	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	/* Likewise, this would be a strange but ultimately not so
														
 
															+	 * harmful place to get an error... */
														
 
															+	ocfs2_clear_slot(si, slot_num);
														
 
															+	status = ocfs2_update_disk_slots(osb, si);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	/* This will kfree the memory pointed to by la_copy and tl_copy */
														
 
															+	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
														
 
															+					tl_copy);
														
 
															+
														
 
															+	status = 0;
														
 
															+done:
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Test node liveness by trylocking his journal. If we get the lock,
														
 
															+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
														
 
															+ * still alive (we couldn't get the lock) and < 0 on error. */
														
 
															+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
														
 
															+				 int slot_num)
														
 
															+{
														
 
															+	int status, flags;
														
 
															+	struct inode *inode = NULL;
														
 
															+
														
 
															+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
														
 
															+					    slot_num);
														
 
															+	if (inode == NULL) {
														
 
															+		mlog(ML_ERROR, "access error\n");
														
 
															+		status = -EACCES;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (is_bad_inode(inode)) {
														
 
															+		mlog(ML_ERROR, "access error (bad inode)\n");
														
 
															+		iput(inode);
														
 
															+		inode = NULL;
														
 
															+		status = -EACCES;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	SET_INODE_JOURNAL(inode);
														
 
															+
														
 
															+	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
														
 
															+	status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -EAGAIN)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_meta_unlock(inode, 1);
														
 
															+bail:
														
 
															+	if (inode)
														
 
															+		iput(inode);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Call this underneath ocfs2_super_lock. It also assumes that the
														
 
															+ * slot info struct has been updated from disk. */
														
 
															+int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status, i, node_num;
														
 
															+	struct ocfs2_slot_info *si = osb->slot_info;
														
 
															+
														
 
															+	/* This is called with the super block cluster lock, so we
														
 
															+	 * know that the slot map can't change underneath us. */
														
 
															+
														
 
															+	spin_lock(&si->si_lock);
														
 
															+	for(i = 0; i < si->si_num_slots; i++) {
														
 
															+		if (i == osb->slot_num)
														
 
															+			continue;
														
 
															+		if (ocfs2_is_empty_slot(si, i))
														
 
															+			continue;
														
 
															+
														
 
															+		node_num = si->si_global_node_nums[i];
														
 
															+		if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
														
 
															+			continue;
														
 
															+		spin_unlock(&si->si_lock);
														
 
															+
														
 
															+		/* Ok, we have a slot occupied by another node which
														
 
															+		 * is not in the recovery map. We trylock his journal
														
 
															+		 * file here to test if he's alive. */
														
 
															+		status = ocfs2_trylock_journal(osb, i);
														
 
															+		if (!status) {
														
 
															+			/* Since we're called from mount, we know that
														
 
															+			 * the recovery thread can't race us on
														
 
															+			 * setting / checking the recovery bits. */
														
 
															+			ocfs2_recovery_thread(osb, node_num);
														
 
															+		} else if ((status < 0) && (status != -EAGAIN)) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		spin_lock(&si->si_lock);
														
 
															+	}
														
 
															+	spin_unlock(&si->si_lock);
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
														
 
															+				 int slot)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int have_disk_lock = 0;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct inode *iter;
														
 
															+	struct inode *orphan_dir_inode = NULL;
														
 
															+	unsigned long offset, blk, local;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_dir_entry *de;
														
 
															+	struct super_block *sb = osb->sb;
														
 
															+	struct ocfs2_inode_info *oi;
														
 
															+
														
 
															+	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
														
 
															+
														
 
															+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						       ORPHAN_DIR_SYSTEM_INODE,
														
 
															+						       slot);
														
 
															+	if  (!orphan_dir_inode) {
														
 
															+		status = -ENOENT;
														
 
															+		mlog_errno(status);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	down(&orphan_dir_inode->i_sem);
														
 
															+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
														
 
															+	if (status < 0) {
														
 
															+		up(&orphan_dir_inode->i_sem);
														
 
															+		mlog_errno(status);
														
 
															+		goto out;
														
 
															+	}
														
 
															+	have_disk_lock = 1;
														
 
															+
														
 
															+	offset = 0;
														
 
															+	iter = NULL;
														
 
															+	while(offset < i_size_read(orphan_dir_inode)) {
														
 
															+		blk = offset >> sb->s_blocksize_bits;
														
 
															+
														
 
															+		bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
														
 
															+		if (!bh)
														
 
															+			status = -EINVAL;
														
 
															+		if (status < 0) {
														
 
															+			up(&orphan_dir_inode->i_sem);
														
 
															+			if (bh)
														
 
															+				brelse(bh);
														
 
															+			mlog_errno(status);
														
 
															+			goto out;
														
 
															+		}
														
 
															+
														
 
															+		local = 0;
														
 
															+		while(offset < i_size_read(orphan_dir_inode)
														
 
															+		      && local < sb->s_blocksize) {
														
 
															+			de = (struct ocfs2_dir_entry *) (bh->b_data + local);
														
 
															+
														
 
															+			if (!ocfs2_check_dir_entry(orphan_dir_inode,
														
 
															+						  de, bh, local)) {
														
 
															+				up(&orphan_dir_inode->i_sem);
														
 
															+				status = -EINVAL;
														
 
															+				mlog_errno(status);
														
 
															+				brelse(bh);
														
 
															+				goto out;
														
 
															+			}
														
 
															+
														
 
															+			local += le16_to_cpu(de->rec_len);
														
 
															+			offset += le16_to_cpu(de->rec_len);
														
 
															+
														
 
															+			/* I guess we silently fail on no inode? */
														
 
															+			if (!le64_to_cpu(de->inode))
														
 
															+				continue;
														
 
															+			if (de->file_type > OCFS2_FT_MAX) {
														
 
															+				mlog(ML_ERROR,
														
 
															+				     "block %llu contains invalid de: "
														
 
															+				     "inode = %"MLFu64", rec_len = %u, "
														
 
															+				     "name_len = %u, file_type = %u, "
														
 
															+				     "name='%.*s'\n",
														
 
															+				     (unsigned long long)bh->b_blocknr,
														
 
															+				     le64_to_cpu(de->inode),
														
 
															+				     le16_to_cpu(de->rec_len),
														
 
															+				     de->name_len,
														
 
															+				     de->file_type,
														
 
															+				     de->name_len,
														
 
															+				     de->name);
														
 
															+				continue;
														
 
															+			}
														
 
															+			if (de->name_len == 1 && !strncmp(".", de->name, 1))
														
 
															+				continue;
														
 
															+			if (de->name_len == 2 && !strncmp("..", de->name, 2))
														
 
															+				continue;
														
 
															+
														
 
															+			iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
														
 
															+			if (IS_ERR(iter))
														
 
															+				continue;
														
 
															+
														
 
															+			mlog(0, "queue orphan %"MLFu64"\n",
														
 
															+			     OCFS2_I(iter)->ip_blkno);
														
 
															+			OCFS2_I(iter)->ip_next_orphan = inode;
														
 
															+			inode = iter;
														
 
															+		}
														
 
															+		brelse(bh);
														
 
															+	}
														
 
															+	up(&orphan_dir_inode->i_sem);
														
 
															+
														
 
															+	ocfs2_meta_unlock(orphan_dir_inode, 0);
														
 
															+	have_disk_lock = 0;
														
 
															+
														
 
															+	iput(orphan_dir_inode);
														
 
															+	orphan_dir_inode = NULL;
														
 
															+
														
 
															+	while (inode) {
														
 
															+		oi = OCFS2_I(inode);
														
 
															+		mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
														
 
															+
														
 
															+		iter = oi->ip_next_orphan;
														
 
															+
														
 
															+		spin_lock(&oi->ip_lock);
														
 
															+		/* Delete voting may have set these on the assumption
														
 
															+		 * that the other node would wipe them successfully.
														
 
															+		 * If they are still in the node's orphan dir, we need
														
 
															+		 * to reset that state. */
														
 
															+		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
														
 
															+
														
 
															+		/* Set the proper information to get us going into
														
 
															+		 * ocfs2_delete_inode. */
														
 
															+		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
														
 
															+		oi->ip_orphaned_slot = slot;
														
 
															+		spin_unlock(&oi->ip_lock);
														
 
															+
														
 
															+		iput(inode);
														
 
															+
														
 
															+		inode = iter;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	if (have_disk_lock)
														
 
															+		ocfs2_meta_unlock(orphan_dir_inode, 0);
														
 
															+
														
 
															+	if (orphan_dir_inode)
														
 
															+		iput(orphan_dir_inode);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	/* This check is good because ocfs2 will wait on our recovery
														
 
															+	 * thread before changing it to something other than MOUNTED
														
 
															+	 * or DISABLED. */
														
 
															+	wait_event(osb->osb_mount_event,
														
 
															+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
														
 
															+		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
														
 
															+
														
 
															+	/* If there's an error on mount, then we may never get to the
														
 
															+	 * MOUNTED flag, but this is set right before
														
 
															+	 * dismount_volume() so we can trust it. */
														
 
															+	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
														
 
															+		mlog(0, "mount error, exiting!\n");
														
 
															+		return -EBUSY;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_commit_thread(void *arg)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_super *osb = arg;
														
 
															+	struct ocfs2_journal *journal = osb->journal;
														
 
															+
														
 
															+	/* we can trust j_num_trans here because _should_stop() is only set in
														
 
															+	 * shutdown and nobody other than ourselves should be able to start
														
 
															+	 * transactions.  committing on shutdown might take a few iterations
														
 
															+	 * as final transactions put deleted inodes on the list */
														
 
															+	while (!(kthread_should_stop() &&
														
 
															+		 atomic_read(&journal->j_num_trans) == 0)) {
														
 
															+
														
 
															+		wait_event_interruptible_timeout(osb->checkpoint_event,
														
 
															+						 atomic_read(&journal->j_num_trans)
														
 
															+						 || kthread_should_stop(),
														
 
															+						 OCFS2_CHECKPOINT_INTERVAL);
														
 
															+
														
 
															+		status = ocfs2_commit_cache(osb);
														
 
															+		if (status < 0)
														
 
															+			mlog_errno(status);
														
 
															+
														
 
															+		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
														
 
															+			mlog(ML_KTHREAD,
														
 
															+			     "commit_thread: %u transactions pending on "
														
 
															+			     "shutdown\n",
														
 
															+			     atomic_read(&journal->j_num_trans));
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Look for a dirty journal without taking any cluster locks. Used for
														
 
															+ * hard readonly access to determine whether the file system journals
														
 
															+ * require recovery. */
														
 
															+int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	unsigned int slot;
														
 
															+	struct buffer_head *di_bh;
														
 
															+	struct ocfs2_dinode *di;
														
 
															+	struct inode *journal = NULL;
														
 
															+
														
 
															+	for(slot = 0; slot < osb->max_slots; slot++) {
														
 
															+		journal = ocfs2_get_system_file_inode(osb,
														
 
															+						      JOURNAL_SYSTEM_INODE,
														
 
															+						      slot);
														
 
															+		if (!journal || is_bad_inode(journal)) {
														
 
															+			ret = -EACCES;
														
 
															+			mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+
														
 
															+		di_bh = NULL;
														
 
															+		ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
														
 
															+				       0, journal);
														
 
															+		if (ret < 0) {
														
 
															+			mlog_errno(ret);
														
 
															+			goto out;
														
 
															+		}
														
 
															+
														
 
															+		di = (struct ocfs2_dinode *) di_bh->b_data;
														
 
															+
														
 
															+		if (le32_to_cpu(di->id1.journal1.ij_flags) &
														
 
															+		    OCFS2_JOURNAL_DIRTY_FL)
														
 
															+			ret = -EROFS;
														
 
															+
														
 
															+		brelse(di_bh);
														
 
															+		if (ret)
														
 
															+			break;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	if (journal)
														
 
															+		iput(journal);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,457 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * journal.h
														
 
															+ *
														
 
															+ * Defines journalling api and structures.
														
 
															+ *
														
 
															+ * Copyright (C) 2003, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_JOURNAL_H
														
 
															+#define OCFS2_JOURNAL_H
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/jbd.h>
														
 
															+
														
 
															+#define OCFS2_CHECKPOINT_INTERVAL        (8 * HZ)
														
 
															+
														
 
															+enum ocfs2_journal_state {
														
 
															+	OCFS2_JOURNAL_FREE = 0,
														
 
															+	OCFS2_JOURNAL_LOADED,
														
 
															+	OCFS2_JOURNAL_IN_SHUTDOWN,
														
 
															+};
														
 
															+
														
 
															+struct ocfs2_super;
														
 
															+struct ocfs2_dinode;
														
 
															+struct ocfs2_journal_handle;
														
 
															+
														
 
															+struct ocfs2_journal {
														
 
															+	enum ocfs2_journal_state   j_state;    /* Journals current state   */
														
 
															+
														
 
															+	journal_t                 *j_journal; /* The kernels journal type */
														
 
															+	struct inode              *j_inode;   /* Kernel inode pointing to
														
 
															+					       * this journal             */
														
 
															+	struct ocfs2_super        *j_osb;     /* pointer to the super
														
 
															+					       * block for the node
														
 
															+					       * we're currently
														
 
															+					       * running on -- not
														
 
															+					       * necessarily the super
														
 
															+					       * block from the node
														
 
															+					       * which we usually run
														
 
															+					       * from (recovery,
														
 
															+					       * etc)                     */
														
 
															+	struct buffer_head        *j_bh;      /* Journal disk inode block */
														
 
															+	atomic_t                  j_num_trans; /* Number of transactions
														
 
															+					        * currently in the system. */
														
 
															+	unsigned long             j_trans_id;
														
 
															+	struct rw_semaphore       j_trans_barrier;
														
 
															+	wait_queue_head_t         j_checkpointed;
														
 
															+
														
 
															+	spinlock_t                j_lock;
														
 
															+	struct list_head          j_la_cleanups;
														
 
															+	struct work_struct        j_recovery_work;
														
 
															+};
														
 
															+
														
 
															+extern spinlock_t trans_inc_lock;
														
 
															+
														
 
															+/* wrap j_trans_id so we never have it equal to zero. */
														
 
															+static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
														
 
															+{
														
 
															+	unsigned long old_id;
														
 
															+	spin_lock(&trans_inc_lock);
														
 
															+	old_id = j->j_trans_id++;
														
 
															+	if (unlikely(!j->j_trans_id))
														
 
															+		j->j_trans_id = 1;
														
 
															+	spin_unlock(&trans_inc_lock);
														
 
															+	return old_id;
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
														
 
															+					      struct inode *inode)
														
 
															+{
														
 
															+	spin_lock(&trans_inc_lock);
														
 
															+	OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
														
 
															+	spin_unlock(&trans_inc_lock);
														
 
															+}
														
 
															+
														
 
															+/* Used to figure out whether it's safe to drop a metadata lock on an
														
 
															+ * inode. Returns true if all the inodes changes have been
														
 
															+ * checkpointed to disk. You should be holding the spinlock on the
														
 
															+ * metadata lock while calling this to be sure that nobody can take
														
 
															+ * the lock and put it on another transaction. */
														
 
															+static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
														
 
															+
														
 
															+	spin_lock(&trans_inc_lock);
														
 
															+	ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
														
 
															+	spin_unlock(&trans_inc_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* convenience function to check if an inode is still new (has never
														
 
															+ * hit disk) Will do you a favor and set created_trans = 0 when you've
														
 
															+ * been checkpointed.  returns '1' if the inode is still new. */
														
 
															+static inline int ocfs2_inode_is_new(struct inode *inode)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	/* System files are never "new" as they're written out by
														
 
															+	 * mkfs. This helps us early during mount, before we have the
														
 
															+	 * journal open and j_trans_id could be junk. */
														
 
															+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
														
 
															+		return 0;
														
 
															+	spin_lock(&trans_inc_lock);
														
 
															+	ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
														
 
															+			   OCFS2_I(inode)->ip_created_trans));
														
 
															+	if (!ret)
														
 
															+		OCFS2_I(inode)->ip_created_trans = 0;
														
 
															+	spin_unlock(&trans_inc_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
														
 
															+				       struct inode *inode)
														
 
															+{
														
 
															+	spin_lock(&trans_inc_lock);
														
 
															+	OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
														
 
															+	spin_unlock(&trans_inc_lock);
														
 
															+}
														
 
															+
														
 
															+extern kmem_cache_t *ocfs2_lock_cache;
														
 
															+
														
 
															+struct ocfs2_journal_lock {
														
 
															+	struct inode     *jl_inode;
														
 
															+	struct list_head  jl_lock_list;
														
 
															+};
														
 
															+
														
 
															+struct ocfs2_journal_handle {
														
 
															+	handle_t            *k_handle; /* kernel handle.                */
														
 
															+	struct ocfs2_journal        *journal;
														
 
															+	u32                 flags;     /* see flags below.              */
														
 
															+	int                 max_buffs; /* Buffs reserved by this handle */
														
 
															+
														
 
															+	/* The following two fields are for ocfs2_handle_add_lock */
														
 
															+	int                 num_locks;
														
 
															+	struct list_head    locks;     /* A bunch of locks to
														
 
															+					* release on commit. This
														
 
															+					* should be a list_head */
														
 
															+
														
 
															+	struct list_head     inode_list;
														
 
															+};
														
 
															+
														
 
															+#define OCFS2_HANDLE_STARTED			1
														
 
															+/* should we sync-commit this handle? */
														
 
															+#define OCFS2_HANDLE_SYNC			2
														
 
															+static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
														
 
															+{
														
 
															+	return handle->flags & OCFS2_HANDLE_STARTED;
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
														
 
															+{
														
 
															+	if (sync)
														
 
															+		handle->flags |= OCFS2_HANDLE_SYNC;
														
 
															+	else
														
 
															+		handle->flags &= ~OCFS2_HANDLE_SYNC;
														
 
															+}
														
 
															+
														
 
															+/* Exported only for the journal struct init code in super.c. Do not call. */
														
 
															+void ocfs2_complete_recovery(void *data);
														
 
															+
														
 
															+/*
														
 
															+ *  Journal Control:
														
 
															+ *  Initialize, Load, Shutdown, Wipe a journal.
														
 
															+ *
														
 
															+ *  ocfs2_journal_init     - Initialize journal structures in the OSB.
														
 
															+ *  ocfs2_journal_load     - Load the given journal off disk. Replay it if
														
 
															+ *                          there's transactions still in there.
														
 
															+ *  ocfs2_journal_shutdown - Shutdown a journal, this will flush all
														
 
															+ *                          uncommitted, uncheckpointed transactions.
														
 
															+ *  ocfs2_journal_wipe     - Wipe transactions from a journal. Optionally
														
 
															+ *                          zero out each block.
														
 
															+ *  ocfs2_recovery_thread  - Perform recovery on a node. osb is our own osb.
														
 
															+ *  ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
														
 
															+ *                          event on.
														
 
															+ *  ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
														
 
															+ */
														
 
															+void   ocfs2_set_journal_params(struct ocfs2_super *osb);
														
 
															+int    ocfs2_journal_init(struct ocfs2_journal *journal,
														
 
															+			  int *dirty);
														
 
															+void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
														
 
															+int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
														
 
															+			  int full);
														
 
															+int    ocfs2_journal_load(struct ocfs2_journal *journal);
														
 
															+int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
														
 
															+void   ocfs2_recovery_thread(struct ocfs2_super *osb,
														
 
															+			     int node_num);
														
 
															+int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
														
 
															+void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
														
 
															+
														
 
															+static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	atomic_set(&osb->needs_checkpoint, 1);
														
 
															+	wake_up(&osb->checkpoint_event);
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_checkpoint_inode(struct inode *inode)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+
														
 
															+	if (!ocfs2_inode_fully_checkpointed(inode)) {
														
 
															+		/* WARNING: This only kicks off a single
														
 
															+		 * checkpoint. If someone races you and adds more
														
 
															+		 * metadata to the journal, you won't know, and will
														
 
															+		 * wind up waiting *alot* longer than necessary. Right
														
 
															+		 * now we only use this in clear_inode so that's
														
 
															+		 * OK. */
														
 
															+		ocfs2_start_checkpoint(osb);
														
 
															+
														
 
															+		wait_event(osb->journal->j_checkpointed,
														
 
															+			   ocfs2_inode_fully_checkpointed(inode));
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ *  Transaction Handling:
														
 
															+ *  Manage the lifetime of a transaction handle.
														
 
															+ *
														
 
															+ *  ocfs2_alloc_handle     - Only allocate a handle so we can start putting
														
 
															+ *                          cluster locks on it. To actually change blocks,
														
 
															+ *                          call ocfs2_start_trans with the handle returned
														
 
															+ *                          from this function. You may call ocfs2_commit_trans
														
 
															+ *                           at any time in the lifetime of a handle.
														
 
															+ *  ocfs2_start_trans      - Begin a transaction. Give it an upper estimate of
														
 
															+ *                          the number of blocks that will be changed during
														
 
															+ *                          this handle.
														
 
															+ *  ocfs2_commit_trans     - Complete a handle.
														
 
															+ *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
														
 
															+ *                          commit the handle to disk in the process, but will
														
 
															+ *                          not release any locks taken during the transaction.
														
 
															+ *  ocfs2_journal_access   - Notify the handle that we want to journal this
														
 
															+ *                          buffer. Will have to call ocfs2_journal_dirty once
														
 
															+ *                          we've actually dirtied it. Type is one of . or .
														
 
															+ *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
														
 
															+ *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
														
 
															+ *                             the current handle commits.
														
 
															+ *  ocfs2_handle_add_lock  - Sometimes we need to delay lock release
														
 
															+ *                          until after a transaction has been completed. Use
														
 
															+ *                          ocfs2_handle_add_lock to indicate that a lock needs
														
 
															+ *                          to be released at the end of that handle. Locks
														
 
															+ *                          will be released in the order that they are added.
														
 
															+ *  ocfs2_handle_add_inode - Add a locked inode to a transaction.
														
 
															+ */
														
 
															+
														
 
															+/* You must always start_trans with a number of buffs > 0, but it's
														
 
															+ * perfectly legal to go through an entire transaction without having
														
 
															+ * dirtied any buffers. */
														
 
															+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
														
 
															+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
														
 
															+					       struct ocfs2_journal_handle *handle,
														
 
															+					       int max_buffs);
														
 
															+void			     ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
														
 
															+int			     ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
														
 
															+						int nblocks);
														
 
															+
														
 
															+/*
														
 
															+ * Create access is for when we get a newly created buffer and we're
														
 
															+ * not gonna read it off disk, but rather fill it ourselves.  Right
														
 
															+ * now, we don't do anything special with this (it turns into a write
														
 
															+ * request), but this is a good placeholder in case we do...
														
 
															+ *
														
 
															+ * Write access is for when we read a block off disk and are going to
														
 
															+ * modify it. This way the journalling layer knows it may need to make
														
 
															+ * a copy of that block (if it's part of another, uncommitted
														
 
															+ * transaction) before we do so.
														
 
															+ */
														
 
															+#define OCFS2_JOURNAL_ACCESS_CREATE 0
														
 
															+#define OCFS2_JOURNAL_ACCESS_WRITE  1
														
 
															+#define OCFS2_JOURNAL_ACCESS_UNDO   2
														
 
															+
														
 
															+int                  ocfs2_journal_access(struct ocfs2_journal_handle *handle,
														
 
															+					  struct inode *inode,
														
 
															+					  struct buffer_head *bh,
														
 
															+					  int type);
														
 
															+/*
														
 
															+ * A word about the journal_access/journal_dirty "dance". It is
														
 
															+ * entirely legal to journal_access a buffer more than once (as long
														
 
															+ * as the access type is the same -- I'm not sure what will happen if
														
 
															+ * access type is different but this should never happen anyway) It is
														
 
															+ * also legal to journal_dirty a buffer more than once. In fact, you
														
 
															+ * can even journal_access a buffer after you've done a
														
 
															+ * journal_access/journal_dirty pair. The only thing you cannot do
														
 
															+ * however, is journal_dirty a buffer which you haven't yet passed to
														
 
															+ * journal_access at least once.
														
 
															+ *
														
 
															+ * That said, 99% of the time this doesn't matter and this is what the
														
 
															+ * path looks like:
														
 
															+ *
														
 
															+ *	<read a bh>
														
 
															+ *	ocfs2_journal_access(handle, bh,	OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+ *	<modify the bh>
														
 
															+ * 	ocfs2_journal_dirty(handle, bh);
														
 
															+ */
														
 
															+int                  ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
														
 
															+					 struct buffer_head *bh);
														
 
															+int                  ocfs2_journal_dirty_data(handle_t *handle,
														
 
															+					      struct buffer_head *bh);
														
 
															+int                  ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
														
 
															+					   struct inode *inode);
														
 
															+/*
														
 
															+ * Use this to protect from other processes reading buffer state while
														
 
															+ * it's in flight.
														
 
															+ */
														
 
															+void                 ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
														
 
															+					    struct inode *inode);
														
 
															+
														
 
															+/*
														
 
															+ *  Credit Macros:
														
 
															+ *  Convenience macros to calculate number of credits needed.
														
 
															+ *
														
 
															+ *  For convenience sake, I have a set of macros here which calculate
														
 
															+ *  the *maximum* number of sectors which will be changed for various
														
 
															+ *  metadata updates.
														
 
															+ */
														
 
															+
														
 
															+/* simple file updates like chmod, etc. */
														
 
															+#define OCFS2_INODE_UPDATE_CREDITS 1
														
 
															+
														
 
															+/* get one bit out of a suballocator: dinode + group descriptor +
														
 
															+ * prev. group desc. if we relink. */
														
 
															+#define OCFS2_SUBALLOC_ALLOC (3)
														
 
															+
														
 
															+/* dinode + group descriptor update. We don't relink on free yet. */
														
 
															+#define OCFS2_SUBALLOC_FREE  (2)
														
 
															+
														
 
															+#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
														
 
															+#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
														
 
															+					 + OCFS2_TRUNCATE_LOG_UPDATE)
														
 
															+
														
 
															+/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
														
 
															+ * bitmap block for the new bit) */
														
 
															+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
														
 
															+
														
 
															+/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
														
 
															+ * group descriptor + mkdir/symlink blocks */
														
 
															+#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
														
 
															+			    + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
														
 
															+
														
 
															+/* local alloc metadata change + main bitmap updates */
														
 
															+#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
														
 
															+				  + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
														
 
															+
														
 
															+/* used when we don't need an allocation change for a dir extend. One
														
 
															+ * for the dinode, one for the new block. */
														
 
															+#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
														
 
															+
														
 
															+/* file update (nlink, etc) + dir entry block */
														
 
															+#define OCFS2_LINK_CREDITS  (OCFS2_INODE_UPDATE_CREDITS + 1)
														
 
															+
														
 
															+/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
														
 
															+ * dir inode link */
														
 
															+#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
														
 
															+			      + OCFS2_LINK_CREDITS)
														
 
															+
														
 
															+/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
														
 
															+ * inode alloc group descriptor */
														
 
															+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
														
 
															+
														
 
															+/* dinode update, old dir dinode update, new dir dinode update, old
														
 
															+ * dir dir entry, new dir dir entry, dir entry update for renaming
														
 
															+ * directory + target unlink */
														
 
															+#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
														
 
															+			     + OCFS2_UNLINK_CREDITS)
														
 
															+
														
 
															+static inline int ocfs2_calc_extend_credits(struct super_block *sb,
														
 
															+					    struct ocfs2_dinode *fe,
														
 
															+					    u32 bits_wanted)
														
 
															+{
														
 
															+	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
														
 
															+
														
 
															+	/* bitmap dinode, group desc. + relinked group. */
														
 
															+	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
														
 
															+
														
 
															+	/* we might need to shift tree depth so lets assume an
														
 
															+	 * absolute worst case of complete fragmentation.  Even with
														
 
															+	 * that, we only need one update for the dinode, and then
														
 
															+	 * however many metadata chunks needed * a remaining suballoc
														
 
															+	 * alloc. */
														
 
															+	sysfile_bitmap_blocks = 1 +
														
 
															+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
														
 
															+
														
 
															+	/* this does not include *new* metadata blocks, which are
														
 
															+	 * accounted for in sysfile_bitmap_blocks. fe +
														
 
															+	 * prev. last_eb_blk + blocks along edge of tree.
														
 
															+	 * calc_symlink_credits passes because we just need 1
														
 
															+	 * credit for the dinode there. */
														
 
															+	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
														
 
															+
														
 
															+	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
														
 
															+{
														
 
															+	int blocks = OCFS2_MKNOD_CREDITS;
														
 
															+
														
 
															+	/* links can be longer than one block so we may update many
														
 
															+	 * within our single allocated extent. */
														
 
															+	blocks += ocfs2_clusters_to_blocks(sb, 1);
														
 
															+
														
 
															+	return blocks;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
														
 
															+						 unsigned int cpg)
														
 
															+{
														
 
															+	int blocks;
														
 
															+	int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
														
 
															+	/* parent inode update + new block group header + bitmap inode update
														
 
															+	   + bitmap blocks affected */
														
 
															+	blocks = 1 + 1 + 1 + bitmap_blocks;
														
 
															+	return blocks;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
														
 
															+						unsigned int clusters_to_del,
														
 
															+						struct ocfs2_dinode *fe,
														
 
															+						struct ocfs2_extent_list *last_el)
														
 
															+{
														
 
															+ 	/* for dinode + all headers in this pass + update to next leaf */
														
 
															+	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
														
 
															+	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
														
 
															+	int credits = 1 + tree_depth + 1;
														
 
															+	int i;
														
 
															+
														
 
															+	i = next_free - 1;
														
 
															+	BUG_ON(i < 0);
														
 
															+
														
 
															+	/* We may be deleting metadata blocks, so metadata alloc dinode +
														
 
															+	   one desc. block for each possible delete. */
														
 
															+	if (tree_depth && next_free == 1 &&
														
 
															+	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
														
 
															+		credits += 1 + tree_depth;
														
 
															+
														
 
															+	/* update to the truncate log. */
														
 
															+	credits += OCFS2_TRUNCATE_LOG_UPDATE;
														
 
															+
														
 
															+	return credits;
														
 
															+}
														
 
															+
														
 
															+#endif /* OCFS2_JOURNAL_H */
														
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,983 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * localalloc.c
														
 
															+ *
														
 
															+ * Node local data allocation
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/bitops.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "localalloc.h"
														
 
															+#include "suballoc.h"
														
 
															+#include "super.h"
														
 
															+#include "sysfile.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+#define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
														
 
															+
														
 
															+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
														
 
															+
														
 
															+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
														
 
															+
														
 
															+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
														
 
															+					     struct ocfs2_dinode *alloc,
														
 
															+					     u32 numbits);
														
 
															+
														
 
															+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
														
 
															+
														
 
															+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
														
 
															+				    struct ocfs2_journal_handle *handle,
														
 
															+				    struct ocfs2_dinode *alloc,
														
 
															+				    struct inode *main_bm_inode,
														
 
															+				    struct buffer_head *main_bm_bh);
														
 
															+
														
 
															+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
														
 
															+						struct ocfs2_journal_handle *handle,
														
 
															+						struct ocfs2_alloc_context **ac,
														
 
															+						struct inode **bitmap_inode,
														
 
															+						struct buffer_head **bitmap_bh);
														
 
															+
														
 
															+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
														
 
															+					struct ocfs2_journal_handle *handle,
														
 
															+					struct ocfs2_alloc_context *ac);
														
 
															+
														
 
															+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
														
 
															+					  struct inode *local_alloc_inode);
														
 
															+
														
 
															+/*
														
 
															+ * Determine how large our local alloc window should be, in bits.
														
 
															+ *
														
 
															+ * These values (and the behavior in ocfs2_alloc_should_use_local) have
														
 
															+ * been chosen so that most allocations, including new block groups go
														
 
															+ * through local alloc.
														
 
															+ */
														
 
															+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	BUG_ON(osb->s_clustersize_bits < 12);
														
 
															+
														
 
															+	return 2048 >> (osb->s_clustersize_bits - 12);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Tell us whether a given allocation should use the local alloc
														
 
															+ * file. Otherwise, it has to go to the main bitmap.
														
 
															+ */
														
 
															+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
														
 
															+{
														
 
															+	int la_bits = ocfs2_local_alloc_window_bits(osb);
														
 
															+
														
 
															+	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
														
 
															+		return 0;
														
 
															+
														
 
															+	/* la_bits should be at least twice the size (in clusters) of
														
 
															+	 * a new block group. We want to be sure block group
														
 
															+	 * allocations go through the local alloc, so allow an
														
 
															+	 * allocation to take up to half the bitmap. */
														
 
															+	if (bits > (la_bits / 2))
														
 
															+		return 0;
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_load_local_alloc(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_dinode *alloc = NULL;
														
 
															+	struct buffer_head *alloc_bh = NULL;
														
 
															+	u32 num_used;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct ocfs2_local_alloc *la;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	/* read the alloc off disk */
														
 
															+	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
														
 
															+					    osb->slot_num);
														
 
															+	if (!inode) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
														
 
															+				  &alloc_bh, 0, inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
														
 
															+	la = OCFS2_LOCAL_ALLOC(alloc);
														
 
															+
														
 
															+	if (!(le32_to_cpu(alloc->i_flags) &
														
 
															+	    (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
														
 
															+		mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
														
 
															+		     OCFS2_I(inode)->ip_blkno);
														
 
															+		status = -EINVAL;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if ((la->la_size == 0) ||
														
 
															+	    (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
														
 
															+		mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
														
 
															+		     le16_to_cpu(la->la_size));
														
 
															+		status = -EINVAL;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* do a little verification. */
														
 
															+	num_used = ocfs2_local_alloc_count_bits(alloc);
														
 
															+
														
 
															+	/* hopefully the local alloc has always been recovered before
														
 
															+	 * we load it. */
														
 
															+	if (num_used
														
 
															+	    || alloc->id1.bitmap1.i_used
														
 
															+	    || alloc->id1.bitmap1.i_total
														
 
															+	    || la->la_bm_off)
														
 
															+		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
														
 
															+		     "found = %u, set = %u, taken = %u, off = %u\n",
														
 
															+		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
														
 
															+		     le32_to_cpu(alloc->id1.bitmap1.i_total),
														
 
															+		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
														
 
															+
														
 
															+	osb->local_alloc_bh = alloc_bh;
														
 
															+	osb->local_alloc_state = OCFS2_LA_ENABLED;
														
 
															+
														
 
															+bail:
														
 
															+	if (status < 0)
														
 
															+		if (alloc_bh)
														
 
															+			brelse(alloc_bh);
														
 
															+	if (inode)
														
 
															+		iput(inode);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * return any unused bits to the bitmap and write out a clean
														
 
															+ * local_alloc.
														
 
															+ *
														
 
															+ * local_alloc_bh is optional. If not passed, we will simply use the
														
 
															+ * one off osb. If you do pass it however, be warned that it *will* be
														
 
															+ * returned brelse'd and NULL'd out.*/
														
 
															+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct inode *local_alloc_inode = NULL;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct buffer_head *main_bm_bh = NULL;
														
 
															+	struct inode *main_bm_inode = NULL;
														
 
															+	struct ocfs2_dinode *alloc_copy = NULL;
														
 
															+	struct ocfs2_dinode *alloc = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
														
 
															+		goto bail;
														
 
															+
														
 
															+	local_alloc_inode =
														
 
															+		ocfs2_get_system_file_inode(osb,
														
 
															+					    LOCAL_ALLOC_SYSTEM_INODE,
														
 
															+					    osb->slot_num);
														
 
															+	if (!local_alloc_inode) {
														
 
															+		status = -ENOENT;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	osb->local_alloc_state = OCFS2_LA_DISABLED;
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (!handle) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	main_bm_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						    GLOBAL_BITMAP_SYSTEM_INODE,
														
 
															+						    OCFS2_INVALID_SLOT);
														
 
															+	if (!main_bm_inode) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_handle_add_inode(handle, main_bm_inode);
														
 
															+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* WINDOW_MOVE_CREDITS is a bit heavy... */
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		mlog_errno(PTR_ERR(handle));
														
 
															+		handle = NULL;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	bh = osb->local_alloc_bh;
														
 
															+	alloc = (struct ocfs2_dinode *) bh->b_data;
														
 
															+
														
 
															+	alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
														
 
															+	if (!alloc_copy) {
														
 
															+		status = -ENOMEM;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	memcpy(alloc_copy, alloc, bh->b_size);
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_clear_local_alloc(alloc);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	brelse(bh);
														
 
															+	osb->local_alloc_bh = NULL;
														
 
															+	osb->local_alloc_state = OCFS2_LA_UNUSED;
														
 
															+
														
 
															+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
														
 
															+					  main_bm_inode, main_bm_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (main_bm_bh)
														
 
															+		brelse(main_bm_bh);
														
 
															+
														
 
															+	if (main_bm_inode)
														
 
															+		iput(main_bm_inode);
														
 
															+
														
 
															+	if (local_alloc_inode)
														
 
															+		iput(local_alloc_inode);
														
 
															+
														
 
															+	if (alloc_copy)
														
 
															+		kfree(alloc_copy);
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * We want to free the bitmap bits outside of any recovery context as
														
 
															+ * we'll need a cluster lock to do so, but we must clear the local
														
 
															+ * alloc before giving up the recovered nodes journal. To solve this,
														
 
															+ * we kmalloc a copy of the local alloc before it's change for the
														
 
															+ * caller to process with ocfs2_complete_local_alloc_recovery
														
 
															+ */
														
 
															+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
														
 
															+				     int slot_num,
														
 
															+				     struct ocfs2_dinode **alloc_copy)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct buffer_head *alloc_bh = NULL;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct ocfs2_dinode *alloc;
														
 
															+
														
 
															+	mlog_entry("(slot_num = %d)\n", slot_num);
														
 
															+
														
 
															+	*alloc_copy = NULL;
														
 
															+
														
 
															+	inode = ocfs2_get_system_file_inode(osb,
														
 
															+					    LOCAL_ALLOC_SYSTEM_INODE,
														
 
															+					    slot_num);
														
 
															+	if (!inode) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	down(&inode->i_sem);
														
 
															+
														
 
															+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
														
 
															+				  &alloc_bh, 0, inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
														
 
															+	if (!(*alloc_copy)) {
														
 
															+		status = -ENOMEM;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
														
 
															+
														
 
															+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
														
 
															+	ocfs2_clear_local_alloc(alloc);
														
 
															+
														
 
															+	status = ocfs2_write_block(osb, alloc_bh, inode);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail:
														
 
															+	if ((status < 0) && (*alloc_copy)) {
														
 
															+		kfree(*alloc_copy);
														
 
															+		*alloc_copy = NULL;
														
 
															+	}
														
 
															+
														
 
															+	if (alloc_bh)
														
 
															+		brelse(alloc_bh);
														
 
															+
														
 
															+	if (inode) {
														
 
															+		up(&inode->i_sem);
														
 
															+		iput(inode);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Step 2: By now, we've completed the journal recovery, we've stamped
														
 
															+ * a clean local alloc on disk and dropped the node out of the
														
 
															+ * recovery map. Dlm locks will no longer stall, so lets clear out the
														
 
															+ * main bitmap.
														
 
															+ */
														
 
															+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
														
 
															+					struct ocfs2_dinode *alloc)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct buffer_head *main_bm_bh = NULL;
														
 
															+	struct inode *main_bm_inode = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (!handle) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	main_bm_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						    GLOBAL_BITMAP_SYSTEM_INODE,
														
 
															+						    OCFS2_INVALID_SLOT);
														
 
															+	if (!main_bm_inode) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_handle_add_inode(handle, main_bm_inode);
														
 
															+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* we want the bitmap change to be recorded on disk asap */
														
 
															+	ocfs2_handle_set_sync(handle, 1);
														
 
															+
														
 
															+	status = ocfs2_sync_local_to_main(osb, handle, alloc,
														
 
															+					  main_bm_inode, main_bm_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (main_bm_bh)
														
 
															+		brelse(main_bm_bh);
														
 
															+
														
 
															+	if (main_bm_inode)
														
 
															+		iput(main_bm_inode);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * make sure we've got at least bitswanted contiguous bits in the
														
 
															+ * local alloc. You lose them when you drop i_sem.
														
 
															+ *
														
 
															+ * We will add ourselves to the transaction passed in, but may start
														
 
															+ * our own in order to shift windows.
														
 
															+ */
														
 
															+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
														
 
															+				   struct ocfs2_journal_handle *passed_handle,
														
 
															+				   u32 bits_wanted,
														
 
															+				   struct ocfs2_alloc_context *ac)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_dinode *alloc;
														
 
															+	struct inode *local_alloc_inode;
														
 
															+	unsigned int free_bits;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!passed_handle);
														
 
															+	BUG_ON(!ac);
														
 
															+	BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
														
 
															+
														
 
															+	local_alloc_inode =
														
 
															+		ocfs2_get_system_file_inode(osb,
														
 
															+					    LOCAL_ALLOC_SYSTEM_INODE,
														
 
															+					    osb->slot_num);
														
 
															+	if (!local_alloc_inode) {
														
 
															+		status = -ENOENT;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
														
 
															+
														
 
															+	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
														
 
															+		status = -ENOSPC;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
														
 
															+		mlog(0, "Asking for more than my max window size!\n");
														
 
															+		status = -ENOSPC;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
														
 
															+
														
 
															+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
														
 
															+	    ocfs2_local_alloc_count_bits(alloc)) {
														
 
															+		ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
														
 
															+			    "%u free bits, but a count shows %u",
														
 
															+			    le64_to_cpu(alloc->i_blkno),
														
 
															+			    le32_to_cpu(alloc->id1.bitmap1.i_used),
														
 
															+			    ocfs2_local_alloc_count_bits(alloc));
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
														
 
															+		le32_to_cpu(alloc->id1.bitmap1.i_used);
														
 
															+	if (bits_wanted > free_bits) {
														
 
															+		/* uhoh, window change time. */
														
 
															+		status =
														
 
															+			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC)
														
 
															+				mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	ac->ac_inode = igrab(local_alloc_inode);
														
 
															+	get_bh(osb->local_alloc_bh);
														
 
															+	ac->ac_bh = osb->local_alloc_bh;
														
 
															+	ac->ac_which = OCFS2_AC_USE_LOCAL;
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (local_alloc_inode)
														
 
															+		iput(local_alloc_inode);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
														
 
															+				 struct ocfs2_journal_handle *handle,
														
 
															+				 struct ocfs2_alloc_context *ac,
														
 
															+				 u32 min_bits,
														
 
															+				 u32 *bit_off,
														
 
															+				 u32 *num_bits)
														
 
															+{
														
 
															+	int status, start;
														
 
															+	struct inode *local_alloc_inode;
														
 
															+	u32 bits_wanted;
														
 
															+	void *bitmap;
														
 
															+	struct ocfs2_dinode *alloc;
														
 
															+	struct ocfs2_local_alloc *la;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
														
 
															+
														
 
															+	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
														
 
															+	local_alloc_inode = ac->ac_inode;
														
 
															+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
														
 
															+	la = OCFS2_LOCAL_ALLOC(alloc);
														
 
															+
														
 
															+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
														
 
															+	if (start == -1) {
														
 
															+		/* TODO: Shouldn't we just BUG here? */
														
 
															+		status = -ENOSPC;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	bitmap = la->la_bitmap;
														
 
															+	*bit_off = le32_to_cpu(la->la_bm_off) + start;
														
 
															+	/* local alloc is always contiguous by nature -- we never
														
 
															+	 * delete bits from it! */
														
 
															+	*num_bits = bits_wanted;
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, local_alloc_inode,
														
 
															+				      osb->local_alloc_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	while(bits_wanted--)
														
 
															+		ocfs2_set_bit(start++, bitmap);
														
 
															+
														
 
															+	alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
														
 
															+				le32_to_cpu(alloc->id1.bitmap1.i_used));
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
														
 
															+{
														
 
															+	int i;
														
 
															+	u8 *buffer;
														
 
															+	u32 count = 0;
														
 
															+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	buffer = la->la_bitmap;
														
 
															+	for (i = 0; i < le16_to_cpu(la->la_size); i++)
														
 
															+		count += hweight8(buffer[i]);
														
 
															+
														
 
															+	mlog_exit(count);
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
														
 
															+					     struct ocfs2_dinode *alloc,
														
 
															+					     u32 numbits)
														
 
															+{
														
 
															+	int numfound, bitoff, left, startoff, lastzero;
														
 
															+	void *bitmap = NULL;
														
 
															+
														
 
															+	mlog_entry("(numbits wanted = %u)\n", numbits);
														
 
															+
														
 
															+	if (!alloc->id1.bitmap1.i_total) {
														
 
															+		mlog(0, "No bits in my window!\n");
														
 
															+		bitoff = -1;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
														
 
															+
														
 
															+	numfound = bitoff = startoff = 0;
														
 
															+	lastzero = -1;
														
 
															+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
														
 
															+	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
														
 
															+		if (bitoff == left) {
														
 
															+			/* mlog(0, "bitoff (%d) == left", bitoff); */
														
 
															+			break;
														
 
															+		}
														
 
															+		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
														
 
															+		   "numfound = %d\n", bitoff, startoff, numfound);*/
														
 
															+
														
 
															+		/* Ok, we found a zero bit... is it contig. or do we
														
 
															+		 * start over?*/
														
 
															+		if (bitoff == startoff) {
														
 
															+			/* we found a zero */
														
 
															+			numfound++;
														
 
															+			startoff++;
														
 
															+		} else {
														
 
															+			/* got a zero after some ones */
														
 
															+			numfound = 1;
														
 
															+			startoff = bitoff+1;
														
 
															+		}
														
 
															+		/* we got everything we needed */
														
 
															+		if (numfound == numbits) {
														
 
															+			/* mlog(0, "Found it all!\n"); */
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
														
 
															+	     numfound);
														
 
															+
														
 
															+	if (numfound == numbits)
														
 
															+		bitoff = startoff - numfound;
														
 
															+	else
														
 
															+		bitoff = -1;
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(bitoff);
														
 
															+	return bitoff;
														
 
															+}
														
 
															+
														
 
															+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
														
 
															+{
														
 
															+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
														
 
															+	int i;
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	alloc->id1.bitmap1.i_total = 0;
														
 
															+	alloc->id1.bitmap1.i_used = 0;
														
 
															+	la->la_bm_off = 0;
														
 
															+	for(i = 0; i < le16_to_cpu(la->la_size); i++)
														
 
															+		la->la_bitmap[i] = 0;
														
 
															+
														
 
															+	mlog_exit_void();
														
 
															+}
														
 
															+
														
 
															+#if 0
														
 
															+/* turn this on and uncomment below to aid debugging window shifts. */
														
 
															+static void ocfs2_verify_zero_bits(unsigned long *bitmap,
														
 
															+				   unsigned int start,
														
 
															+				   unsigned int count)
														
 
															+{
														
 
															+	unsigned int tmp = count;
														
 
															+	while(tmp--) {
														
 
															+		if (ocfs2_test_bit(start + tmp, bitmap)) {
														
 
															+			printk("ocfs2_verify_zero_bits: start = %u, count = "
														
 
															+			       "%u\n", start, count);
														
 
															+			printk("ocfs2_verify_zero_bits: bit %u is set!",
														
 
															+			       start + tmp);
														
 
															+			BUG();
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+/*
														
 
															+ * sync the local alloc to main bitmap.
														
 
															+ *
														
 
															+ * assumes you've already locked the main bitmap -- the bitmap inode
														
 
															+ * passed is used for caching.
														
 
															+ */
														
 
															+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
														
 
															+				    struct ocfs2_journal_handle *handle,
														
 
															+				    struct ocfs2_dinode *alloc,
														
 
															+				    struct inode *main_bm_inode,
														
 
															+				    struct buffer_head *main_bm_bh)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	int bit_off, left, count, start;
														
 
															+	u64 la_start_blk;
														
 
															+	u64 blkno;
														
 
															+	void *bitmap;
														
 
															+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
														
 
															+
														
 
															+	mlog_entry("total = %u, COUNT = %u, used = %u\n",
														
 
															+		   le32_to_cpu(alloc->id1.bitmap1.i_total),
														
 
															+		   ocfs2_local_alloc_count_bits(alloc),
														
 
															+		   le32_to_cpu(alloc->id1.bitmap1.i_used));
														
 
															+
														
 
															+	if (!alloc->id1.bitmap1.i_total) {
														
 
															+		mlog(0, "nothing to sync!\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
														
 
															+	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
														
 
															+		mlog(0, "all bits were taken!\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
														
 
															+						le32_to_cpu(la->la_bm_off));
														
 
															+	bitmap = la->la_bitmap;
														
 
															+	start = count = bit_off = 0;
														
 
															+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
														
 
															+
														
 
															+	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
														
 
															+	       != -1) {
														
 
															+		if ((bit_off < left) && (bit_off == start)) {
														
 
															+			count++;
														
 
															+			start++;
														
 
															+			continue;
														
 
															+		}
														
 
															+		if (count) {
														
 
															+			blkno = la_start_blk +
														
 
															+				ocfs2_clusters_to_blocks(osb->sb,
														
 
															+							 start - count);
														
 
															+
														
 
															+			mlog(0, "freeing %u bits starting at local "
														
 
															+			     "alloc bit %u (la_start_blk = %"MLFu64", "
														
 
															+			     "blkno = %"MLFu64")\n", count, start - count,
														
 
															+			     la_start_blk, blkno);
														
 
															+
														
 
															+			status = ocfs2_free_clusters(handle, main_bm_inode,
														
 
															+						     main_bm_bh, blkno, count);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+		if (bit_off >= left)
														
 
															+			break;
														
 
															+		count = 1;
														
 
															+		start = bit_off + 1;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
														
 
															+						struct ocfs2_journal_handle *handle,
														
 
															+						struct ocfs2_alloc_context **ac,
														
 
															+						struct inode **bitmap_inode,
														
 
															+						struct buffer_head **bitmap_bh)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
														
 
															+	if (!(*ac)) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	(*ac)->ac_handle = handle;
														
 
															+	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
														
 
															+
														
 
															+	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	*bitmap_inode = (*ac)->ac_inode;
														
 
															+	igrab(*bitmap_inode);
														
 
															+	*bitmap_bh = (*ac)->ac_bh;
														
 
															+	get_bh(*bitmap_bh);
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if ((status < 0) && *ac) {
														
 
															+		ocfs2_free_alloc_context(*ac);
														
 
															+		*ac = NULL;
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * pass it the bitmap lock in lock_bh if you have it.
														
 
															+ */
														
 
															+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
														
 
															+					struct ocfs2_journal_handle *handle,
														
 
															+					struct ocfs2_alloc_context *ac)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	u32 cluster_off, cluster_count;
														
 
															+	struct ocfs2_dinode *alloc = NULL;
														
 
															+	struct ocfs2_local_alloc *la;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
														
 
															+	la = OCFS2_LOCAL_ALLOC(alloc);
														
 
															+
														
 
															+	if (alloc->id1.bitmap1.i_total)
														
 
															+		mlog(0, "asking me to alloc a new window over a non-empty "
														
 
															+		     "one\n");
														
 
															+
														
 
															+	mlog(0, "Allocating %u clusters for a new window.\n",
														
 
															+	     ocfs2_local_alloc_window_bits(osb));
														
 
															+	/* we used the generic suballoc reserve function, but we set
														
 
															+	 * everything up nicely, so there's no reason why we can't use
														
 
															+	 * the more specific cluster api to claim bits. */
														
 
															+	status = ocfs2_claim_clusters(osb, handle, ac,
														
 
															+				      ocfs2_local_alloc_window_bits(osb),
														
 
															+				      &cluster_off, &cluster_count);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	la->la_bm_off = cpu_to_le32(cluster_off);
														
 
															+	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
														
 
															+	/* just in case... In the future when we find space ourselves,
														
 
															+	 * we don't have to get all contiguous -- but we'll have to
														
 
															+	 * set all previously used bits in bitmap and update
														
 
															+	 * la_bits_set before setting the bits in the main bitmap. */
														
 
															+	alloc->id1.bitmap1.i_used = 0;
														
 
															+	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
														
 
															+	       le16_to_cpu(la->la_size));
														
 
															+
														
 
															+	mlog(0, "New window allocated:\n");
														
 
															+	mlog(0, "window la_bm_off = %u\n",
														
 
															+	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
														
 
															+	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Note that we do *NOT* lock the local alloc inode here as
														
 
															+ * it's been locked already for us. */
														
 
															+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
														
 
															+					  struct inode *local_alloc_inode)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct buffer_head *main_bm_bh = NULL;
														
 
															+	struct inode *main_bm_inode = NULL;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct ocfs2_dinode *alloc;
														
 
															+	struct ocfs2_dinode *alloc_copy = NULL;
														
 
															+	struct ocfs2_alloc_context *ac = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (!handle) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* This will lock the main bitmap for us. */
														
 
															+	status = ocfs2_local_alloc_reserve_for_window(osb,
														
 
															+						      handle,
														
 
															+						      &ac,
														
 
															+						      &main_bm_inode,
														
 
															+						      &main_bm_bh);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
														
 
															+
														
 
															+	/* We want to clear the local alloc before doing anything
														
 
															+	 * else, so that if we error later during this operation,
														
 
															+	 * local alloc shutdown won't try to double free main bitmap
														
 
															+	 * bits. Make a copy so the sync function knows which bits to
														
 
															+	 * free. */
														
 
															+	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
														
 
															+	if (!alloc_copy) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, local_alloc_inode,
														
 
															+				      osb->local_alloc_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_clear_local_alloc(alloc);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
														
 
															+					  main_bm_inode, main_bm_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_local_alloc_new_window(osb, handle, ac);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	atomic_inc(&osb->alloc_stats.moves);
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (main_bm_bh)
														
 
															+		brelse(main_bm_bh);
														
 
															+
														
 
															+	if (main_bm_inode)
														
 
															+		iput(main_bm_inode);
														
 
															+
														
 
															+	if (alloc_copy)
														
 
															+		kfree(alloc_copy);
														
 
															+
														
 
															+	if (ac)
														
 
															+		ocfs2_free_alloc_context(ac);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,56 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * localalloc.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_LOCALALLOC_H
														
 
															+#define OCFS2_LOCALALLOC_H
														
 
															+
														
 
															+int ocfs2_load_local_alloc(struct ocfs2_super *osb);
														
 
															+
														
 
															+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
														
 
															+
														
 
															+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
														
 
															+				     int node_num,
														
 
															+				     struct ocfs2_dinode **alloc_copy);
														
 
															+
														
 
															+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
														
 
															+					struct ocfs2_dinode *alloc);
														
 
															+
														
 
															+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
														
 
															+				 u64 bits);
														
 
															+
														
 
															+struct ocfs2_alloc_context;
														
 
															+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
														
 
															+				   struct ocfs2_journal_handle *passed_handle,
														
 
															+				   u32 bits_wanted,
														
 
															+				   struct ocfs2_alloc_context *ac);
														
 
															+
														
 
															+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
														
 
															+				 struct ocfs2_journal_handle *handle,
														
 
															+				 struct ocfs2_alloc_context *ac,
														
 
															+				 u32 min_bits,
														
 
															+				 u32 *bit_off,
														
 
															+				 u32 *num_bits);
														
 
															+
														
 
															+#endif /* OCFS2_LOCALALLOC_H */
														
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,102 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * mmap.c
														
 
															+ *
														
 
															+ * Code to deal with the mess that is clustered mmap.
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/pagemap.h>
														
 
															+#include <linux/uio.h>
														
 
															+#include <linux/signal.h>
														
 
															+#include <linux/rbtree.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_FILE_IO
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "dlmglue.h"
														
 
															+#include "file.h"
														
 
															+#include "inode.h"
														
 
															+#include "mmap.h"
														
 
															+
														
 
															+static struct page *ocfs2_nopage(struct vm_area_struct * area,
														
 
															+				 unsigned long address,
														
 
															+				 int *type)
														
 
															+{
														
 
															+	struct inode *inode = area->vm_file->f_dentry->d_inode;
														
 
															+	struct page *page = NOPAGE_SIGBUS;
														
 
															+	sigset_t blocked, oldset;
														
 
															+	int ret;
														
 
															+
														
 
															+	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
														
 
															+
														
 
															+	/* The best way to deal with signals in this path is
														
 
															+	 * to block them upfront, rather than allowing the
														
 
															+	 * locking paths to return -ERESTARTSYS. */
														
 
															+	sigfillset(&blocked);
														
 
															+
														
 
															+	/* We should technically never get a bad ret return
														
 
															+	 * from sigprocmask */
														
 
															+	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
														
 
															+	if (ret < 0) {
														
 
															+		mlog_errno(ret);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	page = filemap_nopage(area, address, type);
														
 
															+
														
 
															+	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
														
 
															+	if (ret < 0)
														
 
															+		mlog_errno(ret);
														
 
															+out:
														
 
															+	mlog_exit_ptr(page);
														
 
															+	return page;
														
 
															+}
														
 
															+
														
 
															+static struct vm_operations_struct ocfs2_file_vm_ops = {
														
 
															+	.nopage = ocfs2_nopage,
														
 
															+};
														
 
															+
														
 
															+int ocfs2_mmap(struct file *file,
														
 
															+	       struct vm_area_struct *vma)
														
 
															+{
														
 
															+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
														
 
															+	struct inode *inode = mapping->host;
														
 
															+
														
 
															+	/* We don't want to support shared writable mappings yet. */
														
 
															+	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
														
 
															+	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
														
 
															+		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
														
 
															+		/* This is -EINVAL because generic_file_readonly_mmap
														
 
															+		 * returns it in a similar situation. */
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	update_atime(inode);
														
 
															+	vma->vm_ops = &ocfs2_file_vm_ops;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
--- a/fs/ocfs2/mmap.h
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
 
															+#ifndef OCFS2_MMAP_H
														
 
															+#define OCFS2_MMAP_H
														
 
															+
														
 
															+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
														
 
															+
														
 
															+#endif  /* OCFS2_MMAP_H */
														
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * namei.c
														
 
															+ *
														
 
															+ * Create and rename file, directory, symlinks
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ *  Portions of this code from linux/fs/ext3/dir.c
														
 
															+ *
														
 
															+ *  Copyright (C) 1992, 1993, 1994, 1995
														
 
															+ *  Remy Card (card@masi.ibp.fr)
														
 
															+ *  Laboratoire MASI - Institut Blaise pascal
														
 
															+ *  Universite Pierre et Marie Curie (Paris VI)
														
 
															+ *
														
 
															+ *   from
														
 
															+ *
														
 
															+ *   linux/fs/minix/dir.c
														
 
															+ *
														
 
															+ *   Copyright (C) 1991, 1992 Linux Torvalds
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_NAMEI
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dcache.h"
														
 
															+#include "dir.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "file.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "namei.h"
														
 
															+#include "suballoc.h"
														
 
															+#include "symlink.h"
														
 
															+#include "sysfile.h"
														
 
															+#include "uptodate.h"
														
 
															+#include "vote.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+#define NAMEI_RA_CHUNKS  2
														
 
															+#define NAMEI_RA_BLOCKS  4
														
 
															+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
														
 
															+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
														
 
															+
														
 
															+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
														
 
															+					struct inode *dir,
														
 
															+					const char *name, int namelen,
														
 
															+					unsigned long offset,
														
 
															+					struct ocfs2_dir_entry **res_dir);
														
 
															+
														
 
															+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
														
 
															+			      struct inode *dir,
														
 
															+			      struct ocfs2_dir_entry *de_del,
														
 
															+			      struct buffer_head *bh);
														
 
															+
														
 
															+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
														
 
															+			     struct inode *dir,
														
 
															+			     const char *name, int namelen,
														
 
															+			     struct inode *inode, u64 blkno,
														
 
															+			     struct buffer_head *parent_fe_bh,
														
 
															+			     struct buffer_head *insert_bh);
														
 
															+
														
 
															+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
														
 
															+			      struct inode *dir,
														
 
															+			      struct dentry *dentry, int mode,
														
 
															+			      dev_t dev,
														
 
															+			      struct buffer_head **new_fe_bh,
														
 
															+			      struct buffer_head *parent_fe_bh,
														
 
															+			      struct ocfs2_journal_handle *handle,
														
 
															+			      struct inode **ret_inode,
														
 
															+			      struct ocfs2_alloc_context *inode_ac);
														
 
															+
														
 
															+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
														
 
															+			      struct ocfs2_journal_handle *handle,
														
 
															+			      struct inode *parent,
														
 
															+			      struct inode *inode,
														
 
															+			      struct buffer_head *fe_bh,
														
 
															+			      struct ocfs2_alloc_context *data_ac);
														
 
															+
														
 
															+static int ocfs2_double_lock(struct ocfs2_super *osb,
														
 
															+			     struct ocfs2_journal_handle *handle,
														
 
															+			     struct buffer_head **bh1,
														
 
															+			     struct inode *inode1,
														
 
															+			     struct buffer_head **bh2,
														
 
															+			     struct inode *inode2);
														
 
															+
														
 
															+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
														
 
															+				    struct ocfs2_journal_handle *handle,
														
 
															+				    struct inode *inode,
														
 
															+				    char *name,
														
 
															+				    struct buffer_head **de_bh);
														
 
															+
														
 
															+static int ocfs2_orphan_add(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_journal_handle *handle,
														
 
															+			    struct inode *inode,
														
 
															+			    struct ocfs2_dinode *fe,
														
 
															+			    char *name,
														
 
															+			    struct buffer_head *de_bh);
														
 
															+
														
 
															+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
														
 
															+				     struct ocfs2_journal_handle *handle,
														
 
															+				     struct inode *inode,
														
 
															+				     const char *symname);
														
 
															+
														
 
															+static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
														
 
															+				  struct dentry *dentry,
														
 
															+				  struct inode *inode, u64 blkno,
														
 
															+				  struct buffer_head *parent_fe_bh,
														
 
															+				  struct buffer_head *insert_bh)
														
 
															+{
														
 
															+	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
														
 
															+				 dentry->d_name.name, dentry->d_name.len,
														
 
															+				 inode, blkno, parent_fe_bh, insert_bh);
														
 
															+}
														
 
															+
														
 
															+/* An orphan dir name is an 8 byte value, printed as a hex string */
														
 
															+#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
														
 
															+
														
 
															+static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
														
 
															+				   struct nameidata *nd)
														
 
															+{
														
 
															+	int status;
														
 
															+	u64 blkno;
														
 
															+	struct buffer_head *dirent_bh = NULL;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct dentry *ret;
														
 
															+	struct ocfs2_dir_entry *dirent;
														
 
															+	struct ocfs2_inode_info *oi;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
														
 
															+		   dentry->d_name.len, dentry->d_name.name);
														
 
															+
														
 
															+	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
														
 
															+		ret = ERR_PTR(-ENAMETOOLONG);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
														
 
															+	     dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
														
 
															+
														
 
															+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		ret = ERR_PTR(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
														
 
															+					  dentry->d_name.len, &blkno,
														
 
															+					  dir, &dirent_bh, &dirent);
														
 
															+	if (status < 0)
														
 
															+		goto bail_add;
														
 
															+
														
 
															+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
														
 
															+	if (IS_ERR(inode)) {
														
 
															+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
														
 
															+		ret = ERR_PTR(-EACCES);
														
 
															+		goto bail_unlock;
														
 
															+	}
														
 
															+
														
 
															+	oi = OCFS2_I(inode);
														
 
															+	/* Clear any orphaned state... If we were able to look up the
														
 
															+	 * inode from a directory, it certainly can't be orphaned. We
														
 
															+	 * might have the bad state from a node which intended to
														
 
															+	 * orphan this inode but crashed before it could commit the
														
 
															+	 * unlink. */
														
 
															+	spin_lock(&oi->ip_lock);
														
 
															+	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
														
 
															+	oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
														
 
															+	spin_unlock(&oi->ip_lock);
														
 
															+
														
 
															+bail_add:
														
 
															+
														
 
															+	dentry->d_op = &ocfs2_dentry_ops;
														
 
															+	ret = d_splice_alias(inode, dentry);
														
 
															+
														
 
															+bail_unlock:
														
 
															+	/* Don't drop the cluster lock until *after* the d_add --
														
 
															+	 * unlink on another node will message us to remove that
														
 
															+	 * dentry under this lock so otherwise we can race this with
														
 
															+	 * the vote thread and have a stale dentry. */
														
 
															+	ocfs2_meta_unlock(dir, 0);
														
 
															+
														
 
															+bail:
														
 
															+	if (dirent_bh)
														
 
															+		brelse(dirent_bh);
														
 
															+
														
 
															+	mlog_exit_ptr(ret);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
														
 
															+			      struct ocfs2_journal_handle *handle,
														
 
															+			      struct inode *parent,
														
 
															+			      struct inode *inode,
														
 
															+			      struct buffer_head *fe_bh,
														
 
															+			      struct ocfs2_alloc_context *data_ac)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct buffer_head *new_bh = NULL;
														
 
															+	struct ocfs2_dir_entry *de = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
														
 
															+				     data_ac, NULL, &new_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_set_new_buffer_uptodate(inode, new_bh);
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, inode, new_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_CREATE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
														
 
															+
														
 
															+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
														
 
															+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
														
 
															+	de->name_len = 1;
														
 
															+	de->rec_len =
														
 
															+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
														
 
															+	strcpy(de->name, ".");
														
 
															+	ocfs2_set_de_type(de, S_IFDIR);
														
 
															+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
														
 
															+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
														
 
															+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
														
 
															+				  OCFS2_DIR_REC_LEN(1));
														
 
															+	de->name_len = 2;
														
 
															+	strcpy(de->name, "..");
														
 
															+	ocfs2_set_de_type(de, S_IFDIR);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, new_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	i_size_write(inode, inode->i_sb->s_blocksize);
														
 
															+	inode->i_nlink = 2;
														
 
															+	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
														
 
															+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (new_bh)
														
 
															+		brelse(new_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_mknod(struct inode *dir,
														
 
															+		       struct dentry *dentry,
														
 
															+		       int mode,
														
 
															+		       dev_t dev)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct buffer_head *parent_fe_bh = NULL;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct ocfs2_super *osb;
														
 
															+	struct ocfs2_dinode *dirfe;
														
 
															+	struct buffer_head *new_fe_bh = NULL;
														
 
															+	struct buffer_head *de_bh = NULL;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct ocfs2_alloc_context *inode_ac = NULL;
														
 
															+	struct ocfs2_alloc_context *data_ac = NULL;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
														
 
															+		   (unsigned long)dev, dentry->d_name.len,
														
 
															+		   dentry->d_name.name);
														
 
															+
														
 
															+	/* get our super block */
														
 
															+	osb = OCFS2_SB(dir->i_sb);
														
 
															+
														
 
															+	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
														
 
															+		mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
														
 
															+		     OCFS2_I(dir)->ip_blkno, dir->i_nlink);
														
 
															+		status = -EMLINK;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (handle == NULL) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
														
 
															+	if (!dirfe->i_links_count) {
														
 
															+		/* can't make a file in a deleted directory. */
														
 
															+		status = -ENOENT;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
														
 
															+					   dentry->d_name.len);
														
 
															+	if (status)
														
 
															+		goto leave;
														
 
															+
														
 
															+	/* get a spot inside the dir. */
														
 
															+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
														
 
															+					      dentry->d_name.name,
														
 
															+					      dentry->d_name.len, &de_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* reserve an inode spot */
														
 
															+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* are we making a directory? If so, reserve a cluster for his
														
 
															+	 * 1st extent. */
														
 
															+	if (S_ISDIR(mode)) {
														
 
															+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC)
														
 
															+				mlog_errno(status);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* do the real work now. */
														
 
															+	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
														
 
															+				    &new_fe_bh, parent_fe_bh, handle,
														
 
															+				    &inode, inode_ac);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (S_ISDIR(mode)) {
														
 
															+		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
														
 
															+					    new_fe_bh, data_ac);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
														
 
															+					      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+		le16_add_cpu(&dirfe->i_links_count, 1);
														
 
															+		status = ocfs2_journal_dirty(handle, parent_fe_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+		dir->i_nlink++;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_add_entry(handle, dentry, inode,
														
 
															+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
														
 
															+				 de_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	insert_inode_hash(inode);
														
 
															+	dentry->d_op = &ocfs2_dentry_ops;
														
 
															+	d_instantiate(dentry, inode);
														
 
															+	status = 0;
														
 
															+leave:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (status == -ENOSPC)
														
 
															+		mlog(0, "Disk is full\n");
														
 
															+
														
 
															+	if (new_fe_bh)
														
 
															+		brelse(new_fe_bh);
														
 
															+
														
 
															+	if (de_bh)
														
 
															+		brelse(de_bh);
														
 
															+
														
 
															+	if (parent_fe_bh)
														
 
															+		brelse(parent_fe_bh);
														
 
															+
														
 
															+	if ((status < 0) && inode)
														
 
															+		iput(inode);
														
 
															+
														
 
															+	if (inode_ac)
														
 
															+		ocfs2_free_alloc_context(inode_ac);
														
 
															+
														
 
															+	if (data_ac)
														
 
															+		ocfs2_free_alloc_context(data_ac);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
														
 
															+			      struct inode *dir,
														
 
															+			      struct dentry *dentry, int mode,
														
 
															+			      dev_t dev,
														
 
															+			      struct buffer_head **new_fe_bh,
														
 
															+			      struct buffer_head *parent_fe_bh,
														
 
															+			      struct ocfs2_journal_handle *handle,
														
 
															+			      struct inode **ret_inode,
														
 
															+			      struct ocfs2_alloc_context *inode_ac)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_dinode *fe = NULL;
														
 
															+	struct ocfs2_extent_list *fel;
														
 
															+	u64 fe_blkno = 0;
														
 
															+	u16 suballoc_bit;
														
 
															+	struct inode *inode = NULL;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
														
 
															+		   (unsigned long)dev, dentry->d_name.len,
														
 
															+		   dentry->d_name.name);
														
 
															+
														
 
															+	*new_fe_bh = NULL;
														
 
															+	*ret_inode = NULL;
														
 
															+
														
 
															+	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
														
 
															+				       &fe_blkno);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	inode = new_inode(dir->i_sb);
														
 
															+	if (IS_ERR(inode)) {
														
 
															+		status = PTR_ERR(inode);
														
 
															+		mlog(ML_ERROR, "new_inode failed!\n");
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* populate as many fields early on as possible - many of
														
 
															+	 * these are used by the support functions here and in
														
 
															+	 * callers. */
														
 
															+	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
														
 
															+	OCFS2_I(inode)->ip_blkno = fe_blkno;
														
 
															+	if (S_ISDIR(mode))
														
 
															+		inode->i_nlink = 2;
														
 
															+	else
														
 
															+		inode->i_nlink = 1;
														
 
															+	inode->i_mode = mode;
														
 
															+	spin_lock(&osb->osb_lock);
														
 
															+	inode->i_generation = osb->s_next_generation++;
														
 
															+	spin_unlock(&osb->osb_lock);
														
 
															+
														
 
															+	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
														
 
															+	if (!*new_fe_bh) {
														
 
															+		status = -EIO;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_CREATE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
														
 
															+	memset(fe, 0, osb->sb->s_blocksize);
														
 
															+
														
 
															+	fe->i_generation = cpu_to_le32(inode->i_generation);
														
 
															+	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
														
 
															+	fe->i_blkno = cpu_to_le64(fe_blkno);
														
 
															+	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
														
 
															+	fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
														
 
															+	fe->i_uid = cpu_to_le32(current->fsuid);
														
 
															+	if (dir->i_mode & S_ISGID) {
														
 
															+		fe->i_gid = cpu_to_le32(dir->i_gid);
														
 
															+		if (S_ISDIR(mode))
														
 
															+			mode |= S_ISGID;
														
 
															+	} else
														
 
															+		fe->i_gid = cpu_to_le32(current->fsgid);
														
 
															+	fe->i_mode = cpu_to_le16(mode);
														
 
															+	if (S_ISCHR(mode) || S_ISBLK(mode))
														
 
															+		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
														
 
															+
														
 
															+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
														
 
															+
														
 
															+	fe->i_last_eb_blk = 0;
														
 
															+	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
														
 
															+	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
														
 
															+	fe->i_atime = fe->i_ctime = fe->i_mtime =
														
 
															+		cpu_to_le64(CURRENT_TIME.tv_sec);
														
 
															+	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
														
 
															+		cpu_to_le32(CURRENT_TIME.tv_nsec);
														
 
															+	fe->i_dtime = 0;
														
 
															+
														
 
															+	fel = &fe->id2.i_list;
														
 
															+	fel->l_tree_depth = 0;
														
 
															+	fel->l_next_free_rec = 0;
														
 
															+	fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, *new_fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
														
 
															+		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
														
 
															+		     "i_blkno=%"MLFu64", i_ino=%lu\n",
														
 
															+		     (unsigned long long) (*new_fe_bh)->b_blocknr,
														
 
															+		     fe->i_blkno, inode->i_ino);
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_inode_set_new(osb, inode);
														
 
															+	status = ocfs2_create_new_inode_locks(inode);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	status = 0; /* error in ocfs2_create_new_inode_locks is not
														
 
															+		     * critical */
														
 
															+
														
 
															+	*ret_inode = inode;
														
 
															+leave:
														
 
															+	if (status < 0) {
														
 
															+		if (*new_fe_bh) {
														
 
															+			brelse(*new_fe_bh);
														
 
															+			*new_fe_bh = NULL;
														
 
															+		}
														
 
															+		if (inode)
														
 
															+			iput(inode);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_mkdir(struct inode *dir,
														
 
															+		       struct dentry *dentry,
														
 
															+		       int mode)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
														
 
															+		   dentry->d_name.len, dentry->d_name.name);
														
 
															+	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
														
 
															+	mlog_exit(ret);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_create(struct inode *dir,
														
 
															+			struct dentry *dentry,
														
 
															+			int mode,
														
 
															+			struct nameidata *nd)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
														
 
															+		   dentry->d_name.len, dentry->d_name.name);
														
 
															+	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
														
 
															+	mlog_exit(ret);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_link(struct dentry *old_dentry,
														
 
															+		      struct inode *dir,
														
 
															+		      struct dentry *dentry)
														
 
															+{
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct inode *inode = old_dentry->d_inode;
														
 
															+	int err;
														
 
															+	struct buffer_head *fe_bh = NULL;
														
 
															+	struct buffer_head *parent_fe_bh = NULL;
														
 
															+	struct buffer_head *de_bh = NULL;
														
 
															+	struct ocfs2_dinode *fe = NULL;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
														
 
															+
														
 
															+	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
														
 
															+		   old_dentry->d_name.len, old_dentry->d_name.name,
														
 
															+		   dentry->d_name.len, dentry->d_name.name);
														
 
															+
														
 
															+	if (S_ISDIR(inode->i_mode)) {
														
 
															+		err = -EPERM;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (inode->i_nlink >= OCFS2_LINK_MAX) {
														
 
															+		err = -EMLINK;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (handle == NULL) {
														
 
															+		err = -ENOMEM;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
														
 
															+	if (err < 0) {
														
 
															+		if (err != -ENOENT)
														
 
															+			mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
														
 
															+					dentry->d_name.len);
														
 
															+	if (err)
														
 
															+		goto bail;
														
 
															+
														
 
															+	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
														
 
															+					   dentry->d_name.name,
														
 
															+					   dentry->d_name.len, &de_bh);
														
 
															+	if (err < 0) {
														
 
															+		mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
														
 
															+	if (err < 0) {
														
 
															+		if (err != -ENOENT)
														
 
															+			mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+	if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
														
 
															+		err = -EMLINK;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		err = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	err = ocfs2_journal_access(handle, inode, fe_bh,
														
 
															+				   OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (err < 0) {
														
 
															+		mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	inode->i_nlink++;
														
 
															+	inode->i_ctime = CURRENT_TIME;
														
 
															+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
														
 
															+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
														
 
															+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
														
 
															+
														
 
															+	err = ocfs2_journal_dirty(handle, fe_bh);
														
 
															+	if (err < 0) {
														
 
															+		le16_add_cpu(&fe->i_links_count, -1);
														
 
															+		inode->i_nlink--;
														
 
															+		mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	err = ocfs2_add_entry(handle, dentry, inode,
														
 
															+			      OCFS2_I(inode)->ip_blkno,
														
 
															+			      parent_fe_bh, de_bh);
														
 
															+	if (err) {
														
 
															+		le16_add_cpu(&fe->i_links_count, -1);
														
 
															+		inode->i_nlink--;
														
 
															+		mlog_errno(err);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	atomic_inc(&inode->i_count);
														
 
															+	dentry->d_op = &ocfs2_dentry_ops;
														
 
															+	d_instantiate(dentry, inode);
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+	if (de_bh)
														
 
															+		brelse(de_bh);
														
 
															+	if (fe_bh)
														
 
															+		brelse(fe_bh);
														
 
															+	if (parent_fe_bh)
														
 
															+		brelse(parent_fe_bh);
														
 
															+
														
 
															+	mlog_exit(err);
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_unlink(struct inode *dir,
														
 
															+			struct dentry *dentry)
														
 
															+{
														
 
															+	int status;
														
 
															+	unsigned int saved_nlink = 0;
														
 
															+	struct inode *inode = dentry->d_inode;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
														
 
															+	u64 blkno;
														
 
															+	struct ocfs2_dinode *fe = NULL;
														
 
															+	struct buffer_head *fe_bh = NULL;
														
 
															+	struct buffer_head *parent_node_bh = NULL;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct ocfs2_dir_entry *dirent = NULL;
														
 
															+	struct buffer_head *dirent_bh = NULL;
														
 
															+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
														
 
															+	struct buffer_head *orphan_entry_bh = NULL;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
														
 
															+		   dentry->d_name.len, dentry->d_name.name);
														
 
															+
														
 
															+	BUG_ON(dentry->d_parent->d_inode != dir);
														
 
															+
														
 
															+	mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
														
 
															+
														
 
															+	if (inode == osb->root_inode) {
														
 
															+		mlog(0, "Cannot delete the root directory\n");
														
 
															+		status = -EPERM;
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (handle == NULL) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
														
 
															+					  dentry->d_name.len, &blkno,
														
 
															+					  dir, &dirent_bh, &dirent);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (OCFS2_I(inode)->ip_blkno != blkno) {
														
 
															+		status = -ENOENT;
														
 
															+
														
 
															+		mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
														
 
															+		     "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
														
 
															+		     OCFS2_I(inode)->ip_flags);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (S_ISDIR(inode->i_mode)) {
														
 
															+	       	if (!ocfs2_empty_dir(inode)) {
														
 
															+			status = -ENOTEMPTY;
														
 
															+			goto leave;
														
 
															+		} else if (inode->i_nlink != 2) {
														
 
															+			status = -ENOTEMPTY;
														
 
															+			goto leave;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* There are still a few steps left until we can consider the
														
 
															+	 * unlink to have succeeded. Save off nlink here before
														
 
															+	 * modification so we can set it back in case we hit an issue
														
 
															+	 * before commit. */
														
 
															+	saved_nlink = inode->i_nlink;
														
 
															+	if (S_ISDIR(inode->i_mode))
														
 
															+		inode->i_nlink = 0;
														
 
															+	else
														
 
															+		inode->i_nlink--;
														
 
															+
														
 
															+	status = ocfs2_request_unlink_vote(inode, dentry,
														
 
															+					   (unsigned int) inode->i_nlink);
														
 
															+	if (status < 0) {
														
 
															+		/* This vote should succeed under all normal
														
 
															+		 * circumstances. */
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (!inode->i_nlink) {
														
 
															+		status = ocfs2_prepare_orphan_dir(osb, handle, inode,
														
 
															+						  orphan_name,
														
 
															+						  &orphan_entry_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, inode, fe_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+
														
 
															+	if (!inode->i_nlink) {
														
 
															+		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
														
 
															+					  orphan_entry_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto leave;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* delete the name from the parent dir */
														
 
															+	status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* We can set nlink on the dinode now. clear the saved version
														
 
															+	 * so that it doesn't get set later. */
														
 
															+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
														
 
															+	saved_nlink = 0;
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	if (S_ISDIR(inode->i_mode)) {
														
 
															+		dir->i_nlink--;
														
 
															+		status = ocfs2_mark_inode_dirty(handle, dir,
														
 
															+						parent_node_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			dir->i_nlink++;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	if (status < 0 && saved_nlink)
														
 
															+		inode->i_nlink = saved_nlink;
														
 
															+
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (fe_bh)
														
 
															+		brelse(fe_bh);
														
 
															+
														
 
															+	if (dirent_bh)
														
 
															+		brelse(dirent_bh);
														
 
															+
														
 
															+	if (parent_node_bh)
														
 
															+		brelse(parent_node_bh);
														
 
															+
														
 
															+	if (orphan_entry_bh)
														
 
															+		brelse(orphan_entry_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * The only place this should be used is rename!
														
 
															+ * if they have the same id, then the 1st one is the only one locked.
														
 
															+ */
														
 
															+static int ocfs2_double_lock(struct ocfs2_super *osb,
														
 
															+			     struct ocfs2_journal_handle *handle,
														
 
															+			     struct buffer_head **bh1,
														
 
															+			     struct inode *inode1,
														
 
															+			     struct buffer_head **bh2,
														
 
															+			     struct inode *inode2)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
														
 
															+	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
														
 
															+	struct buffer_head **tmpbh;
														
 
															+	struct inode *tmpinode;
														
 
															+
														
 
															+	mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
														
 
															+		   oi1->ip_blkno, oi2->ip_blkno);
														
 
															+
														
 
															+	BUG_ON(!handle);
														
 
															+
														
 
															+	if (*bh1)
														
 
															+		*bh1 = NULL;
														
 
															+	if (*bh2)
														
 
															+		*bh2 = NULL;
														
 
															+
														
 
															+	/* we always want to lock the one with the lower lockid first. */
														
 
															+	if (oi1->ip_blkno != oi2->ip_blkno) {
														
 
															+		if (oi1->ip_blkno < oi2->ip_blkno) {
														
 
															+			/* switch id1 and id2 around */
														
 
															+			mlog(0, "switching them around...\n");
														
 
															+			tmpbh = bh2;
														
 
															+			bh2 = bh1;
														
 
															+			bh1 = tmpbh;
														
 
															+
														
 
															+			tmpinode = inode2;
														
 
															+			inode2 = inode1;
														
 
															+			inode1 = tmpinode;
														
 
															+		}
														
 
															+		/* lock id2 */
														
 
															+		status = ocfs2_meta_lock(inode2, handle, bh2, 1);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOENT)
														
 
															+				mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+	/* lock id1 */
														
 
															+	status = ocfs2_meta_lock(inode1, handle, bh1, 1);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+#define PARENT_INO(buffer) \
														
 
															+	((struct ocfs2_dir_entry *) \
														
 
															+	 ((char *)buffer + \
														
 
															+	  le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
														
 
															+
														
 
															+static int ocfs2_rename(struct inode *old_dir,
														
 
															+			struct dentry *old_dentry,
														
 
															+			struct inode *new_dir,
														
 
															+			struct dentry *new_dentry)
														
 
															+{
														
 
															+	int status = 0, rename_lock = 0;
														
 
															+	struct inode *old_inode = old_dentry->d_inode;
														
 
															+	struct inode *new_inode = new_dentry->d_inode;
														
 
															+	struct ocfs2_dinode *newfe = NULL;
														
 
															+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
														
 
															+	struct buffer_head *orphan_entry_bh = NULL;
														
 
															+	struct buffer_head *newfe_bh = NULL;
														
 
															+	struct buffer_head *insert_entry_bh = NULL;
														
 
															+	struct ocfs2_super *osb = NULL;
														
 
															+	u64 newfe_blkno;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct buffer_head *old_dir_bh = NULL;
														
 
															+	struct buffer_head *new_dir_bh = NULL;
														
 
															+	struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
														
 
															+							       // and new_dentry
														
 
															+	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
														
 
															+	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
														
 
															+						    // this is the 1st dirent bh
														
 
															+	nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
														
 
															+	unsigned int links_count;
														
 
															+
														
 
															+	/* At some point it might be nice to break this function up a
														
 
															+	 * bit. */
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
														
 
															+		   old_dir, old_dentry, new_dir, new_dentry,
														
 
															+		   old_dentry->d_name.len, old_dentry->d_name.name,
														
 
															+		   new_dentry->d_name.len, new_dentry->d_name.name);
														
 
															+
														
 
															+	osb = OCFS2_SB(old_dir->i_sb);
														
 
															+
														
 
															+	if (new_inode) {
														
 
															+		if (!igrab(new_inode))
														
 
															+			BUG();
														
 
															+	}
														
 
															+
														
 
															+	if (atomic_read(&old_dentry->d_count) > 2) {
														
 
															+		shrink_dcache_parent(old_dentry);
														
 
															+		if (atomic_read(&old_dentry->d_count) > 2) {
														
 
															+			status = -EBUSY;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Assume a directory heirarchy thusly:
														
 
															+	 * a/b/c
														
 
															+	 * a/d
														
 
															+	 * a,b,c, and d are all directories.
														
 
															+	 *
														
 
															+	 * from cwd of 'a' on both nodes:
														
 
															+	 * node1: mv b/c d
														
 
															+	 * node2: mv d   b/c
														
 
															+	 *
														
 
															+	 * And that's why, just like the VFS, we need a file system
														
 
															+	 * rename lock. */
														
 
															+	if (old_dentry != new_dentry) {
														
 
															+		status = ocfs2_rename_lock(osb);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		rename_lock = 1;
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (handle == NULL) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* if old and new are the same, this'll just do one lock. */
														
 
															+	status = ocfs2_double_lock(osb, handle,
														
 
															+				  &old_dir_bh, old_dir,
														
 
															+				  &new_dir_bh, new_dir);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* make sure both dirs have bhs
														
 
															+	 * get an extra ref on old_dir_bh if old==new */
														
 
															+	if (!new_dir_bh) {
														
 
															+		if (old_dir_bh) {
														
 
															+			new_dir_bh = old_dir_bh;
														
 
															+			get_bh(new_dir_bh);
														
 
															+		} else {
														
 
															+			mlog(ML_ERROR, "no old_dir_bh!\n");
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (S_ISDIR(old_inode->i_mode)) {
														
 
															+		/* Directories actually require metadata updates to
														
 
															+		 * the directory info so we can't get away with not
														
 
															+		 * doing node locking on it. */
														
 
															+		status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOENT)
														
 
															+				mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		status = -EIO;
														
 
															+		old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
														
 
															+		if (!old_inode_de_bh)
														
 
															+			goto bail;
														
 
															+
														
 
															+		status = -EIO;
														
 
															+		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
														
 
															+		    OCFS2_I(old_dir)->ip_blkno)
														
 
															+			goto bail;
														
 
															+		status = -EMLINK;
														
 
															+		if (!new_inode && new_dir!=old_dir &&
														
 
															+		    new_dir->i_nlink >= OCFS2_LINK_MAX)
														
 
															+			goto bail;
														
 
															+	} else {
														
 
															+		/* Ah, the simple case - we're a file so just send a
														
 
															+		 * message. */
														
 
															+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	status = -ENOENT;
														
 
															+	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
														
 
															+				     old_dentry->d_name.len,
														
 
															+				     old_dir, &old_de);
														
 
															+	if (!old_de_bh)
														
 
															+		goto bail;
														
 
															+
														
 
															+	/*
														
 
															+	 *  Check for inode number is _not_ due to possible IO errors.
														
 
															+	 *  We might rmdir the source, keep it as pwd of some process
														
 
															+	 *  and merrily kill the link to whatever was created under the
														
 
															+	 *  same name. Goodbye sticky bit ;-<
														
 
															+	 */
														
 
															+	if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
														
 
															+		goto bail;
														
 
															+
														
 
															+	/* check if the target already exists (in which case we need
														
 
															+	 * to delete it */
														
 
															+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
														
 
															+					  new_dentry->d_name.len,
														
 
															+					  &newfe_blkno, new_dir, &new_de_bh,
														
 
															+					  &new_de);
														
 
															+	/* The only error we allow here is -ENOENT because the new
														
 
															+	 * file not existing is perfectly valid. */
														
 
															+	if ((status < 0) && (status != -ENOENT)) {
														
 
															+		/* If we cannot find the file specified we should just */
														
 
															+		/* return the error... */
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (!new_de && new_inode)
														
 
															+		mlog(ML_ERROR, "inode %lu does not exist in it's parent "
														
 
															+		     "directory!", new_inode->i_ino);
														
 
															+
														
 
															+	/* In case we need to overwrite an existing file, we blow it
														
 
															+	 * away first */
														
 
															+	if (new_de) {
														
 
															+		/* VFS didn't think there existed an inode here, but
														
 
															+		 * someone else in the cluster must have raced our
														
 
															+		 * rename to create one. Today we error cleanly, in
														
 
															+		 * the future we should consider calling iget to build
														
 
															+		 * a new struct inode for this entry. */
														
 
															+		if (!new_inode) {
														
 
															+			status = -EACCES;
														
 
															+
														
 
															+			mlog(0, "We found an inode for name %.*s but VFS "
														
 
															+			     "didn't give us one.\n", new_dentry->d_name.len,
														
 
															+			     new_dentry->d_name.name);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
														
 
															+			status = -EACCES;
														
 
															+
														
 
															+			mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
														
 
															+			     "disagree. ip_flags = %x\n",
														
 
															+			     OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
														
 
															+			     OCFS2_I(new_inode)->ip_flags);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOENT)
														
 
															+				mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		if (S_ISDIR(new_inode->i_mode))
														
 
															+			links_count = 0;
														
 
															+		else
														
 
															+			links_count = (unsigned int) (new_inode->i_nlink - 1);
														
 
															+
														
 
															+		status = ocfs2_request_unlink_vote(new_inode, new_dentry,
														
 
															+						   links_count);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
														
 
															+
														
 
															+		mlog(0, "aha rename over existing... new_de=%p "
														
 
															+		     "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
														
 
															+		     new_de, newfe_blkno, newfe_bh, newfe_bh ?
														
 
															+		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
														
 
															+
														
 
															+		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
														
 
															+			status = ocfs2_prepare_orphan_dir(osb, handle,
														
 
															+							  new_inode,
														
 
															+							  orphan_name,
														
 
															+							  &orphan_entry_bh);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+	} else {
														
 
															+		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
														
 
															+
														
 
															+		status = ocfs2_check_dir_for_entry(new_dir,
														
 
															+						   new_dentry->d_name.name,
														
 
															+						   new_dentry->d_name.len);
														
 
															+		if (status)
														
 
															+			goto bail;
														
 
															+
														
 
															+		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
														
 
															+						      new_dentry->d_name.name,
														
 
															+						      new_dentry->d_name.len,
														
 
															+						      &insert_entry_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (new_de) {
														
 
															+		if (S_ISDIR(new_inode->i_mode)) {
														
 
															+			if (!ocfs2_empty_dir(new_inode) ||
														
 
															+			    new_inode->i_nlink != 2) {
														
 
															+				status = -ENOTEMPTY;
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
														
 
															+					      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		if (S_ISDIR(new_inode->i_mode) ||
														
 
															+		    (newfe->i_links_count == cpu_to_le16(1))){
														
 
															+			status = ocfs2_orphan_add(osb, handle, new_inode,
														
 
															+						  newfe, orphan_name,
														
 
															+						  orphan_entry_bh);
														
 
															+			if (status < 0) {
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		/* change the dirent to point to the correct inode */
														
 
															+		status = ocfs2_journal_access(handle, new_dir, new_de_bh,
														
 
															+					      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
														
 
															+		new_de->file_type = old_de->file_type;
														
 
															+		new_dir->i_version++;
														
 
															+		status = ocfs2_journal_dirty(handle, new_de_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		if (S_ISDIR(new_inode->i_mode))
														
 
															+			newfe->i_links_count = 0;
														
 
															+		else
														
 
															+			le16_add_cpu(&newfe->i_links_count, -1);
														
 
															+
														
 
															+		status = ocfs2_journal_dirty(handle, newfe_bh);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	} else {
														
 
															+		/* if the name was not found in new_dir, add it now */
														
 
															+		status = ocfs2_add_entry(handle, new_dentry, old_inode,
														
 
															+					 OCFS2_I(old_inode)->ip_blkno,
														
 
															+					 new_dir_bh, insert_entry_bh);
														
 
															+	}
														
 
															+
														
 
															+	old_inode->i_ctime = CURRENT_TIME;
														
 
															+	mark_inode_dirty(old_inode);
														
 
															+
														
 
															+	/* now that the name has been added to new_dir, remove the old name */
														
 
															+	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (new_inode) {
														
 
															+		new_inode->i_nlink--;
														
 
															+		new_inode->i_ctime = CURRENT_TIME;
														
 
															+	}
														
 
															+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
														
 
															+	if (old_inode_de_bh) {
														
 
															+		status = ocfs2_journal_access(handle, old_inode,
														
 
															+					     old_inode_de_bh,
														
 
															+					     OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+		PARENT_INO(old_inode_de_bh->b_data) =
														
 
															+			cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
														
 
															+		status = ocfs2_journal_dirty(handle, old_inode_de_bh);
														
 
															+		old_dir->i_nlink--;
														
 
															+		if (new_inode) {
														
 
															+			new_inode->i_nlink--;
														
 
															+		} else {
														
 
															+			new_dir->i_nlink++;
														
 
															+			mark_inode_dirty(new_dir);
														
 
															+		}
														
 
															+	}
														
 
															+	mark_inode_dirty(old_dir);
														
 
															+	if (new_inode)
														
 
															+		mark_inode_dirty(new_inode);
														
 
															+
														
 
															+	if (old_dir != new_dir)
														
 
															+		if (new_dir_nlink != new_dir->i_nlink) {
														
 
															+			if (!new_dir_bh) {
														
 
															+				mlog(ML_ERROR, "need to change nlink for new "
														
 
															+				     "dir %"MLFu64" from %d to %d but bh is "
														
 
															+				     "NULL\n", OCFS2_I(new_dir)->ip_blkno,
														
 
															+				     (int)new_dir_nlink, new_dir->i_nlink);
														
 
															+			} else {
														
 
															+				struct ocfs2_dinode *fe;
														
 
															+				status = ocfs2_journal_access(handle,
														
 
															+							      new_dir,
														
 
															+							      new_dir_bh,
														
 
															+							      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+				fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
														
 
															+				fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
														
 
															+				status = ocfs2_journal_dirty(handle, new_dir_bh);
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+	if (old_dir_nlink != old_dir->i_nlink) {
														
 
															+		if (!old_dir_bh) {
														
 
															+			mlog(ML_ERROR, "need to change nlink for old dir "
														
 
															+			     "%"MLFu64" from %d to %d but bh is NULL!\n",
														
 
															+			     OCFS2_I(old_dir)->ip_blkno,
														
 
															+			     (int)old_dir_nlink,
														
 
															+			     old_dir->i_nlink);
														
 
															+		} else {
														
 
															+			struct ocfs2_dinode *fe;
														
 
															+			status = ocfs2_journal_access(handle, old_dir,
														
 
															+						      old_dir_bh,
														
 
															+						      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
														
 
															+			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
														
 
															+			status = ocfs2_journal_dirty(handle, old_dir_bh);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (rename_lock)
														
 
															+		ocfs2_rename_unlock(osb);
														
 
															+
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (new_inode)
														
 
															+		sync_mapping_buffers(old_inode->i_mapping);
														
 
															+
														
 
															+	if (new_inode)
														
 
															+		iput(new_inode);
														
 
															+	if (newfe_bh)
														
 
															+		brelse(newfe_bh);
														
 
															+	if (old_dir_bh)
														
 
															+		brelse(old_dir_bh);
														
 
															+	if (new_dir_bh)
														
 
															+		brelse(new_dir_bh);
														
 
															+	if (new_de_bh)
														
 
															+		brelse(new_de_bh);
														
 
															+	if (old_de_bh)
														
 
															+		brelse(old_de_bh);
														
 
															+	if (old_inode_de_bh)
														
 
															+		brelse(old_inode_de_bh);
														
 
															+	if (orphan_entry_bh)
														
 
															+		brelse(orphan_entry_bh);
														
 
															+	if (insert_entry_bh)
														
 
															+		brelse(insert_entry_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * we expect i_size = strlen(symname). Copy symname into the file
														
 
															+ * data, including the null terminator.
														
 
															+ */
														
 
															+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
														
 
															+				     struct ocfs2_journal_handle *handle,
														
 
															+				     struct inode *inode,
														
 
															+				     const char *symname)
														
 
															+{
														
 
															+	struct buffer_head **bhs = NULL;
														
 
															+	const char *c;
														
 
															+	struct super_block *sb = osb->sb;
														
 
															+	u64 p_blkno;
														
 
															+	int p_blocks;
														
 
															+	int virtual, blocks, status, i, bytes_left;
														
 
															+
														
 
															+	bytes_left = i_size_read(inode) + 1;
														
 
															+	/* we can't trust i_blocks because we're actually going to
														
 
															+	 * write i_size + 1 bytes. */
														
 
															+	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
														
 
															+
														
 
															+	mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
														
 
															+		       inode->i_blocks, i_size_read(inode), blocks);
														
 
															+
														
 
															+	/* Sanity check -- make sure we're going to fit. */
														
 
															+	if (bytes_left >
														
 
															+	    ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
														
 
															+		status = -EIO;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
														
 
															+	if (!bhs) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
														
 
															+					     &p_blocks);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* links can never be larger than one cluster so we know this
														
 
															+	 * is all going to be contiguous, but do a sanity check
														
 
															+	 * anyway. */
														
 
															+	if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
														
 
															+		status = -EIO;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	virtual = 0;
														
 
															+	while(bytes_left > 0) {
														
 
															+		c = &symname[virtual * sb->s_blocksize];
														
 
															+
														
 
															+		bhs[virtual] = sb_getblk(sb, p_blkno);
														
 
															+		if (!bhs[virtual]) {
														
 
															+			status = -ENOMEM;
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
														
 
															+
														
 
															+		status = ocfs2_journal_access(handle, inode, bhs[virtual],
														
 
															+					      OCFS2_JOURNAL_ACCESS_CREATE);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
														
 
															+
														
 
															+		memcpy(bhs[virtual]->b_data, c,
														
 
															+		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
														
 
															+		       bytes_left);
														
 
															+
														
 
															+		status = ocfs2_journal_dirty(handle, bhs[virtual]);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		virtual++;
														
 
															+		p_blkno++;
														
 
															+		bytes_left -= sb->s_blocksize;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+
														
 
															+	if (bhs) {
														
 
															+		for(i = 0; i < blocks; i++)
														
 
															+			if (bhs[i])
														
 
															+				brelse(bhs[i]);
														
 
															+		kfree(bhs);
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_symlink(struct inode *dir,
														
 
															+			 struct dentry *dentry,
														
 
															+			 const char *symname)
														
 
															+{
														
 
															+	int status, l, credits;
														
 
															+	u64 newsize;
														
 
															+	struct ocfs2_super *osb = NULL;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct super_block *sb;
														
 
															+	struct buffer_head *new_fe_bh = NULL;
														
 
															+	struct buffer_head *de_bh = NULL;
														
 
															+	struct buffer_head *parent_fe_bh = NULL;
														
 
															+	struct ocfs2_dinode *fe = NULL;
														
 
															+	struct ocfs2_dinode *dirfe;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	struct ocfs2_alloc_context *inode_ac = NULL;
														
 
															+	struct ocfs2_alloc_context *data_ac = NULL;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
														
 
															+		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
														
 
															+
														
 
															+	sb = dir->i_sb;
														
 
															+	osb = OCFS2_SB(sb);
														
 
															+
														
 
															+	l = strlen(symname) + 1;
														
 
															+
														
 
															+	credits = ocfs2_calc_symlink_credits(sb);
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (handle == NULL) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* lock the parent directory */
														
 
															+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOENT)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
														
 
															+	if (!dirfe->i_links_count) {
														
 
															+		/* can't make a file in a deleted directory. */
														
 
															+		status = -ENOENT;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
														
 
															+					   dentry->d_name.len);
														
 
															+	if (status)
														
 
															+		goto bail;
														
 
															+
														
 
															+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
														
 
															+					      dentry->d_name.name,
														
 
															+					      dentry->d_name.len, &de_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	/* don't reserve bitmap space for fast symlinks. */
														
 
															+	if (l > ocfs2_fast_symlink_chars(sb)) {
														
 
															+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC)
														
 
															+				mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	handle = ocfs2_start_trans(osb, handle, credits);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_mknod_locked(osb, dir, dentry,
														
 
															+				    S_IFLNK | S_IRWXUGO, 0,
														
 
															+				    &new_fe_bh, parent_fe_bh, handle,
														
 
															+				    &inode, inode_ac);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
														
 
															+	inode->i_rdev = 0;
														
 
															+	newsize = l - 1;
														
 
															+	if (l > ocfs2_fast_symlink_chars(sb)) {
														
 
															+		inode->i_op = &ocfs2_symlink_inode_operations;
														
 
															+		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
														
 
															+						    handle, data_ac, NULL,
														
 
															+						    NULL);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC && status != -EINTR) {
														
 
															+				mlog(ML_ERROR, "Failed to extend file to "
														
 
															+					       "%"MLFu64"\n",
														
 
															+				     newsize);
														
 
															+				mlog_errno(status);
														
 
															+				status = -ENOSPC;
														
 
															+			}
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		i_size_write(inode, newsize);
														
 
															+		inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
														
 
															+	} else {
														
 
															+		inode->i_op = &ocfs2_fast_symlink_inode_operations;
														
 
															+		memcpy((char *) fe->id2.i_symlink, symname, l);
														
 
															+		i_size_write(inode, newsize);
														
 
															+		inode->i_blocks = 0;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (!ocfs2_inode_is_fast_symlink(inode)) {
														
 
															+		status = ocfs2_create_symlink_data(osb, handle, inode,
														
 
															+						   symname);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_add_entry(handle, dentry, inode,
														
 
															+				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
														
 
															+				 de_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	insert_inode_hash(inode);
														
 
															+	dentry->d_op = &ocfs2_dentry_ops;
														
 
															+	d_instantiate(dentry, inode);
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+	if (new_fe_bh)
														
 
															+		brelse(new_fe_bh);
														
 
															+	if (parent_fe_bh)
														
 
															+		brelse(parent_fe_bh);
														
 
															+	if (de_bh)
														
 
															+		brelse(de_bh);
														
 
															+	if (inode_ac)
														
 
															+		ocfs2_free_alloc_context(inode_ac);
														
 
															+	if (data_ac)
														
 
															+		ocfs2_free_alloc_context(data_ac);
														
 
															+	if ((status < 0) && inode)
														
 
															+		iput(inode);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_check_dir_entry(struct inode * dir,
														
 
															+			  struct ocfs2_dir_entry * de,
														
 
															+			  struct buffer_head * bh,
														
 
															+			  unsigned long offset)
														
 
															+{
														
 
															+	const char *error_msg = NULL;
														
 
															+	const int rlen = le16_to_cpu(de->rec_len);
														
 
															+
														
 
															+	if (rlen < OCFS2_DIR_REC_LEN(1))
														
 
															+		error_msg = "rec_len is smaller than minimal";
														
 
															+	else if (rlen % 4 != 0)
														
 
															+		error_msg = "rec_len % 4 != 0";
														
 
															+	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
														
 
															+		error_msg = "rec_len is too small for name_len";
														
 
															+	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
														
 
															+		error_msg = "directory entry across blocks";
														
 
															+
														
 
															+	if (error_msg != NULL)
														
 
															+		mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
														
 
															+		     "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
														
 
															+		     OCFS2_I(dir)->ip_blkno, error_msg, offset,
														
 
															+		     le64_to_cpu(de->inode), rlen, de->name_len);
														
 
															+	return error_msg == NULL ? 1 : 0;
														
 
															+}
														
 
															+
														
 
															+/* we don't always have a dentry for what we want to add, so people
														
 
															+ * like orphan dir can call this instead.
														
 
															+ *
														
 
															+ * If you pass me insert_bh, I'll skip the search of the other dir
														
 
															+ * blocks and put the record in there.
														
 
															+ */
														
 
															+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
														
 
															+			     struct inode *dir,
														
 
															+			     const char *name, int namelen,
														
 
															+			     struct inode *inode, u64 blkno,
														
 
															+			     struct buffer_head *parent_fe_bh,
														
 
															+			     struct buffer_head *insert_bh)
														
 
															+{
														
 
															+	unsigned long offset;
														
 
															+	unsigned short rec_len;
														
 
															+	struct ocfs2_dir_entry *de, *de1;
														
 
															+	struct super_block *sb;
														
 
															+	int retval, status;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	sb = dir->i_sb;
														
 
															+
														
 
															+	if (!namelen)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	rec_len = OCFS2_DIR_REC_LEN(namelen);
														
 
															+	offset = 0;
														
 
															+	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
														
 
															+	while (1) {
														
 
															+		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
														
 
															+		/* These checks should've already been passed by the
														
 
															+		 * prepare function, but I guess we can leave them
														
 
															+		 * here anyway. */
														
 
															+		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
														
 
															+			retval = -ENOENT;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		if (ocfs2_match(namelen, name, de)) {
														
 
															+			retval = -EEXIST;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		if (((le64_to_cpu(de->inode) == 0) &&
														
 
															+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
														
 
															+		    (le16_to_cpu(de->rec_len) >=
														
 
															+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
														
 
															+			status = ocfs2_journal_access(handle, dir, insert_bh,
														
 
															+						      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+			/* By now the buffer is marked for journaling */
														
 
															+			offset += le16_to_cpu(de->rec_len);
														
 
															+			if (le64_to_cpu(de->inode)) {
														
 
															+				de1 = (struct ocfs2_dir_entry *)((char *) de +
														
 
															+					OCFS2_DIR_REC_LEN(de->name_len));
														
 
															+				de1->rec_len =
														
 
															+					cpu_to_le16(le16_to_cpu(de->rec_len) -
														
 
															+					OCFS2_DIR_REC_LEN(de->name_len));
														
 
															+				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
														
 
															+				de = de1;
														
 
															+			}
														
 
															+			de->file_type = OCFS2_FT_UNKNOWN;
														
 
															+			if (blkno) {
														
 
															+				de->inode = cpu_to_le64(blkno);
														
 
															+				ocfs2_set_de_type(de, inode->i_mode);
														
 
															+			} else
														
 
															+				de->inode = 0;
														
 
															+			de->name_len = namelen;
														
 
															+			memcpy(de->name, name, namelen);
														
 
															+
														
 
															+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
														
 
															+			dir->i_version++;
														
 
															+			status = ocfs2_journal_dirty(handle, insert_bh);
														
 
															+			retval = 0;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		offset += le16_to_cpu(de->rec_len);
														
 
															+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
														
 
															+	}
														
 
															+
														
 
															+	/* when you think about it, the assert above should prevent us
														
 
															+	 * from ever getting here. */
														
 
															+	retval = -ENOSPC;
														
 
															+bail:
														
 
															+
														
 
															+	mlog_exit(retval);
														
 
															+	return retval;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * ocfs2_delete_entry deletes a directory entry by merging it with the
														
 
															+ * previous entry
														
 
															+ */
														
 
															+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
														
 
															+			      struct inode *dir,
														
 
															+			      struct ocfs2_dir_entry *de_del,
														
 
															+			      struct buffer_head *bh)
														
 
															+{
														
 
															+	struct ocfs2_dir_entry *de, *pde;
														
 
															+	int i, status = -ENOENT;
														
 
															+
														
 
															+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
														
 
															+
														
 
															+	i = 0;
														
 
															+	pde = NULL;
														
 
															+	de = (struct ocfs2_dir_entry *) bh->b_data;
														
 
															+	while (i < bh->b_size) {
														
 
															+		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
														
 
															+			status = -EIO;
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		if (de == de_del)  {
														
 
															+			status = ocfs2_journal_access(handle, dir, bh,
														
 
															+						      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+			if (status < 0) {
														
 
															+				status = -EIO;
														
 
															+				mlog_errno(status);
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			if (pde)
														
 
															+				pde->rec_len =
														
 
															+					cpu_to_le16(le16_to_cpu(pde->rec_len) +
														
 
															+						    le16_to_cpu(de->rec_len));
														
 
															+			else
														
 
															+				de->inode = 0;
														
 
															+			dir->i_version++;
														
 
															+			status = ocfs2_journal_dirty(handle, bh);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		i += le16_to_cpu(de->rec_len);
														
 
															+		pde = de;
														
 
															+		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
														
 
															+	}
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Returns 0 if not found, -1 on failure, and 1 on success
														
 
															+ */
														
 
															+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
														
 
															+					struct inode *dir,
														
 
															+					const char *name, int namelen,
														
 
															+					unsigned long offset,
														
 
															+					struct ocfs2_dir_entry **res_dir)
														
 
															+{
														
 
															+	struct ocfs2_dir_entry *de;
														
 
															+	char *dlimit, *de_buf;
														
 
															+	int de_len;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	de_buf = bh->b_data;
														
 
															+	dlimit = de_buf + dir->i_sb->s_blocksize;
														
 
															+
														
 
															+	while (de_buf < dlimit) {
														
 
															+		/* this code is executed quadratically often */
														
 
															+		/* do minimal checking `by hand' */
														
 
															+
														
 
															+		de = (struct ocfs2_dir_entry *) de_buf;
														
 
															+
														
 
															+		if (de_buf + namelen <= dlimit &&
														
 
															+		    ocfs2_match(namelen, name, de)) {
														
 
															+			/* found a match - just to be sure, do a full check */
														
 
															+			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
														
 
															+				ret = -1;
														
 
															+				goto bail;
														
 
															+			}
														
 
															+			*res_dir = de;
														
 
															+			ret = 1;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		/* prevent looping on a bad block */
														
 
															+		de_len = le16_to_cpu(de->rec_len);
														
 
															+		if (de_len <= 0) {
														
 
															+			ret = -1;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		de_buf += de_len;
														
 
															+		offset += de_len;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
														
 
															+				     struct inode *dir,
														
 
															+				     struct ocfs2_dir_entry **res_dir)
														
 
															+{
														
 
															+	struct super_block *sb;
														
 
															+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
														
 
															+	struct buffer_head *bh, *ret = NULL;
														
 
															+	unsigned long start, block, b;
														
 
															+	int ra_max = 0;		/* Number of bh's in the readahead
														
 
															+				   buffer, bh_use[] */
														
 
															+	int ra_ptr = 0;		/* Current index into readahead
														
 
															+				   buffer */
														
 
															+	int num = 0;
														
 
															+	int nblocks, i, err;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	*res_dir = NULL;
														
 
															+	sb = dir->i_sb;
														
 
															+
														
 
															+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
														
 
															+	start = OCFS2_I(dir)->ip_dir_start_lookup;
														
 
															+	if (start >= nblocks)
														
 
															+		start = 0;
														
 
															+	block = start;
														
 
															+
														
 
															+restart:
														
 
															+	do {
														
 
															+		/*
														
 
															+		 * We deal with the read-ahead logic here.
														
 
															+		 */
														
 
															+		if (ra_ptr >= ra_max) {
														
 
															+			/* Refill the readahead buffer */
														
 
															+			ra_ptr = 0;
														
 
															+			b = block;
														
 
															+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
														
 
															+				/*
														
 
															+				 * Terminate if we reach the end of the
														
 
															+				 * directory and must wrap, or if our
														
 
															+				 * search has finished at this block.
														
 
															+				 */
														
 
															+				if (b >= nblocks || (num && block == start)) {
														
 
															+					bh_use[ra_max] = NULL;
														
 
															+					break;
														
 
															+				}
														
 
															+				num++;
														
 
															+
														
 
															+				/* XXX: questionable readahead stuff here */
														
 
															+				bh = ocfs2_bread(dir, b++, &err, 1);
														
 
															+				bh_use[ra_max] = bh;
														
 
															+#if 0		// ???
														
 
															+				if (bh)
														
 
															+					ll_rw_block(READ, 1, &bh);
														
 
															+#endif
														
 
															+			}
														
 
															+		}
														
 
															+		if ((bh = bh_use[ra_ptr++]) == NULL)
														
 
															+			goto next;
														
 
															+		wait_on_buffer(bh);
														
 
															+		if (!buffer_uptodate(bh)) {
														
 
															+			/* read error, skip block & hope for the best */
														
 
															+			brelse(bh);
														
 
															+			goto next;
														
 
															+		}
														
 
															+		i = ocfs2_search_dirblock(bh, dir, name, namelen,
														
 
															+					  block << sb->s_blocksize_bits,
														
 
															+					  res_dir);
														
 
															+		if (i == 1) {
														
 
															+			OCFS2_I(dir)->ip_dir_start_lookup = block;
														
 
															+			ret = bh;
														
 
															+			goto cleanup_and_exit;
														
 
															+		} else {
														
 
															+			brelse(bh);
														
 
															+			if (i < 0)
														
 
															+				goto cleanup_and_exit;
														
 
															+		}
														
 
															+	next:
														
 
															+		if (++block >= nblocks)
														
 
															+			block = 0;
														
 
															+	} while (block != start);
														
 
															+
														
 
															+	/*
														
 
															+	 * If the directory has grown while we were searching, then
														
 
															+	 * search the last part of the directory before giving up.
														
 
															+	 */
														
 
															+	block = nblocks;
														
 
															+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
														
 
															+	if (block < nblocks) {
														
 
															+		start = 0;
														
 
															+		goto restart;
														
 
															+	}
														
 
															+
														
 
															+cleanup_and_exit:
														
 
															+	/* Clean up the read-ahead blocks */
														
 
															+	for (; ra_ptr < ra_max; ra_ptr++)
														
 
															+		brelse(bh_use[ra_ptr]);
														
 
															+
														
 
															+	mlog_exit_ptr(ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_blkno_stringify(u64 blkno, char *name)
														
 
															+{
														
 
															+	int status, namelen;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
														
 
															+			   blkno);
														
 
															+	if (namelen <= 0) {
														
 
															+		if (namelen)
														
 
															+			status = namelen;
														
 
															+		else
														
 
															+			status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (namelen != OCFS2_ORPHAN_NAMELEN) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
														
 
															+	     namelen);
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
														
 
															+				    struct ocfs2_journal_handle *handle,
														
 
															+				    struct inode *inode,
														
 
															+				    char *name,
														
 
															+				    struct buffer_head **de_bh)
														
 
															+{
														
 
															+	struct inode *orphan_dir_inode = NULL;
														
 
															+	struct buffer_head *orphan_dir_bh = NULL;
														
 
															+	int status = 0;
														
 
															+
														
 
															+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						       ORPHAN_DIR_SYSTEM_INODE,
														
 
															+						       osb->slot_num);
														
 
															+	if (!orphan_dir_inode) {
														
 
															+		status = -ENOENT;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	ocfs2_handle_add_inode(handle, orphan_dir_inode);
														
 
															+	status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
														
 
															+					      orphan_dir_bh, name,
														
 
															+					      OCFS2_ORPHAN_NAMELEN, de_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	if (orphan_dir_inode)
														
 
															+		iput(orphan_dir_inode);
														
 
															+
														
 
															+	if (orphan_dir_bh)
														
 
															+		brelse(orphan_dir_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_orphan_add(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_journal_handle *handle,
														
 
															+			    struct inode *inode,
														
 
															+			    struct ocfs2_dinode *fe,
														
 
															+			    char *name,
														
 
															+			    struct buffer_head *de_bh)
														
 
															+{
														
 
															+	struct inode *orphan_dir_inode = NULL;
														
 
															+	struct buffer_head *orphan_dir_bh = NULL;
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_dinode *orphan_fe;
														
 
															+
														
 
															+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
														
 
															+
														
 
															+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						       ORPHAN_DIR_SYSTEM_INODE,
														
 
															+						       osb->slot_num);
														
 
															+	if (!orphan_dir_inode) {
														
 
															+		status = -ENOENT;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_read_block(osb,
														
 
															+				  OCFS2_I(orphan_dir_inode)->ip_blkno,
														
 
															+				  &orphan_dir_bh, OCFS2_BH_CACHED,
														
 
															+				  orphan_dir_inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* we're a cluster, and nlink can change on disk from
														
 
															+	 * underneath us... */
														
 
															+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
														
 
															+	if (S_ISDIR(inode->i_mode))
														
 
															+		le16_add_cpu(&orphan_fe->i_links_count, 1);
														
 
															+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
														
 
															+				   OCFS2_ORPHAN_NAMELEN, inode,
														
 
															+				   OCFS2_I(inode)->ip_blkno,
														
 
															+				   orphan_dir_bh, de_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
														
 
															+
														
 
															+	/* Record which orphan dir our inode now resides
														
 
															+	 * in. delete_inode will use this to determine which orphan
														
 
															+	 * dir to lock. */
														
 
															+	spin_lock(&OCFS2_I(inode)->ip_lock);
														
 
															+	OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
														
 
															+	spin_unlock(&OCFS2_I(inode)->ip_lock);
														
 
															+
														
 
															+	mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
														
 
															+	     OCFS2_I(inode)->ip_blkno, osb->slot_num);
														
 
															+
														
 
															+leave:
														
 
															+	if (orphan_dir_inode)
														
 
															+		iput(orphan_dir_inode);
														
 
															+
														
 
															+	if (orphan_dir_bh)
														
 
															+		brelse(orphan_dir_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* unlike orphan_add, we expect the orphan dir to already be locked here. */
														
 
															+int ocfs2_orphan_del(struct ocfs2_super *osb,
														
 
															+		     struct ocfs2_journal_handle *handle,
														
 
															+		     struct inode *orphan_dir_inode,
														
 
															+		     struct inode *inode,
														
 
															+		     struct buffer_head *orphan_dir_bh)
														
 
															+{
														
 
															+	char name[OCFS2_ORPHAN_NAMELEN + 1];
														
 
															+	struct ocfs2_dinode *orphan_fe;
														
 
															+	int status = 0;
														
 
															+	struct buffer_head *target_de_bh = NULL;
														
 
															+	struct ocfs2_dir_entry *target_de = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
														
 
															+	     name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
														
 
															+
														
 
															+	/* find it's spot in the orphan directory */
														
 
															+	target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
														
 
															+					orphan_dir_inode, &target_de);
														
 
															+	if (!target_de_bh) {
														
 
															+		status = -ENOENT;
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* remove it from the orphan directory */
														
 
															+	status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
														
 
															+				    target_de_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+	/* do the i_nlink dance! :) */
														
 
															+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
														
 
															+	if (S_ISDIR(inode->i_mode))
														
 
															+		le16_add_cpu(&orphan_fe->i_links_count, -1);
														
 
															+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto leave;
														
 
															+	}
														
 
															+
														
 
															+leave:
														
 
															+	if (target_de_bh)
														
 
															+		brelse(target_de_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+struct inode_operations ocfs2_dir_iops = {
														
 
															+	.create		= ocfs2_create,
														
 
															+	.lookup		= ocfs2_lookup,
														
 
															+	.link		= ocfs2_link,
														
 
															+	.unlink		= ocfs2_unlink,
														
 
															+	.rmdir		= ocfs2_unlink,
														
 
															+	.symlink	= ocfs2_symlink,
														
 
															+	.mkdir		= ocfs2_mkdir,
														
 
															+	.mknod		= ocfs2_mknod,
														
 
															+	.rename		= ocfs2_rename,
														
 
															+	.setattr	= ocfs2_setattr,
														
 
															+	.getattr	= ocfs2_getattr,
														
 
															+};
														
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,58 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * namei.h
														
 
															+ *
														
 
															+ * Function prototypes
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_NAMEI_H
														
 
															+#define OCFS2_NAMEI_H
														
 
															+
														
 
															+extern struct inode_operations ocfs2_dir_iops;
														
 
															+
														
 
															+struct dentry *ocfs2_get_parent(struct dentry *child);
														
 
															+
														
 
															+int ocfs2_check_dir_entry (struct inode *dir,
														
 
															+			   struct ocfs2_dir_entry *de,
														
 
															+			   struct buffer_head *bh,
														
 
															+			   unsigned long offset);
														
 
															+struct buffer_head *ocfs2_find_entry(const char *name,
														
 
															+				     int namelen,
														
 
															+				     struct inode *dir,
														
 
															+				     struct ocfs2_dir_entry **res_dir);
														
 
															+int ocfs2_orphan_del(struct ocfs2_super *osb,
														
 
															+		     struct ocfs2_journal_handle *handle,
														
 
															+		     struct inode *orphan_dir_inode,
														
 
															+		     struct inode *inode,
														
 
															+		     struct buffer_head *orphan_dir_bh);
														
 
															+
														
 
															+static inline int ocfs2_match(int len,
														
 
															+			      const char * const name,
														
 
															+			      struct ocfs2_dir_entry *de)
														
 
															+{
														
 
															+	if (len != de->name_len)
														
 
															+		return 0;
														
 
															+	if (!de->inode)
														
 
															+		return 0;
														
 
															+	return !memcmp(name, de->name, len);
														
 
															+}
														
 
															+
														
 
															+#endif /* OCFS2_NAMEI_H */
														
--- a/fs/ocfs2/ocfs1_fs_compat.h
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ocfs1_fs_compat.h
														
 
															+ *
														
 
															+ * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
														
 
															+ * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
														
 
															+ * This allows an OCFS1 volume to see the partition and cleanly fail to
														
 
															+ * mount it.
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License, version 2,  as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef _OCFS1_FS_COMPAT_H
														
 
															+#define _OCFS1_FS_COMPAT_H
														
 
															+
														
 
															+#define OCFS1_MAX_VOL_SIGNATURE_LEN          128
														
 
															+#define OCFS1_MAX_MOUNT_POINT_LEN            128
														
 
															+#define OCFS1_MAX_VOL_ID_LENGTH               16
														
 
															+#define OCFS1_MAX_VOL_LABEL_LEN               64
														
 
															+#define OCFS1_MAX_CLUSTER_NAME_LEN            64
														
 
															+
														
 
															+#define OCFS1_MAJOR_VERSION              (2)
														
 
															+#define OCFS1_MINOR_VERSION              (0)
														
 
															+#define OCFS1_VOLUME_SIGNATURE		 "OracleCFS"
														
 
															+
														
 
															+/*
														
 
															+ * OCFS1 superblock.  Lives at sector 0.
														
 
															+ */
														
 
															+struct ocfs1_vol_disk_hdr
														
 
															+{
														
 
															+/*00*/	__u32 minor_version;
														
 
															+	__u32 major_version;
														
 
															+/*08*/	__u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
														
 
															+/*88*/	__u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
														
 
															+/*108*/	__u64 serial_num;
														
 
															+/*110*/	__u64 device_size;
														
 
															+	__u64 start_off;
														
 
															+/*120*/	__u64 bitmap_off;
														
 
															+	__u64 publ_off;
														
 
															+/*130*/	__u64 vote_off;
														
 
															+	__u64 root_bitmap_off;
														
 
															+/*140*/	__u64 data_start_off;
														
 
															+	__u64 root_bitmap_size;
														
 
															+/*150*/	__u64 root_off;
														
 
															+	__u64 root_size;
														
 
															+/*160*/	__u64 cluster_size;
														
 
															+	__u64 num_nodes;
														
 
															+/*170*/	__u64 num_clusters;
														
 
															+	__u64 dir_node_size;
														
 
															+/*180*/	__u64 file_node_size;
														
 
															+	__u64 internal_off;
														
 
															+/*190*/	__u64 node_cfg_off;
														
 
															+	__u64 node_cfg_size;
														
 
															+/*1A0*/	__u64 new_cfg_off;
														
 
															+	__u32 prot_bits;
														
 
															+	__s32 excl_mount;
														
 
															+/*1B0*/
														
 
															+};
														
 
															+
														
 
															+
														
 
															+struct ocfs1_disk_lock
														
 
															+{
														
 
															+/*00*/	__u32 curr_master;
														
 
															+	__u8 file_lock;
														
 
															+	__u8 compat_pad[3];  /* Not in orignal definition.  Used to
														
 
															+				make the already existing alignment
														
 
															+				explicit */
														
 
															+	__u64 last_write_time;
														
 
															+/*10*/	__u64 last_read_time;
														
 
															+	__u32 writer_node_num;
														
 
															+	__u32 reader_node_num;
														
 
															+/*20*/	__u64 oin_node_map;
														
 
															+	__u64 dlock_seq_num;
														
 
															+/*30*/
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * OCFS1 volume label.  Lives at sector 1.
														
 
															+ */
														
 
															+struct ocfs1_vol_label
														
 
															+{
														
 
															+/*00*/	struct ocfs1_disk_lock disk_lock;
														
 
															+/*30*/	__u8 label[OCFS1_MAX_VOL_LABEL_LEN];
														
 
															+/*70*/	__u16 label_len;
														
 
															+/*72*/	__u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
														
 
															+/*82*/	__u16 vol_id_len;
														
 
															+/*84*/	__u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
														
 
															+/*A4*/	__u16 cluster_name_len;
														
 
															+/*A6*/
														
 
															+};
														
 
															+
														
 
															+
														
 
															+#endif /* _OCFS1_FS_COMPAT_H */
														
 
															+
														
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,464 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ocfs2.h
														
 
															+ *
														
 
															+ * Defines macros and structures used in OCFS2
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_H
														
 
															+#define OCFS2_H
														
 
															+
														
 
															+#include <linux/spinlock.h>
														
 
															+#include <linux/sched.h>
														
 
															+#include <linux/wait.h>
														
 
															+#include <linux/list.h>
														
 
															+#include <linux/rbtree.h>
														
 
															+#include <linux/workqueue.h>
														
 
															+#include <linux/kref.h>
														
 
															+
														
 
															+#include "cluster/nodemanager.h"
														
 
															+#include "cluster/heartbeat.h"
														
 
															+#include "cluster/tcp.h"
														
 
															+
														
 
															+#include "dlm/dlmapi.h"
														
 
															+
														
 
															+#include "ocfs2_fs.h"
														
 
															+#include "endian.h"
														
 
															+#include "ocfs2_lockid.h"
														
 
															+
														
 
															+struct ocfs2_extent_map {
														
 
															+	u32		em_clusters;
														
 
															+	struct rb_root	em_extents;
														
 
															+};
														
 
															+
														
 
															+/* Most user visible OCFS2 inodes will have very few pieces of
														
 
															+ * metadata, but larger files (including bitmaps, etc) must be taken
														
 
															+ * into account when designing an access scheme. We allow a small
														
 
															+ * amount of inlined blocks to be stored on an array and grow the
														
 
															+ * structure into a rb tree when necessary. */
														
 
															+#define OCFS2_INODE_MAX_CACHE_ARRAY 2
														
 
															+
														
 
															+struct ocfs2_caching_info {
														
 
															+	unsigned int		ci_num_cached;
														
 
															+	union {
														
 
															+		sector_t	ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
														
 
															+		struct rb_root	ci_tree;
														
 
															+	} ci_cache;
														
 
															+};
														
 
															+
														
 
															+/* this limits us to 256 nodes
														
 
															+ * if we need more, we can do a kmalloc for the map */
														
 
															+#define OCFS2_NODE_MAP_MAX_NODES    256
														
 
															+struct ocfs2_node_map {
														
 
															+	u16 num_nodes;
														
 
															+	unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
														
 
															+};
														
 
															+
														
 
															+enum ocfs2_ast_action {
														
 
															+	OCFS2_AST_INVALID = 0,
														
 
															+	OCFS2_AST_ATTACH,
														
 
															+	OCFS2_AST_CONVERT,
														
 
															+	OCFS2_AST_DOWNCONVERT,
														
 
															+};
														
 
															+
														
 
															+/* actions for an unlockast function to take. */
														
 
															+enum ocfs2_unlock_action {
														
 
															+	OCFS2_UNLOCK_INVALID = 0,
														
 
															+	OCFS2_UNLOCK_CANCEL_CONVERT,
														
 
															+	OCFS2_UNLOCK_DROP_LOCK,
														
 
															+};
														
 
															+
														
 
															+/* ocfs2_lock_res->l_flags flags. */
														
 
															+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* have we initialized
														
 
															+					       * the lvb */
														
 
															+#define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
														
 
															+					       * dlm_lock */
														
 
															+#define OCFS2_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
														
 
															+					       * downconvert*/
														
 
															+#define OCFS2_LOCK_LOCAL         (0x00000008) /* newly created inode */
														
 
															+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
														
 
															+#define OCFS2_LOCK_REFRESHING    (0x00000020)
														
 
															+#define OCFS2_LOCK_INITIALIZED   (0x00000040) /* track initialization
														
 
															+					       * for shutdown paths */
														
 
															+#define OCFS2_LOCK_FREEING       (0x00000080) /* help dlmglue track
														
 
															+					       * when to skip queueing
														
 
															+					       * a lock because it's
														
 
															+					       * about to be
														
 
															+					       * dropped. */
														
 
															+#define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
														
 
															+
														
 
															+struct ocfs2_lock_res_ops;
														
 
															+
														
 
															+typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
														
 
															+
														
 
															+struct ocfs2_lock_res {
														
 
															+	void                    *l_priv;
														
 
															+	struct ocfs2_lock_res_ops *l_ops;
														
 
															+	spinlock_t               l_lock;
														
 
															+
														
 
															+	struct list_head         l_blocked_list;
														
 
															+	struct list_head         l_mask_waiters;
														
 
															+
														
 
															+	enum ocfs2_lock_type     l_type;
														
 
															+	unsigned long		 l_flags;
														
 
															+	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
														
 
															+	int                      l_level;
														
 
															+	unsigned int             l_ro_holders;
														
 
															+	unsigned int             l_ex_holders;
														
 
															+	struct dlm_lockstatus    l_lksb;
														
 
															+
														
 
															+	/* used from AST/BAST funcs. */
														
 
															+	enum ocfs2_ast_action    l_action;
														
 
															+	enum ocfs2_unlock_action l_unlock_action;
														
 
															+	int                      l_requested;
														
 
															+	int                      l_blocking;
														
 
															+
														
 
															+	wait_queue_head_t        l_event;
														
 
															+
														
 
															+	struct list_head         l_debug_list;
														
 
															+};
														
 
															+
														
 
															+struct ocfs2_dlm_debug {
														
 
															+	struct kref d_refcnt;
														
 
															+	struct dentry *d_locking_state;
														
 
															+	struct list_head d_lockres_tracking;
														
 
															+};
														
 
															+
														
 
															+enum ocfs2_vol_state
														
 
															+{
														
 
															+	VOLUME_INIT = 0,
														
 
															+	VOLUME_MOUNTED,
														
 
															+	VOLUME_DISMOUNTED,
														
 
															+	VOLUME_DISABLED
														
 
															+};
														
 
															+
														
 
															+struct ocfs2_alloc_stats
														
 
															+{
														
 
															+	atomic_t moves;
														
 
															+	atomic_t local_data;
														
 
															+	atomic_t bitmap_data;
														
 
															+	atomic_t bg_allocs;
														
 
															+	atomic_t bg_extends;
														
 
															+};
														
 
															+
														
 
															+enum ocfs2_local_alloc_state
														
 
															+{
														
 
															+	OCFS2_LA_UNUSED = 0,
														
 
															+	OCFS2_LA_ENABLED,
														
 
															+	OCFS2_LA_DISABLED
														
 
															+};
														
 
															+
														
 
															+enum ocfs2_mount_options
														
 
															+{
														
 
															+	OCFS2_MOUNT_HB_LOCAL   = 1 << 0, /* Heartbeat started in local mode */
														
 
															+	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
														
 
															+	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
														
 
															+	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
														
 
															+	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
														
 
															+#ifdef OCFS2_ORACORE_WORKAROUNDS
														
 
															+	OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
														
 
															+#endif
														
 
															+};
														
 
															+
														
 
															+#define OCFS2_OSB_SOFT_RO	0x0001
														
 
															+#define OCFS2_OSB_HARD_RO	0x0002
														
 
															+#define OCFS2_OSB_ERROR_FS	0x0004
														
 
															+
														
 
															+struct ocfs2_journal;
														
 
															+struct ocfs2_journal_handle;
														
 
															+struct ocfs2_super
														
 
															+{
														
 
															+	u32 osb_id;		/* id used by the proc interface */
														
 
															+	struct task_struct *commit_task;
														
 
															+	struct super_block *sb;
														
 
															+	struct inode *root_inode;
														
 
															+	struct inode *sys_root_inode;
														
 
															+	struct inode *system_inodes[NUM_SYSTEM_INODES];
														
 
															+
														
 
															+	struct ocfs2_slot_info *slot_info;
														
 
															+
														
 
															+	spinlock_t node_map_lock;
														
 
															+	struct ocfs2_node_map mounted_map;
														
 
															+	struct ocfs2_node_map recovery_map;
														
 
															+	struct ocfs2_node_map umount_map;
														
 
															+
														
 
															+	u32 num_clusters;
														
 
															+	u64 root_blkno;
														
 
															+	u64 system_dir_blkno;
														
 
															+	u64 bitmap_blkno;
														
 
															+	u32 bitmap_cpg;
														
 
															+	u8 *uuid;
														
 
															+	char *uuid_str;
														
 
															+	u8 *vol_label;
														
 
															+	u64 first_cluster_group_blkno;
														
 
															+	u32 fs_generation;
														
 
															+
														
 
															+	u32 s_feature_compat;
														
 
															+	u32 s_feature_incompat;
														
 
															+	u32 s_feature_ro_compat;
														
 
															+
														
 
															+	/* Protects s_next_generaion, osb_flags. Could protect more on
														
 
															+	 * osb as it's very short lived. */
														
 
															+	spinlock_t osb_lock;
														
 
															+	u32 s_next_generation;
														
 
															+	unsigned long osb_flags;
														
 
															+
														
 
															+	unsigned long s_mount_opt;
														
 
															+
														
 
															+	u16 max_slots;
														
 
															+	u16 num_nodes;
														
 
															+	s16 node_num;
														
 
															+	s16 slot_num;
														
 
															+	int s_sectsize_bits;
														
 
															+	int s_clustersize;
														
 
															+	int s_clustersize_bits;
														
 
															+	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
														
 
															+
														
 
															+	atomic_t vol_state;
														
 
															+	struct semaphore recovery_lock;
														
 
															+	struct task_struct *recovery_thread_task;
														
 
															+	int disable_recovery;
														
 
															+	wait_queue_head_t checkpoint_event;
														
 
															+	atomic_t needs_checkpoint;
														
 
															+	struct ocfs2_journal *journal;
														
 
															+
														
 
															+	enum ocfs2_local_alloc_state local_alloc_state;
														
 
															+	struct buffer_head *local_alloc_bh;
														
 
															+
														
 
															+	/* Next two fields are for local node slot recovery during
														
 
															+	 * mount. */
														
 
															+	int dirty;
														
 
															+	struct ocfs2_dinode *local_alloc_copy;
														
 
															+
														
 
															+	struct ocfs2_alloc_stats alloc_stats;
														
 
															+	char dev_str[20];		/* "major,minor" of the device */
														
 
															+
														
 
															+	struct dlm_ctxt *dlm;
														
 
															+	struct ocfs2_lock_res osb_super_lockres;
														
 
															+	struct ocfs2_lock_res osb_rename_lockres;
														
 
															+	struct dlm_eviction_cb osb_eviction_cb;
														
 
															+	struct ocfs2_dlm_debug *osb_dlm_debug;
														
 
															+
														
 
															+	struct dentry *osb_debug_root;
														
 
															+
														
 
															+	wait_queue_head_t recovery_event;
														
 
															+
														
 
															+	spinlock_t vote_task_lock;
														
 
															+	struct task_struct *vote_task;
														
 
															+	wait_queue_head_t vote_event;
														
 
															+	unsigned long vote_wake_sequence;
														
 
															+	unsigned long vote_work_sequence;
														
 
															+
														
 
															+	struct list_head blocked_lock_list;
														
 
															+	unsigned long blocked_lock_count;
														
 
															+
														
 
															+	struct list_head vote_list;
														
 
															+	int vote_count;
														
 
															+
														
 
															+	u32 net_key;
														
 
															+	spinlock_t net_response_lock;
														
 
															+	unsigned int net_response_ids;
														
 
															+	struct list_head net_response_list;
														
 
															+
														
 
															+	struct o2hb_callback_func osb_hb_up;
														
 
															+	struct o2hb_callback_func osb_hb_down;
														
 
															+
														
 
															+	struct list_head	osb_net_handlers;
														
 
															+
														
 
															+	wait_queue_head_t		osb_mount_event;
														
 
															+
														
 
															+	/* Truncate log info */
														
 
															+	struct inode			*osb_tl_inode;
														
 
															+	struct buffer_head		*osb_tl_bh;
														
 
															+	struct work_struct		osb_truncate_log_wq;
														
 
															+};
														
 
															+
														
 
															+#define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
														
 
															+#define OCFS2_MAX_OSB_ID             65536
														
 
															+
														
 
															+static inline int ocfs2_should_order_data(struct inode *inode)
														
 
															+{
														
 
															+	if (!S_ISREG(inode->i_mode))
														
 
															+		return 0;
														
 
															+	if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
														
 
															+		return 0;
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+/* set / clear functions because cluster events can make these happen
														
 
															+ * in parallel so we want the transitions to be atomic. this also
														
 
															+ * means that any future flags osb_flags must be protected by spinlock
														
 
															+ * too! */
														
 
															+static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
														
 
															+				      unsigned long flag)
														
 
															+{
														
 
															+	spin_lock(&osb->osb_lock);
														
 
															+	osb->osb_flags |= flag;
														
 
															+	spin_unlock(&osb->osb_lock);
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
														
 
															+				     int hard)
														
 
															+{
														
 
															+	spin_lock(&osb->osb_lock);
														
 
															+	osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
														
 
															+	if (hard)
														
 
															+		osb->osb_flags |= OCFS2_OSB_HARD_RO;
														
 
															+	else
														
 
															+		osb->osb_flags |= OCFS2_OSB_SOFT_RO;
														
 
															+	spin_unlock(&osb->osb_lock);
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&osb->osb_lock);
														
 
															+	ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
														
 
															+	spin_unlock(&osb->osb_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&osb->osb_lock);
														
 
															+	ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
														
 
															+	spin_unlock(&osb->osb_lock);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+#define OCFS2_IS_VALID_DINODE(ptr)					\
														
 
															+	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
														
 
															+
														
 
															+#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)	do {			\
														
 
															+	typeof(__di) ____di = (__di);					\
														
 
															+	ocfs2_error((__sb), 						\
														
 
															+		"Dinode # %"MLFu64" has bad signature %.*s",		\
														
 
															+		(____di)->i_blkno, 7,					\
														
 
															+		(____di)->i_signature);					\
														
 
															+} while (0);
														
 
															+
														
 
															+#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
														
 
															+	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
														
 
															+
														
 
															+#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)	do {		\
														
 
															+	typeof(__eb) ____eb = (__eb);					\
														
 
															+	ocfs2_error((__sb), 						\
														
 
															+		"Extent Block # %"MLFu64" has bad signature %.*s",	\
														
 
															+		(____eb)->h_blkno, 7,					\
														
 
															+		(____eb)->h_signature);					\
														
 
															+} while (0);
														
 
															+
														
 
															+#define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
														
 
															+	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
														
 
															+
														
 
															+#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)	do {		\
														
 
															+	typeof(__gd) ____gd = (__gd);					\
														
 
															+		ocfs2_error((__sb),					\
														
 
															+		"Group Descriptor # %"MLFu64" has bad signature %.*s",	\
														
 
															+		(____gd)->bg_blkno, 7,					\
														
 
															+		(____gd)->bg_signature);				\
														
 
															+} while (0);
														
 
															+
														
 
															+static inline unsigned long ino_from_blkno(struct super_block *sb,
														
 
															+					   u64 blkno)
														
 
															+{
														
 
															+	return (unsigned long)(blkno & (u64)ULONG_MAX);
														
 
															+}
														
 
															+
														
 
															+static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
														
 
															+					   u32 clusters)
														
 
															+{
														
 
															+	int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
														
 
															+		sb->s_blocksize_bits;
														
 
															+
														
 
															+	return (u64)clusters << c_to_b_bits;
														
 
															+}
														
 
															+
														
 
															+static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
														
 
															+					   u64 blocks)
														
 
															+{
														
 
															+	int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
														
 
															+		sb->s_blocksize_bits;
														
 
															+
														
 
															+	return (u32)(blocks >> b_to_c_bits);
														
 
															+}
														
 
															+
														
 
															+static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
														
 
															+						    u64 bytes)
														
 
															+{
														
 
															+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
														
 
															+	unsigned int clusters;
														
 
															+
														
 
															+	bytes += OCFS2_SB(sb)->s_clustersize - 1;
														
 
															+	/* OCFS2 just cannot have enough clusters to overflow this */
														
 
															+	clusters = (unsigned int)(bytes >> cl_bits);
														
 
															+
														
 
															+	return clusters;
														
 
															+}
														
 
															+
														
 
															+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
														
 
															+					 u64 bytes)
														
 
															+{
														
 
															+	bytes += sb->s_blocksize - 1;
														
 
															+	return bytes >> sb->s_blocksize_bits;
														
 
															+}
														
 
															+
														
 
															+static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
														
 
															+					  u32 clusters)
														
 
															+{
														
 
															+	return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
														
 
															+}
														
 
															+
														
 
															+static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
														
 
															+						u64 bytes)
														
 
															+{
														
 
															+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
														
 
															+	unsigned int clusters;
														
 
															+
														
 
															+	clusters = ocfs2_clusters_for_bytes(sb, bytes);
														
 
															+	return (u64)clusters << cl_bits;
														
 
															+}
														
 
															+
														
 
															+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
														
 
															+					      u64 bytes)
														
 
															+{
														
 
															+	u64 blocks;
														
 
															+
														
 
															+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
														
 
															+	return blocks << sb->s_blocksize_bits;
														
 
															+}
														
 
															+
														
 
															+static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
														
 
															+{
														
 
															+	return (unsigned long)((bytes + 511) >> 9);
														
 
															+}
														
 
															+
														
 
															+#define ocfs2_set_bit ext2_set_bit
														
 
															+#define ocfs2_clear_bit ext2_clear_bit
														
 
															+#define ocfs2_test_bit ext2_test_bit
														
 
															+#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
														
 
															+#endif  /* OCFS2_H */
														
 
															+
														
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,638 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ocfs2_fs.h
														
 
															+ *
														
 
															+ * On-disk structures for OCFS2.
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License, version 2,  as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef _OCFS2_FS_H
														
 
															+#define _OCFS2_FS_H
														
 
															+
														
 
															+/* Version */
														
 
															+#define OCFS2_MAJOR_REV_LEVEL		0
														
 
															+#define OCFS2_MINOR_REV_LEVEL          	90
														
 
															+
														
 
															+/*
														
 
															+ * An OCFS2 volume starts this way:
														
 
															+ * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
														
 
															+ * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
														
 
															+ * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
														
 
															+ *
														
 
															+ * All other structures are found from the superblock information.
														
 
															+ *
														
 
															+ * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors.  eg, for a
														
 
															+ * blocksize of 2K, it is 4096 bytes into disk.
														
 
															+ */
														
 
															+#define OCFS2_SUPER_BLOCK_BLKNO		2
														
 
															+
														
 
															+/*
														
 
															+ * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
														
 
															+ * grow if needed.
														
 
															+ */
														
 
															+#define OCFS2_MIN_CLUSTERSIZE		4096
														
 
															+#define OCFS2_MAX_CLUSTERSIZE		1048576
														
 
															+
														
 
															+/*
														
 
															+ * Blocks cannot be bigger than clusters, so the maximum blocksize is the
														
 
															+ * minimum cluster size.
														
 
															+ */
														
 
															+#define OCFS2_MIN_BLOCKSIZE		512
														
 
															+#define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
														
 
															+
														
 
															+/* Filesystem magic number */
														
 
															+#define OCFS2_SUPER_MAGIC		0x7461636f
														
 
															+
														
 
															+/* Object signatures */
														
 
															+#define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
														
 
															+#define OCFS2_INODE_SIGNATURE		"INODE01"
														
 
															+#define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
														
 
															+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
														
 
															+
														
 
															+/* Compatibility flags */
														
 
															+#define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
														
 
															+	( OCFS2_SB(sb)->s_feature_compat & (mask) )
														
 
															+#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask)			\
														
 
															+	( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
														
 
															+#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask)			\
														
 
															+	( OCFS2_SB(sb)->s_feature_incompat & (mask) )
														
 
															+#define OCFS2_SET_COMPAT_FEATURE(sb,mask)			\
														
 
															+	OCFS2_SB(sb)->s_feature_compat |= (mask)
														
 
															+#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask)			\
														
 
															+	OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
														
 
															+#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask)			\
														
 
															+	OCFS2_SB(sb)->s_feature_incompat |= (mask)
														
 
															+#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask)			\
														
 
															+	OCFS2_SB(sb)->s_feature_compat &= ~(mask)
														
 
															+#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
														
 
															+	OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
														
 
															+#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
														
 
															+	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
														
 
															+
														
 
															+#define OCFS2_FEATURE_COMPAT_SUPP	0
														
 
															+#define OCFS2_FEATURE_INCOMPAT_SUPP	0
														
 
															+#define OCFS2_FEATURE_RO_COMPAT_SUPP	0
														
 
															+
														
 
															+/*
														
 
															+ * Heartbeat-only devices are missing journals and other files.  The
														
 
															+ * filesystem driver can't load them, but the library can.  Never put
														
 
															+ * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
														
 
															+ */
														
 
															+#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV	0x0002
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * Flags on ocfs2_dinode.i_flags
														
 
															+ */
														
 
															+#define OCFS2_VALID_FL		(0x00000001)	/* Inode is valid */
														
 
															+#define OCFS2_UNUSED2_FL	(0x00000002)
														
 
															+#define OCFS2_ORPHANED_FL	(0x00000004)	/* On the orphan list */
														
 
															+#define OCFS2_UNUSED3_FL	(0x00000008)
														
 
															+/* System inode flags */
														
 
															+#define OCFS2_SYSTEM_FL		(0x00000010)	/* System inode */
														
 
															+#define OCFS2_SUPER_BLOCK_FL	(0x00000020)	/* Super block */
														
 
															+#define OCFS2_LOCAL_ALLOC_FL	(0x00000040)	/* Slot local alloc bitmap */
														
 
															+#define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
														
 
															+#define OCFS2_JOURNAL_FL	(0x00000100)	/* Slot local journal */
														
 
															+#define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
														
 
															+#define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
														
 
															+#define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
														
 
															+
														
 
															+/*
														
 
															+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
														
 
															+ */
														
 
															+#define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
														
 
															+
														
 
															+/*
														
 
															+ * superblock s_state flags
														
 
															+ */
														
 
															+#define OCFS2_ERROR_FS		(0x00000001)	/* FS saw errors */
														
 
															+
														
 
															+/* Limit of space in ocfs2_dir_entry */
														
 
															+#define OCFS2_MAX_FILENAME_LEN		255
														
 
															+
														
 
															+/* Maximum slots on an ocfs2 file system */
														
 
															+#define OCFS2_MAX_SLOTS			255
														
 
															+
														
 
															+/* Slot map indicator for an empty slot */
														
 
															+#define OCFS2_INVALID_SLOT		-1
														
 
															+
														
 
															+#define OCFS2_VOL_UUID_LEN		16
														
 
															+#define OCFS2_MAX_VOL_LABEL_LEN		64
														
 
															+
														
 
															+/* Journal limits (in bytes) */
														
 
															+#define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
														
 
															+#define OCFS2_MAX_JOURNAL_SIZE		(500 * 1024 * 1024)
														
 
															+
														
 
															+struct ocfs2_system_inode_info {
														
 
															+	char	*si_name;
														
 
															+	int	si_iflags;
														
 
															+	int	si_mode;
														
 
															+};
														
 
															+
														
 
															+/* System file index */
														
 
															+enum {
														
 
															+	BAD_BLOCK_SYSTEM_INODE = 0,
														
 
															+	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
														
 
															+	SLOT_MAP_SYSTEM_INODE,
														
 
															+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
														
 
															+	HEARTBEAT_SYSTEM_INODE,
														
 
															+	GLOBAL_BITMAP_SYSTEM_INODE,
														
 
															+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
														
 
															+	ORPHAN_DIR_SYSTEM_INODE,
														
 
															+	EXTENT_ALLOC_SYSTEM_INODE,
														
 
															+	INODE_ALLOC_SYSTEM_INODE,
														
 
															+	JOURNAL_SYSTEM_INODE,
														
 
															+	LOCAL_ALLOC_SYSTEM_INODE,
														
 
															+	TRUNCATE_LOG_SYSTEM_INODE,
														
 
															+	NUM_SYSTEM_INODES
														
 
															+};
														
 
															+
														
 
															+static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
														
 
															+	/* Global system inodes (single copy) */
														
 
															+	/* The first two are only used from userspace mfks/tunefs */
														
 
															+	[BAD_BLOCK_SYSTEM_INODE]		= { "bad_blocks", 0, S_IFREG | 0644 },
														
 
															+	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	= { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
														
 
															+
														
 
															+	/* These are used by the running filesystem */
														
 
															+	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
														
 
															+	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
														
 
															+	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
														
 
															+
														
 
															+	/* Slot-specific system inodes (one copy per slot) */
														
 
															+	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
														
 
															+	[EXTENT_ALLOC_SYSTEM_INODE]		= { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
														
 
															+	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
														
 
															+	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
														
 
															+	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
														
 
															+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
														
 
															+};
														
 
															+
														
 
															+/* Parameter passed from mount.ocfs2 to module */
														
 
															+#define OCFS2_HB_NONE			"heartbeat=none"
														
 
															+#define OCFS2_HB_LOCAL			"heartbeat=local"
														
 
															+
														
 
															+/*
														
 
															+ * OCFS2 directory file types.  Only the low 3 bits are used.  The
														
 
															+ * other bits are reserved for now.
														
 
															+ */
														
 
															+#define OCFS2_FT_UNKNOWN	0
														
 
															+#define OCFS2_FT_REG_FILE	1
														
 
															+#define OCFS2_FT_DIR		2
														
 
															+#define OCFS2_FT_CHRDEV		3
														
 
															+#define OCFS2_FT_BLKDEV		4
														
 
															+#define OCFS2_FT_FIFO		5
														
 
															+#define OCFS2_FT_SOCK		6
														
 
															+#define OCFS2_FT_SYMLINK	7
														
 
															+
														
 
															+#define OCFS2_FT_MAX		8
														
 
															+
														
 
															+/*
														
 
															+ * OCFS2_DIR_PAD defines the directory entries boundaries
														
 
															+ *
														
 
															+ * NOTE: It must be a multiple of 4
														
 
															+ */
														
 
															+#define OCFS2_DIR_PAD			4
														
 
															+#define OCFS2_DIR_ROUND			(OCFS2_DIR_PAD - 1)
														
 
															+#define OCFS2_DIR_MEMBER_LEN 		offsetof(struct ocfs2_dir_entry, name)
														
 
															+#define OCFS2_DIR_REC_LEN(name_len)	(((name_len) + OCFS2_DIR_MEMBER_LEN + \
														
 
															+                                          OCFS2_DIR_ROUND) & \
														
 
															+					 ~OCFS2_DIR_ROUND)
														
 
															+
														
 
															+#define OCFS2_LINK_MAX		32000
														
 
															+
														
 
															+#define S_SHIFT			12
														
 
															+static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
														
 
															+	[S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
														
 
															+	[S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
														
 
															+	[S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
														
 
															+	[S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
														
 
															+	[S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
														
 
															+	[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
														
 
															+	[S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
														
 
															+};
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * Convenience casts
														
 
															+ */
														
 
															+#define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
														
 
															+
														
 
															+/*
														
 
															+ * On disk extent record for OCFS2
														
 
															+ * It describes a range of clusters on disk.
														
 
															+ */
														
 
															+struct ocfs2_extent_rec {
														
 
															+/*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
														
 
															+	__le32 e_clusters;	/* Clusters covered by this extent */
														
 
															+	__le64 e_blkno;		/* Physical disk offset, in blocks */
														
 
															+/*10*/
														
 
															+};
														
 
															+
														
 
															+struct ocfs2_chain_rec {
														
 
															+	__le32 c_free;	/* Number of free bits in this chain. */
														
 
															+	__le32 c_total;	/* Number of total bits in this chain */
														
 
															+	__le64 c_blkno;	/* Physical disk offset (blocks) of 1st group */
														
 
															+};
														
 
															+
														
 
															+struct ocfs2_truncate_rec {
														
 
															+	__le32 t_start;		/* 1st cluster in this log */
														
 
															+	__le32 t_clusters;	/* Number of total clusters covered */
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * On disk extent list for OCFS2 (node in the tree).  Note that this
														
 
															+ * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
														
 
															+ * offsets are relative to ocfs2_dinode.id2.i_list or
														
 
															+ * ocfs2_extent_block.h_list, respectively.
														
 
															+ */
														
 
															+struct ocfs2_extent_list {
														
 
															+/*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
														
 
															+					   point.  0 means data extents
														
 
															+					   hang directly off this
														
 
															+					   header (a leaf) */
														
 
															+	__le16 l_count;			/* Number of extent records */
														
 
															+	__le16 l_next_free_rec;		/* Next unused extent slot */
														
 
															+	__le16 l_reserved1;
														
 
															+	__le64 l_reserved2;		/* Pad to
														
 
															+					   sizeof(ocfs2_extent_rec) */
														
 
															+/*10*/	struct ocfs2_extent_rec l_recs[0];	/* Extent records */
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * On disk allocation chain list for OCFS2.  Note that this is
														
 
															+ * contained inside ocfs2_dinode, so the offsets are relative to
														
 
															+ * ocfs2_dinode.id2.i_chain.
														
 
															+ */
														
 
															+struct ocfs2_chain_list {
														
 
															+/*00*/	__le16 cl_cpg;			/* Clusters per Block Group */
														
 
															+	__le16 cl_bpc;			/* Bits per cluster */
														
 
															+	__le16 cl_count;		/* Total chains in this list */
														
 
															+	__le16 cl_next_free_rec;	/* Next unused chain slot */
														
 
															+	__le64 cl_reserved1;
														
 
															+/*10*/	struct ocfs2_chain_rec cl_recs[0];	/* Chain records */
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * On disk deallocation log for OCFS2.  Note that this is
														
 
															+ * contained inside ocfs2_dinode, so the offsets are relative to
														
 
															+ * ocfs2_dinode.id2.i_dealloc.
														
 
															+ */
														
 
															+struct ocfs2_truncate_log {
														
 
															+/*00*/	__le16 tl_count;		/* Total records in this log */
														
 
															+	__le16 tl_used;			/* Number of records in use */
														
 
															+	__le32 tl_reserved1;
														
 
															+/*08*/	struct ocfs2_truncate_rec tl_recs[0];	/* Truncate records */
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * On disk extent block (indirect block) for OCFS2
														
 
															+ */
														
 
															+struct ocfs2_extent_block
														
 
															+{
														
 
															+/*00*/	__u8 h_signature[8];		/* Signature for verification */
														
 
															+	__le64 h_reserved1;
														
 
															+/*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
														
 
															+					   extent_header belongs to */
														
 
															+	__le16 h_suballoc_bit;		/* Bit offset in suballocator
														
 
															+					   block group */
														
 
															+	__le32 h_fs_generation;		/* Must match super block */
														
 
															+	__le64 h_blkno;			/* Offset on disk, in blocks */
														
 
															+/*20*/	__le64 h_reserved3;
														
 
															+	__le64 h_next_leaf_blk;		/* Offset on disk, in blocks,
														
 
															+					   of next leaf header pointing
														
 
															+					   to data */
														
 
															+/*30*/	struct ocfs2_extent_list h_list;	/* Extent record list */
														
 
															+/* Actual on-disk size is one block */
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * On disk superblock for OCFS2
														
 
															+ * Note that it is contained inside an ocfs2_dinode, so all offsets
														
 
															+ * are relative to the start of ocfs2_dinode.id2.
														
 
															+ */
														
 
															+struct ocfs2_super_block {
														
 
															+/*00*/	__le16 s_major_rev_level;
														
 
															+	__le16 s_minor_rev_level;
														
 
															+	__le16 s_mnt_count;
														
 
															+	__le16 s_max_mnt_count;
														
 
															+	__le16 s_state;			/* File system state */
														
 
															+	__le16 s_errors;			/* Behaviour when detecting errors */
														
 
															+	__le32 s_checkinterval;		/* Max time between checks */
														
 
															+/*10*/	__le64 s_lastcheck;		/* Time of last check */
														
 
															+	__le32 s_creator_os;		/* OS */
														
 
															+	__le32 s_feature_compat;		/* Compatible feature set */
														
 
															+/*20*/	__le32 s_feature_incompat;	/* Incompatible feature set */
														
 
															+	__le32 s_feature_ro_compat;	/* Readonly-compatible feature set */
														
 
															+	__le64 s_root_blkno;		/* Offset, in blocks, of root directory
														
 
															+					   dinode */
														
 
															+/*30*/	__le64 s_system_dir_blkno;	/* Offset, in blocks, of system
														
 
															+					   directory dinode */
														
 
															+	__le32 s_blocksize_bits;		/* Blocksize for this fs */
														
 
															+	__le32 s_clustersize_bits;	/* Clustersize for this fs */
														
 
															+/*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
														
 
															+					   before tunefs required */
														
 
															+	__le16 s_reserved1;
														
 
															+	__le32 s_reserved2;
														
 
															+	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
														
 
															+					 * group header */
														
 
															+/*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
														
 
															+/*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
														
 
															+/*A0*/
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Local allocation bitmap for OCFS2 slots
														
 
															+ * Note that it exists inside an ocfs2_dinode, so all offsets are
														
 
															+ * relative to the start of ocfs2_dinode.id2.
														
 
															+ */
														
 
															+struct ocfs2_local_alloc
														
 
															+{
														
 
															+/*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
														
 
															+	__le16 la_size;		/* Size of included bitmap, in bytes */
														
 
															+	__le16 la_reserved1;
														
 
															+	__le64 la_reserved2;
														
 
															+/*10*/	__u8   la_bitmap[0];
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * On disk inode for OCFS2
														
 
															+ */
														
 
															+struct ocfs2_dinode {
														
 
															+/*00*/	__u8 i_signature[8];		/* Signature for validation */
														
 
															+	__le32 i_generation;		/* Generation number */
														
 
															+	__le16 i_suballoc_slot;		/* Slot suballocator this inode
														
 
															+					   belongs to */
														
 
															+	__le16 i_suballoc_bit;		/* Bit offset in suballocator
														
 
															+					   block group */
														
 
															+/*10*/	__le32 i_reserved0;
														
 
															+	__le32 i_clusters;		/* Cluster count */
														
 
															+	__le32 i_uid;			/* Owner UID */
														
 
															+	__le32 i_gid;			/* Owning GID */
														
 
															+/*20*/	__le64 i_size;			/* Size in bytes */
														
 
															+	__le16 i_mode;			/* File mode */
														
 
															+	__le16 i_links_count;		/* Links count */
														
 
															+	__le32 i_flags;			/* File flags */
														
 
															+/*30*/	__le64 i_atime;			/* Access time */
														
 
															+	__le64 i_ctime;			/* Creation time */
														
 
															+/*40*/	__le64 i_mtime;			/* Modification time */
														
 
															+	__le64 i_dtime;			/* Deletion time */
														
 
															+/*50*/	__le64 i_blkno;			/* Offset on disk, in blocks */
														
 
															+	__le64 i_last_eb_blk;		/* Pointer to last extent
														
 
															+					   block */
														
 
															+/*60*/	__le32 i_fs_generation;		/* Generation per fs-instance */
														
 
															+	__le32 i_atime_nsec;
														
 
															+	__le32 i_ctime_nsec;
														
 
															+	__le32 i_mtime_nsec;
														
 
															+/*70*/	__le64 i_reserved1[9];
														
 
															+/*B8*/	union {
														
 
															+		__le64 i_pad1;		/* Generic way to refer to this
														
 
															+					   64bit union */
														
 
															+		struct {
														
 
															+			__le64 i_rdev;	/* Device number */
														
 
															+		} dev1;
														
 
															+		struct {		/* Info for bitmap system
														
 
															+					   inodes */
														
 
															+			__le32 i_used;	/* Bits (ie, clusters) used  */
														
 
															+			__le32 i_total;	/* Total bits (clusters)
														
 
															+					   available */
														
 
															+		} bitmap1;
														
 
															+		struct {		/* Info for journal system
														
 
															+					   inodes */
														
 
															+			__le32 ij_flags;	/* Mounted, version, etc. */
														
 
															+			__le32 ij_pad;
														
 
															+		} journal1;
														
 
															+	} id1;				/* Inode type dependant 1 */
														
 
															+/*C0*/	union {
														
 
															+		struct ocfs2_super_block	i_super;
														
 
															+		struct ocfs2_local_alloc	i_lab;
														
 
															+		struct ocfs2_chain_list		i_chain;
														
 
															+		struct ocfs2_extent_list	i_list;
														
 
															+		struct ocfs2_truncate_log	i_dealloc;
														
 
															+		__u8               		i_symlink[0];
														
 
															+	} id2;
														
 
															+/* Actual on-disk size is one block */
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * On-disk directory entry structure for OCFS2
														
 
															+ *
														
 
															+ * Packed as this structure could be accessed unaligned on 64-bit platforms
														
 
															+ */
														
 
															+struct ocfs2_dir_entry {
														
 
															+/*00*/	__le64   inode;                  /* Inode number */
														
 
															+	__le16   rec_len;                /* Directory entry length */
														
 
															+	__u8    name_len;               /* Name length */
														
 
															+	__u8    file_type;
														
 
															+/*0C*/	char    name[OCFS2_MAX_FILENAME_LEN];   /* File name */
														
 
															+/* Actual on-disk length specified by rec_len */
														
 
															+} __attribute__ ((packed));
														
 
															+
														
 
															+/*
														
 
															+ * On disk allocator group structure for OCFS2
														
 
															+ */
														
 
															+struct ocfs2_group_desc
														
 
															+{
														
 
															+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
														
 
															+	__le16   bg_size;                /* Size of included bitmap in
														
 
															+					   bytes. */
														
 
															+	__le16   bg_bits;                /* Bits represented by this
														
 
															+					   group. */
														
 
															+	__le16	bg_free_bits_count;     /* Free bits count */
														
 
															+	__le16   bg_chain;               /* What chain I am in. */
														
 
															+/*10*/	__le32   bg_generation;
														
 
															+	__le32	bg_reserved1;
														
 
															+	__le64   bg_next_group;          /* Next group in my list, in
														
 
															+					   blocks */
														
 
															+/*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
														
 
															+					   blocks */
														
 
															+	__le64   bg_blkno;               /* Offset on disk, in blocks */
														
 
															+/*30*/	__le64   bg_reserved2[2];
														
 
															+/*40*/	__u8    bg_bitmap[0];
														
 
															+};
														
 
															+
														
 
															+#ifdef __KERNEL__
														
 
															+static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
														
 
															+{
														
 
															+	return  sb->s_blocksize -
														
 
															+		 offsetof(struct ocfs2_dinode, id2.i_symlink);
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = sb->s_blocksize -
														
 
															+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
														
 
															+
														
 
															+	return size / sizeof(struct ocfs2_extent_rec);
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = sb->s_blocksize -
														
 
															+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
														
 
															+
														
 
															+	return size / sizeof(struct ocfs2_chain_rec);
														
 
															+}
														
 
															+
														
 
															+static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = sb->s_blocksize -
														
 
															+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
														
 
															+
														
 
															+	return size / sizeof(struct ocfs2_extent_rec);
														
 
															+}
														
 
															+
														
 
															+static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
														
 
															+{
														
 
															+	u16 size;
														
 
															+
														
 
															+	size = sb->s_blocksize -
														
 
															+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
														
 
															+
														
 
															+	return size;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_group_bitmap_size(struct super_block *sb)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = sb->s_blocksize -
														
 
															+		offsetof(struct ocfs2_group_desc, bg_bitmap);
														
 
															+
														
 
															+	return size;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = sb->s_blocksize -
														
 
															+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
														
 
															+
														
 
															+	return size / sizeof(struct ocfs2_truncate_rec);
														
 
															+}
														
 
															+#else
														
 
															+static inline int ocfs2_fast_symlink_chars(int blocksize)
														
 
															+{
														
 
															+	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_extent_recs_per_inode(int blocksize)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = blocksize -
														
 
															+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
														
 
															+
														
 
															+	return size / sizeof(struct ocfs2_extent_rec);
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_chain_recs_per_inode(int blocksize)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = blocksize -
														
 
															+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
														
 
															+
														
 
															+	return size / sizeof(struct ocfs2_chain_rec);
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_extent_recs_per_eb(int blocksize)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = blocksize -
														
 
															+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
														
 
															+
														
 
															+	return size / sizeof(struct ocfs2_extent_rec);
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_local_alloc_size(int blocksize)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = blocksize -
														
 
															+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
														
 
															+
														
 
															+	return size;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_group_bitmap_size(int blocksize)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = blocksize -
														
 
															+		offsetof(struct ocfs2_group_desc, bg_bitmap);
														
 
															+
														
 
															+	return size;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_truncate_recs_per_inode(int blocksize)
														
 
															+{
														
 
															+	int size;
														
 
															+
														
 
															+	size = blocksize -
														
 
															+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
														
 
															+
														
 
															+	return size / sizeof(struct ocfs2_truncate_rec);
														
 
															+}
														
 
															+#endif  /* __KERNEL__ */
														
 
															+
														
 
															+
														
 
															+static inline int ocfs2_system_inode_is_global(int type)
														
 
															+{
														
 
															+	return ((type >= 0) &&
														
 
															+		(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
														
 
															+						  int type, int slot)
														
 
															+{
														
 
															+	int chars;
														
 
															+
														
 
															+        /*
														
 
															+         * Global system inodes can only have one copy.  Everything
														
 
															+         * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
														
 
															+         * list has a copy per slot.
														
 
															+         */
														
 
															+	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
														
 
															+		chars = snprintf(buf, len,
														
 
															+				 ocfs2_system_inodes[type].si_name);
														
 
															+	else
														
 
															+		chars = snprintf(buf, len,
														
 
															+				 ocfs2_system_inodes[type].si_name,
														
 
															+				 slot);
														
 
															+
														
 
															+	return chars;
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
														
 
															+				    umode_t mode)
														
 
															+{
														
 
															+	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
														
 
															+}
														
 
															+
														
 
															+#endif  /* _OCFS2_FS_H */
														
 
															+
														
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,73 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * ocfs2_lockid.h
														
 
															+ *
														
 
															+ * Defines OCFS2 lockid bits.
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef OCFS2_LOCKID_H
														
 
															+#define OCFS2_LOCKID_H
														
 
															+
														
 
															+/* lock ids are made up in the following manner:
														
 
															+ * name[0]     --> type
														
 
															+ * name[1-6]   --> 6 pad characters, reserved for now
														
 
															+ * name[7-22]  --> block number, expressed in hex as 16 chars
														
 
															+ * name[23-30] --> i_generation, expressed in hex 8 chars
														
 
															+ * name[31]    --> '\0' */
														
 
															+#define OCFS2_LOCK_ID_MAX_LEN  32
														
 
															+#define OCFS2_LOCK_ID_PAD "000000"
														
 
															+
														
 
															+enum ocfs2_lock_type {
														
 
															+	OCFS2_LOCK_TYPE_META = 0,
														
 
															+	OCFS2_LOCK_TYPE_DATA,
														
 
															+	OCFS2_LOCK_TYPE_SUPER,
														
 
															+	OCFS2_LOCK_TYPE_RENAME,
														
 
															+	OCFS2_LOCK_TYPE_RW,
														
 
															+	OCFS2_NUM_LOCK_TYPES
														
 
															+};
														
 
															+
														
 
															+static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
														
 
															+{
														
 
															+	char c;
														
 
															+	switch (type) {
														
 
															+		case OCFS2_LOCK_TYPE_META:
														
 
															+			c = 'M';
														
 
															+			break;
														
 
															+		case OCFS2_LOCK_TYPE_DATA:
														
 
															+			c = 'D';
														
 
															+			break;
														
 
															+		case OCFS2_LOCK_TYPE_SUPER:
														
 
															+			c = 'S';
														
 
															+			break;
														
 
															+		case OCFS2_LOCK_TYPE_RENAME:
														
 
															+			c = 'R';
														
 
															+			break;
														
 
															+		case OCFS2_LOCK_TYPE_RW:
														
 
															+			c = 'W';
														
 
															+			break;
														
 
															+		default:
														
 
															+			c = '\0';
														
 
															+	}
														
 
															+
														
 
															+	return c;
														
 
															+}
														
 
															+
														
 
															+#endif  /* OCFS2_LOCKID_H */
														
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,303 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * slot_map.c
														
 
															+ *
														
 
															+ *
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+#include <linux/smp_lock.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_SUPER
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "dlmglue.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "heartbeat.h"
														
 
															+#include "inode.h"
														
 
															+#include "slot_map.h"
														
 
															+#include "super.h"
														
 
															+#include "sysfile.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
														
 
															+				    s16 global);
														
 
															+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
														
 
															+			      s16 slot_num,
														
 
															+			      s16 node_num);
														
 
															+
														
 
															+/* Use the slot information we've collected to create a map of mounted
														
 
															+ * nodes. Should be holding an EX on super block. assumes slot info is
														
 
															+ * up to date. Note that we call this *after* we find a slot, so our
														
 
															+ * own node should be set in the map too... */
														
 
															+void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct ocfs2_slot_info *si = osb->slot_info;
														
 
															+
														
 
															+	spin_lock(&si->si_lock);
														
 
															+
														
 
															+	for (i = 0; i < si->si_size; i++)
														
 
															+		if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
														
 
															+			ocfs2_node_map_set_bit(osb, &osb->mounted_map,
														
 
															+					      si->si_global_node_nums[i]);
														
 
															+
														
 
															+	spin_unlock(&si->si_lock);
														
 
															+}
														
 
															+
														
 
															+/* post the slot information on disk into our slot_info struct. */
														
 
															+void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
														
 
															+{
														
 
															+	int i;
														
 
															+	__le16 *disk_info;
														
 
															+
														
 
															+	/* we don't read the slot block here as ocfs2_super_lock
														
 
															+	 * should've made sure we have the most recent copy. */
														
 
															+	spin_lock(&si->si_lock);
														
 
															+	disk_info = (__le16 *) si->si_bh->b_data;
														
 
															+
														
 
															+	for (i = 0; i < si->si_size; i++)
														
 
															+		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
														
 
															+
														
 
															+	spin_unlock(&si->si_lock);
														
 
															+}
														
 
															+
														
 
															+/* post the our slot info stuff into it's destination bh and write it
														
 
															+ * out. */
														
 
															+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_slot_info *si)
														
 
															+{
														
 
															+	int status, i;
														
 
															+	__le16 *disk_info = (__le16 *) si->si_bh->b_data;
														
 
															+
														
 
															+	spin_lock(&si->si_lock);
														
 
															+	for (i = 0; i < si->si_size; i++)
														
 
															+		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
														
 
															+	spin_unlock(&si->si_lock);
														
 
															+
														
 
															+	status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* try to find global node in the slot info. Returns
														
 
															+ * OCFS2_INVALID_SLOT if nothing is found. */
														
 
															+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
														
 
															+				    s16 global)
														
 
															+{
														
 
															+	int i;
														
 
															+	s16 ret = OCFS2_INVALID_SLOT;
														
 
															+
														
 
															+	for(i = 0; i < si->si_num_slots; i++) {
														
 
															+		if (global == si->si_global_node_nums[i]) {
														
 
															+			ret = (s16) i;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
														
 
															+{
														
 
															+	int i;
														
 
															+	s16 ret = OCFS2_INVALID_SLOT;
														
 
															+
														
 
															+	for(i = 0; i < si->si_num_slots; i++) {
														
 
															+		if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
														
 
															+			ret = (s16) i;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
														
 
															+			   s16 global)
														
 
															+{
														
 
															+	s16 ret;
														
 
															+
														
 
															+	spin_lock(&si->si_lock);
														
 
															+	ret = __ocfs2_node_num_to_slot(si, global);
														
 
															+	spin_unlock(&si->si_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
														
 
															+			      s16 slot_num,
														
 
															+			      s16 node_num)
														
 
															+{
														
 
															+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
														
 
															+	BUG_ON(slot_num >= si->si_num_slots);
														
 
															+	BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
														
 
															+	       (node_num >= O2NM_MAX_NODES));
														
 
															+
														
 
															+	si->si_global_node_nums[slot_num] = node_num;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
														
 
															+		      s16 slot_num)
														
 
															+{
														
 
															+	spin_lock(&si->si_lock);
														
 
															+	__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
														
 
															+	spin_unlock(&si->si_lock);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_init_slot_info(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status, i;
														
 
															+	u64 blkno;
														
 
															+	struct inode *inode = NULL;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_slot_info *si;
														
 
															+
														
 
															+	si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
														
 
															+	if (!si) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock_init(&si->si_lock);
														
 
															+	si->si_num_slots = osb->max_slots;
														
 
															+	si->si_size = OCFS2_MAX_SLOTS;
														
 
															+
														
 
															+	for(i = 0; i < si->si_num_slots; i++)
														
 
															+		si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
														
 
															+
														
 
															+	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
														
 
															+					    OCFS2_INVALID_SLOT);
														
 
															+	if (!inode) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	si->si_inode = inode;
														
 
															+	si->si_bh = bh;
														
 
															+	osb->slot_info = si;
														
 
															+bail:
														
 
															+	if (status < 0 && si)
														
 
															+		ocfs2_free_slot_info(si);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
														
 
															+{
														
 
															+	if (si->si_inode)
														
 
															+		iput(si->si_inode);
														
 
															+	if (si->si_bh)
														
 
															+		brelse(si->si_bh);
														
 
															+	kfree(si);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_find_slot(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	s16 slot;
														
 
															+	struct ocfs2_slot_info *si;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	si = osb->slot_info;
														
 
															+
														
 
															+	ocfs2_update_slot_info(si);
														
 
															+
														
 
															+	spin_lock(&si->si_lock);
														
 
															+	/* search for ourselves first and take the slot if it already
														
 
															+	 * exists. Perhaps we need to mark this in a variable for our
														
 
															+	 * own journal recovery? Possibly not, though we certainly
														
 
															+	 * need to warn to the user */
														
 
															+	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
														
 
															+	if (slot == OCFS2_INVALID_SLOT) {
														
 
															+		/* if no slot yet, then just take 1st available
														
 
															+		 * one. */
														
 
															+		slot = __ocfs2_find_empty_slot(si);
														
 
															+		if (slot == OCFS2_INVALID_SLOT) {
														
 
															+			spin_unlock(&si->si_lock);
														
 
															+			mlog(ML_ERROR, "no free slots available!\n");
														
 
															+			status = -EINVAL;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	} else
														
 
															+		mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
														
 
															+		     slot);
														
 
															+
														
 
															+	__ocfs2_fill_slot(si, slot, osb->node_num);
														
 
															+	osb->slot_num = slot;
														
 
															+	spin_unlock(&si->si_lock);
														
 
															+
														
 
															+	mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
														
 
															+
														
 
															+	status = ocfs2_update_disk_slots(osb, si);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+void ocfs2_put_slot(struct ocfs2_super *osb)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct ocfs2_slot_info *si = osb->slot_info;
														
 
															+
														
 
															+	if (!si)
														
 
															+		return;
														
 
															+
														
 
															+	ocfs2_update_slot_info(si);
														
 
															+
														
 
															+	spin_lock(&si->si_lock);
														
 
															+	__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
														
 
															+	osb->slot_num = OCFS2_INVALID_SLOT;
														
 
															+	spin_unlock(&si->si_lock);
														
 
															+
														
 
															+	status = ocfs2_update_disk_slots(osb, si);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	osb->slot_info = NULL;
														
 
															+	ocfs2_free_slot_info(si);
														
 
															+}
														
 
															+
														
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,66 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * slotmap.h
														
 
															+ *
														
 
															+ * description here
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#ifndef SLOTMAP_H
														
 
															+#define SLOTMAP_H
														
 
															+
														
 
															+struct ocfs2_slot_info {
														
 
															+	spinlock_t si_lock;
														
 
															+
														
 
															+       	struct inode *si_inode;
														
 
															+	struct buffer_head *si_bh;
														
 
															+	unsigned int si_num_slots;
														
 
															+	unsigned int si_size;
														
 
															+	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
														
 
															+};
														
 
															+
														
 
															+int ocfs2_init_slot_info(struct ocfs2_super *osb);
														
 
															+void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
														
 
															+
														
 
															+int ocfs2_find_slot(struct ocfs2_super *osb);
														
 
															+void ocfs2_put_slot(struct ocfs2_super *osb);
														
 
															+
														
 
															+void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
														
 
															+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_slot_info *si);
														
 
															+
														
 
															+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
														
 
															+			   s16 global);
														
 
															+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
														
 
															+		      s16 slot_num);
														
 
															+
														
 
															+void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
														
 
															+
														
 
															+static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
														
 
															+				      int slot_num)
														
 
															+{
														
 
															+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
														
 
															+	assert_spin_locked(&si->si_lock);
														
 
															+
														
 
															+	return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
														
 
															+}
														
 
															+
														
 
															+#endif
														
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,1651 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * suballoc.c
														
 
															+ *
														
 
															+ * metadata alloc and free
														
 
															+ * Inspired by ext3 block groups.
														
 
															+ *
														
 
															+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/fs.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/highmem.h>
														
 
															+
														
 
															+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
														
 
															+#include <cluster/masklog.h>
														
 
															+
														
 
															+#include "ocfs2.h"
														
 
															+
														
 
															+#include "alloc.h"
														
 
															+#include "dlmglue.h"
														
 
															+#include "inode.h"
														
 
															+#include "journal.h"
														
 
															+#include "localalloc.h"
														
 
															+#include "suballoc.h"
														
 
															+#include "super.h"
														
 
															+#include "sysfile.h"
														
 
															+#include "uptodate.h"
														
 
															+
														
 
															+#include "buffer_head_io.h"
														
 
															+
														
 
															+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
														
 
															+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
														
 
															+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
														
 
															+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
														
 
															+				  struct inode *alloc_inode,
														
 
															+				  struct buffer_head *bg_bh,
														
 
															+				  u64 group_blkno,
														
 
															+				  u16 my_chain,
														
 
															+				  struct ocfs2_chain_list *cl);
														
 
															+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
														
 
															+				   struct inode *alloc_inode,
														
 
															+				   struct buffer_head *bh);
														
 
															+
														
 
															+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
														
 
															+				       struct ocfs2_alloc_context *ac);
														
 
															+
														
 
															+static int ocfs2_cluster_group_search(struct inode *inode,
														
 
															+				      struct buffer_head *group_bh,
														
 
															+				      u32 bits_wanted, u32 min_bits,
														
 
															+				      u16 *bit_off, u16 *bits_found);
														
 
															+static int ocfs2_block_group_search(struct inode *inode,
														
 
															+				    struct buffer_head *group_bh,
														
 
															+				    u32 bits_wanted, u32 min_bits,
														
 
															+				    u16 *bit_off, u16 *bits_found);
														
 
															+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
														
 
															+			      u32 bits_wanted,
														
 
															+			      u32 min_bits,
														
 
															+			      u16 *bit_off,
														
 
															+			      unsigned int *num_bits,
														
 
															+			      u64 *bg_blkno);
														
 
															+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
														
 
															+				     struct ocfs2_alloc_context *ac,
														
 
															+				     u32 bits_wanted,
														
 
															+				     u32 min_bits,
														
 
															+				     u16 *bit_off,
														
 
															+				     unsigned int *num_bits,
														
 
															+				     u64 *bg_blkno);
														
 
															+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
														
 
															+					 int nr);
														
 
															+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
														
 
															+					     struct buffer_head *bg_bh,
														
 
															+					     unsigned int bits_wanted,
														
 
															+					     u16 *bit_off,
														
 
															+					     u16 *bits_found);
														
 
															+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
														
 
															+					     struct inode *alloc_inode,
														
 
															+					     struct ocfs2_group_desc *bg,
														
 
															+					     struct buffer_head *group_bh,
														
 
															+					     unsigned int bit_off,
														
 
															+					     unsigned int num_bits);
														
 
															+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
														
 
															+					       struct inode *alloc_inode,
														
 
															+					       struct ocfs2_group_desc *bg,
														
 
															+					       struct buffer_head *group_bh,
														
 
															+					       unsigned int bit_off,
														
 
															+					       unsigned int num_bits);
														
 
															+
														
 
															+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
														
 
															+				    struct inode *alloc_inode,
														
 
															+				    struct buffer_head *fe_bh,
														
 
															+				    struct buffer_head *bg_bh,
														
 
															+				    struct buffer_head *prev_bg_bh,
														
 
															+				    u16 chain);
														
 
															+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
														
 
															+						     u32 wanted);
														
 
															+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
														
 
															+				    struct inode *alloc_inode,
														
 
															+				    struct buffer_head *alloc_bh,
														
 
															+				    unsigned int start_bit,
														
 
															+				    u64 bg_blkno,
														
 
															+				    unsigned int count);
														
 
															+static inline u64 ocfs2_which_suballoc_group(u64 block,
														
 
															+					     unsigned int bit);
														
 
															+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
														
 
															+						   u64 bg_blkno,
														
 
															+						   u16 bg_bit_off);
														
 
															+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
														
 
															+					    u32 cluster);
														
 
															+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
														
 
															+						u64 data_blkno,
														
 
															+						u64 *bg_blkno,
														
 
															+						u16 *bg_bit_off);
														
 
															+
														
 
															+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
														
 
															+{
														
 
															+	if (ac->ac_inode)
														
 
															+		iput(ac->ac_inode);
														
 
															+	if (ac->ac_bh)
														
 
															+		brelse(ac->ac_bh);
														
 
															+	kfree(ac);
														
 
															+}
														
 
															+
														
 
															+static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
														
 
															+{
														
 
															+	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
														
 
															+				  struct inode *alloc_inode,
														
 
															+				  struct buffer_head *bg_bh,
														
 
															+				  u64 group_blkno,
														
 
															+				  u16 my_chain,
														
 
															+				  struct ocfs2_chain_list *cl)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
														
 
															+	struct super_block * sb = alloc_inode->i_sb;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
														
 
															+		ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
														
 
															+			    "!= b_blocknr (%llu)", group_blkno,
														
 
															+			    (unsigned long long) bg_bh->b_blocknr);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle,
														
 
															+				      alloc_inode,
														
 
															+				      bg_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_CREATE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	memset(bg, 0, sb->s_blocksize);
														
 
															+	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
														
 
															+	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
														
 
															+	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
														
 
															+	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
														
 
															+	bg->bg_chain = cpu_to_le16(my_chain);
														
 
															+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
														
 
															+	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
														
 
															+	bg->bg_blkno = cpu_to_le64(group_blkno);
														
 
															+	/* set the 1st bit in the bitmap to account for the descriptor block */
														
 
															+	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
														
 
															+	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, bg_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	/* There is no need to zero out or otherwise initialize the
														
 
															+	 * other blocks in a group - All valid FS metadata in a block
														
 
															+	 * group stores the superblock fs_generation value at
														
 
															+	 * allocation time. */
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
														
 
															+{
														
 
															+	u16 curr, best;
														
 
															+
														
 
															+	best = curr = 0;
														
 
															+	while (curr < le16_to_cpu(cl->cl_count)) {
														
 
															+		if (le32_to_cpu(cl->cl_recs[best].c_total) >
														
 
															+		    le32_to_cpu(cl->cl_recs[curr].c_total))
														
 
															+			best = curr;
														
 
															+		curr++;
														
 
															+	}
														
 
															+	return best;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * We expect the block group allocator to already be locked.
														
 
															+ */
														
 
															+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
														
 
															+				   struct inode *alloc_inode,
														
 
															+				   struct buffer_head *bh)
														
 
															+{
														
 
															+	int status, credits;
														
 
															+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
														
 
															+	struct ocfs2_chain_list *cl;
														
 
															+	struct ocfs2_alloc_context *ac = NULL;
														
 
															+	struct ocfs2_journal_handle *handle = NULL;
														
 
															+	u32 bit_off, num_bits;
														
 
															+	u16 alloc_rec;
														
 
															+	u64 bg_blkno;
														
 
															+	struct buffer_head *bg_bh = NULL;
														
 
															+	struct ocfs2_group_desc *bg;
														
 
															+
														
 
															+	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	handle = ocfs2_alloc_handle(osb);
														
 
															+	if (!handle) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	cl = &fe->id2.i_chain;
														
 
															+	status = ocfs2_reserve_clusters(osb,
														
 
															+					handle,
														
 
															+					le16_to_cpu(cl->cl_cpg),
														
 
															+					&ac);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	credits = ocfs2_calc_group_alloc_credits(osb->sb,
														
 
															+						 le16_to_cpu(cl->cl_cpg));
														
 
															+	handle = ocfs2_start_trans(osb, handle, credits);
														
 
															+	if (IS_ERR(handle)) {
														
 
															+		status = PTR_ERR(handle);
														
 
															+		handle = NULL;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_claim_clusters(osb,
														
 
															+				      handle,
														
 
															+				      ac,
														
 
															+				      le16_to_cpu(cl->cl_cpg),
														
 
															+				      &bit_off,
														
 
															+				      &num_bits);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	alloc_rec = ocfs2_find_smallest_chain(cl);
														
 
															+
														
 
															+	/* setup the group */
														
 
															+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
														
 
															+	mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
														
 
															+	     alloc_rec, bg_blkno);
														
 
															+
														
 
															+	bg_bh = sb_getblk(osb->sb, bg_blkno);
														
 
															+	if (!bg_bh) {
														
 
															+		status = -EIO;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
														
 
															+
														
 
															+	status = ocfs2_block_group_fill(handle,
														
 
															+					alloc_inode,
														
 
															+					bg_bh,
														
 
															+					bg_blkno,
														
 
															+					alloc_rec,
														
 
															+					cl);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, alloc_inode,
														
 
															+				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
														
 
															+		     le16_to_cpu(bg->bg_free_bits_count));
														
 
															+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
														
 
															+	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
														
 
															+	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
														
 
															+		le16_add_cpu(&cl->cl_next_free_rec, 1);
														
 
															+
														
 
															+	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
														
 
															+					le16_to_cpu(bg->bg_free_bits_count));
														
 
															+	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
														
 
															+	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
														
 
															+	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
														
 
															+	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
														
 
															+					     le32_to_cpu(fe->i_clusters)));
														
 
															+	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
														
 
															+	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
														
 
															+	alloc_inode->i_blocks =
														
 
															+		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if (handle)
														
 
															+		ocfs2_commit_trans(handle);
														
 
															+
														
 
															+	if (ac)
														
 
															+		ocfs2_free_alloc_context(ac);
														
 
															+
														
 
															+	if (bg_bh)
														
 
															+		brelse(bg_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
														
 
															+				       struct ocfs2_alloc_context *ac)
														
 
															+{
														
 
															+	int status;
														
 
															+	u32 bits_wanted = ac->ac_bits_wanted;
														
 
															+	struct inode *alloc_inode = ac->ac_inode;
														
 
															+	struct buffer_head *bh = NULL;
														
 
															+	struct ocfs2_journal_handle *handle = ac->ac_handle;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+	u32 free_bits;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
														
 
															+
														
 
															+	ocfs2_handle_add_inode(handle, alloc_inode);
														
 
															+	status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
														
 
															+		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
														
 
															+			    "# %"MLFu64, le64_to_cpu(fe->i_blkno));
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
														
 
															+		le32_to_cpu(fe->id1.bitmap1.i_used);
														
 
															+
														
 
															+	if (bits_wanted > free_bits) {
														
 
															+		/* cluster bitmap never grows */
														
 
															+		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
														
 
															+			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
														
 
															+			     bits_wanted, free_bits);
														
 
															+			status = -ENOSPC;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+
														
 
															+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC)
														
 
															+				mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		atomic_inc(&osb->alloc_stats.bg_extends);
														
 
															+
														
 
															+		/* You should never ask for this much metadata */
														
 
															+		BUG_ON(bits_wanted >
														
 
															+		       (le32_to_cpu(fe->id1.bitmap1.i_total)
														
 
															+			- le32_to_cpu(fe->id1.bitmap1.i_used)));
														
 
															+	}
														
 
															+
														
 
															+	get_bh(bh);
														
 
															+	ac->ac_bh = bh;
														
 
															+bail:
														
 
															+	if (bh)
														
 
															+		brelse(bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
														
 
															+			       struct ocfs2_journal_handle *handle,
														
 
															+			       struct ocfs2_dinode *fe,
														
 
															+			       struct ocfs2_alloc_context **ac)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *alloc_inode = NULL;
														
 
															+
														
 
															+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
														
 
															+	if (!(*ac)) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
														
 
															+	(*ac)->ac_handle = handle;
														
 
															+	(*ac)->ac_which = OCFS2_AC_USE_META;
														
 
															+
														
 
															+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
														
 
															+	alloc_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						  EXTENT_ALLOC_SYSTEM_INODE,
														
 
															+						  0);
														
 
															+#else
														
 
															+	alloc_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						  EXTENT_ALLOC_SYSTEM_INODE,
														
 
															+						  osb->slot_num);
														
 
															+#endif
														
 
															+	if (!alloc_inode) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	(*ac)->ac_inode = igrab(alloc_inode);
														
 
															+	(*ac)->ac_group_search = ocfs2_block_group_search;
														
 
															+
														
 
															+	status = ocfs2_reserve_suballoc_bits(osb, (*ac));
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if ((status < 0) && *ac) {
														
 
															+		ocfs2_free_alloc_context(*ac);
														
 
															+		*ac = NULL;
														
 
															+	}
														
 
															+
														
 
															+	if (alloc_inode)
														
 
															+		iput(alloc_inode);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_journal_handle *handle,
														
 
															+			    struct ocfs2_alloc_context **ac)
														
 
															+{
														
 
															+	int status;
														
 
															+	struct inode *alloc_inode = NULL;
														
 
															+
														
 
															+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
														
 
															+	if (!(*ac)) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	(*ac)->ac_bits_wanted = 1;
														
 
															+	(*ac)->ac_handle = handle;
														
 
															+	(*ac)->ac_which = OCFS2_AC_USE_INODE;
														
 
															+
														
 
															+	alloc_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						  INODE_ALLOC_SYSTEM_INODE,
														
 
															+						  osb->slot_num);
														
 
															+	if (!alloc_inode) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	(*ac)->ac_inode = igrab(alloc_inode);
														
 
															+	(*ac)->ac_group_search = ocfs2_block_group_search;
														
 
															+
														
 
															+	status = ocfs2_reserve_suballoc_bits(osb, *ac);
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if ((status < 0) && *ac) {
														
 
															+		ocfs2_free_alloc_context(*ac);
														
 
															+		*ac = NULL;
														
 
															+	}
														
 
															+
														
 
															+	if (alloc_inode)
														
 
															+		iput(alloc_inode);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* local alloc code has to do the same thing, so rather than do this
														
 
															+ * twice.. */
														
 
															+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
														
 
															+				      struct ocfs2_alloc_context *ac)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	ac->ac_inode = ocfs2_get_system_file_inode(osb,
														
 
															+						   GLOBAL_BITMAP_SYSTEM_INODE,
														
 
															+						   OCFS2_INVALID_SLOT);
														
 
															+	if (!ac->ac_inode) {
														
 
															+		status = -EINVAL;
														
 
															+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	ac->ac_which = OCFS2_AC_USE_MAIN;
														
 
															+	ac->ac_group_search = ocfs2_cluster_group_search;
														
 
															+
														
 
															+	status = ocfs2_reserve_suballoc_bits(osb, ac);
														
 
															+	if (status < 0 && status != -ENOSPC)
														
 
															+		mlog_errno(status);
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* Callers don't need to care which bitmap (local alloc or main) to
														
 
															+ * use so we figure it out for them, but unfortunately this clutters
														
 
															+ * things a bit. */
														
 
															+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
														
 
															+			   struct ocfs2_journal_handle *handle,
														
 
															+			   u32 bits_wanted,
														
 
															+			   struct ocfs2_alloc_context **ac)
														
 
															+{
														
 
															+	int status;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!handle);
														
 
															+
														
 
															+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
														
 
															+	if (!(*ac)) {
														
 
															+		status = -ENOMEM;
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	(*ac)->ac_bits_wanted = bits_wanted;
														
 
															+	(*ac)->ac_handle = handle;
														
 
															+
														
 
															+	status = -ENOSPC;
														
 
															+	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
														
 
															+		status = ocfs2_reserve_local_alloc_bits(osb,
														
 
															+							handle,
														
 
															+							bits_wanted,
														
 
															+							*ac);
														
 
															+		if ((status < 0) && (status != -ENOSPC)) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		} else if (status == -ENOSPC) {
														
 
															+			/* reserve_local_bits will return enospc with
														
 
															+			 * the local alloc inode still locked, so we
														
 
															+			 * can change this safely here. */
														
 
															+			mlog(0, "Disabling local alloc\n");
														
 
															+			/* We set to OCFS2_LA_DISABLED so that umount
														
 
															+			 * can clean up what's left of the local
														
 
															+			 * allocation */
														
 
															+			osb->local_alloc_state = OCFS2_LA_DISABLED;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (status == -ENOSPC) {
														
 
															+		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
														
 
															+		if (status < 0) {
														
 
															+			if (status != -ENOSPC)
														
 
															+				mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	if ((status < 0) && *ac) {
														
 
															+		ocfs2_free_alloc_context(*ac);
														
 
															+		*ac = NULL;
														
 
															+	}
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * More or less lifted from ext3. I'll leave their description below:
														
 
															+ *
														
 
															+ * "For ext3 allocations, we must not reuse any blocks which are
														
 
															+ * allocated in the bitmap buffer's "last committed data" copy.  This
														
 
															+ * prevents deletes from freeing up the page for reuse until we have
														
 
															+ * committed the delete transaction.
														
 
															+ *
														
 
															+ * If we didn't do this, then deleting something and reallocating it as
														
 
															+ * data would allow the old block to be overwritten before the
														
 
															+ * transaction committed (because we force data to disk before commit).
														
 
															+ * This would lead to corruption if we crashed between overwriting the
														
 
															+ * data and committing the delete.
														
 
															+ *
														
 
															+ * @@@ We may want to make this allocation behaviour conditional on
														
 
															+ * data-writes at some point, and disable it for metadata allocations or
														
 
															+ * sync-data inodes."
														
 
															+ *
														
 
															+ * Note: OCFS2 already does this differently for metadata vs data
														
 
															+ * allocations, as those bitmaps are seperate and undo access is never
														
 
															+ * called on a metadata group descriptor.
														
 
															+ */
														
 
															+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
														
 
															+					 int nr)
														
 
															+{
														
 
															+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
														
 
															+
														
 
															+	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
														
 
															+		return 0;
														
 
															+	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
														
 
															+		return 1;
														
 
															+
														
 
															+	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
														
 
															+	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
														
 
															+					     struct buffer_head *bg_bh,
														
 
															+					     unsigned int bits_wanted,
														
 
															+					     u16 *bit_off,
														
 
															+					     u16 *bits_found)
														
 
															+{
														
 
															+	void *bitmap;
														
 
															+	u16 best_offset, best_size;
														
 
															+	int offset, start, found, status = 0;
														
 
															+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
														
 
															+
														
 
															+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
														
 
															+		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
														
 
															+		return -EIO;
														
 
															+	}
														
 
															+
														
 
															+	found = start = best_offset = best_size = 0;
														
 
															+	bitmap = bg->bg_bitmap;
														
 
															+
														
 
															+	while((offset = ocfs2_find_next_zero_bit(bitmap,
														
 
															+						 le16_to_cpu(bg->bg_bits),
														
 
															+						 start)) != -1) {
														
 
															+		if (offset == le16_to_cpu(bg->bg_bits))
														
 
															+			break;
														
 
															+
														
 
															+		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
														
 
															+			/* We found a zero, but we can't use it as it
														
 
															+			 * hasn't been put to disk yet! */
														
 
															+			found = 0;
														
 
															+			start = offset + 1;
														
 
															+		} else if (offset == start) {
														
 
															+			/* we found a zero */
														
 
															+			found++;
														
 
															+			/* move start to the next bit to test */
														
 
															+			start++;
														
 
															+		} else {
														
 
															+			/* got a zero after some ones */
														
 
															+			found = 1;
														
 
															+			start = offset + 1;
														
 
															+		}
														
 
															+		if (found > best_size) {
														
 
															+			best_size = found;
														
 
															+			best_offset = start - found;
														
 
															+		}
														
 
															+		/* we got everything we needed */
														
 
															+		if (found == bits_wanted) {
														
 
															+			/* mlog(0, "Found it all!\n"); */
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* XXX: I think the first clause is equivalent to the second
														
 
															+	 * 	- jlbec */
														
 
															+	if (found == bits_wanted) {
														
 
															+		*bit_off = start - found;
														
 
															+		*bits_found = found;
														
 
															+	} else if (best_size) {
														
 
															+		*bit_off = best_offset;
														
 
															+		*bits_found = best_size;
														
 
															+	} else {
														
 
															+		status = -ENOSPC;
														
 
															+		/* No error log here -- see the comment above
														
 
															+		 * ocfs2_test_bg_bit_allocatable */
														
 
															+	}
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
														
 
															+					     struct inode *alloc_inode,
														
 
															+					     struct ocfs2_group_desc *bg,
														
 
															+					     struct buffer_head *group_bh,
														
 
															+					     unsigned int bit_off,
														
 
															+					     unsigned int num_bits)
														
 
															+{
														
 
															+	int status;
														
 
															+	void *bitmap = bg->bg_bitmap;
														
 
															+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
														
 
															+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
														
 
															+
														
 
															+	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
														
 
															+	     num_bits);
														
 
															+
														
 
															+	if (ocfs2_is_cluster_bitmap(alloc_inode))
														
 
															+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle,
														
 
															+				      alloc_inode,
														
 
															+				      group_bh,
														
 
															+				      journal_type);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
														
 
															+
														
 
															+	while(num_bits--)
														
 
															+		ocfs2_set_bit(bit_off++, bitmap);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle,
														
 
															+				     group_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* find the one with the most empty bits */
														
 
															+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
														
 
															+{
														
 
															+	u16 curr, best;
														
 
															+
														
 
															+	BUG_ON(!cl->cl_next_free_rec);
														
 
															+
														
 
															+	best = curr = 0;
														
 
															+	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
														
 
															+		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
														
 
															+		    le32_to_cpu(cl->cl_recs[best].c_free))
														
 
															+			best = curr;
														
 
															+		curr++;
														
 
															+	}
														
 
															+
														
 
															+	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
														
 
															+	return best;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
														
 
															+				    struct inode *alloc_inode,
														
 
															+				    struct buffer_head *fe_bh,
														
 
															+				    struct buffer_head *bg_bh,
														
 
															+				    struct buffer_head *prev_bg_bh,
														
 
															+				    u16 chain)
														
 
															+{
														
 
															+	int status;
														
 
															+	/* there is a really tiny chance the journal calls could fail,
														
 
															+	 * but we wouldn't want inconsistent blocks in *any* case. */
														
 
															+	u64 fe_ptr, bg_ptr, prev_bg_ptr;
														
 
															+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
														
 
															+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
														
 
															+	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
														
 
															+
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
														
 
															+		status = -EIO;
														
 
															+		goto out;
														
 
															+	}
														
 
															+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
														
 
															+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
														
 
															+		status = -EIO;
														
 
															+		goto out;
														
 
															+	}
														
 
															+	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
														
 
															+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
														
 
															+		status = -EIO;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
														
 
															+	     "top, prev = %"MLFu64"\n",
														
 
															+	     fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
														
 
															+
														
 
															+	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
														
 
															+	bg_ptr = le64_to_cpu(bg->bg_next_group);
														
 
															+	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto out_rollback;
														
 
															+	}
														
 
															+
														
 
															+	prev_bg->bg_next_group = bg->bg_next_group;
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, prev_bg_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto out_rollback;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto out_rollback;
														
 
															+	}
														
 
															+
														
 
															+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, bg_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto out_rollback;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto out_rollback;
														
 
															+	}
														
 
															+
														
 
															+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, fe_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto out_rollback;
														
 
															+	}
														
 
															+
														
 
															+	status = 0;
														
 
															+out_rollback:
														
 
															+	if (status < 0) {
														
 
															+		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
														
 
															+		bg->bg_next_group = cpu_to_le64(bg_ptr);
														
 
															+		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
														
 
															+	}
														
 
															+out:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
														
 
															+						     u32 wanted)
														
 
															+{
														
 
															+	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
														
 
															+}
														
 
															+
														
 
															+/* return 0 on success, -ENOSPC to keep searching and any other < 0
														
 
															+ * value on error. */
														
 
															+static int ocfs2_cluster_group_search(struct inode *inode,
														
 
															+				      struct buffer_head *group_bh,
														
 
															+				      u32 bits_wanted, u32 min_bits,
														
 
															+				      u16 *bit_off, u16 *bits_found)
														
 
															+{
														
 
															+	int search = -ENOSPC;
														
 
															+	int ret;
														
 
															+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
														
 
															+	u16 tmp_off, tmp_found;
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
														
 
															+
														
 
															+	if (bg->bg_free_bits_count) {
														
 
															+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
														
 
															+							group_bh, bits_wanted,
														
 
															+							&tmp_off, &tmp_found);
														
 
															+		if (ret)
														
 
															+			return ret;
														
 
															+
														
 
															+		/* ocfs2_block_group_find_clear_bits() might
														
 
															+		 * return success, but we still want to return
														
 
															+		 * -ENOSPC unless it found the minimum number
														
 
															+		 * of bits. */
														
 
															+		if (min_bits <= tmp_found) {
														
 
															+			*bit_off = tmp_off;
														
 
															+			*bits_found = tmp_found;
														
 
															+			search = 0; /* success */
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return search;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_block_group_search(struct inode *inode,
														
 
															+				    struct buffer_head *group_bh,
														
 
															+				    u32 bits_wanted, u32 min_bits,
														
 
															+				    u16 *bit_off, u16 *bits_found)
														
 
															+{
														
 
															+	int ret = -ENOSPC;
														
 
															+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
														
 
															+
														
 
															+	BUG_ON(min_bits != 1);
														
 
															+	BUG_ON(ocfs2_is_cluster_bitmap(inode));
														
 
															+
														
 
															+	if (bg->bg_free_bits_count)
														
 
															+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
														
 
															+							group_bh, bits_wanted,
														
 
															+							bit_off, bits_found);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
														
 
															+			      u32 bits_wanted,
														
 
															+			      u32 min_bits,
														
 
															+			      u16 *bit_off,
														
 
															+			      unsigned int *num_bits,
														
 
															+			      u64 *bg_blkno)
														
 
															+{
														
 
															+	int status;
														
 
															+	u16 chain, tmp_bits;
														
 
															+	u32 tmp_used;
														
 
															+	u64 next_group;
														
 
															+	struct ocfs2_journal_handle *handle = ac->ac_handle;
														
 
															+	struct inode *alloc_inode = ac->ac_inode;
														
 
															+	struct buffer_head *group_bh = NULL;
														
 
															+	struct buffer_head *prev_group_bh = NULL;
														
 
															+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
														
 
															+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
														
 
															+	struct ocfs2_group_desc *bg;
														
 
															+
														
 
															+	chain = ac->ac_chain;
														
 
															+	mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
														
 
															+	     bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
														
 
															+
														
 
															+	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
														
 
															+				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
														
 
															+				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	bg = (struct ocfs2_group_desc *) group_bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
														
 
															+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = -ENOSPC;
														
 
															+	/* for now, the chain search is a bit simplistic. We just use
														
 
															+	 * the 1st group with any empty bits. */
														
 
															+	while ((status = ac->ac_group_search(alloc_inode, group_bh,
														
 
															+					     bits_wanted, min_bits, bit_off,
														
 
															+					     &tmp_bits)) == -ENOSPC) {
														
 
															+		if (!bg->bg_next_group)
														
 
															+			break;
														
 
															+
														
 
															+		if (prev_group_bh) {
														
 
															+			brelse(prev_group_bh);
														
 
															+			prev_group_bh = NULL;
														
 
															+		}
														
 
															+		next_group = le64_to_cpu(bg->bg_next_group);
														
 
															+		prev_group_bh = group_bh;
														
 
															+		group_bh = NULL;
														
 
															+		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
														
 
															+					  next_group, &group_bh,
														
 
															+					  OCFS2_BH_CACHED, alloc_inode);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		bg = (struct ocfs2_group_desc *) group_bh->b_data;
														
 
															+		if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
														
 
															+			OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
														
 
															+			status = -EIO;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
														
 
															+	     tmp_bits, bg->bg_blkno);
														
 
															+
														
 
															+	*num_bits = tmp_bits;
														
 
															+
														
 
															+	BUG_ON(*num_bits == 0);
														
 
															+
														
 
															+	/*
														
 
															+	 * Keep track of previous block descriptor read. When
														
 
															+	 * we find a target, if we have read more than X
														
 
															+	 * number of descriptors, and the target is reasonably
														
 
															+	 * empty, relink him to top of his chain.
														
 
															+	 *
														
 
															+	 * We've read 0 extra blocks and only send one more to
														
 
															+	 * the transaction, yet the next guy to search has a
														
 
															+	 * much easier time.
														
 
															+	 *
														
 
															+	 * Do this *after* figuring out how many bits we're taking out
														
 
															+	 * of our target group.
														
 
															+	 */
														
 
															+	if (ac->ac_allow_chain_relink &&
														
 
															+	    (prev_group_bh) &&
														
 
															+	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
														
 
															+		status = ocfs2_relink_block_group(handle, alloc_inode,
														
 
															+						  ac->ac_bh, group_bh,
														
 
															+						  prev_group_bh, chain);
														
 
															+		if (status < 0) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Ok, claim our bits now: set the info on dinode, chainlist
														
 
															+	 * and then the group */
														
 
															+	status = ocfs2_journal_access(handle,
														
 
															+				      alloc_inode,
														
 
															+				      ac->ac_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
														
 
															+	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
														
 
															+	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle,
														
 
															+				     ac->ac_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_block_group_set_bits(handle,
														
 
															+					    alloc_inode,
														
 
															+					    bg,
														
 
															+					    group_bh,
														
 
															+					    *bit_off,
														
 
															+					    *num_bits);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
														
 
															+	     *num_bits, fe->i_blkno);
														
 
															+
														
 
															+	*bg_blkno = le64_to_cpu(bg->bg_blkno);
														
 
															+bail:
														
 
															+	if (group_bh)
														
 
															+		brelse(group_bh);
														
 
															+	if (prev_group_bh)
														
 
															+		brelse(prev_group_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* will give out up to bits_wanted contiguous bits. */
														
 
															+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
														
 
															+				     struct ocfs2_alloc_context *ac,
														
 
															+				     u32 bits_wanted,
														
 
															+				     u32 min_bits,
														
 
															+				     u16 *bit_off,
														
 
															+				     unsigned int *num_bits,
														
 
															+				     u64 *bg_blkno)
														
 
															+{
														
 
															+	int status;
														
 
															+	u16 victim, i;
														
 
															+	struct ocfs2_chain_list *cl;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
														
 
															+	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
														
 
															+	BUG_ON(!ac->ac_bh);
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
														
 
															+	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
														
 
															+		ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
														
 
															+			    "used bits but only %u total.",
														
 
															+			    le64_to_cpu(fe->i_blkno),
														
 
															+			    le32_to_cpu(fe->id1.bitmap1.i_used),
														
 
															+			    le32_to_cpu(fe->id1.bitmap1.i_total));
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
														
 
															+
														
 
															+	victim = ocfs2_find_victim_chain(cl);
														
 
															+	ac->ac_chain = victim;
														
 
															+	ac->ac_allow_chain_relink = 1;
														
 
															+
														
 
															+	status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
														
 
															+				    num_bits, bg_blkno);
														
 
															+	if (!status)
														
 
															+		goto bail;
														
 
															+	if (status < 0 && status != -ENOSPC) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "Search of victim chain %u came up with nothing, "
														
 
															+	     "trying all chains now.\n", victim);
														
 
															+
														
 
															+	/* If we didn't pick a good victim, then just default to
														
 
															+	 * searching each chain in order. Don't allow chain relinking
														
 
															+	 * because we only calculate enough journal credits for one
														
 
															+	 * relink per alloc. */
														
 
															+	ac->ac_allow_chain_relink = 0;
														
 
															+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
														
 
															+		if (i == victim)
														
 
															+			continue;
														
 
															+		if (!cl->cl_recs[i].c_free)
														
 
															+			continue;
														
 
															+
														
 
															+		ac->ac_chain = i;
														
 
															+		status = ocfs2_search_chain(ac, bits_wanted, min_bits,
														
 
															+					    bit_off, num_bits,
														
 
															+					    bg_blkno);
														
 
															+		if (!status)
														
 
															+			break;
														
 
															+		if (status < 0 && status != -ENOSPC) {
														
 
															+			mlog_errno(status);
														
 
															+			goto bail;
														
 
															+		}
														
 
															+	}
														
 
															+bail:
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_claim_metadata(struct ocfs2_super *osb,
														
 
															+			 struct ocfs2_journal_handle *handle,
														
 
															+			 struct ocfs2_alloc_context *ac,
														
 
															+			 u32 bits_wanted,
														
 
															+			 u16 *suballoc_bit_start,
														
 
															+			 unsigned int *num_bits,
														
 
															+			 u64 *blkno_start)
														
 
															+{
														
 
															+	int status;
														
 
															+	u64 bg_blkno;
														
 
															+
														
 
															+	BUG_ON(!ac);
														
 
															+	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
														
 
															+	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
														
 
															+	BUG_ON(ac->ac_handle != handle);
														
 
															+
														
 
															+	status = ocfs2_claim_suballoc_bits(osb,
														
 
															+					   ac,
														
 
															+					   bits_wanted,
														
 
															+					   1,
														
 
															+					   suballoc_bit_start,
														
 
															+					   num_bits,
														
 
															+					   &bg_blkno);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	atomic_inc(&osb->alloc_stats.bg_allocs);
														
 
															+
														
 
															+	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
														
 
															+	ac->ac_bits_given += (*num_bits);
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
														
 
															+			  struct ocfs2_journal_handle *handle,
														
 
															+			  struct ocfs2_alloc_context *ac,
														
 
															+			  u16 *suballoc_bit,
														
 
															+			  u64 *fe_blkno)
														
 
															+{
														
 
															+	int status;
														
 
															+	unsigned int num_bits;
														
 
															+	u64 bg_blkno;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!ac);
														
 
															+	BUG_ON(ac->ac_bits_given != 0);
														
 
															+	BUG_ON(ac->ac_bits_wanted != 1);
														
 
															+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
														
 
															+	BUG_ON(ac->ac_handle != handle);
														
 
															+
														
 
															+	status = ocfs2_claim_suballoc_bits(osb,
														
 
															+					   ac,
														
 
															+					   1,
														
 
															+					   1,
														
 
															+					   suballoc_bit,
														
 
															+					   &num_bits,
														
 
															+					   &bg_blkno);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	atomic_inc(&osb->alloc_stats.bg_allocs);
														
 
															+
														
 
															+	BUG_ON(num_bits != 1);
														
 
															+
														
 
															+	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
														
 
															+	ac->ac_bits_given++;
														
 
															+	status = 0;
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/* translate a group desc. blkno and it's bitmap offset into
														
 
															+ * disk cluster offset. */
														
 
															+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
														
 
															+						   u64 bg_blkno,
														
 
															+						   u16 bg_bit_off)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	u32 cluster = 0;
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
														
 
															+
														
 
															+	if (bg_blkno != osb->first_cluster_group_blkno)
														
 
															+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
														
 
															+	cluster += (u32) bg_bit_off;
														
 
															+	return cluster;
														
 
															+}
														
 
															+
														
 
															+/* given a cluster offset, calculate which block group it belongs to
														
 
															+ * and return that block offset. */
														
 
															+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
														
 
															+					    u32 cluster)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	u32 group_no;
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
														
 
															+
														
 
															+	group_no = cluster / osb->bitmap_cpg;
														
 
															+	if (!group_no)
														
 
															+		return osb->first_cluster_group_blkno;
														
 
															+	return ocfs2_clusters_to_blocks(inode->i_sb,
														
 
															+					group_no * osb->bitmap_cpg);
														
 
															+}
														
 
															+
														
 
															+/* given the block number of a cluster start, calculate which cluster
														
 
															+ * group and descriptor bitmap offset that corresponds to. */
														
 
															+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
														
 
															+						u64 data_blkno,
														
 
															+						u64 *bg_blkno,
														
 
															+						u16 *bg_bit_off)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
														
 
															+
														
 
															+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
														
 
															+
														
 
															+	*bg_blkno = ocfs2_which_cluster_group(inode,
														
 
															+					      data_cluster);
														
 
															+
														
 
															+	if (*bg_blkno == osb->first_cluster_group_blkno)
														
 
															+		*bg_bit_off = (u16) data_cluster;
														
 
															+	else
														
 
															+		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
														
 
															+							     data_blkno - *bg_blkno);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * min_bits - minimum contiguous chunk from this total allocation we
														
 
															+ * can handle. set to what we asked for originally for a full
														
 
															+ * contig. allocation, set to '1' to indicate we can deal with extents
														
 
															+ * of any size.
														
 
															+ */
														
 
															+int ocfs2_claim_clusters(struct ocfs2_super *osb,
														
 
															+			 struct ocfs2_journal_handle *handle,
														
 
															+			 struct ocfs2_alloc_context *ac,
														
 
															+			 u32 min_clusters,
														
 
															+			 u32 *cluster_start,
														
 
															+			 u32 *num_clusters)
														
 
															+{
														
 
															+	int status;
														
 
															+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
														
 
															+	u64 bg_blkno;
														
 
															+	u16 bg_bit_off;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	BUG_ON(!ac);
														
 
															+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
														
 
															+
														
 
															+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
														
 
															+	       && ac->ac_which != OCFS2_AC_USE_MAIN);
														
 
															+	BUG_ON(ac->ac_handle != handle);
														
 
															+
														
 
															+	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
														
 
															+		status = ocfs2_claim_local_alloc_bits(osb,
														
 
															+						      handle,
														
 
															+						      ac,
														
 
															+						      bits_wanted,
														
 
															+						      cluster_start,
														
 
															+						      num_clusters);
														
 
															+		if (!status)
														
 
															+			atomic_inc(&osb->alloc_stats.local_data);
														
 
															+	} else {
														
 
															+		if (min_clusters > (osb->bitmap_cpg - 1)) {
														
 
															+			/* The only paths asking for contiguousness
														
 
															+			 * should know about this already. */
														
 
															+			mlog(ML_ERROR, "minimum allocation requested exceeds "
														
 
															+				       "group bitmap size!");
														
 
															+			status = -ENOSPC;
														
 
															+			goto bail;
														
 
															+		}
														
 
															+		/* clamp the current request down to a realistic size. */
														
 
															+		if (bits_wanted > (osb->bitmap_cpg - 1))
														
 
															+			bits_wanted = osb->bitmap_cpg - 1;
														
 
															+
														
 
															+		status = ocfs2_claim_suballoc_bits(osb,
														
 
															+						   ac,
														
 
															+						   bits_wanted,
														
 
															+						   min_clusters,
														
 
															+						   &bg_bit_off,
														
 
															+						   num_clusters,
														
 
															+						   &bg_blkno);
														
 
															+		if (!status) {
														
 
															+			*cluster_start =
														
 
															+				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
														
 
															+								 bg_blkno,
														
 
															+								 bg_bit_off);
														
 
															+			atomic_inc(&osb->alloc_stats.bitmap_data);
														
 
															+		}
														
 
															+	}
														
 
															+	if (status < 0) {
														
 
															+		if (status != -ENOSPC)
														
 
															+			mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	ac->ac_bits_given += *num_clusters;
														
 
															+
														
 
															+bail:
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
														
 
															+					       struct inode *alloc_inode,
														
 
															+					       struct ocfs2_group_desc *bg,
														
 
															+					       struct buffer_head *group_bh,
														
 
															+					       unsigned int bit_off,
														
 
															+					       unsigned int num_bits)
														
 
															+{
														
 
															+	int status;
														
 
															+	unsigned int tmp;
														
 
															+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
														
 
															+	struct ocfs2_group_desc *undo_bg = NULL;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
														
 
															+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
														
 
															+
														
 
															+	if (ocfs2_is_cluster_bitmap(alloc_inode))
														
 
															+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
														
 
															+				      journal_type);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	if (ocfs2_is_cluster_bitmap(alloc_inode))
														
 
															+		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
														
 
															+
														
 
															+	tmp = num_bits;
														
 
															+	while(tmp--) {
														
 
															+		ocfs2_clear_bit((bit_off + tmp),
														
 
															+				(unsigned long *) bg->bg_bitmap);
														
 
															+		if (ocfs2_is_cluster_bitmap(alloc_inode))
														
 
															+			ocfs2_set_bit(bit_off + tmp,
														
 
															+				      (unsigned long *) undo_bg->bg_bitmap);
														
 
															+	}
														
 
															+	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, group_bh);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+bail:
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * expects the suballoc inode to already be locked.
														
 
															+ */
														
 
															+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
														
 
															+				    struct inode *alloc_inode,
														
 
															+				    struct buffer_head *alloc_bh,
														
 
															+				    unsigned int start_bit,
														
 
															+				    u64 bg_blkno,
														
 
															+				    unsigned int count)
														
 
															+{
														
 
															+	int status = 0;
														
 
															+	u32 tmp_used;
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
														
 
															+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
														
 
															+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
														
 
															+	struct buffer_head *group_bh = NULL;
														
 
															+	struct ocfs2_group_desc *group;
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	if (!OCFS2_IS_VALID_DINODE(fe)) {
														
 
															+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
														
 
															+
														
 
															+	mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
														
 
															+	     ", starting at %u\n",
														
 
															+	     OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
														
 
															+	     start_bit);
														
 
															+
														
 
															+	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
														
 
															+				  alloc_inode);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	group = (struct ocfs2_group_desc *) group_bh->b_data;
														
 
															+	if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
														
 
															+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
														
 
															+		status = -EIO;
														
 
															+		goto bail;
														
 
															+	}
														
 
															+	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
														
 
															+
														
 
															+	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
														
 
															+					      group, group_bh,
														
 
															+					      start_bit, count);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
														
 
															+				      OCFS2_JOURNAL_ACCESS_WRITE);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
														
 
															+		     count);
														
 
															+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
														
 
															+	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
														
 
															+
														
 
															+	status = ocfs2_journal_dirty(handle, alloc_bh);
														
 
															+	if (status < 0) {
														
 
															+		mlog_errno(status);
														
 
															+		goto bail;
														
 
															+	}
														
 
															+
														
 
															+bail:
														
 
															+	if (group_bh)
														
 
															+		brelse(group_bh);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
														
 
															+{
														
 
															+	u64 group = block - (u64) bit;
														
 
															+
														
 
															+	return group;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
														
 
															+		      struct inode *inode_alloc_inode,
														
 
															+		      struct buffer_head *inode_alloc_bh,
														
 
															+		      struct ocfs2_dinode *di)
														
 
															+{
														
 
															+	u64 blk = le64_to_cpu(di->i_blkno);
														
 
															+	u16 bit = le16_to_cpu(di->i_suballoc_bit);
														
 
															+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
														
 
															+
														
 
															+	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
														
 
															+					inode_alloc_bh, bit, bg_blkno, 1);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
														
 
															+			    struct inode *eb_alloc_inode,
														
 
															+			    struct buffer_head *eb_alloc_bh,
														
 
															+			    struct ocfs2_extent_block *eb)
														
 
															+{
														
 
															+	u64 blk = le64_to_cpu(eb->h_blkno);
														
 
															+	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
														
 
															+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
														
 
															+
														
 
															+	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
														
 
															+					bit, bg_blkno, 1);
														
 
															+}
														
 
															+
														
 
															+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
														
 
															+		       struct inode *bitmap_inode,
														
 
															+		       struct buffer_head *bitmap_bh,
														
 
															+		       u64 start_blk,
														
 
															+		       unsigned int num_clusters)
														
 
															+{
														
 
															+	int status;
														
 
															+	u16 bg_start_bit;
														
 
															+	u64 bg_blkno;
														
 
															+	struct ocfs2_dinode *fe;
														
 
															+
														
 
															+	/* You can't ever have a contiguous set of clusters
														
 
															+	 * bigger than a block group bitmap so we never have to worry
														
 
															+	 * about looping on them. */
														
 
															+
														
 
															+	mlog_entry_void();
														
 
															+
														
 
															+	/* This is expensive. We can safely remove once this stuff has
														
 
															+	 * gotten tested really well. */
														
 
															+	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
														
 
															+
														
 
															+	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
														
 
															+
														
 
															+	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
														
 
															+				     &bg_start_bit);
														
 
															+
														
 
															+	mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
														
 
															+	     num_clusters, start_blk);
														
 
															+	mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
														
 
															+	     bg_blkno, bg_start_bit);
														
 
															+
														
 
															+	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
														
 
															+					  bg_start_bit, bg_blkno,
														
 
															+					  num_clusters);
														
 
															+	if (status < 0)
														
 
															+		mlog_errno(status);
														
 
															+
														
 
															+	mlog_exit(status);
														
 
															+	return status;
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
														
 
															+{
														
 
															+	printk("Block Group:\n");
														
 
															+	printk("bg_signature:       %s\n", bg->bg_signature);
														
 
															+	printk("bg_size:            %u\n", bg->bg_size);
														
 
															+	printk("bg_bits:            %u\n", bg->bg_bits);
														
 
															+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
														
 
															+	printk("bg_chain:           %u\n", bg->bg_chain);
														
 
															+	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
														
 
															+	printk("bg_next_group:      %"MLFu64"\n", bg->bg_next_group);
														
 
															+	printk("bg_parent_dinode:   %"MLFu64"\n", bg->bg_parent_dinode);
														
 
															+	printk("bg_blkno:           %"MLFu64"\n", bg->bg_blkno);
														
 
															+}
														
 
															+
														
 
															+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
														
 
															+	printk("i_signature:                  %s\n", fe->i_signature);
														
 
															+	printk("i_size:                       %"MLFu64"\n", fe->i_size);
														
 
															+	printk("i_clusters:                   %u\n", fe->i_clusters);
														
 
															+	printk("i_generation:                 %u\n",
														
 
															+	       le32_to_cpu(fe->i_generation));
														
 
															+	printk("id1.bitmap1.i_used:           %u\n",
														
 
															+	       le32_to_cpu(fe->id1.bitmap1.i_used));
														
 
															+	printk("id1.bitmap1.i_total:          %u\n",
														
 
															+	       le32_to_cpu(fe->id1.bitmap1.i_total));
														
 
															+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
														
 
															+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
														
 
															+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
														
 
															+	printk("id2.i_chain.cl_next_free_rec: %u\n",
														
 
															+	       fe->id2.i_chain.cl_next_free_rec);
														
 
															+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
														
 
															+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
														
 
															+		       fe->id2.i_chain.cl_recs[i].c_free);
														
 
															+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
														
 
															+		       fe->id2.i_chain.cl_recs[i].c_total);
														
 
															+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
														
 
															+		       fe->id2.i_chain.cl_recs[i].c_blkno);
														
 
															+	}
														
 
															+}
														
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,132 @@
 
															+/* -*- mode: c; c-basic-offset: 8; -*-
														
 
															+ * vim: noexpandtab sw=8 ts=8 sts=0:
														
 
															+ *
														
 
															+ * suballoc.h
														
 
															+ *
														
 
															+ * Defines sub allocator api
														
 
															+ *
														
 
															+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation; either
														
 
															+ * version 2 of the License, or (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef _CHAINALLOC_H_
														
 
															+#define _CHAINALLOC_H_
														
 
															+
														
 
															+typedef int (group_search_t)(struct inode *,
														
 
															+			     struct buffer_head *,
														
 
															+			     u32,
														
 
															+			     u32,
														
 
															+			     u16 *,
														
 
															+			     u16 *);
														
 
															+
														
 
															+struct ocfs2_alloc_context {
														
 
															+	struct inode *ac_inode;    /* which bitmap are we allocating from? */
														
 
															+	struct buffer_head *ac_bh; /* file entry bh */
														
 
															+	u32    ac_bits_wanted;
														
 
															+	u32    ac_bits_given;
														
 
															+#define OCFS2_AC_USE_LOCAL 1
														
 
															+#define OCFS2_AC_USE_MAIN  2
														
 
															+#define OCFS2_AC_USE_INODE 3
														
 
															+#define OCFS2_AC_USE_META  4
														
 
															+	u32    ac_which;
														
 
															+	struct ocfs2_journal_handle *ac_handle;
														
 
															+
														
 
															+	/* these are used by the chain search */
														
 
															+	u16    ac_chain;
														
 
															+	int    ac_allow_chain_relink;
														
 
															+	group_search_t *ac_group_search;
														
 
															+};
														
 
															+
														
 
															+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
														
 
															+static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
														
 
															+{
														
 
															+	return ac->ac_bits_wanted - ac->ac_bits_given;
														
 
															+}
														
 
															+
														
 
															+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
														
 
															+			       struct ocfs2_journal_handle *handle,
														
 
															+			       struct ocfs2_dinode *fe,
														
 
															+			       struct ocfs2_alloc_context **ac);
														
 
															+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
														
 
															+			    struct ocfs2_journal_handle *handle,
														
 
															+			    struct ocfs2_alloc_context **ac);
														
 
															+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
														
 
															+			   struct ocfs2_journal_handle *handle,
														
 
															+			   u32 bits_wanted,
														
 
															+			   struct ocfs2_alloc_context **ac);
														
 
															+
														
 
															+int ocfs2_claim_metadata(struct ocfs2_super *osb,
														
 
															+			 struct ocfs2_journal_handle *handle,
														
 
															+			 struct ocfs2_alloc_context *ac,
														
 
															+			 u32 bits_wanted,
														
 
															+			 u16 *suballoc_bit_start,
														
 
															+			 u32 *num_bits,
														
 
															+			 u64 *blkno_start);
														
 
															+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
														
 
															+			  struct ocfs2_journal_handle *handle,
														
 
															+			  struct ocfs2_alloc_context *ac,
														
 
															+			  u16 *suballoc_bit,
														
 
															+			  u64 *fe_blkno);
														
 
															+int ocfs2_claim_clusters(struct ocfs2_super *osb,
														
 
															+			 struct ocfs2_journal_handle *handle,
														
 
															+			 struct ocfs2_alloc_context *ac,
														
 
															+			 u32 min_clusters,
														
 
															+			 u32 *cluster_start,
														
 
															+			 u32 *num_clusters);
														
 
															+
														
 
															+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
														
 
															+		      struct inode *inode_alloc_inode,
														
 
															+		      struct buffer_head *inode_alloc_bh,
														
 
															+		      struct ocfs2_dinode *di);
														
 
															+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
														
 
															+			    struct inode *eb_alloc_inode,
														
 
															+			    struct buffer_head *eb_alloc_bh,
														
 
															+			    struct ocfs2_extent_block *eb);
														
 
															+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
														
 
															+			struct inode *bitmap_inode,
														
 
															+			struct buffer_head *bitmap_bh,
														
 
															+			u64 start_blk,
														
 
															+			unsigned int num_clusters);
														
 
															+
														
 
															+static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
														
 
															+					  u64 bg_blkno)
														
 
															+{
														
 
															+	/* This should work for all block group descriptors as only
														
 
															+	 * the 1st group descriptor of the cluster bitmap is
														
 
															+	 * different. */
														
 
															+
														
 
															+	if (bg_blkno == osb->first_cluster_group_blkno)
														
 
															+		return 0;
														
 
															+
														
 
															+	/* the rest of the block groups are located at the beginning
														
 
															+	 * of their 1st cluster, so a direct translation just
														
 
															+	 * works. */
														
 
															+	return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
														
 
															+}
														
 
															+
														
 
															+static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
														
 
															+{
														
 
															+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
														
 
															+	return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
														
 
															+}
														
 
															+
														
 
															+/* This is for local alloc ONLY. Others should use the task-specific
														
 
															+ * apis above. */
														
 
															+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
														
 
															+				      struct ocfs2_alloc_context *ac);
														
 
															+
														
 
															+#endif /* _CHAINALLOC_H_ */