# Add DLM to the build system
diff -urN -p linux-2.6.8.1/cluster/Kconfig linux/cluster/Kconfig
--- linux-2.6.8.1/cluster/Kconfig	2004-08-24 13:23:09.000000000 +0800
+++ linux/cluster/Kconfig	2004-08-24 13:23:32.000000000 +0800
@@ -10,4 +10,22 @@ config CLUSTER
 	needed by all the other components. It provides membership services
 	for those other subsystems.
 
+config CLUSTER_DLM
+	tristate "Distributed Lock Manager"
+	depends on CLUSTER
+	---help---
+	A fully distributed lock manager, providing cluster-wide locking services
+	and protected lock namespaces for kernel and userland applications.
+
+config CLUSTER_DLM_PROCLOCKS
+       boolean "/proc/locks support for DLM"
+       depends on CLUSTER_DLM
+       depends on PROC_FS
+       ---help---
+       If this option is enabled a file will appear in /proc/cluster/dlm_locks.
+       write into this "file" the name of a lockspace known to the DLM and then
+       read out a list of all the resources and locks in that lockspace that are
+       known to the local node. Note because the DLM is distributed this may not
+       be the full lock picture.
+
 endmenu
diff -urN -p linux-2.6.8.1/cluster/Makefile linux/cluster/Makefile
--- linux-2.6.8.1/cluster/Makefile	2004-08-24 13:23:09.000000000 +0800
+++ linux/cluster/Makefile	2004-08-24 13:23:32.000000000 +0800
@@ -1,3 +1,4 @@
 obj-y	:= nocluster.o
 
 obj-$(CONFIG_CLUSTER)         += cman/
+obj-$(CONFIG_CLUSTER_DLM)     += dlm/
diff -urN -p linux-2.6.8.1/cluster/dlm/Makefile linux/cluster/dlm/Makefile
--- linux-2.6.8.1/cluster/dlm/Makefile	1970-01-01 07:30:00.000000000 +0730
+++ linux/cluster/dlm/Makefile	2004-08-24 13:23:32.000000000 +0800
@@ -0,0 +1,23 @@
+dlm-objs		  :=	ast.o \
+				config.o \
+				device.o \
+				dir.o \
+				lkb.o \
+				locking.o \
+				lockqueue.o \
+				lockspace.o \
+				lowcomms.o \
+				main.o \
+				memory.o \
+				midcomms.o \
+				nodes.o \
+				proc.o \
+				queries.o \
+				rebuild.o \
+				reccomms.o \
+				recover.o \
+				recoverd.o \
+				rsb.o \
+				util.o \
+
+obj-$(CONFIG_CLUSTER_DLM) += dlm.o
diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
--- linux-orig/cluster/dlm/ast.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/ast.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,618 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* 
+ * This delivers ASTs and checks for dead remote requests and deadlocks.
+ */
+
+#include <linux/timer.h>
+
+#include "dlm_internal.h"
+#include "rsb.h"
+#include "lockqueue.h"
+#include "dir.h"
+#include "locking.h"
+#include "lkb.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "ast.h"
+#include "nodes.h"
+#include "config.h"
+#include "util.h"
+
+/* Wake up flags for astd */
+#define WAKE_ASTS  1
+#define WAKE_TIMER 2
+
+static struct list_head		ast_queue;
+static struct semaphore		ast_queue_lock;
+static wait_queue_head_t	astd_waitchan;
+struct task_struct *		astd_task;
+static unsigned long		astd_wakeflags;
+
+static struct list_head		_deadlockqueue;
+static struct semaphore		_deadlockqueue_lock;
+static struct list_head		_lockqueue;
+static struct semaphore		_lockqueue_lock;
+static struct timer_list	_lockqueue_timer;
+
+void add_to_lockqueue(struct dlm_lkb *lkb)
+{
+	/* Time stamp the entry so we know if it's been waiting too long */
+	lkb->lkb_lockqueue_time = jiffies;
+
+	down(&_lockqueue_lock);
+	list_add(&lkb->lkb_lockqueue, &_lockqueue);
+	up(&_lockqueue_lock);
+}
+
+void remove_from_lockqueue(struct dlm_lkb *lkb)
+{
+	down(&_lockqueue_lock);
+	list_del(&lkb->lkb_lockqueue);
+	up(&_lockqueue_lock);
+
+#ifdef CONFIG_DLM_STATS
+	dlm_stats.lockqueue_time[lkb->lkb_lockqueue_state] += (jiffies - lkb->lkb_lockqueue_time);
+	dlm_stats.lockqueue_locks[lkb->lkb_lockqueue_state]++;
+#endif
+	lkb->lkb_lockqueue_state = 0;
+}
+
+void add_to_deadlockqueue(struct dlm_lkb *lkb)
+{
+	if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
+		return;
+	lkb->lkb_duetime = jiffies;
+	down(&_deadlockqueue_lock);
+	list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
+	up(&_deadlockqueue_lock);
+}
+
+void remove_from_deadlockqueue(struct dlm_lkb *lkb)
+{
+	if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
+		return;
+
+	down(&_deadlockqueue_lock);
+	list_del(&lkb->lkb_deadlockq);
+	up(&_deadlockqueue_lock);
+
+	/* Invalidate the due time */
+	memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
+}
+
+/* 
+ * Queue an AST for delivery, this will only deal with
+ * kernel ASTs, usermode API will piggyback on top of this.
+ *
+ * This can be called in either the user or DLM context.
+ * ASTs are queued EVEN IF we are already running in dlm_astd
+ * context as we don't know what other locks are held (eg we could
+ * be being called from a lock operation that was called from
+ * another AST!
+ * If the AST is to be queued remotely then a message is sent to
+ * the target system via midcomms.
+ */
+
+void queue_ast(struct dlm_lkb *lkb, uint16_t flags, uint8_t rqmode)
+{
+	struct dlm_request req;
+
+	if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
+		/* 
+		 * Send a message to have an ast queued remotely.  Note: we do
+		 * not send remote completion asts, they are handled as part of
+		 * remote lock granting.
+		 */
+		if (flags & AST_BAST) {
+			req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
+			req.rr_header.rh_length = sizeof(req);
+			req.rr_header.rh_flags = 0;
+			req.rr_header.rh_lkid = lkb->lkb_id;
+			req.rr_header.rh_lockspace =
+			    lkb->lkb_resource->res_ls->ls_global_id;
+			req.rr_status = lkb->lkb_retstatus;
+			req.rr_remlkid = lkb->lkb_remid;
+			req.rr_rqmode = rqmode;
+
+			midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
+				lkb->lkb_resource->res_ls->ls_allocation);
+		} else if (lkb->lkb_retstatus == -EDEADLOCK) {
+			/* 
+			 * We only queue remote Completion ASTs here for error
+			 * completions that happen out of band.
+			 * DEADLOCK is one such.
+			 */
+			req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
+			req.rr_header.rh_length = sizeof(req);
+			req.rr_header.rh_flags = 0;
+			req.rr_header.rh_lkid = lkb->lkb_id;
+			req.rr_header.rh_lockspace =
+			    lkb->lkb_resource->res_ls->ls_global_id;
+			req.rr_status = lkb->lkb_retstatus;
+			req.rr_remlkid = lkb->lkb_remid;
+			req.rr_rqmode = rqmode;
+
+			midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
+				lkb->lkb_resource->res_ls->ls_allocation);
+		}
+	} else {
+		/* 
+		 * Prepare info that will be returned in ast/bast.
+		 */
+
+		if (flags & AST_BAST) {
+			lkb->lkb_bastmode = rqmode;
+		} else {
+			lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
+			if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
+				lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
+			else
+				lkb->lkb_lksb->sb_flags = 0;
+		}
+
+		down(&ast_queue_lock);
+		if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST)))
+			list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+		lkb->lkb_astflags |= flags;
+		up(&ast_queue_lock);
+
+		/* It is the responsibility of the caller to call wake_astd()
+		 * after it has finished other locking operations that request
+		 * the ASTs to be delivered after */
+	}
+}
+
+/* 
+ * Process any LKBs on the AST queue.
+ */
+
+static void process_asts(void)
+{
+	struct dlm_ls *ls;
+	struct dlm_rsb *rsb;
+	struct dlm_lkb *lkb;
+	void (*cast) (long param);
+	void (*bast) (long param, int mode);
+	long astparam;
+	uint16_t flags;
+
+	for (;;) {
+		down(&ast_queue_lock);
+		if (list_empty(&ast_queue)) {
+			up(&ast_queue_lock);
+			break;
+		}
+
+		lkb = list_entry(ast_queue.next, struct dlm_lkb, lkb_astqueue);
+		list_del(&lkb->lkb_astqueue);
+		flags = lkb->lkb_astflags;
+		lkb->lkb_astflags = 0;
+		up(&ast_queue_lock);
+
+		cast = lkb->lkb_astaddr;
+		bast = lkb->lkb_bastaddr;
+		astparam = lkb->lkb_astparam;
+		rsb = lkb->lkb_resource;
+		ls = rsb->res_ls;
+
+		if (flags & AST_COMP) {
+			if (flags & AST_DEL) {
+				DLM_ASSERT(lkb->lkb_astflags == 0,);
+
+				/* FIXME: we don't want to block asts for other
+				   lockspaces while one is being recovered */
+
+				down_read(&ls->ls_in_recovery);
+				release_lkb(ls, lkb);
+				release_rsb(rsb);
+				up_read(&ls->ls_in_recovery);
+			}
+
+			if (cast) {
+#ifdef CONFIG_DLM_STATS
+				dlm_stats.cast++;
+#endif
+				cast(astparam);
+			}
+		}
+
+		if (flags & AST_BAST && !(flags & AST_DEL)) {
+			int bmode = lkb->lkb_bastmode;
+
+			/* gr or rq mode of the lock may have changed since the
+			   ast was queued making the delivery unnecessary */
+
+			if (!bast || dlm_modes_compat(lkb->lkb_grmode, bmode))
+				continue;
+
+			if (lkb->lkb_rqmode == DLM_LOCK_IV ||
+			    !dlm_modes_compat(lkb->lkb_rqmode, bmode)) {
+				bast(astparam, bmode);
+#ifdef CONFIG_DLM_STATS
+				dlm_stats.bast++;
+#endif
+			}
+		}
+
+		schedule();
+	}
+}
+
+void lockqueue_lkb_mark(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb, *safe;
+	int count = 0;
+
+	log_all(ls, "mark waiting requests");
+
+	down(&_lockqueue_lock);
+
+	list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
+
+		if (lkb->lkb_resource->res_ls != ls)
+			continue;
+
+		log_debug(ls, "mark %x lq %d nodeid %d", lkb->lkb_id,
+			  lkb->lkb_lockqueue_state, lkb->lkb_nodeid);
+
+		/*
+		 * These lkb's are new and the master is being looked up.  Mark
+		 * the lkb request to be resent.  Even if the destination node
+		 * for the request is still living and has our request, it will
+		 * purge all resdir requests in purge_requestqueue.  If there's
+		 * a reply to the LOOKUP request in our requestqueue (the reply
+		 * arrived after ls_stop), it is invalid and will be discarded
+		 * in purge_requestqueue, too.
+		 */
+
+		if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
+			DLM_ASSERT(lkb->lkb_nodeid == -1,
+				    print_lkb(lkb);
+				    print_rsb(lkb->lkb_resource););
+
+			lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
+			count++;
+			continue;
+		}
+
+		/*
+		 * We're waiting for an unlock reply and the master node from
+		 * whom we're expecting the reply has failed.  If there's a
+		 * reply in the requestqueue do nothing and process it later in
+		 * process_requestqueue.  If there's no reply, don't rebuild
+		 * the lkb on a new master, but just assume we've gotten an
+		 * unlock completion reply from the prev master (this also
+		 * means not resending the unlock request).  If the unlock is
+		 * for the last lkb on the rsb, the rsb has nodeid of -1 and
+		 * the rsb won't be rebuilt on the new master either.
+		 *
+		 * If we're waiting for an unlock reply and the master node is
+		 * still alive, we should either have a reply in the
+		 * requestqueue from the master already, or we should get one
+		 * from the master once recovery is complete.  There is no
+		 * rebuilding of the rsb/lkb in this case and no resending of
+		 * the request.
+                 */
+
+		if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK) {
+			if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
+				if (reply_in_requestqueue(ls, lkb->lkb_id)) {
+					lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
+					log_debug(ls, "mark %x unlock have rep",
+						  lkb->lkb_id);
+				} else {
+					/* assume we got reply fr old master */
+					lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
+					lkb->lkb_flags |= GDLM_LKFLG_UNLOCKDONE;
+					log_debug(ls, "mark %x unlock no rep",
+						lkb->lkb_id);
+				}
+			}
+			count++;
+			continue;
+		}
+
+		/*
+		 * These lkb's have an outstanding request to a bygone node.
+		 * The request will be redirected to the new master node in
+		 * resend_cluster_requests().  Don't mark the request for
+		 * resending if there's a reply for it saved in the
+		 * requestqueue.
+		 */
+
+		if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
+		    !reply_in_requestqueue(ls, lkb->lkb_id)) {
+
+			lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
+
+			/* 
+			 * Don't rebuild this lkb on a new rsb in
+			 * rebuild_rsbs_send().
+			 */
+
+			if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONDGRANT) {
+				DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_WAITING,
+					    print_lkb(lkb);
+					    print_rsb(lkb->lkb_resource););
+				lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
+			}
+
+			/* 
+			 * This flag indicates to the new master that his lkb
+			 * is in the midst of a convert request and should be
+			 * placed on the granted queue rather than the convert
+			 * queue.  We will resend this convert request to the
+			 * new master.
+			 */
+
+			else if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONVERT) {
+				DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,
+					    print_lkb(lkb);
+					    print_rsb(lkb->lkb_resource););
+				lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
+			}
+
+			count++;
+		}
+	}
+	up(&_lockqueue_lock);
+
+	log_all(ls, "marked %d requests", count);
+}
+
+int resend_cluster_requests(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb, *safe;
+	struct dlm_rsb *r;
+	int error = 0, state, count = 0;
+
+	log_all(ls, "resend marked requests");
+
+	down(&_lockqueue_lock);
+
+	list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
+
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+			log_debug(ls, "resend_cluster_requests: aborted");
+			error = -EINTR;
+			break;
+		}
+
+		r = lkb->lkb_resource;
+
+		if (r->res_ls != ls)
+			continue;
+
+		log_debug(ls, "resend %x lq %d flg %x node %d/%d \"%s\"",
+			  lkb->lkb_id, lkb->lkb_lockqueue_state, lkb->lkb_flags,
+			  lkb->lkb_nodeid, r->res_nodeid, r->res_name);
+
+		if (lkb->lkb_flags & GDLM_LKFLG_UNLOCKDONE) {
+			log_debug(ls, "unlock done %x", lkb->lkb_id);
+			list_del(&lkb->lkb_lockqueue);
+			res_lkb_dequeue(lkb);
+			lkb->lkb_retstatus = -DLM_EUNLOCK;
+			queue_ast(lkb, AST_COMP | AST_DEL, 0);
+			count++;
+			continue;
+		}
+
+		/*
+		 * Resend/process the lockqueue lkb's (in-progres requests)
+		 * that were flagged at the start of recovery in
+		 * lockqueue_lkb_mark().
+		 */
+
+		if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
+			lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
+			lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
+			lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
+
+			if (lkb->lkb_nodeid == -1) {
+				/* 
+				 * Send lookup to new resdir node.
+				 */
+				lkb->lkb_lockqueue_time = jiffies;
+				send_cluster_request(lkb,
+						     lkb->lkb_lockqueue_state);
+			}
+
+			else if (lkb->lkb_nodeid != 0) {
+				/* 
+				 * There's a new RSB master (that's not us.)
+				 */
+				lkb->lkb_lockqueue_time = jiffies;
+				send_cluster_request(lkb,
+						     lkb->lkb_lockqueue_state);
+			}
+
+			else {
+				/* 
+				 * We are the new RSB master for this lkb
+				 * request.
+				 */
+				state = lkb->lkb_lockqueue_state;
+				lkb->lkb_lockqueue_state = 0;
+				/* list_del equals remove_from_lockqueue() */
+				list_del(&lkb->lkb_lockqueue);
+				process_remastered_lkb(ls, lkb, state);
+			}
+
+			count++;
+		}
+	}
+	up(&_lockqueue_lock);
+
+	log_all(ls, "resent %d requests", count);
+	return error;
+}
+
+/* 
+ * Process any LKBs on the Lock queue, this
+ * just looks at the entries to see if they have been
+ * on the queue too long and fails the requests if so.
+ */
+
+static void process_lockqueue(void)
+{
+	struct dlm_lkb *lkb, *safe;
+	struct dlm_ls *ls;
+	int count = 0;
+
+	down(&_lockqueue_lock);
+
+	list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
+		ls = lkb->lkb_resource->res_ls;
+
+		if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
+			continue;
+
+		/* Don't time out locks that are in transition */
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
+			continue;
+
+		if (check_timeout(lkb->lkb_lockqueue_time,
+				  dlm_config.lock_timeout)) {
+			count++;
+			list_del(&lkb->lkb_lockqueue);
+			up(&_lockqueue_lock);
+			cancel_lockop(lkb, -ETIMEDOUT);
+			down(&_lockqueue_lock);
+		}
+	}
+	up(&_lockqueue_lock);
+
+	if (count)
+		wake_astd();
+
+	mod_timer(&_lockqueue_timer,
+		  jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
+}
+
+/* Look for deadlocks */
+static void process_deadlockqueue(void)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	down(&_deadlockqueue_lock);
+
+	list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
+		struct dlm_lkb *kill_lkb;
+
+		/* Only look at "due" locks */
+		if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
+			break;
+
+		/* Don't look at locks that are in transition */
+		if (!test_bit(LSFL_LS_RUN,
+			      &lkb->lkb_resource->res_ls->ls_flags))
+			continue;
+
+		up(&_deadlockqueue_lock);
+
+		/* Lock has hit due time, check for conversion deadlock */
+		kill_lkb = conversion_deadlock_check(lkb);
+		if (kill_lkb)
+			cancel_conversion(kill_lkb, -EDEADLOCK);
+
+		down(&_deadlockqueue_lock);
+	}
+	up(&_deadlockqueue_lock);
+}
+
+static __inline__ int no_asts(void)
+{
+	int ret;
+
+	down(&ast_queue_lock);
+	ret = list_empty(&ast_queue);
+	up(&ast_queue_lock);
+	return ret;
+}
+
+static void lockqueue_timer_fn(unsigned long arg)
+{
+	set_bit(WAKE_TIMER, &astd_wakeflags);
+	wake_up(&astd_waitchan);
+}
+
+/* 
+ * DLM daemon which delivers asts.
+ */
+
+static int dlm_astd(void *data)
+{
+	/*
+	 * Set a timer to check the lockqueue for dead locks (and deadlocks).
+	 */
+	INIT_LIST_HEAD(&_lockqueue);
+	init_MUTEX(&_lockqueue_lock);
+	INIT_LIST_HEAD(&_deadlockqueue);
+	init_MUTEX(&_deadlockqueue_lock);
+	init_timer(&_lockqueue_timer);
+	_lockqueue_timer.function = lockqueue_timer_fn;
+	_lockqueue_timer.data = 0;
+	mod_timer(&_lockqueue_timer,
+		  jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
+
+	while (!kthread_should_stop()) {
+		wchan_cond_sleep_intr(astd_waitchan, !test_bit(WAKE_ASTS, &astd_wakeflags));
+
+		if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
+			process_asts();
+
+		if (test_and_clear_bit(WAKE_TIMER, &astd_wakeflags)) {
+			process_lockqueue();
+			if (dlm_config.deadlocktime)
+				process_deadlockqueue();
+		}
+	}
+
+	if (timer_pending(&_lockqueue_timer))
+		del_timer(&_lockqueue_timer);
+
+	return 0;
+}
+
+void wake_astd(void)
+{
+	if (!no_asts()) {
+		set_bit(WAKE_ASTS, &astd_wakeflags);
+		wake_up(&astd_waitchan);
+	}
+}
+
+int astd_start(void)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	INIT_LIST_HEAD(&ast_queue);
+	init_MUTEX(&ast_queue_lock);
+	init_waitqueue_head(&astd_waitchan);
+
+	p = kthread_run(dlm_astd, NULL, 0, "dlm_astd");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+		astd_task = p;
+	return error;
+}
+
+void astd_stop(void)
+{
+	kthread_stop(astd_task);
+	wake_up(&astd_waitchan);
+}
diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
--- linux-orig/cluster/dlm/ast.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/ast.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,28 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __AST_DOT_H__
+#define __AST_DOT_H__
+
+void lockqueue_lkb_mark(struct dlm_ls *ls);
+int resend_cluster_requests(struct dlm_ls *ls);
+void add_to_lockqueue(struct dlm_lkb *lkb);
+void remove_from_lockqueue(struct dlm_lkb *lkb);
+void add_to_deadlockqueue(struct dlm_lkb *lkb);
+void remove_from_deadlockqueue(struct dlm_lkb *lkb);
+void queue_ast(struct dlm_lkb *lkb, uint16_t astflags, uint8_t rqmode);
+void wake_astd(void);
+int astd_start(void);
+void astd_stop(void);
+
+#endif				/* __AST_DOT_H__ */
diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
--- linux-orig/cluster/dlm/config.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/config.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,137 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT       21064
+#define DEFAULT_LOCK_TIMEOUT      30
+#define DEFAULT_BUFFER_SIZE     4096
+#define DEFAULT_RSBTBL_SIZE      256
+#define DEFAULT_LKBTBL_SIZE     1024
+#define DEFAULT_DIRTBL_SIZE      512
+#define DEFAULT_CONN_INCREMENT    32
+#define DEFAULT_DEADLOCKTIME      10
+#define DEFAULT_RECOVER_TIMER      5
+
+struct config_info dlm_config = {
+	.tcp_port = DEFAULT_TCP_PORT,
+	.lock_timeout = DEFAULT_LOCK_TIMEOUT,
+	.buffer_size = DEFAULT_BUFFER_SIZE,
+	.rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+	.lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+	.dirtbl_size = DEFAULT_DIRTBL_SIZE,
+	.conn_increment = DEFAULT_CONN_INCREMENT,
+	.deadlocktime = DEFAULT_DEADLOCKTIME,
+	.recover_timer = DEFAULT_RECOVER_TIMER
+};
+
+
+static struct config_proc_info {
+    char *name;
+    int  *value;
+} config_proc[] = {
+    {
+	.name = "tcp_port",
+	.value = &dlm_config.tcp_port,
+    },
+    {
+	.name = "lock_timeout",
+	.value = &dlm_config.lock_timeout,
+    },
+    {
+	.name = "buffer_size",
+	.value = &dlm_config.buffer_size,
+    },
+    {
+	.name = "rsbtbl_size",
+	.value = &dlm_config.rsbtbl_size,
+    },
+    {
+	.name = "lkbtbl_size",
+	.value = &dlm_config.lkbtbl_size,
+    },
+    {
+	.name = "dirtbl_size",
+	.value = &dlm_config.dirtbl_size,
+    },
+    {
+	.name = "conn_increment",
+	.value = &dlm_config.conn_increment,
+    },
+    {
+	.name = "deadlocktime",
+	.value = &dlm_config.deadlocktime,
+    },
+    {
+	.name = "recover_timer",
+	.value = &dlm_config.recover_timer,
+    }
+};
+static struct proc_dir_entry *dlm_dir;
+
+static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
+				int *eof, void *data)
+{
+	struct config_proc_info *cinfo = data;
+	return snprintf(page, count, "%d\n", *cinfo->value);
+}
+
+static int dlm_config_write_proc(struct file *file, const char *buffer,
+				 unsigned long count, void *data)
+{
+	struct config_proc_info *cinfo = data;
+	int value;
+	char *end;
+
+	value = simple_strtoul(buffer, &end, 10);
+	if (*end)
+		*cinfo->value = value;
+	return count;
+}
+
+int dlm_config_init(void)
+{
+	int i;
+	struct proc_dir_entry *pde;
+
+	dlm_dir = proc_mkdir("cluster/config/dlm", 0);
+	if (!dlm_dir)
+		return -1;
+
+	dlm_dir->owner = THIS_MODULE;
+
+	for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
+		pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
+		if (pde) {
+			pde->data = &config_proc[i];
+			pde->write_proc = dlm_config_write_proc;
+			pde->read_proc = dlm_config_read_proc;
+		}
+	}
+	return 0;
+}
+
+void dlm_config_exit(void)
+{
+	int i;
+
+	for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
+		remove_proc_entry(config_proc[i].name, dlm_dir);
+	remove_proc_entry("cluster/config/dlm", NULL);
+}
diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
--- linux-orig/cluster/dlm/config.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/config.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,33 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+struct config_info {
+	int tcp_port;
+	int lock_timeout;
+	int buffer_size;
+	int rsbtbl_size;
+	int lkbtbl_size;
+	int dirtbl_size;
+	int conn_increment;
+	int deadlocktime;
+	int recover_timer;
+};
+
+extern struct config_info dlm_config;
+extern int  dlm_config_init(void);
+extern void dlm_config_exit(void);
+
+#endif				/* __CONFIG_DOT_H__ */
diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
--- linux-orig/cluster/dlm/device.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/device.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,1212 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * device.c
+ *
+ * This is the userland interface to the DLM.
+ *
+ * The locking is done via a misc char device (find the
+ * registered minor number in /proc/misc).
+ *
+ * User code should not use this interface directly but
+ * call the library routines in libdlm.a instead.
+ *
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <asm/ioctls.h>
+
+#include "dlm_internal.h"
+#include "device.h"
+
+extern struct dlm_lkb *dlm_get_lkb(struct dlm_ls *, int);
+static struct file_operations _dlm_fops;
+static const char *name_prefix="dlm";
+static struct list_head user_ls_list;
+static struct semaphore user_ls_lock;
+
+/* Flags in li_flags */
+#define LI_FLAG_COMPLETE  1
+#define LI_FLAG_FIRSTLOCK 2
+
+#define LOCKINFO_MAGIC 0x53595324
+
+struct lock_info {
+	uint32_t li_magic;
+	uint8_t li_cmd;
+	struct dlm_lksb li_lksb;
+	wait_queue_head_t li_waitq;
+	unsigned long li_flags;
+	void __user *li_castparam;
+	void __user *li_castaddr;
+	void __user *li_bastparam;
+	void __user *li_bastaddr;
+	void __user *li_pend_bastparam;
+	void __user *li_pend_bastaddr;
+	void __user *li_user_lvbptr;
+	struct list_head li_ownerqueue;
+	struct file_info *li_file;
+	struct dlm_lksb __user *li_user_lksb;
+	struct semaphore li_firstlock;
+	struct dlm_queryinfo *li_queryinfo;
+	struct dlm_queryinfo __user *li_user_queryinfo;
+};
+
+/* A queued AST no less */
+struct ast_info {
+	struct dlm_lock_result result;
+	struct dlm_queryinfo *queryinfo;
+	struct dlm_queryinfo __user *user_queryinfo;
+	struct list_head list;
+	void __user *user_lvbptr;
+	uint32_t ast_reason;	/* AST_COMP or AST_BAST from dlm_internal.h */
+};
+
+/* One of these per userland lockspace */
+struct user_ls {
+	void    *ls_lockspace;
+	atomic_t ls_refcnt;
+	long     ls_flags; /* bit 1 means LS has been deleted */
+
+	/* Passed into misc_register() */
+	struct miscdevice ls_miscinfo;
+	struct list_head  ls_list;
+};
+
+/* misc_device info for the control device */
+static struct miscdevice ctl_device;
+
+/*
+ * Stuff we hang off the file struct.
+ * The first two are to cope with unlocking all the
+ * locks help by a process when it dies.
+ */
+struct file_info {
+	struct list_head    fi_lkb_list;     /* List of active lkbs */
+	spinlock_t          fi_lkb_lock;
+	struct list_head    fi_ast_list;     /* Queue of ASTs to be delivered */
+	spinlock_t          fi_ast_lock;
+	wait_queue_head_t   fi_wait;
+	struct user_ls     *fi_ls;
+	atomic_t            fi_refcnt;       /* Number of users */
+	unsigned long       fi_flags;        /* Bit 1 means the device is open */
+};
+
+
+/* get and put ops for file_info.
+   Actually I don't really like "get" and "put", but everyone
+   else seems to use them and I can't think of anything
+   nicer at the moment */
+static void get_file_info(struct file_info *f)
+{
+	atomic_inc(&f->fi_refcnt);
+}
+
+static void put_file_info(struct file_info *f)
+{
+	if (atomic_dec_and_test(&f->fi_refcnt))
+		kfree(f);
+}
+
+static void release_lockinfo(struct lock_info *li)
+{
+	put_file_info(li->li_file);
+	if (li->li_lksb.sb_lvbptr && li->li_cmd != DLM_USER_QUERY)
+		kfree(li->li_lksb.sb_lvbptr);
+	kfree(li);
+}
+
+static struct user_ls *__find_lockspace(int minor)
+{
+	struct user_ls *lsinfo;
+
+	list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
+
+		if (lsinfo->ls_miscinfo.minor == minor)
+			return lsinfo;
+	}
+	return NULL;
+}
+
+/* Find a lockspace struct given the device minor number */
+static struct user_ls *find_lockspace(int minor)
+{
+	struct user_ls *lsinfo;
+
+	down(&user_ls_lock);
+	lsinfo = __find_lockspace(minor);
+	up(&user_ls_lock);
+
+	return lsinfo;
+}
+
+static void add_lockspace_to_list(struct user_ls *lsinfo)
+{
+	down(&user_ls_lock);
+	list_add(&lsinfo->ls_list, &user_ls_list);
+	up(&user_ls_lock);
+}
+
+/* Register a lockspace with the DLM and create a misc
+   device for userland to access it */
+static int register_lockspace(char *name, struct user_ls **ls)
+{
+	struct user_ls *newls;
+	int status;
+	int namelen;
+
+	namelen = strlen(name)+strlen(name_prefix)+2;
+
+	newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
+	if (!newls)
+		return -ENOMEM;
+	memset(newls, 0, sizeof(struct user_ls));
+
+	newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
+	if (!newls->ls_miscinfo.name) {
+		kfree(newls);
+		return -ENOMEM;
+	}
+	status = dlm_new_lockspace(name, strlen(name),
+				   &newls->ls_lockspace, 0);
+
+	if (status != 0) {
+		kfree(newls->ls_miscinfo.name);
+		kfree(newls);
+		return status;
+	}
+
+	snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
+
+	newls->ls_miscinfo.fops = &_dlm_fops;
+	newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
+
+	status = misc_register(&newls->ls_miscinfo);
+	if (status) {
+		log_print("failed to register misc device for %s", name);
+		dlm_release_lockspace(newls->ls_lockspace, 0);
+		kfree(newls->ls_miscinfo.name);
+		kfree(newls);
+		return status;
+	}
+
+
+	add_lockspace_to_list(newls);
+	*ls = newls;
+	return 0;
+}
+
+/* Called with the user_ls_lock semaphore held */
+static int unregister_lockspace(struct user_ls *lsinfo, int force)
+{
+	int status;
+
+	status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
+	if (status)
+		return status;
+
+	status = misc_deregister(&lsinfo->ls_miscinfo);
+	if (status)
+		return status;
+
+	list_del(&lsinfo->ls_list);
+	set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
+	lsinfo->ls_lockspace = NULL;
+	if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
+		kfree(lsinfo->ls_miscinfo.name);
+		kfree(lsinfo);
+	}
+
+	return 0;
+}
+
+/* Add it to userland's AST queue */
+static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam, uint32_t reason)
+{
+	struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
+	if (!ast)
+		return;
+
+	ast->result.astparam  = astparam;
+	ast->result.astaddr   = astaddr;
+	ast->result.user_lksb = li->li_user_lksb;
+	ast->result.cmd       = li->li_cmd;
+	ast->user_lvbptr      = li->li_user_lvbptr;
+	ast->ast_reason	      = reason;
+	memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
+
+	/* These two will both be NULL for anything other than queries */
+	ast->queryinfo        = li->li_queryinfo;
+	ast->user_queryinfo   = li->li_user_queryinfo;
+
+	spin_lock(&li->li_file->fi_ast_lock);
+	list_add_tail(&ast->list, &li->li_file->fi_ast_list);
+	spin_unlock(&li->li_file->fi_ast_lock);
+	wake_up_interruptible(&li->li_file->fi_wait);
+}
+
+static void bast_routine(void *param, int mode)
+{
+	struct lock_info *li = param;
+
+	if (li && li->li_bastaddr) {
+		add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, AST_BAST);
+	}
+}
+
+/*
+ * This is the kernel's AST routine.
+ * All lock, unlock & query operations complete here.
+ * The only syncronous ops are those done during device close.
+ */
+static void ast_routine(void *param)
+{
+	struct lock_info *li = param;
+
+	/* Param may be NULL if a persistent lock is unlocked by someone else */
+	if (!li)
+		return;
+
+	/* If this is a succesful conversion then activate the blocking ast
+	 * args from the conversion request */
+	if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
+	    li->li_lksb.sb_status == 0) {
+
+		li->li_bastparam = li->li_pend_bastparam;
+		li->li_bastaddr = li->li_pend_bastaddr;
+		li->li_pend_bastaddr = NULL;
+	}
+
+	/* If it's an async request then post data to the user's AST queue. */
+	if (li->li_castaddr) {
+
+		/* Only queue AST if the device is still open */
+		if (test_bit(1, &li->li_file->fi_flags))
+			add_to_astqueue(li, li->li_castaddr, li->li_castparam, AST_COMP);
+
+		/* If it's a new lock operation that failed, then
+		 * remove it from the owner queue and free the
+		 * lock_info. The DLM will not free the LKB until this
+		 * AST has completed.
+		 */
+		if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
+		    li->li_lksb.sb_status != 0) {
+			struct dlm_lkb *lkb;
+
+			/* Wait till dlm_lock() has finished */
+			down(&li->li_firstlock);
+			up(&li->li_firstlock);
+
+			/* If the LKB has been freed then we need to tidy up too */
+			lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
+			if (!lkb) {
+				spin_lock(&li->li_file->fi_lkb_lock);
+				list_del(&li->li_ownerqueue);
+				spin_unlock(&li->li_file->fi_lkb_lock);
+
+				release_lockinfo(li);
+			}
+			return;
+		}
+		/* Free unlocks & queries */
+		if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
+		    li->li_cmd == DLM_USER_QUERY) {
+			release_lockinfo(li);
+		}
+	}
+	else {
+		/* Synchronous request, just wake up the caller */
+		set_bit(LI_FLAG_COMPLETE, &li->li_flags);
+		wake_up_interruptible(&li->li_waitq);
+	}
+}
+
+/*
+ * Wait for the lock op to complete and return the status.
+ */
+static int wait_for_ast(struct lock_info *li)
+{
+	/* Wait for the AST routine to complete */
+	set_task_state(current, TASK_INTERRUPTIBLE);
+	while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
+		schedule();
+
+	set_task_state(current, TASK_RUNNING);
+
+	return li->li_lksb.sb_status;
+}
+
+
+/* Open on control device */
+static int dlm_ctl_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+/* Close on control device */
+static int dlm_ctl_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+/* Open on lockspace device */
+static int dlm_open(struct inode *inode, struct file *file)
+{
+	struct file_info *f;
+	struct user_ls *lsinfo;
+
+	lsinfo = find_lockspace(iminor(inode));
+	if (!lsinfo)
+		return -ENOENT;
+
+	f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
+	if (!f)
+		return -ENOMEM;
+
+	atomic_inc(&lsinfo->ls_refcnt);
+	INIT_LIST_HEAD(&f->fi_lkb_list);
+	INIT_LIST_HEAD(&f->fi_ast_list);
+	spin_lock_init(&f->fi_ast_lock);
+	spin_lock_init(&f->fi_lkb_lock);
+	init_waitqueue_head(&f->fi_wait);
+	f->fi_ls = lsinfo;
+	atomic_set(&f->fi_refcnt, 1);
+	set_bit(1, &f->fi_flags);
+
+	file->private_data = f;
+
+	return 0;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_lock_params *params)
+{
+	if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+	    (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+	     params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+		log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
+		       params->version[0],
+		       params->version[1],
+		       params->version[2],
+		       DLM_DEVICE_VERSION_MAJOR,
+		       DLM_DEVICE_VERSION_MINOR,
+		       DLM_DEVICE_VERSION_PATCH);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Close on lockspace device */
+static int dlm_close(struct inode *inode, struct file *file)
+{
+	struct file_info *f = file->private_data;
+	struct lock_info li;
+	struct lock_info *old_li, *safe;
+	sigset_t tmpsig;
+	sigset_t allsigs;
+	struct user_ls *lsinfo;
+	DECLARE_WAITQUEUE(wq, current);
+
+	lsinfo = find_lockspace(iminor(inode));
+	if (!lsinfo)
+		return -ENOENT;
+
+	/* Mark this closed so that ASTs will not be delivered any more */
+	clear_bit(1, &f->fi_flags);
+
+	/* Block signals while we are doing this */
+	sigfillset(&allsigs);
+	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+	/* We use our own lock_info struct here, so that any
+	 * outstanding "real" ASTs will be delivered with the
+	 * corresponding "real" params, thus freeing the lock_info
+	 * that belongs the lock. This catches the corner case where
+	 * a lock is BUSY when we try to unlock it here
+	 */
+	memset(&li, 0, sizeof(li));
+	clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
+	init_waitqueue_head(&li.li_waitq);
+	add_wait_queue(&li.li_waitq, &wq);
+
+	/*
+	 * Free any outstanding locks, they are on the
+	 * list in LIFO order so there should be no problems
+	 * about unlocking parents before children.
+	 * Although we don't remove the lkbs from the list here
+	 * (what would be the point?), foreach_safe is needed
+	 * because the lkbs are freed during dlm_unlock operations
+	 */
+	list_for_each_entry_safe(old_li, safe, &f->fi_lkb_list, li_ownerqueue) {
+		int status;
+		int lock_status;
+		int flags = 0;
+		struct dlm_lkb *lkb;
+
+		lkb = dlm_get_lkb(f->fi_ls->ls_lockspace, old_li->li_lksb.sb_lkid);
+
+		/* Don't unlock persistent locks */
+		if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
+			list_del(&old_li->li_ownerqueue);
+
+			/* Update master copy */
+			if (lkb->lkb_resource->res_nodeid) {
+				li.li_lksb.sb_lkid = lkb->lkb_id;
+				status = dlm_lock(f->fi_ls->ls_lockspace, 
+						lkb->lkb_grmode, &li.li_lksb, 
+						DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
+						NULL, 0, 0, ast_routine, &li, 
+						NULL, NULL);
+				if (status == 0)
+					wait_for_ast(&li);
+			}
+			lkb->lkb_flags |= GDLM_LKFLG_ORPHAN;
+
+			/* But tidy our references in it */
+			kfree(old_li);
+			lkb->lkb_astparam = (long)NULL;
+			put_file_info(f);
+
+			continue;
+		}
+
+		clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
+
+		/* If it's not granted then cancel the request.
+		 * If the lock was WAITING then it will be dropped,
+		 *    if it was converting then it will be reverted to GRANTED,
+		 *    then we will unlock it.
+		 */
+		lock_status = lkb->lkb_status;
+
+		if (lock_status != GDLM_LKSTS_GRANTED)
+			flags = DLM_LKF_CANCEL;
+
+		if (lkb->lkb_grmode >= DLM_LOCK_PW)
+			flags |= DLM_LKF_IVVALBLK;
+
+		status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
+
+		/* Must wait for it to complete as the next lock could be its
+		 * parent */
+		if (status == 0)
+			wait_for_ast(&li);
+
+		/* If it was waiting for a conversion, it will
+		   now be granted so we can unlock it properly */
+		if (lock_status == GDLM_LKSTS_CONVERT) {
+			flags &= ~DLM_LKF_CANCEL;
+			clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
+			status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
+
+			if (status == 0)
+				wait_for_ast(&li);
+		}
+		/* Unlock suceeded, free the lock_info struct. */
+		if (status == 0) {
+			kfree(old_li);
+			put_file_info(f);
+		}
+	}
+
+	remove_wait_queue(&li.li_waitq, &wq);
+
+	/* If this is the last reference, and the lockspace has been deleted
+	   then free the struct */
+	if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
+		kfree(lsinfo->ls_miscinfo.name);
+		kfree(lsinfo);
+	}
+
+	/* Restore signals */
+	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+	recalc_sigpending();
+
+	return 0;
+}
+
+/*
+ * ioctls to create/remove lockspaces, and check how many
+ * outstanding ASTs there are against a particular LS.
+ */
+static int dlm_ioctl(struct inode *inode, struct file *file,
+		     uint command, ulong u)
+{
+	struct file_info *fi = file->private_data;
+	int status = -EINVAL;
+	int count;
+	struct list_head *tmp_list;
+
+	switch (command) {
+
+		/* Are there any ASTs for us to read?
+		 * Warning, this returns the number of messages (ASTs)
+		 * in the queue, NOT the number of bytes to read
+		 */
+	case FIONREAD:
+		count = 0;
+		spin_lock(&fi->fi_ast_lock);
+		list_for_each(tmp_list, &fi->fi_ast_list)
+			count++;
+		spin_unlock(&fi->fi_ast_lock);
+		status = put_user(count, (int *)u);
+		break;
+
+	default:
+		return -ENOTTY;
+	}
+
+	return status;
+}
+
+/*
+ * ioctls to create/remove lockspaces.
+ */
+static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
+			 uint command, ulong u)
+{
+	int status = -EINVAL;
+	char ls_name[MAX_LS_NAME_LEN];
+	struct user_ls *lsinfo;
+	int force = 0;
+
+	switch (command) {
+	case DLM_CREATE_LOCKSPACE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
+			return -EFAULT;
+		status = register_lockspace(ls_name, &lsinfo);
+
+		/* If it succeeded then return the minor number */
+		if (status == 0)
+			status = lsinfo->ls_miscinfo.minor;
+		break;
+
+	case DLM_FORCE_RELEASE_LOCKSPACE:
+		force = 2;
+
+	case DLM_RELEASE_LOCKSPACE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		down(&user_ls_lock);
+		lsinfo = __find_lockspace(u);
+		if (!lsinfo) {
+			up(&user_ls_lock);
+			return -EINVAL;
+		}
+
+		status = unregister_lockspace(lsinfo, force);
+		up(&user_ls_lock);
+		break;
+
+	default:
+		return -ENOTTY;
+	}
+
+	return status;
+}
+
+/* Deal with the messy stuff of copying a web of structs
+   from kernel space to userspace */
+static int copy_query_result(struct ast_info *ast)
+{
+	int status = -EFAULT;
+	struct dlm_queryinfo qi;
+
+	/* Get the pointers to userspace structs */
+	if (copy_from_user(&qi, ast->user_queryinfo,
+			   sizeof(struct dlm_queryinfo)))
+		goto copy_out;
+
+	if (put_user(ast->queryinfo->gqi_lockcount,
+		     &ast->user_queryinfo->gqi_lockcount))
+		goto copy_out;
+
+	if (qi.gqi_resinfo) {
+		if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
+				 sizeof(struct dlm_resinfo)))
+			goto copy_out;
+	}
+
+	if (qi.gqi_lockinfo) {
+		if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
+				 sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
+			goto copy_out;
+	}
+
+	status = 0;
+
+	if (ast->queryinfo->gqi_lockinfo)
+		kfree(ast->queryinfo->gqi_lockinfo);
+
+	if (ast->queryinfo->gqi_resinfo)
+		kfree(ast->queryinfo->gqi_resinfo);
+
+	kfree(ast->queryinfo);
+
+ copy_out:
+	return status;
+}
+
+/* Read call, might block if no ASTs are waiting.
+ * It will only ever return one message at a time, regardless
+ * of how many are pending.
+ */
+static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
+{
+	struct file_info *fi = file->private_data;
+	struct ast_info *ast;
+	int ret;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (count < sizeof(struct dlm_lock_result))
+		return -EINVAL;
+
+	spin_lock(&fi->fi_ast_lock);
+	if (list_empty(&fi->fi_ast_list)) {
+
+		/* No waiting ASTs.
+		 * Return EOF if the lockspace been deleted.
+		 */
+		if (test_bit(1, &fi->fi_ls->ls_flags))
+			return 0;
+
+		if (file->f_flags & O_NONBLOCK) {
+			spin_unlock(&fi->fi_ast_lock);
+			return -EAGAIN;
+		}
+
+		add_wait_queue(&fi->fi_wait, &wait);
+
+	repeat:
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&fi->fi_ast_list) &&
+		    !signal_pending(current)) {
+
+			spin_unlock(&fi->fi_ast_lock);
+			schedule();
+			spin_lock(&fi->fi_ast_lock);
+			goto repeat;
+		}
+
+		current->state = TASK_RUNNING;
+		remove_wait_queue(&fi->fi_wait, &wait);
+
+		if (signal_pending(current)) {
+			spin_unlock(&fi->fi_ast_lock);
+			return -ERESTARTSYS;
+		}
+	}
+
+	ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
+	list_del(&ast->list);
+	spin_unlock(&fi->fi_ast_lock);
+
+	ret = sizeof(struct dlm_lock_result);
+	if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
+		ret = -EFAULT;
+
+	if (ast->ast_reason == AST_COMP &&
+	    ast->result.cmd == DLM_USER_LOCK && ast->user_lvbptr) {
+		if (copy_to_user(ast->user_lvbptr, ast->result.lksb.sb_lvbptr, DLM_LVB_LEN))
+			ret = -EFAULT;
+	}
+
+	/* If it was a query then copy the result block back here */
+	if (ast->queryinfo) {
+		int status = copy_query_result(ast);
+		if (status)
+			ret = status;
+	}
+
+	kfree(ast);
+	return ret;
+}
+
+static unsigned int dlm_poll(struct file *file, poll_table *wait)
+{
+	struct file_info *fi = file->private_data;
+
+	poll_wait(file, &fi->fi_wait, wait);
+
+	spin_lock(&fi->fi_ast_lock);
+	if (!list_empty(&fi->fi_ast_list)) {
+		spin_unlock(&fi->fi_ast_lock);
+		return POLLIN | POLLRDNORM;
+	}
+
+	spin_unlock(&fi->fi_ast_lock);
+	return 0;
+}
+
+static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
+{
+	struct lock_info *li;
+	int status;
+
+	if (!kparams->castaddr)
+		return -EINVAL;
+
+	if (!kparams->lksb)
+		return -EINVAL;
+
+	li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
+	if (!li)
+		return -ENOMEM;
+
+	get_file_info(fi);
+	li->li_user_lksb = kparams->lksb;
+	li->li_bastparam = kparams->bastparam;
+	li->li_bastaddr  = kparams->bastaddr;
+	li->li_castparam = kparams->castparam;
+	li->li_castaddr  = kparams->castaddr;
+	li->li_file      = fi;
+	li->li_flags     = 0;
+	li->li_cmd       = kparams->cmd;
+	clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
+
+	if (copy_from_user(&li->li_lksb, kparams->lksb,
+			   sizeof(struct dlm_lksb))) {
+		kfree(li);
+		return -EFAULT;
+	}
+	li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
+
+	/* Allocate query structs */
+	status = -ENOMEM;
+	li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
+	if (!li->li_queryinfo)
+		goto out1;
+
+	/* Mainly to get gqi_lock buffer size */
+	if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
+			   sizeof(struct dlm_queryinfo))) {
+		status = -EFAULT;
+		goto out1;
+	}
+
+	/* Overwrite userspace pointers we just copied with kernel space ones */
+	if (li->li_queryinfo->gqi_resinfo) {
+		li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
+		if (!li->li_queryinfo->gqi_resinfo)
+			goto out1;
+	}
+	if (li->li_queryinfo->gqi_lockinfo) {
+		li->li_queryinfo->gqi_lockinfo =
+			kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
+				GFP_KERNEL);
+		if (!li->li_queryinfo->gqi_lockinfo)
+			goto out2;
+	}
+
+	li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
+
+	return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
+			  kparams->flags, /* query */
+			  li->li_queryinfo,
+			  ast_routine, li);
+
+ out2:
+	kfree(li->li_queryinfo);
+
+ out1:
+	kfree(li);
+	return status;
+}
+
+static struct lock_info *allocate_lockinfo(struct file_info *fi, struct dlm_lock_params *kparams)
+{
+	struct lock_info *li;
+
+	li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
+	if (li) {
+		li->li_magic     = LOCKINFO_MAGIC;
+		li->li_file      = fi;
+		li->li_cmd       = kparams->cmd;
+		li->li_queryinfo = NULL;
+		li->li_flags     = 0;
+		li->li_pend_bastparam = NULL;
+		li->li_pend_bastaddr  = NULL;
+		li->li_lksb.sb_lvbptr = NULL;
+		li->li_bastaddr  = kparams->bastaddr;
+		li->li_bastparam = kparams->bastparam;
+
+		get_file_info(fi);
+	}
+	return li;
+}
+
+static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
+			const char *buffer)
+{
+	struct lock_info *li;
+	int status;
+	char name[DLM_RESNAME_MAXLEN];
+	void *lvbptr;
+
+	/*
+	 * Validate things that we need to have correct.
+	 */
+	if (!kparams->castaddr)
+		return -EINVAL;
+
+	if (!kparams->lksb)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_WRITE, kparams->lksb, sizeof(struct dlm_lksb)))
+		return -EFAULT;
+
+	/* Persistent child locks are not available yet */
+	if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
+		return -EINVAL;
+
+        /* For conversions, the lock will already have a lock_info
+	   block squirelled away in astparam */
+	if (kparams->flags & DLM_LKF_CONVERT) {
+		struct dlm_lkb *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
+		if (!lkb) {
+			return -EINVAL;
+		}
+
+		li = (struct lock_info *)lkb->lkb_astparam;
+
+		/* li may be NULL if the lock was PERSISTENT and the process went
+		   away, so we need to allocate a new one */
+		if (!li) {
+			li = allocate_lockinfo(fi, kparams);
+			if (li) {
+				spin_lock(&fi->fi_lkb_lock);
+				list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
+				spin_unlock(&fi->fi_lkb_lock);
+			}
+			else {
+				return -ENOMEM;
+			}
+		}
+
+		if (li->li_magic != LOCKINFO_MAGIC)
+			return -EINVAL;
+
+		/* For conversions don't overwrite the current blocking AST
+		   info so that:
+		   a) if a blocking AST fires before the conversion is queued
+		      it runs the current handler
+		   b) if the conversion is cancelled, the original blocking AST
+		      declaration is active
+		   The pend_ info is made active when the conversion
+		   completes.
+		*/
+		li->li_pend_bastaddr  = kparams->bastaddr;
+		li->li_pend_bastparam = kparams->bastparam;
+	}
+	else {
+		li = allocate_lockinfo(fi, kparams);
+		if (!li)
+			return -ENOMEM;
+
+		/* Get the lock name */
+		if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
+				   kparams->namelen)) {
+			return -EFAULT;
+		}
+
+		/* semaphore to allow us to complete our work before
+  		   the AST routine runs. In fact we only need (and use) this
+		   when the initial lock fails */
+		init_MUTEX_LOCKED(&li->li_firstlock);
+		set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
+	}
+
+	li->li_user_lksb = kparams->lksb;
+	li->li_castaddr  = kparams->castaddr;
+	li->li_castparam = kparams->castparam;
+
+	/* Copy the user's LKSB into kernel space,
+	   needed for conversions & value block operations.
+	   Save our kernel-space lvbptr first */
+	lvbptr = li->li_lksb.sb_lvbptr;
+	if (copy_from_user(&li->li_lksb, kparams->lksb, sizeof(struct dlm_lksb))) {
+		status = -EFAULT;
+		goto out_err;
+	}
+	/* Store new userland LVBptr and restore kernel one */
+	li->li_user_lvbptr = li->li_lksb.sb_lvbptr;
+	li->li_lksb.sb_lvbptr = lvbptr;
+
+	/* Copy in the value block */
+	if (kparams->flags & DLM_LKF_VALBLK) {
+		if (!li->li_lksb.sb_lvbptr) {
+			li->li_lksb.sb_lvbptr = kmalloc(DLM_LVB_LEN, GFP_KERNEL);
+			if (!li->li_lksb.sb_lvbptr) {
+				status = -ENOMEM;
+				goto out_err;
+			}
+		}
+
+		if (copy_from_user(li->li_lksb.sb_lvbptr, kparams->lksb->sb_lvbptr,
+				   DLM_LVB_LEN)) {
+			status = -EFAULT;
+			goto out_err;
+		}
+	}
+	else {
+		li->li_user_lvbptr = NULL;
+	}
+
+	/* Lock it ... */
+	status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
+			   kparams->flags, name, kparams->namelen,
+			   kparams->parent,
+			   ast_routine,
+			   li,
+			   (li->li_pend_bastaddr || li->li_bastaddr) ?
+			                            bast_routine : NULL,
+			   kparams->range.ra_end ? &kparams->range : NULL);
+
+	/* If it succeeded (this far) with a new lock then keep track of
+	   it on the file's lkb list */
+	if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
+
+		spin_lock(&fi->fi_lkb_lock);
+		list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
+		spin_unlock(&fi->fi_lkb_lock);
+
+		up(&li->li_firstlock);
+
+		/* Copy the lkid back to userspace in case they want to cancel.
+		   This address has already been tested so /should/ be OK, if not:
+		   tough - we've taken the lock! */
+		copy_to_user(&kparams->lksb->sb_lkid,
+			     &li->li_lksb.sb_lkid,
+			     sizeof(li->li_lksb.sb_lkid));
+	}
+
+	return status;
+
+ out_err:
+	if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
+
+		release_lockinfo(li);
+	}
+	return status;
+
+}
+
+static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
+{
+	struct lock_info *li;
+	struct dlm_lkb *lkb;
+	int status;
+	int convert_cancel = 0;
+
+	lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
+	if (!lkb) {
+		return -EINVAL;
+	}
+
+	/* Cancelling a conversion doesn't remove the lock...*/
+	if (kparams->flags & DLM_LKF_CANCEL &&
+	    lkb->lkb_status == GDLM_LKSTS_CONVERT) {
+		convert_cancel = 1;
+	}
+
+	li = (struct lock_info *)lkb->lkb_astparam;
+	if (!li) {
+		li = allocate_lockinfo(fi, kparams);
+		spin_lock(&fi->fi_lkb_lock);
+		list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
+		spin_unlock(&fi->fi_lkb_lock);
+	}
+	if (!li)
+		return -ENOMEM;
+
+	if (li->li_magic != LOCKINFO_MAGIC)
+		return -EINVAL;
+
+	li->li_user_lksb = kparams->lksb;
+	li->li_castparam = kparams->castparam;
+	li->li_cmd       = kparams->cmd;
+
+	/* dlm_unlock() passes a 0 for castaddr which means don't overwrite
+	   the existing li_castaddr as that's the completion routine for
+	   unlocks. dlm_unlock_wait() specifies a new AST routine to be
+	   executed when the unlock completes. */
+	if (kparams->castaddr)
+		li->li_castaddr = kparams->castaddr;
+
+	/* Have to do it here cos the lkb may not exist after
+	 * dlm_unlock() */
+	if (!convert_cancel) {
+		spin_lock(&fi->fi_lkb_lock);
+		list_del(&li->li_ownerqueue);
+		spin_unlock(&fi->fi_lkb_lock);
+	}
+
+	/* Use existing lksb & astparams */
+	status = dlm_unlock(fi->fi_ls->ls_lockspace,
+			     kparams->lkid,
+			     kparams->flags, &li->li_lksb, li);
+	if (status && !convert_cancel) {
+		/* It failed, put it back on the list */
+		spin_lock(&fi->fi_lkb_lock);
+		list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
+		spin_unlock(&fi->fi_lkb_lock);
+	}
+
+	return status;
+}
+
+/* Write call, submit a locking request */
+static ssize_t dlm_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *ppos)
+{
+	struct file_info *fi = file->private_data;
+	struct dlm_lock_params kparams;
+	sigset_t tmpsig;
+	sigset_t allsigs;
+	int status;
+
+	if (count < sizeof(kparams)-1)	/* -1 because lock name is optional */
+		return -EINVAL;
+
+	/* Has the lockspace been deleted */
+	if (test_bit(1, &fi->fi_ls->ls_flags))
+		return -ENOENT;
+
+	/* Get the command info */
+	if (copy_from_user(&kparams, buffer, sizeof(kparams)))
+		return -EFAULT;
+
+	if (check_version(&kparams))
+		return -EINVAL;
+
+	/* Block signals while we are doing this */
+	sigfillset(&allsigs);
+	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+	switch (kparams.cmd)
+	{
+	case DLM_USER_LOCK:
+		status = do_user_lock(fi, &kparams, buffer);
+		break;
+
+	case DLM_USER_UNLOCK:
+		status = do_user_unlock(fi, &kparams);
+		break;
+
+	case DLM_USER_QUERY:
+		status = do_user_query(fi, &kparams);
+		break;
+
+	default:
+		status = -EINVAL;
+		break;
+	}
+	/* Restore signals */
+	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+	recalc_sigpending();
+
+	if (status == 0)
+		return count;
+	else
+		return status;
+}
+
+/* Called when the cluster is shutdown uncleanly, all lockspaces
+   have been summarily removed */
+void dlm_device_free_devices()
+{
+	struct user_ls *tmp;
+	struct user_ls *lsinfo;
+
+	down(&user_ls_lock);
+	list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
+		misc_deregister(&lsinfo->ls_miscinfo);
+
+		/* Tidy up, but don't delete the lsinfo struct until
+		   all the users have closed their devices */
+		list_del(&lsinfo->ls_list);
+		set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
+		lsinfo->ls_lockspace = NULL;
+	}
+	up(&user_ls_lock);
+}
+
+static struct file_operations _dlm_fops = {
+      .open    = dlm_open,
+      .release = dlm_close,
+      .ioctl   = dlm_ioctl,
+      .read    = dlm_read,
+      .write   = dlm_write,
+      .poll    = dlm_poll,
+      .owner   = THIS_MODULE,
+};
+
+static struct file_operations _dlm_ctl_fops = {
+      .open    = dlm_ctl_open,
+      .release = dlm_ctl_close,
+      .ioctl   = dlm_ctl_ioctl,
+      .owner   = THIS_MODULE,
+};
+
+/*
+ * Create control device
+ */
+int dlm_device_init(void)
+{
+	int r;
+
+	INIT_LIST_HEAD(&user_ls_list);
+	init_MUTEX(&user_ls_lock);
+
+	ctl_device.name = "dlm-control";
+	ctl_device.fops = &_dlm_ctl_fops;
+	ctl_device.minor = MISC_DYNAMIC_MINOR;
+
+	r = misc_register(&ctl_device);
+	if (r) {
+		log_print("misc_register failed for DLM control device");
+		return r;
+	}
+
+	return 0;
+}
+
+void dlm_device_exit(void)
+{
+	misc_deregister(&ctl_device);
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
--- linux-orig/cluster/dlm/device.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/device.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,19 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DEVICE_DOT_H__
+#define __DEVICE_DOT_H__
+
+extern void dlm_device_free_devices(void);
+
+#endif				/* __DEVICE_DOT_H__ */
diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
--- linux-orig/cluster/dlm/dir.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/dir.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,471 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "nodes.h"
+#include "lockspace.h"
+#include "lowcomms.h"
+#include "reccomms.h"
+#include "rsb.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+
+struct resmov {
+	uint32_t rm_nodeid;
+	uint16_t rm_length;
+	uint16_t rm_pad;
+};
+
+void print_name(char *b, int len)
+{
+	int i;
+	for (i = 0; i < len; i++)
+		printk("%c", b[i]);
+	printk("\n");
+}
+
+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+	spin_lock(&ls->ls_recover_list_lock);
+	list_add(&de->list, &ls->ls_recover_list);
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
+{
+	int found = FALSE;
+	struct dlm_direntry *de;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_for_each_entry(de, &ls->ls_recover_list, list) {
+		if (de->length == len) {
+			list_del(&de->list);
+			de->master_nodeid = 0;
+			memset(de->name, 0, len);
+			found = TRUE;
+			break;
+		}
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	if (!found)
+		de = allocate_direntry(ls, len);
+	return de;
+}
+
+void clear_free_de(struct dlm_ls *ls)
+{
+	struct dlm_direntry *de;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	while (!list_empty(&ls->ls_recover_list)) {
+		de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
+				list);
+		list_del(&de->list);
+		free_direntry(de);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+/* 
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value.  This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid of the
+ * directory node.
+ */
+
+uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length)
+{
+	struct list_head *tmp;
+	struct dlm_csb *csb = NULL;
+	uint32_t hash, node, n = 0, nodeid;
+
+	if (ls->ls_num_nodes == 1) {
+		nodeid = our_nodeid();
+		goto out;
+	}
+
+	hash = dlm_hash(name, length);
+	node = (hash >> 16) % ls->ls_num_nodes;
+
+	if (ls->ls_node_array) {
+		nodeid = ls->ls_node_array[node];
+		goto out;
+	}
+
+	list_for_each(tmp, &ls->ls_nodes) {
+		if (n++ != node)
+			continue;
+		csb = list_entry(tmp, struct dlm_csb, list);
+		break;
+	}
+
+	DLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u\n",
+				ls->ls_num_nodes, n, node););
+	nodeid = csb->node->nodeid;
+ out:
+	return nodeid;
+}
+
+uint32_t get_directory_nodeid(struct dlm_rsb *rsb)
+{
+	return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
+					rsb->res_length);
+}
+
+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
+{
+	uint32_t val;
+
+	val = dlm_hash(name, len);
+	val &= (ls->ls_dirtbl_size - 1);
+
+	return val;
+}
+
+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, de->name, de->length);
+	list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+}
+
+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
+					  int namelen, uint32_t bucket)
+{
+	struct dlm_direntry *de;
+
+	list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
+		if (de->length == namelen && !memcmp(name, de->name, namelen))
+			goto out;
+	}
+	de = NULL;
+ out:
+	return de;
+}
+
+void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen)
+{
+	struct dlm_direntry *de;
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, name, namelen);
+
+	write_lock(&ls->ls_dirtbl[bucket].lock);
+
+	de = search_bucket(ls, name, namelen, bucket);
+
+	if (!de) {
+		log_all(ls, "remove fr %u none", nodeid);
+		print_name(name, namelen);
+		goto out;
+	}
+
+	if (de->master_nodeid != nodeid) {
+		log_all(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
+		print_name(name, namelen);
+		goto out;
+	}
+
+	list_del(&de->list);
+	free_direntry(de);
+ out:
+	write_unlock(&ls->ls_dirtbl[bucket].lock);
+}
+
+void dlm_dir_clear(struct dlm_ls *ls)
+{
+	struct list_head *head;
+	struct dlm_direntry *de;
+	int i;
+
+	for (i = 0; i < ls->ls_dirtbl_size; i++) {
+		write_lock(&ls->ls_dirtbl[i].lock);
+		head = &ls->ls_dirtbl[i].list;
+		while (!list_empty(head)) {
+			de = list_entry(head->next, struct dlm_direntry, list);
+			list_del(&de->list);
+			put_free_de(ls, de);
+		}
+		write_unlock(&ls->ls_dirtbl[i].lock);
+	}
+}
+
+static void resmov_in(struct resmov *rm, char *buf)
+{
+	struct resmov tmp;
+
+	memcpy(&tmp, buf, sizeof(struct resmov));
+
+	rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
+	rm->rm_length = be16_to_cpu(tmp.rm_length);
+}
+
+int dlm_dir_rebuild_local(struct dlm_ls *ls)
+{
+	struct dlm_csb *csb;
+	struct dlm_direntry *de;
+	struct dlm_rcom *rc;
+	struct resmov mov, last_mov;
+	char *b, *last_name;
+	int error = -ENOMEM, count = 0;
+
+	log_all(ls, "rebuild resource directory");
+
+	dlm_dir_clear(ls);
+
+	rc = allocate_rcom_buffer(ls);
+	if (!rc)
+		goto out;
+
+	last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+	if (!last_name)
+		goto free_rc;
+
+	list_for_each_entry(csb, &ls->ls_nodes, list) {
+		last_mov.rm_length = 0;
+		for (;;) {
+			error = dlm_recovery_stopped(ls);
+			if (error)
+				goto free_last;
+
+			memcpy(rc->rc_buf, last_name, last_mov.rm_length);
+			rc->rc_datalen = last_mov.rm_length;
+
+			error = rcom_send_message(ls, csb->node->nodeid,
+						  RECCOMM_RECOVERNAMES, rc, 1);
+			if (error)
+				goto free_last;
+
+			schedule();
+
+			/* 
+			 * pick each res out of buffer
+			 */
+
+			b = rc->rc_buf;
+
+			for (;;) {
+				resmov_in(&mov, b);
+				b += sizeof(struct resmov);
+
+				/* Length of 0 with a non-zero nodeid marks the 
+				 * end of the list */
+				if (!mov.rm_length && mov.rm_nodeid)
+					goto done;
+
+				/* This is just the end of the block */
+				if (!mov.rm_length)
+					break;
+
+				DLM_ASSERT(mov.rm_nodeid == csb->node->nodeid,);
+
+				error = -ENOMEM;
+				de = get_free_de(ls, mov.rm_length);
+				if (!de)
+					goto free_last;
+
+				de->master_nodeid = mov.rm_nodeid;
+				de->length = mov.rm_length;
+				memcpy(de->name, b, mov.rm_length);
+				b += mov.rm_length;
+
+				add_entry_to_hash(ls, de);
+				count++;
+
+				last_mov = mov;
+				memset(last_name, 0, DLM_RESNAME_MAXLEN);
+				memcpy(last_name, de->name, de->length);
+			}
+		}
+	      done:
+		;
+	}
+
+	set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
+	error = 0;
+
+	log_all(ls, "rebuilt %d resources", count);
+
+      free_last:
+	kfree(last_name);
+
+      free_rc:
+	free_rcom_buffer(rc);
+
+      out:
+	clear_free_de(ls);
+	return error;
+}
+
+/* 
+ * The reply end of dlm_dir_rebuild_local/RECOVERNAMES.  Collect and send as
+ * many resource names as can fit in the buffer.
+ */
+
+int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
+			 char *outbuf, int outlen, uint32_t nodeid)
+{
+	struct list_head *list;
+	struct dlm_rsb *start_rsb = NULL, *rsb;
+	int offset = 0, start_namelen, error;
+	char *start_name;
+	struct resmov tmp;
+	uint32_t dir_nodeid;
+
+	/* 
+	 * Find the rsb where we left off (or start again)
+	 */
+
+	start_namelen = inlen;
+	start_name = inbuf;
+
+	if (start_namelen > 1) {
+		error = find_rsb(ls, NULL, start_name, start_namelen, 0,
+				 &start_rsb);
+		DLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
+		release_rsb(start_rsb);
+	}
+
+	/* 
+	 * Send rsb names for rsb's we're master of and whose directory node
+	 * matches the requesting node.
+	 */
+
+	down_read(&ls->ls_root_lock);
+	if (start_rsb)
+		list = start_rsb->res_rootlist.next;
+	else
+		list = ls->ls_rootres.next;
+
+	for (offset = 0; list != &ls->ls_rootres; list = list->next) {
+		rsb = list_entry(list, struct dlm_rsb, res_rootlist);
+		if (rsb->res_nodeid)
+			continue;
+
+		dir_nodeid = get_directory_nodeid(rsb);
+		if (dir_nodeid != nodeid)
+			continue;
+
+		if (offset + sizeof(struct resmov)*2 + rsb->res_length > outlen) {
+			/* Write end-of-block record */
+			memset(&tmp, 0, sizeof(struct resmov));
+			memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
+			offset += sizeof(struct resmov);
+			goto out;
+		}
+
+		memset(&tmp, 0, sizeof(struct resmov));
+		tmp.rm_nodeid = cpu_to_be32(our_nodeid());
+		tmp.rm_length = cpu_to_be16(rsb->res_length);
+
+		memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
+		offset += sizeof(struct resmov);
+
+		memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
+		offset += rsb->res_length;
+	}
+
+	/* 
+	 * If we've reached the end of the list (and there's room) write a
+	 * terminating record.
+	 */
+
+	if ((list == &ls->ls_rootres) &&
+	    (offset + sizeof(struct resmov) <= outlen)) {
+
+		memset(&tmp, 0, sizeof(struct resmov));
+		/* This only needs to be non-zero */
+		tmp.rm_nodeid = cpu_to_be32(1);
+		/* and this must be zero */
+		tmp.rm_length = 0;
+		memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
+		offset += sizeof(struct resmov);
+	}
+
+ out:
+	up_read(&ls->ls_root_lock);
+	return offset;
+}
+
+static int get_entry(struct dlm_ls *ls, uint32_t nodeid, char *name,
+		     int namelen, uint32_t *r_nodeid)
+{
+	struct dlm_direntry *de, *tmp;
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, name, namelen);
+
+	write_lock(&ls->ls_dirtbl[bucket].lock);
+	de = search_bucket(ls, name, namelen, bucket);
+	if (de) {
+		*r_nodeid = de->master_nodeid;
+		write_unlock(&ls->ls_dirtbl[bucket].lock);
+		if (*r_nodeid == nodeid)
+			return -EEXIST;
+		return 0;
+	}
+
+	write_unlock(&ls->ls_dirtbl[bucket].lock);
+
+	de = allocate_direntry(ls, namelen);
+	if (!de)
+		return -ENOMEM;
+
+	de->master_nodeid = nodeid;
+	de->length = namelen;
+	memcpy(de->name, name, namelen);
+
+	write_lock(&ls->ls_dirtbl[bucket].lock);
+	tmp = search_bucket(ls, name, namelen, bucket);
+	if (tmp) {
+		free_direntry(de);
+		de = tmp;
+	} else {
+		list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+	}
+	*r_nodeid = de->master_nodeid;
+	write_unlock(&ls->ls_dirtbl[bucket].lock);
+	return 0;
+}
+
+int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
+		   uint32_t *r_nodeid)
+{
+	return get_entry(ls, nodeid, name, namelen, r_nodeid);
+}
+
+/* 
+ * The node with lowest id queries all nodes to determine when all are done.
+ * All other nodes query the low nodeid for this.
+ */
+
+int dlm_dir_rebuild_wait(struct dlm_ls *ls)
+{
+	int error;
+
+	if (ls->ls_low_nodeid == our_nodeid()) {
+		error = dlm_wait_status_all(ls, RESDIR_VALID);
+		if (!error)
+			set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
+	} else
+		error = dlm_wait_status_low(ls, RESDIR_ALL_VALID);
+
+	return error;
+}
diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
--- linux-orig/cluster/dlm/dir.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/dir.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,33 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+void print_name(char *b, int len);
+uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length);
+uint32_t get_directory_nodeid(struct dlm_rsb *rsb);
+
+int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
+			uint32_t *r_nodeid);
+void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name,
+		    int namelen);
+int dlm_dir_rebuild_local(struct dlm_ls *ls);
+int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
+			 char *outbuf, int outlen, uint32_t nodeid);
+int dlm_dir_rebuild_wait(struct dlm_ls * ls);
+void dlm_dir_clear(struct dlm_ls *ls);
+void dlm_dir_dump(struct dlm_ls *ls);
+void clear_free_de(struct dlm_ls *ls);
+
+#endif				/* __DIR_DOT_H__ */
diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
--- linux-orig/cluster/dlm/dlm_internal.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/dlm_internal.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,612 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#define DLM_RELEASE_NAME "<CVS>"
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <asm/semaphore.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+
+#include <cluster/dlm.h>
+#include <cluster/dlm_device.h>
+#include <cluster/service.h>
+
+#ifndef TRUE
+#define TRUE (1)
+#endif
+
+#ifndef FALSE
+#define FALSE (0)
+#endif
+
+#if (BITS_PER_LONG == 64)
+#define PRIu64 "lu"
+#define PRId64 "ld"
+#define PRIo64 "lo"
+#define PRIx64 "lx"
+#define PRIX64 "lX"
+#define SCNu64 "lu"
+#define SCNd64 "ld"
+#define SCNo64 "lo"
+#define SCNx64 "lx"
+#define SCNX64 "lX"
+#else
+#define PRIu64 "Lu"
+#define PRId64 "Ld"
+#define PRIo64 "Lo"
+#define PRIx64 "Lx"
+#define PRIX64 "LX"
+#define SCNu64 "Lu"
+#define SCNd64 "Ld"
+#define SCNo64 "Lo"
+#define SCNx64 "Lx"
+#define SCNX64 "LX"
+#endif
+
+#define wchan_cond_sleep_intr(chan, sleep_cond) \
+do \
+{ \
+  DECLARE_WAITQUEUE(__wait_chan, current); \
+  current->state = TASK_INTERRUPTIBLE; \
+  add_wait_queue(&chan, &__wait_chan); \
+  if ((sleep_cond)) \
+    schedule(); \
+  remove_wait_queue(&chan, &__wait_chan); \
+  current->state = TASK_RUNNING; \
+} \
+while (0)
+
+static inline int check_timeout(unsigned long stamp, unsigned int seconds)
+{
+    return time_after(jiffies, stamp + seconds * HZ);
+}
+
+
+#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
+
+#define log_all(ls, fmt, args...) \
+	do { \
+		printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
+		dlm_debug_log(ls, fmt, ##args); \
+	} while (0)
+
+#define log_error log_all
+
+#if defined(DLM_DEBUG2)
+int nibbler_printf(const char *fmt, ...);
+#define log_debug2(fmt, args...) nibbler_printf(fmt"\n", ##args)
+#else
+#define log_debug2(fmt, args...)
+#endif
+
+#define DLM_DEBUG
+#if defined(DLM_DEBUG)
+#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
+#else
+#define log_debug(ls, fmt, args...)
+#endif
+
+#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
+#undef log_debug
+#define log_debug log_all
+#endif
+
+
+#define DLM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    dlm_locks_dump(); \
+    dlm_debug_dump(); \
+    printk("\nDLM:  Assertion failed on line %d of file %s\n" \
+               "DLM:  assertion:  \"%s\"\n" \
+               "DLM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    BUG(); \
+    panic("DLM:  Record message above and reboot.\n"); \
+  } \
+}
+
+
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_csb;
+struct dlm_node;
+struct dlm_lkbtable;
+struct dlm_rsbtable;
+struct dlm_dirtable;
+struct dlm_direntry;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_request;
+struct dlm_reply;
+struct dlm_rcom;
+struct dlm_query_request;
+struct dlm_query_reply;
+
+
+struct dlm_direntry {
+	struct list_head	list;
+	uint32_t		master_nodeid;
+	uint16_t		length;
+	char			name[1];
+};
+
+struct dlm_dirtable {
+	struct list_head	list;
+	rwlock_t		lock;
+};
+
+struct dlm_rsbtable {
+	struct list_head	list;
+	rwlock_t		lock;
+};
+
+struct dlm_lkbtable {
+	struct list_head	list;
+	rwlock_t		lock;
+	uint16_t		counter;
+};
+
+/*
+ * Cluster node (per node in cluster)
+ */
+
+struct dlm_node {
+	struct list_head	list;
+	uint32_t		nodeid;
+	atomic_t		refcount;	/* num csb's referencing */
+};
+
+/*
+ * Cluster System Block (per node in a ls)
+ */
+
+struct dlm_csb {
+	struct list_head	list;		/* per-lockspace node list */
+	struct dlm_node *	node;		/* global node structure */
+	int			gone_event;	/* event id when node removed */
+};
+
+/*
+ * Used to save and manage recovery state for a lockspace.
+ */
+
+struct dlm_recover {
+	struct list_head	list;
+	uint32_t *		nodeids;
+	int			node_count;
+	int			event_id;
+};
+
+/*
+ * Elements in the range array
+ */
+
+#define GR_RANGE_START		(0)
+#define GR_RANGE_END		(1)
+#define RQ_RANGE_START		(2)
+#define RQ_RANGE_END		(3)
+
+/*
+ * Lockspace structure
+ */
+
+#define LSFL_WORK		(0)
+#define LSFL_LS_RUN		(1)
+#define LSFL_LS_STOP		(2)
+#define LSFL_LS_START		(3)
+#define LSFL_LS_FINISH		(4)
+#define LSFL_RECCOMM_WAIT	(5)
+#define LSFL_RECCOMM_READY	(6)
+#define LSFL_NOTIMERS		(7)
+#define LSFL_FINISH_RECOVERY	(8)
+#define LSFL_RESDIR_VALID	(9)
+#define LSFL_ALL_RESDIR_VALID	(10)
+#define LSFL_NODES_VALID	(11)
+#define LSFL_ALL_NODES_VALID	(12)
+#define LSFL_REQUEST_WARN	(13)
+#define LSFL_RECOVERD_EXIT      (14)
+
+#define LSST_NONE		(0)
+#define LSST_INIT		(1)
+#define LSST_INIT_DONE		(2)
+#define LSST_CLEAR		(3)
+#define LSST_WAIT_START		(4)
+#define LSST_RECONFIG_DONE	(5)
+
+struct dlm_ls {
+	struct list_head	ls_list;	/* list of lockspaces */
+	uint32_t		ls_local_id;	/* local unique lockspace ID */
+	uint32_t		ls_global_id;	/* global unique lockspace ID */
+	int			ls_allocation;	/* Memory allocation policy */
+	int			ls_count;	/* reference count */
+	unsigned long		ls_flags;	/* LSFL_ */
+
+	struct dlm_rsbtable *	ls_rsbtbl;
+	uint32_t		ls_rsbtbl_size;
+
+	struct dlm_lkbtable *	ls_lkbtbl;
+	uint32_t		ls_lkbtbl_size;
+
+	struct dlm_dirtable *	ls_dirtbl;
+	uint32_t		ls_dirtbl_size;
+
+	struct list_head	ls_nodes;	/* current nodes in ls */
+	struct list_head	ls_nodes_gone;	/* dead node list, recovery */
+	uint32_t		ls_num_nodes;	/* number of nodes in ls */
+	uint32_t		ls_low_nodeid;
+	uint32_t *		ls_node_array;
+
+	struct rw_semaphore	ls_unlock_sem;	/* To prevent unlock on a
+						   parent lock racing with a
+						   new child lock */
+
+	struct list_head	ls_deadlockq;	/* List of locks in conversion
+						   ordered by duetime. for
+						   deadlock detection */
+
+	/* recovery related */
+
+	struct task_struct *	ls_recoverd_task;
+	struct semaphore	ls_recoverd_lock;
+	struct list_head	ls_recover;	/* dlm_recover structs */
+	spinlock_t		ls_recover_lock;
+	int			ls_last_stop;
+	int			ls_last_start;
+	int			ls_last_finish;
+	int			ls_state;	/* recovery states */
+
+	struct rw_semaphore	ls_in_recovery;	/* block local requests */
+	struct list_head	ls_requestqueue;/* queue remote requests */
+	struct semaphore	ls_requestqueue_lock;
+
+	struct dlm_rcom *	ls_rcom;	/* recovery comms */
+	uint32_t		ls_rcom_msgid;
+	struct semaphore	ls_rcom_lock;
+
+	struct list_head	ls_recover_list;
+	spinlock_t		ls_recover_list_lock;
+	int			ls_recover_list_count;
+	wait_queue_head_t	ls_wait_general;
+
+	struct list_head	ls_rootres;	/* root resources */
+	struct rw_semaphore	ls_root_lock;	/* protect rootres list */
+
+	struct list_head	ls_rebuild_rootrsb_list; /* Root of lock trees
+							  we're deserialising */
+	int			ls_namelen;
+	char			ls_name[1];
+};
+
+/*
+ * Resource block
+ */
+
+#define RESFL_NEW_MASTER	(0)
+#define RESFL_RECOVER_LIST	(1)
+#define RESFL_MASTER		(2)
+
+struct dlm_rsb {
+	struct list_head	res_hashchain;
+	uint32_t		res_bucket;
+
+	struct dlm_ls *		res_ls;		/* The owning lockspace */
+
+	struct list_head	res_rootlist;	/* List of root rsb's */
+
+	struct list_head	res_subreslist;	/* List of all sub-resources
+						   for this root rsb */
+
+	uint8_t			res_depth;	/* Depth in resource tree */
+	unsigned long		res_flags;	/* Flags, RESFL_ */
+
+	struct list_head	res_grantqueue;
+	struct list_head	res_convertqueue;
+	struct list_head	res_waitqueue;
+
+	uint32_t		res_nodeid;	/* nodeid of master node */
+
+	struct dlm_rsb *	res_root;	/* root rsb if a subresource */
+	struct dlm_rsb *	res_parent;	/* parent rsb (if any) */
+
+	atomic_t		res_ref;	/* Number of lkb's */
+	uint16_t		res_remasterid;	/* ID used during remaster */
+
+	struct list_head	res_recover_list; /* General list for use
+						     during recovery */
+	int			res_recover_msgid;
+	int			res_newlkid_expect;
+
+	struct rw_semaphore	res_lock;
+
+	char *			res_lvbptr;	/* Lock value block */
+
+	uint8_t			res_length;
+	char			res_name[1];	/* <res_length> bytes */
+};
+
+/*
+ * Lock block. To avoid confusion, where flags mirror the public flags, they
+ * should have the same value.
+ *
+ * In general, DLM_LKF flags from dlm.h apply only to lkb_lockqueue_flags
+ * and GDLM_LKFLG flags from dlm_internal.h apply only to lkb_flags.
+ * The rr_flags field in the request struct is a copy of lkb_lockqueue_flags.
+ * There is one dangerous exception: GDLM_LKFLG_RANGE is set in rr_flags
+ * when sending a remote range lock request.  This value is then copied into
+ * the remote lkb_lockqueue_flags field.  This means GDLM_LKFLG_RANGE must
+ * not have the same value as any external DLM_LKF flag.
+ */
+
+#define GDLM_LKSTS_NEW		(0)
+#define GDLM_LKSTS_WAITING	(1)
+#define GDLM_LKSTS_GRANTED	(2)
+#define GDLM_LKSTS_CONVERT	(3)
+
+/* mirror external flags */
+#define GDLM_LKFLG_VALBLK	(0x00000008)
+#define GDLM_LKFLG_PERSISTENT	(0x00000080)
+#define GDLM_LKFLG_NODLCKWT	(0x00000100)
+#define GDLM_LKFLG_EXPEDITE	(0x00000400)
+#define GDLM_LKFLG_ORPHAN	(0x00004000)
+/* external flags now go up to: (0x00004000) : DLM_LKF_ORPHAN */
+
+/* internal-only flags */
+#define GDLM_LKFLG_RANGE	(0x00010000)
+#define GDLM_LKFLG_MSTCPY	(0x00020000)
+#define GDLM_LKFLG_DELETED	(0x00040000)
+#define GDLM_LKFLG_LQCONVERT	(0x00080000)
+#define GDLM_LKFLG_LQRESEND	(0x00100000)
+#define GDLM_LKFLG_DEMOTED	(0x00200000)
+#define GDLM_LKFLG_RESENT	(0x00400000)
+#define GDLM_LKFLG_NOREBUILD	(0x00800000)
+#define GDLM_LKFLG_UNLOCKDONE	(0x01000000)
+
+#define AST_COMP		(1)
+#define AST_BAST		(2)
+#define AST_DEL			(4)
+
+struct dlm_lkb {
+	uint32_t		lkb_flags;
+	uint16_t		lkb_status;	/* grant, wait, convert */
+	int8_t			lkb_rqmode;	/* requested lock mode */
+	int8_t			lkb_grmode;	/* granted lock mode */
+	uint32_t		lkb_retstatus;	/* status to return in lksb */
+	uint32_t		lkb_id;		/* our lock ID */
+	struct dlm_lksb *	lkb_lksb;	/* status block of caller */
+	struct list_head	lkb_idtbl_list;	/* lockidtbl */
+	struct list_head	lkb_statequeue;	/* rsb's g/c/w queue */
+	struct dlm_rsb *	lkb_resource;
+	struct dlm_lkb *	lkb_parent;	/* parent lock if any */
+	atomic_t		lkb_childcnt;	/* number of children */
+
+	struct list_head	lkb_lockqueue;	/* queue of locks waiting
+						   for remote reply */
+	int			lkb_lockqueue_state; /* reason on lockqueue */
+	uint32_t		lkb_lockqueue_flags; /* as passed into
+							lock/unlock */
+	int			lkb_ownpid;	/* pid of lock owner */
+	unsigned long		lkb_lockqueue_time;  /* time lkb went on the
+							lockqueue */
+	unsigned long		lkb_duetime;	/* for deadlock detection */
+
+	uint32_t		lkb_remid;	/* id on remote partner */
+	uint32_t		lkb_nodeid;	/* id of remote partner */
+	void *			lkb_astaddr;
+	void *			lkb_bastaddr;
+	long			lkb_astparam;
+	struct list_head	lkb_astqueue;	/* locks with asts to deliver */
+	uint16_t		lkb_astflags;	/* COMP, BAST, DEL */
+	uint8_t			lkb_bastmode;	/* requested mode */
+	uint8_t			lkb_highbast;	/* highest mode bast sent for */
+
+	struct dlm_request *	lkb_request;
+
+	struct list_head	lkb_deadlockq;	/* ls_deadlockq list */
+
+	char *			lkb_lvbptr;	/* points to lksb lvb on local
+						   lock, allocated lvb on
+						   on remote lock */
+	uint64_t *		lkb_range;	/* Points to an array of 64 bit
+						   numbers that represent the
+						   requested and granted ranges
+						   of the lock. NULL implies
+						   0-ffffffffffffffff */
+};
+
+/*
+ * Header part of the mid-level comms system. All packets start with
+ * this header so we can identify them. The comms packet can
+ * contain many of these structs but the are split into individual
+ * work units before being passed to the lockqueue routines.
+ * below this are the structs that this is a header for
+ */
+
+struct dlm_header {
+	uint8_t			rh_cmd;		/* What we are */
+	uint8_t			rh_flags;	/* maybe just a pad */
+	uint16_t		rh_length;	/* Length of struct (so we can
+						   send many in 1 message) */
+	uint32_t		rh_lkid;	/* Lock ID tag: ie the local
+						   (requesting) lock ID */
+	uint32_t		rh_lockspace;	/* Lockspace ID */
+} __attribute__((packed));
+
+/*
+ * This is the struct used in a remote lock/unlock/convert request
+ * The mid-level comms API should turn this into native byte order.
+ * Most "normal" lock operations will use these two structs for
+ * communications. Recovery operations use their own structs
+ * but still with the gd_req_header on the front.
+ */
+
+struct dlm_request {
+	struct dlm_header	rr_header;
+	uint32_t		rr_remlkid;	/* Remote lock ID */
+	uint32_t		rr_remparid;	/* Parent's remote lock ID */
+	uint32_t		rr_flags;	/* Flags from lock/convert req*/
+	uint64_t		rr_range_start; /* Yes, these are in the right
+						   place... */
+	uint64_t		rr_range_end;
+	uint32_t		rr_status;	/* Status to return if this is
+						   an AST request */
+        uint32_t                rr_pid;         /* Owner PID of lock */
+	uint8_t			rr_rqmode;	/* Requested lock mode */
+	uint8_t			rr_asts;	/* Whether the LKB has ASTs */
+	char			rr_lvb[DLM_LVB_LEN];
+	char			rr_name[1];	/* As long as needs be. Only
+						   used for directory lookups.
+						   The length of this can be
+						   worked out from the packet
+						   length */
+} __attribute__((packed));
+
+/*
+ * This is the struct returned by a remote lock/unlock/convert request
+ * The mid-level comms API should turn this into native byte order.
+ */
+
+struct dlm_reply {
+	struct dlm_header	rl_header;
+	uint32_t		rl_lockstate;	/* Whether request was
+						   queued/granted/waiting */
+	uint32_t		rl_nodeid;	/* nodeid of lock master */
+	uint32_t		rl_status;	/* Status to return to caller */
+	uint32_t		rl_lkid;	/* Remote lkid */
+	char			rl_lvb[DLM_LVB_LEN];
+} __attribute__((packed));
+
+/*
+ * Recovery comms message
+ */
+
+struct dlm_rcom {
+	struct dlm_header	rc_header;	/* 32 byte aligned */
+	uint32_t		rc_msgid;
+	uint16_t		rc_datalen;
+	uint8_t			rc_expanded;
+	uint8_t			rc_subcmd;	/* secondary command */
+	char			rc_buf[1];	/* first byte of data goes here
+						   and extends beyond here for
+						   another datalen - 1 bytes.
+						   rh_length is set to sizeof
+						   dlm_rcom + datalen - 1 */
+} __attribute__((packed));
+
+
+/* A remote query: GDLM_REMCMD_QUERY */
+
+struct dlm_query_request {
+	struct dlm_header	rq_header;
+	uint32_t		rq_mstlkid;   /* LockID on master node */
+	uint32_t		rq_query;     /* query from the user */
+	uint32_t		rq_maxlocks;  /* max number of locks we can
+						 cope with */
+} __attribute__((packed));
+
+/* First block of a reply query.  cmd = GDLM_REMCMD_QUERY */
+/* There may be subsequent blocks of
+   lock info in GDLM_REMCMD_QUERYCONT messages which just have
+   a normal header. The last of these will have rh_flags set to
+   GDLM_REMFLAG_ENDQUERY
+ */
+
+struct dlm_query_reply {
+	struct dlm_header	rq_header;
+	uint32_t		rq_numlocks;  /* Number of locks in reply */
+	uint32_t		rq_startlock; /* Which lock this block starts
+						 at (for multi-block replies) */
+	uint32_t		rq_status;
+
+	/* Resource information */
+	uint32_t		rq_grantcount;	/* No. of nodes on grantqueue */
+	uint32_t		rq_convcount;	/* No. of nodes on convertq */
+	uint32_t		rq_waitcount;	/* No. of nodes on waitqueue */
+	char			rq_valblk[DLM_LVB_LEN];	/* Master's LVB
+							   contents, if
+							   applicable */
+} __attribute__((packed));
+
+/*
+ * Lockqueue wait lock states
+ */
+
+#define GDLM_LQSTATE_WAIT_RSB		1
+#define GDLM_LQSTATE_WAIT_CONVERT	2
+#define GDLM_LQSTATE_WAIT_CONDGRANT	3
+#define GDLM_LQSTATE_WAIT_UNLOCK	4
+
+/* Commands sent across the comms link */
+#define GDLM_REMCMD_LOOKUP		1
+#define GDLM_REMCMD_LOCKREQUEST		2
+#define GDLM_REMCMD_UNLOCKREQUEST	3
+#define GDLM_REMCMD_CONVREQUEST		4
+#define GDLM_REMCMD_LOCKREPLY		5
+#define GDLM_REMCMD_LOCKGRANT		6
+#define GDLM_REMCMD_SENDBAST		7
+#define GDLM_REMCMD_SENDCAST		8
+#define GDLM_REMCMD_REM_RESDATA		9
+#define GDLM_REMCMD_RECOVERMESSAGE	20
+#define GDLM_REMCMD_RECOVERREPLY	21
+#define GDLM_REMCMD_QUERY		30
+#define GDLM_REMCMD_QUERYREPLY		31
+
+/* Set in rh_flags when this is the last block of
+   query information. Note this could also be the first
+   block */
+#define GDLM_REMFLAG_ENDQUERY       1
+
+#ifdef CONFIG_DLM_STATS
+struct dlm_statinfo
+{
+	unsigned int cast;
+	unsigned int bast;
+	unsigned int lockops;
+	unsigned int unlockops;
+	unsigned int convertops;
+	unsigned long lockqueue_time[5];
+	unsigned long lockqueue_locks[5];
+};
+extern struct dlm_statinfo dlm_stats;
+#endif
+
+#ifndef BUG_ON
+#define BUG_ON(x)
+#endif
+
+void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...);
+void dlm_debug_dump(void);
+void dlm_locks_dump(void);
+
+#endif				/* __DLM_INTERNAL_DOT_H__ */
diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
--- linux-orig/cluster/dlm/lkb.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lkb.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,183 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* 
+ * lkb.c
+ *
+ * Allocate and free locks on the lock ID table.
+ *
+ * This is slightly naff but I don't really like the
+ * VMS lockidtbl stuff as it uses a realloced array
+ * to hold the locks in. I think this is slightly better
+ * in some ways.
+ *
+ * Any better suggestions gratefully received. Patrick
+ *
+ */
+
+#include "dlm_internal.h"
+#include "lockqueue.h"
+#include "lkb.h"
+#include "config.h"
+#include "rsb.h"
+#include "memory.h"
+#include "lockspace.h"
+#include "util.h"
+
+/* 
+ * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
+ */
+
+static struct dlm_lkb *__find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
+{
+	uint16_t bucket = lkid & 0xFFFF;
+	struct dlm_lkb *lkb;
+
+	if (bucket >= ls->ls_lkbtbl_size)
+		goto out;
+
+	list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list){
+		if (lkb->lkb_id == lkid)
+			return lkb;
+	}
+ out:
+	return NULL;
+}
+
+/* 
+ * LKB lkid's are 32 bits and have two 16 bit parts.  The bottom 16 bits are a
+ * random number between 0 and lockidtbl_size-1.  This random number specifies
+ * the "bucket" for the lkb in lockidtbl.  The upper 16 bits are a sequentially
+ * assigned per-bucket id.
+ *
+ * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
+ * against the lkid of all lkb's in the bucket to avoid duplication.
+ *
+ */
+
+struct dlm_lkb *create_lkb(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	uint32_t lkid;
+	uint16_t bucket;
+
+	lkb = allocate_lkb(ls);
+	if (!lkb)
+		goto out;
+
+ retry:
+	get_random_bytes(&bucket, sizeof(bucket));
+	bucket &= (ls->ls_lkbtbl_size - 1);
+
+	write_lock(&ls->ls_lkbtbl[bucket].lock);
+
+	lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+
+	if (__find_lock_by_id(ls, lkid)) {
+		write_unlock(&ls->ls_lkbtbl[bucket].lock);
+		goto retry;
+	}
+
+	lkb->lkb_id = lkid;
+	list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+	write_unlock(&ls->ls_lkbtbl[bucket].lock);
+ out:
+	return lkb;
+}
+
+/* 
+ * Free LKB and remove it from the lockidtbl.
+ * NB - this always frees the lkb whereas release_rsb doesn't free an
+ * rsb unless its reference count is zero.
+ */
+
+void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	uint16_t bucket = lkb->lkb_id & 0xFFFF;
+
+	if (lkb->lkb_status) {
+		log_error(ls, "release lkb with status %u", lkb->lkb_status);
+		print_lkb(lkb);
+		return;
+	}
+
+	if (lkb->lkb_parent)
+		atomic_dec(&lkb->lkb_parent->lkb_childcnt);
+
+	write_lock(&ls->ls_lkbtbl[bucket].lock);
+	list_del(&lkb->lkb_idtbl_list);
+	write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+	/* if this is not a master copy then lvbptr points into the user's
+	 * lksb, so don't free it */
+	if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
+		free_lvb(lkb->lkb_lvbptr);
+
+	if (lkb->lkb_range)
+		free_range(lkb->lkb_range);
+
+	free_lkb(lkb);
+}
+
+struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
+{
+	struct dlm_lkb *lkb;
+	uint16_t bucket = lkid & 0xFFFF;
+
+	read_lock(&ls->ls_lkbtbl[bucket].lock);
+	lkb = __find_lock_by_id(ls, lkid);
+	read_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+	return lkb;
+}
+
+struct dlm_lkb *dlm_get_lkb(void *lockspace, uint32_t lkid)
+{
+	struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
+	struct dlm_lkb *lkb = find_lock_by_id(ls, lkid);
+	put_lockspace(ls);
+	return lkb;
+}
+
+/*
+ * Initialise the range parts of an LKB.
+ */
+
+int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end)
+{
+	int ret = -ENOMEM;
+
+	/*
+	 * if this wasn't already a range lock, make it one
+	 */
+	if (!lkb->lkb_range) {
+		lkb->lkb_range = allocate_range(lspace);
+		if (!lkb->lkb_range)
+			goto out;
+
+		/*
+		 * This is needed for conversions that contain ranges where the
+		 * original lock didn't but it's harmless for new locks too.
+		 */
+		lkb->lkb_range[GR_RANGE_START] = 0LL;
+		lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
+	}
+
+	lkb->lkb_range[RQ_RANGE_START] = start;
+	lkb->lkb_range[RQ_RANGE_END] = end;
+
+	ret = 0;
+
+      out:
+	return ret;
+}
diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
--- linux-orig/cluster/dlm/lkb.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lkb.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,23 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LKB_DOT_H__
+#define __LKB_DOT_H__
+
+struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid);
+struct dlm_lkb *create_lkb(struct dlm_ls *ls);
+void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb);
+struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid);
+int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end);
+
+#endif				/* __LKB_DOT_H__ */
diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
--- linux-orig/cluster/dlm/locking.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/locking.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,1378 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * locking.c
+ *
+ * This is where the main work of the DLM goes on
+ *
+ */
+
+#include "dlm_internal.h"
+#include "lockqueue.h"
+#include "locking.h"
+#include "lockspace.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "dir.h"
+#include "ast.h"
+#include "memory.h"
+#include "rsb.h"
+#include "util.h"
+#include "lowcomms.h"
+
+extern struct list_head lslist;
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
+ */
+
+#define modes_compat(gr, rq) \
+	__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+	{1, 1, 1, 1, 1, 1, 1, 0},	/* UN */
+	{1, 1, 1, 1, 1, 1, 1, 0},	/* NL */
+	{1, 1, 1, 1, 1, 1, 0, 0},	/* CR */
+	{1, 1, 1, 1, 0, 0, 0, 0},	/* CW */
+	{1, 1, 1, 0, 1, 0, 0, 0},	/* PR */
+	{1, 1, 1, 0, 0, 0, 0, 0},	/* PW */
+	{1, 1, 0, 0, 0, 0, 0, 0},	/* EX */
+	{0, 0, 0, 0, 0, 0, 0, 0}	/* PD */
+};
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+const int __quecvt_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+	{0, 0, 0, 0, 0, 0, 0, 0},	/* UN */
+	{0, 0, 1, 1, 1, 1, 1, 0},	/* NL */
+	{0, 0, 0, 1, 1, 1, 1, 0},	/* CR */
+	{0, 0, 0, 0, 1, 1, 1, 0},	/* CW */
+	{0, 0, 0, 1, 0, 1, 1, 0},	/* PR */
+	{0, 0, 0, 0, 0, 0, 1, 0},	/* PW */
+	{0, 0, 0, 0, 0, 0, 0, 0},	/* EX */
+	{0, 0, 0, 0, 0, 0, 0, 0}	/* PD */
+};
+
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int __lvb_operations[8][8] = {
+	/* UN   NL  CR  CW  PR  PW  EX  PD*/
+	{  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
+	{  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
+	{  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
+	{  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
+	{  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
+	{  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
+	{  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
+	{  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
+};
+
+static void grant_lock(struct dlm_lkb *lkb, int send_remote);
+static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
+static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
+static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
+			uint32_t flags, void *ast, void *astarg, void *bast,
+			struct dlm_range *range);
+static int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			   uint32_t flags, char *name, int namelen);
+
+
+inline int dlm_modes_compat(int mode1, int mode2)
+{
+	return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, lkb_statequeue);
+
+	if (lkb->lkb_id == first->lkb_id)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Return 1 if the locks' ranges overlap
+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
+ */
+
+static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
+{
+	if (!lkb1->lkb_range || !lkb2->lkb_range)
+		return 1;
+
+	if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
+	    lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
+		return 0;
+
+	return 1;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource.  The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ */
+
+static struct dlm_lkb *conversion_deadlock_detect(struct dlm_rsb *rsb,
+						  struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *this;
+
+	list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+		if (this == lkb)
+			continue;
+
+		if (!ranges_overlap(lkb, this))
+			continue;
+
+		if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+			return this;
+	}
+
+	return NULL;
+}
+
+/*
+ * Check if the given lkb conflicts with another lkb on the queue.
+ */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *this;
+
+	list_for_each_entry(this, head, lkb_statequeue) {
+		if (this == lkb)
+			continue;
+		if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
+			return TRUE;
+	}
+	return FALSE;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+	int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+	/*
+	 * 6-10: Version 5.4 introduced an option to address the phenomenon of
+	 * a new request for a NL mode lock being blocked.
+	 *
+	 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+	 * request, then it would be granted.  In essence, the use of this flag
+	 * tells the Lock Manager to expedite theis request by not considering
+	 * what may be in the CONVERTING or WAITING queues...  As of this
+	 * writing, the EXPEDITE flag can be used only with new requests for NL
+	 * mode locks.  This flag is not valid for conversion requests.
+	 *
+	 * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
+	 * conversion or used with a non-NL requested mode.  We also know an
+	 * EXPEDITE request is always granted immediately, so now must always
+	 * be 1.  The full condition to grant an expedite request: (now &&
+	 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+	 * therefore be shortened to just checking the flag.
+	 */
+
+	if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
+		return TRUE;
+
+	/*
+	 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+	 * added to the remaining conditions.
+	 */
+
+	if (queue_conflict(&r->res_grantqueue, lkb))
+		goto out;
+
+	/*
+	 * 6-3: By default, a conversion request is immediately granted if the
+	 * requested mode is compatible with the modes of all other granted
+	 * locks
+	 */
+
+	if (queue_conflict(&r->res_convertqueue, lkb))
+		goto out;
+
+	/*
+	 * 6-5: But the default algorithm for deciding whether to grant or
+	 * queue conversion requests does not by itself guarantee that such
+	 * requests are serviced on a "first come first serve" basis.  This, in
+	 * turn, can lead to a phenomenon known as "indefinate postponement".
+	 *
+	 * 6-7: This issue is dealt with by using the optional QUECVT flag with
+	 * the system service employed to request a lock conversion.  This flag
+	 * forces certain conversion requests to be queued, even if they are
+	 * compatible with the granted modes of other locks on the same
+	 * resource.  Thus, the use of this flag results in conversion requests
+	 * being ordered on a "first come first servce" basis.
+	 */
+
+	if (now && conv && !(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
+		return TRUE;
+
+	/*
+	 * When using range locks the NOORDER flag is set to avoid the standard
+	 * vms rules on grant order.
+	 */
+
+	if (lkb->lkb_lockqueue_flags & DLM_LKF_NOORDER)
+		return TRUE;
+
+	/*
+	 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+	 * granted until all other conversion requests ahead of it are granted
+	 * and/or canceled.
+	 */
+
+	if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+		return TRUE;
+
+	/*
+	 * 6-4: By default, a new request is immediately granted only if all
+	 * three of the following conditions are satisfied when the request is
+	 * issued:
+	 * - The queue of ungranted conversion requests for the resource is
+	 *   empty.
+	 * - The queue of ungranted new requests for the resource is empty.
+	 * - The mode of the new request is compatible with the most
+	 *   restrictive mode of all granted locks on the resource.
+	 */
+
+	if (now && !conv && list_empty(&r->res_convertqueue) &&
+	    list_empty(&r->res_waitqueue))
+		return TRUE;
+
+	/*
+	 * 6-4: Once a lock request is in the queue of ungranted new requests,
+	 * it cannot be granted until the queue of ungranted conversion
+	 * requests is empty, all ungranted new requests ahead of it are
+	 * granted and/or canceled, and it is compatible with the granted mode
+	 * of the most restrictive lock granted on the resource.
+	 */
+
+	if (!now && !conv && list_empty(&r->res_convertqueue) &&
+	    first_in_list(lkb, &r->res_waitqueue))
+		return TRUE;
+
+ out:
+	/*
+	 * The following, enabled by CONVDEADLK, departs from VMS.
+	 */
+
+	if (now && conv && (lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK) &&
+	    conversion_deadlock_detect(r, lkb)) {
+		lkb->lkb_grmode = DLM_LOCK_NL;
+		lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
+	}
+
+	return FALSE;
+}
+
+int dlm_lock(void *lockspace,
+	     uint32_t mode,
+	     struct dlm_lksb *lksb,
+	     uint32_t flags,
+	     void *name,
+	     unsigned int namelen,
+	     uint32_t parent,
+	     void (*ast) (void *astarg),
+	     void *astarg,
+	     void (*bast) (void *astarg, int mode),
+	     struct dlm_range *range)
+{
+	struct dlm_ls *lspace;
+	struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
+	int ret = -EINVAL;
+
+	lspace = find_lockspace_by_local_id(lockspace);
+	if (!lspace)
+		return ret;
+
+	if (mode < 0 || mode > DLM_LOCK_EX)
+		goto out;
+
+	if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+		goto out;
+
+	if (flags & DLM_LKF_CANCEL)
+		goto out;
+
+	if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && (mode != DLM_LOCK_NL))
+		goto out;
+
+	if (!ast || !lksb)
+		goto out;
+
+	if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
+		goto out;
+
+	/*
+	 * Take conversion path.
+	 */
+
+	if (flags & DLM_LKF_CONVERT) {
+		ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
+				   bast, range);
+		goto out;
+	}
+
+#ifdef CONFIG_DLM_STATS
+	dlm_stats.lockops++;
+#endif
+	/*
+	 * Take new lock path.
+	 */
+
+	if (parent) {
+		down_read(&lspace->ls_unlock_sem);
+
+		parent_lkb = find_lock_by_id(lspace, parent);
+
+		if (!parent_lkb ||
+		    parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
+		    parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
+		    parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
+			up_read(&lspace->ls_unlock_sem);
+			goto out;
+		}
+
+		atomic_inc(&parent_lkb->lkb_childcnt);
+		up_read(&lspace->ls_unlock_sem);
+	}
+
+	down_read(&lspace->ls_in_recovery);
+
+	ret = -ENOMEM;
+
+	lkb = create_lkb(lspace);
+	if (!lkb)
+		goto fail_dec;
+	lkb->lkb_astaddr = ast;
+	lkb->lkb_astparam = (long) astarg;
+	lkb->lkb_bastaddr = bast;
+	lkb->lkb_rqmode = mode;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	lkb->lkb_nodeid = -1;
+	lkb->lkb_lksb = lksb;
+	lkb->lkb_parent = parent_lkb;
+	lkb->lkb_lockqueue_flags = flags;
+	lkb->lkb_lvbptr = lksb->sb_lvbptr;
+
+	if (!in_interrupt() && current)
+		lkb->lkb_ownpid = (int) current->pid;
+	else
+		lkb->lkb_ownpid = 0;
+
+	if (range) {
+		if (range->ra_start > range->ra_end) {
+			ret = -EINVAL;
+			goto fail_free;
+		}
+
+		if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
+			goto fail_free;
+	}
+
+	/* Convert relevant flags to internal numbers */
+	if (flags & DLM_LKF_VALBLK)
+		lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
+	if (flags & DLM_LKF_PERSISTENT)
+		lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
+	if (flags & DLM_LKF_NODLCKWT)
+		lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
+
+	lksb->sb_lkid = lkb->lkb_id;
+
+	ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
+	if (ret)
+		goto fail_free;
+
+	up_read(&lspace->ls_in_recovery);
+
+	wake_astd();
+
+	put_lockspace(lspace);
+	return 0;
+
+      fail_free:
+	release_lkb(lspace, lkb);
+	goto fail_unlock;
+
+      fail_dec:
+	if (parent_lkb)
+		atomic_dec(&parent_lkb->lkb_childcnt);
+
+      fail_unlock:
+	up_read(&lspace->ls_in_recovery);
+
+      out:
+	put_lockspace(lspace);
+	return ret;
+}
+
+int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb, uint32_t flags,
+		    char *name, int namelen)
+{
+	struct dlm_rsb *rsb, *parent_rsb = NULL;
+	struct dlm_lkb *parent_lkb = lkb->lkb_parent;
+	uint32_t nodeid;
+	int error, dir_error = 0;
+
+	if (parent_lkb)
+		parent_rsb = parent_lkb->lkb_resource;
+
+	error = find_rsb(ls, parent_rsb, name, namelen, CREATE, &rsb);
+	if (error)
+		return error;
+	lkb->lkb_resource = rsb;
+	down_write(&rsb->res_lock);
+
+	log_debug(ls, "(%d) rq %u %x \"%s\"", lkb->lkb_ownpid, lkb->lkb_rqmode,
+		  lkb->lkb_id, rsb->res_name);
+	/*
+	 * Next stage, do we need to find the master or can
+	 * we get on with the real locking work ?
+	 */
+
+ retry:
+	if (rsb->res_nodeid == -1) {
+		if (get_directory_nodeid(rsb) != our_nodeid()) {
+			remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
+			up_write(&rsb->res_lock);
+			return 0;
+		}
+
+		error = dlm_dir_lookup(ls, our_nodeid(), rsb->res_name,
+				       rsb->res_length, &nodeid);
+		if (error) {
+			DLM_ASSERT(error == -EEXIST,);
+			msleep(500);
+			dir_error = error;
+			goto retry;
+		}
+
+		if (nodeid == our_nodeid()) {
+			set_bit(RESFL_MASTER, &rsb->res_flags);
+			rsb->res_nodeid = 0;
+		} else {
+			clear_bit(RESFL_MASTER, &rsb->res_flags);
+			rsb->res_nodeid = nodeid;
+		}
+
+		if (dir_error) {
+			log_all(ls, "dir lookup retry %x %u", lkb->lkb_id,
+				nodeid);
+		}
+	}
+
+	lkb->lkb_nodeid = rsb->res_nodeid;
+	up_write(&rsb->res_lock);
+
+	error = dlm_lock_stage2(ls, lkb, rsb, flags);
+
+	return error;
+}
+
+/*
+ * Locking routine called after we have an RSB, either a copy of a remote one
+ * or a local one, or perhaps a shiny new one all of our very own
+ */
+
+int dlm_lock_stage2(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *rsb,
+		    uint32_t flags)
+{
+	int error = 0;
+
+	DLM_ASSERT(rsb->res_nodeid != -1, print_lkb(lkb); print_rsb(rsb););
+
+	if (rsb->res_nodeid) {
+		res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
+		error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
+	} else {
+		dlm_lock_stage3(lkb);
+	}
+
+	return error;
+}
+
+/*
+ * Called on an RSB's master node to do stage2 locking for a remote lock
+ * request.  Returns a proper lkb with rsb ready for lock processing.
+ * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
+ */
+
+struct dlm_lkb *remote_stage2(int remote_nodeid, struct dlm_ls *ls,
+			      struct dlm_request *freq)
+{
+	struct dlm_rsb *rsb = NULL, *parent_rsb = NULL;
+	struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
+	int error, namelen;
+
+	if (freq->rr_remparid) {
+		parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
+		if (!parent_lkb)
+			goto fail;
+
+		atomic_inc(&parent_lkb->lkb_childcnt);
+		parent_rsb = parent_lkb->lkb_resource;
+	}
+
+	/*
+	 * A new MSTCPY lkb.  Initialize lkb fields including the real lkid and
+	 * node actually holding the (non-MSTCPY) lkb.  AST address are just
+	 * flags in the master copy.
+	 */
+
+	lkb = create_lkb(ls);
+	if (!lkb)
+		goto fail_dec;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	lkb->lkb_rqmode = freq->rr_rqmode;
+	lkb->lkb_parent = parent_lkb;
+	lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP);
+	lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST);
+	lkb->lkb_nodeid = remote_nodeid;
+	lkb->lkb_remid = freq->rr_header.rh_lkid;
+	lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
+	lkb->lkb_lockqueue_flags = freq->rr_flags;
+
+	if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
+		lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
+		allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
+		if (!lkb->lkb_lvbptr)
+			goto fail_free;
+	}
+
+	if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
+		error = lkb_set_range(ls, lkb, freq->rr_range_start,
+				      freq->rr_range_end);
+		if (error)
+			goto fail_free;
+	}
+
+	/*
+	 * Get the RSB which this lock is for.  Create a new RSB if this is a
+	 * new lock on a new resource.  We must be the master of any new rsb.
+	 */
+
+	namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
+
+	error = find_rsb(ls, parent_rsb, freq->rr_name, namelen, MASTER, &rsb);
+	if (error)
+		goto fail_free;
+
+	if (!rsb) {
+		log_debug(ls, "send einval to %u", remote_nodeid);
+		/* print_name(freq->rr_name, namelen); */
+		lkb->lkb_retstatus = -EINVAL;
+		goto out;
+	}
+
+	lkb->lkb_resource = rsb;
+
+	log_debug(ls, "(%d) rq %u from %u %x \"%s\"",
+		  lkb->lkb_ownpid, lkb->lkb_rqmode, remote_nodeid,
+		  lkb->lkb_id, rsb->res_name);
+
+      out:
+	return lkb;
+
+      fail_free:
+	/* release_lkb handles parent */
+	release_lkb(ls, lkb);
+	parent_lkb = NULL;
+
+      fail_dec:
+	if (parent_lkb)
+		atomic_dec(&parent_lkb->lkb_childcnt);
+      fail:
+	return NULL;
+}
+
+/*
+ * The final bit of lock request processing on the master node.  Here the lock
+ * is granted and the completion ast is queued, or the lock is put on the
+ * waitqueue and blocking asts are sent.
+ */
+
+void dlm_lock_stage3(struct dlm_lkb *lkb)
+{
+	struct dlm_rsb *rsb = lkb->lkb_resource;
+
+	/*
+	 * This is a locally mastered lock on a resource that already exists,
+	 * see if it can be  granted or if it must wait.  When this function is
+	 * called for a remote lock request (process_cluster_request,
+	 * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
+	 * requesting node at the end of process_cluster_request, not at the
+	 * end of grant_lock.
+	 */
+
+	down_write(&rsb->res_lock);
+
+	if (can_be_granted(rsb, lkb, TRUE)) {
+		grant_lock(lkb, 0);
+		goto out;
+	}
+
+	/*
+	 * This request is not a conversion, so the lkb didn't exist other than
+	 * for this request and should be freed after EAGAIN is returned in the
+	 * ast.
+	 */
+
+	if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
+		lkb->lkb_retstatus = -EAGAIN;
+		if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
+			send_blocking_asts_all(rsb, lkb);
+		queue_ast(lkb, AST_COMP | AST_DEL, 0);
+		goto out;
+	}
+
+	/*
+	 * The requested lkb must wait.  Because the rsb of the requested lkb
+	 * is mastered here, send blocking asts for the lkb's blocking the
+	 * request.
+	 */
+
+	log_debug2("w %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
+		   lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
+		   lkb->lkb_status, rsb->res_name);
+
+	lkb->lkb_retstatus = 0;
+	lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
+
+	send_blocking_asts(rsb, lkb);
+
+      out:
+	up_write(&rsb->res_lock);
+}
+
+int dlm_unlock(void *lockspace,
+	       uint32_t lkid,
+	       uint32_t flags,
+	       struct dlm_lksb *lksb,
+	       void *astarg)
+{
+	struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *rsb;
+	int ret = -EINVAL;
+
+	if (!ls) {
+		log_print("dlm_unlock: lkid %x lockspace not found", lkid);
+		return ret;
+	}
+
+	lkb = find_lock_by_id(ls, lkid);
+	if (!lkb) {
+		log_debug(ls, "unlock %x no id", lkid);
+		goto out;
+	}
+
+	/* Can't dequeue a master copy (a remote node's mastered lock) */
+	if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
+		log_debug(ls, "(%d) unlock %x lkb_flags %x",
+			  lkb->lkb_ownpid, lkid, lkb->lkb_flags);
+		goto out;
+	}
+
+	/* Already waiting for a remote lock operation */
+	if (lkb->lkb_lockqueue_state) {
+		log_debug(ls, "(%d) unlock %x lq%d",
+			  lkb->lkb_ownpid, lkid, lkb->lkb_lockqueue_state);
+		ret = -EBUSY;
+		goto out;
+	}
+
+#ifdef CONFIG_DLM_STATS
+	dlm_stats.unlockops++;
+#endif
+	/* Can only cancel WAITING or CONVERTing locks.
+	 * This is just a quick check - it is also checked in unlock_stage2()
+	 * (which may be on the master) under the semaphore.
+	 */
+	if ((flags & DLM_LKF_CANCEL) &&
+	    (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
+		log_debug(ls, "(%d) unlock %x %x %d",
+			  lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
+		goto out;
+	}
+
+	/* "Normal" unlocks must operate on a granted lock */
+	if (!(flags & DLM_LKF_CANCEL) &&
+	    (lkb->lkb_status != GDLM_LKSTS_GRANTED)) {
+		log_debug(ls, "(%d) unlock %x %x %d",
+			  lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
+		goto out;
+	}
+
+	if (lkb->lkb_flags & GDLM_LKFLG_DELETED) {
+		log_debug(ls, "(%d) unlock deleted %x %x %d",
+			  lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
+		goto out;
+	}
+
+	down_write(&ls->ls_unlock_sem);
+	/* Can't dequeue a lock with sublocks */
+	if (atomic_read(&lkb->lkb_childcnt)) {
+		up_write(&ls->ls_unlock_sem);
+		ret = -ENOTEMPTY;
+		goto out;
+	}
+	/* Mark it as deleted so we can't use it as a parent in dlm_lock() */
+	if (!(flags & DLM_LKF_CANCEL))
+		lkb->lkb_flags |= GDLM_LKFLG_DELETED;
+	up_write(&ls->ls_unlock_sem);
+
+	down_read(&ls->ls_in_recovery);
+	rsb = find_rsb_to_unlock(ls, lkb);
+
+	log_debug(ls, "(%d) un %x %x %d %d \"%s\"",
+		  lkb->lkb_ownpid,
+		  lkb->lkb_id,
+		  lkb->lkb_flags,
+		  lkb->lkb_nodeid,
+		  rsb->res_nodeid,
+		  rsb->res_name);
+
+	/* Save any new params */
+	if (lksb)
+		lkb->lkb_lksb = lksb;
+	lkb->lkb_astparam = (long) astarg;
+	lkb->lkb_lockqueue_flags = flags;
+
+	if (lkb->lkb_nodeid)
+		ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
+	else
+		ret = dlm_unlock_stage2(lkb, rsb, flags);
+	up_read(&ls->ls_in_recovery);
+
+	wake_astd();
+
+      out:
+	put_lockspace(ls);
+	return ret;
+}
+
+int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags)
+{
+	int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
+	int old_status;
+
+	down_write(&rsb->res_lock);
+
+	/* Can only cancel WAITING or CONVERTing locks */
+	if ((flags & DLM_LKF_CANCEL) &&
+	    (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
+	        lkb->lkb_retstatus = -EINVAL;
+		queue_ast(lkb, AST_COMP, 0);
+	        goto out;
+	}
+
+	log_debug2("u %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
+		   lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
+		   lkb->lkb_status, rsb->res_name);
+
+	old_status = lkb_dequeue(lkb);
+
+	/*
+	 * Cancelling a conversion
+	 */
+
+	if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
+		/* VMS semantics say we should send blocking ASTs again here */
+		send_blocking_asts(rsb, lkb);
+
+		/* Remove from deadlock detection */
+		if (lkb->lkb_duetime)
+			remove_from_deadlockqueue(lkb);
+
+		/* Stick it back on the granted queue */
+		lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+		lkb->lkb_rqmode = lkb->lkb_grmode;
+
+		/* Was it blocking any other locks? */
+		if (first_in_list(lkb, &rsb->res_convertqueue))
+			grant_pending_locks(rsb);
+
+		lkb->lkb_retstatus = -DLM_ECANCEL;
+		queue_ast(lkb, AST_COMP, 0);
+		goto out;
+	}
+
+	/*
+	 * If was granted grant any converting or waiting locks
+	 * and save or clear lvb
+	 */
+
+	if (old_status == GDLM_LKSTS_GRANTED) {
+		if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
+			if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
+				memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr,
+				       DLM_LVB_LEN);
+			if (flags & DLM_LKF_IVVALBLK)
+				memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
+		}
+
+		grant_pending_locks(rsb);
+	} else
+		DLM_ASSERT(0, print_lkb(lkb); print_rsb(rsb););
+
+	lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK;
+
+	if (!remote) {
+		queue_ast(lkb, AST_COMP | AST_DEL, 0);
+	} else {
+		up_write(&rsb->res_lock);
+		release_lkb(rsb->res_ls, lkb);
+		release_rsb(rsb);
+		goto out2;
+	}
+
+ out:
+	up_write(&rsb->res_lock);
+ out2:
+	wake_astd();
+	return 0;
+}
+
+/*
+ * Lock conversion
+ */
+
+static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
+			uint32_t flags, void *ast, void *astarg, void *bast,
+			struct dlm_range *range)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *rsb;
+	int ret = -EINVAL;
+
+	lkb = find_lock_by_id(ls, lksb->sb_lkid);
+	if (!lkb) {
+		goto out;
+	}
+
+	if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
+		goto out;
+	}
+
+	if ((flags & DLM_LKF_QUECVT) &&
+	    !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
+		goto out;
+	}
+
+	if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
+	        goto out;
+	}
+
+#ifdef CONFIG_DLM_STATS
+	dlm_stats.convertops++;
+#endif
+	/* Set up the ranges as appropriate */
+	if (range) {
+		if (range->ra_start > range->ra_end)
+			goto out;
+
+		if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	rsb = lkb->lkb_resource;
+	down_read(&ls->ls_in_recovery);
+
+	log_debug(ls, "(%d) cv %u %x \"%s\"", lkb->lkb_ownpid, mode,
+		  lkb->lkb_id, rsb->res_name);
+
+	lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
+	lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
+
+	if (flags & DLM_LKF_NODLCKWT)
+		lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
+	lkb->lkb_astaddr = ast;
+	lkb->lkb_astparam = (long) astarg;
+	lkb->lkb_bastaddr = bast;
+	lkb->lkb_rqmode = mode;
+	lkb->lkb_lockqueue_flags = flags;
+	lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
+	lkb->lkb_lvbptr = lksb->sb_lvbptr;
+
+	if (rsb->res_nodeid) {
+		res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
+		ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
+	} else {
+		ret = dlm_convert_stage2(lkb, FALSE);
+	}
+
+	up_read(&ls->ls_in_recovery);
+
+	wake_astd();
+
+      out:
+	return ret;
+}
+
+/*
+ * For local conversion requests on locally mastered locks this is called
+ * directly from dlm_lock/convert_lock.  This function is also called for
+ * remote conversion requests of MSTCPY locks (from process_cluster_request).
+ */
+
+int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast)
+{
+	struct dlm_rsb *rsb = lkb->lkb_resource;
+	int ret = 0;
+
+	down_write(&rsb->res_lock);
+
+	if (can_be_granted(rsb, lkb, TRUE)) {
+		grant_lock(lkb, 0);
+		grant_pending_locks(rsb);
+		goto out;
+	}
+
+	if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
+		ret = lkb->lkb_retstatus = -EAGAIN;
+		if (do_ast)
+			queue_ast(lkb, AST_COMP, 0);
+		if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
+			send_blocking_asts_all(rsb, lkb);
+		goto out;
+	}
+
+	log_debug2("c %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
+		   lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
+		   lkb->lkb_status, rsb->res_name);
+
+	lkb->lkb_retstatus = 0;
+	lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
+
+	/*
+	 * The granted mode may have been reduced to NL by conversion deadlock
+	 * avoidance in can_be_granted().  If so, try to grant other locks.
+	 */
+
+	if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
+		grant_pending_locks(rsb);
+
+	send_blocking_asts(rsb, lkb);
+
+	if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
+	        add_to_deadlockqueue(lkb);
+
+      out:
+	up_write(&rsb->res_lock);
+	return ret;
+}
+
+/*
+ * Remove lkb from any queue it's on, add it to the granted queue, and queue a
+ * completion ast.  rsb res_lock must be held in write when this is called.
+ */
+
+static void grant_lock(struct dlm_lkb *lkb, int send_remote)
+{
+	struct dlm_rsb *rsb = lkb->lkb_resource;
+
+	if (lkb->lkb_duetime)
+		remove_from_deadlockqueue(lkb);
+
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
+		int b;
+		DLM_ASSERT(lkb->lkb_lvbptr,);
+
+		if (!rsb->res_lvbptr)
+			rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
+
+		b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+		if (b)
+			memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
+		else
+			memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+	}
+
+	if (lkb->lkb_range) {
+		lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
+		lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
+	}
+
+	log_debug2("g %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
+	           lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
+		   lkb->lkb_status, rsb->res_name);
+
+	if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+		lkb->lkb_grmode = lkb->lkb_rqmode;
+		lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+	}
+	lkb->lkb_rqmode = DLM_LOCK_IV;
+	lkb->lkb_highbast = 0;
+	lkb->lkb_retstatus = 0;
+	queue_ast(lkb, AST_COMP, 0);
+
+	/*
+	 * A remote conversion request has been granted, either immediately
+	 * upon being requested or after waiting a bit.  In the former case,
+	 * reply_and_grant() is called.  In the later case send_remote is 1 and
+	 * remote_grant() is called.
+	 *
+	 * The "send_remote" flag is set only for locks which are granted "out
+	 * of band" - ie by another lock being converted or unlocked.
+	 *
+	 * The second case occurs when this lkb is granted right away as part
+	 * of processing the initial request.  In that case, we send a single
+	 * message in reply_and_grant which combines the request reply with the
+	 * grant message.
+	 */
+
+	if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
+		if (send_remote)
+			remote_grant(lkb);
+		else if (lkb->lkb_request)
+			reply_and_grant(lkb);
+	}
+
+}
+
+static void send_bast_queue(struct list_head *head, struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *gr;
+
+	list_for_each_entry(gr, head, lkb_statequeue) {
+		if (gr->lkb_bastaddr &&
+		    gr->lkb_highbast < lkb->lkb_rqmode &&
+		    ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
+			queue_ast(gr, AST_BAST, lkb->lkb_rqmode);
+			gr->lkb_highbast = lkb->lkb_rqmode;
+		}
+	}
+}
+
+/*
+ * Notify granted locks if they are blocking a newly forced-to-wait lock.
+ */
+
+static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+	send_bast_queue(&rsb->res_grantqueue, lkb);
+	/* check if the following improves performance */
+	/* send_bast_queue(&rsb->res_convertqueue, lkb); */
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+	send_bast_queue(&rsb->res_grantqueue, lkb);
+	send_bast_queue(&rsb->res_convertqueue, lkb);
+}
+
+/*
+ * Called when a lock has been dequeued. Look for any locks to grant that are
+ * waiting for conversion or waiting to be granted.
+ * The rsb res_lock must be held in write when this function is called.
+ */
+
+int grant_pending_locks(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *s;
+	int8_t high = DLM_LOCK_IV;
+
+	list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+		if (can_be_granted(r, lkb, FALSE))
+			grant_lock(lkb, 1);
+		else
+			high = MAX(lkb->lkb_rqmode, high);
+	}
+
+	list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+		if (lkb->lkb_lockqueue_state)
+			continue;
+
+		if (can_be_granted(r, lkb, FALSE))
+			grant_lock(lkb, 1);
+		else
+			high = MAX(lkb->lkb_rqmode, high);
+	}
+
+	/*
+	 * If there are locks left on the wait/convert queue then send blocking
+	 * ASTs to granted locks that are blocking
+	 *
+	 * FIXME: This might generate some spurious blocking ASTs for range
+	 * locks.
+	 */
+
+	if (high > DLM_LOCK_IV) {
+		list_for_each_entry_safe(lkb, s, &r->res_grantqueue,
+					 lkb_statequeue) {
+			if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
+			    !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+				queue_ast(lkb, AST_BAST, high);
+				lkb->lkb_highbast = high;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Called to cancel a locking operation that failed due to some internal
+ * reason.
+ *
+ * Waiting locks will be removed, converting locks will be reverted to their
+ * granted status, unlocks will be left where they are.
+ *
+ * A completion AST will be delivered to the caller.
+ */
+
+int cancel_lockop(struct dlm_lkb *lkb, int status)
+{
+	int state = lkb->lkb_lockqueue_state;
+	uint16_t astflags = AST_COMP;
+
+	lkb->lkb_lockqueue_state = 0;
+
+	switch (state) {
+	case GDLM_LQSTATE_WAIT_RSB:
+		astflags |= AST_DEL;
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONDGRANT:
+		res_lkb_dequeue(lkb);
+		astflags |= AST_DEL;
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONVERT:
+		res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
+
+		/* Remove from deadlock detection */
+		if (lkb->lkb_duetime) {
+			remove_from_deadlockqueue(lkb);
+		}
+		break;
+
+	case GDLM_LQSTATE_WAIT_UNLOCK:
+		/* We can leave this. I think.... */
+		break;
+	}
+
+	lkb->lkb_retstatus = status;
+	queue_ast(lkb, astflags, 0);
+
+	return 0;
+}
+
+/*
+ * Check for conversion deadlock. If a deadlock was found
+ * return lkb to kill, else return NULL
+ */
+
+struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb)
+{
+	struct dlm_rsb *rsb = lkb->lkb_resource;
+	struct list_head *entry;
+
+	DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
+
+	/* Work our way up to the head of the queue looking for locks that
+	 * conflict with us */
+
+	down_read(&rsb->res_lock);
+
+	entry = lkb->lkb_statequeue.prev;
+	while (entry != &rsb->res_convertqueue) {
+		struct dlm_lkb *lkb2 = list_entry(entry, struct dlm_lkb, lkb_statequeue);
+
+		if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
+			up_read(&rsb->res_lock);
+			return lkb;
+		}
+		entry = entry->prev;
+	}
+	up_read(&rsb->res_lock);
+
+	return 0;
+}
+
+/*
+ * Conversion operation was cancelled by us (not the user).
+ * ret contains the return code to pass onto the user
+ */
+
+void cancel_conversion(struct dlm_lkb *lkb, int ret)
+{
+	struct dlm_rsb *rsb = lkb->lkb_resource;
+
+	/* Stick it back on the granted queue */
+	res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+	lkb->lkb_rqmode = lkb->lkb_grmode;
+
+	remove_from_deadlockqueue(lkb);
+
+	lkb->lkb_retstatus = ret;
+	queue_ast(lkb, AST_COMP, 0);
+	wake_astd();
+}
+
+/*
+ * As new master of the rsb for this lkb, we need to handle these requests
+ * removed from the lockqueue and originating from local processes:
+ * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
+ * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
+ */
+
+void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state)
+{
+	struct dlm_rsb *rsb;
+
+	switch (state) {
+	case GDLM_LQSTATE_WAIT_RSB:
+		dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
+				lkb->lkb_lockqueue_flags,
+				lkb->lkb_resource->res_name,
+				lkb->lkb_resource->res_length);
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONDGRANT:
+		res_lkb_dequeue(lkb);
+		dlm_lock_stage3(lkb);
+		break;
+
+	case GDLM_LQSTATE_WAIT_UNLOCK:
+		rsb = find_rsb_to_unlock(ls, lkb);
+		dlm_unlock_stage2(lkb, rsb, lkb->lkb_lockqueue_flags);
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONVERT:
+		dlm_convert_stage2(lkb, TRUE);
+		break;
+
+	default:
+		DLM_ASSERT(0,);
+	}
+}
+
+static void dump_queue(struct list_head *head, char *qname)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+		printk("%s %08x gr %d rq %d flg %x sts %u node %u remid %x "
+		       "lq %d,%x\n",
+		       qname,
+		       lkb->lkb_id,
+		       lkb->lkb_grmode,
+		       lkb->lkb_rqmode,
+		       lkb->lkb_flags,
+		       lkb->lkb_status,
+		       lkb->lkb_nodeid,
+		       lkb->lkb_remid,
+		       lkb->lkb_lockqueue_state,
+		       lkb->lkb_lockqueue_flags);
+	}
+}
+
+static void dump_rsb(struct dlm_rsb *rsb)
+{
+	printk("name \"%s\" flags %lx nodeid %d ref %u\n",
+	       rsb->res_name, rsb->res_flags, rsb->res_nodeid,
+	       atomic_read(&rsb->res_ref));
+
+	if (!list_empty(&rsb->res_grantqueue))
+		dump_queue(&rsb->res_grantqueue, "G");
+
+	if (!list_empty(&rsb->res_convertqueue))
+		dump_queue(&rsb->res_convertqueue, "C");
+
+	if (!list_empty(&rsb->res_waitqueue))
+		dump_queue(&rsb->res_waitqueue, "W");
+}
+
+void dlm_locks_dump(void)
+{
+	struct dlm_ls *ls;
+	struct dlm_rsb *rsb;
+	struct list_head *head;
+	int i;
+
+	lowcomms_stop_accept();
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		down_write(&ls->ls_in_recovery);
+		for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+			head = &ls->ls_rsbtbl[i].list;
+			list_for_each_entry(rsb, head, res_hashchain)
+				dump_rsb(rsb);
+		}
+	}
+}
+
diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
--- linux-orig/cluster/dlm/locking.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/locking.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,33 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKING_DOT_H__
+#define __LOCKING_DOT_H__
+
+int dlm_modes_compat(int mode1, int mode2);
+void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state);
+void dlm_lock_stage3(struct dlm_lkb *lkb);
+int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast);
+int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
+int dlm_lock_stage2(struct dlm_ls *lspace, struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
+struct dlm_rsb *create_rsb(struct dlm_ls *lspace, struct dlm_lkb *lkb, char *name, int namelen);
+int free_rsb_if_unused(struct dlm_rsb *rsb);
+struct dlm_lkb *remote_stage2(int remote_csid, struct dlm_ls *lspace,
+			struct dlm_request *freq);
+int cancel_lockop(struct dlm_lkb *lkb, int status);
+int dlm_remove_lock(struct dlm_lkb *lkb, uint32_t flags);
+int grant_pending_locks(struct dlm_rsb *rsb);
+void cancel_conversion(struct dlm_lkb *lkb, int ret);
+struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb);
+
+#endif				/* __LOCKING_DOT_H__ */
diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
--- linux-orig/cluster/dlm/lockqueue.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lockqueue.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,1159 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lockqueue.c
+ *
+ * This controls the lock queue, which is where locks
+ * come when they need to wait for a remote operation
+ * to complete.
+ *
+ * This could also be thought of as the "high-level" comms
+ * layer.
+ *
+ */
+
+#include "dlm_internal.h"
+#include "lockqueue.h"
+#include "dir.h"
+#include "locking.h"
+#include "lkb.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "reccomms.h"
+#include "nodes.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "memory.h"
+#include "rsb.h"
+#include "queries.h"
+#include "util.h"
+
+static void add_reply_lvb(struct dlm_lkb * lkb, struct dlm_reply *reply);
+static void add_request_lvb(struct dlm_lkb * lkb, struct dlm_request *req);
+
+/*
+ * format of an entry on the request queue
+ */
+struct rq_entry {
+	struct list_head rqe_list;
+	uint32_t rqe_nodeid;
+	char rqe_request[1];
+};
+
+/*
+ * Add a new request (if appropriate) to the request queue and send the remote
+ * request out.  - runs in the context of the locking caller
+ *
+ * Recovery of a remote_stage request if the remote end fails while the lkb
+ * is still on the lockqueue:
+ *
+ * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
+ *   lockqueue_lkb_mark() at the start of recovery.
+ *
+ * o Some lkb's will be rebuilt on new master rsb's during recovery.
+ *   (depends on the type of request, see below).
+ *
+ * o At the end of recovery, resend_cluster_requests() looks at these
+ *   LQRESEND lkb's and either:
+ *
+ *   i) resends the request to the new master for the rsb where the
+ *      request is processed as usual.  The lkb remains on the lockqueue until
+ *      the new master replies and we run process_lockqueue_reply().
+ *
+ *   ii) if we've become the rsb master, remove the lkb from the lockqueue
+ *       and processes the request locally via process_remastered_lkb().
+ *
+ * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
+ * and the request should be resent if dest node is failed.
+ *
+ * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
+ * wait queue.  Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
+ * makes send_lkb_queue() skip it).  Resend this request to the new master.
+ *
+ * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue.  It will
+ * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
+ * Resend this request to the new master.
+ *
+ * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
+ * It will be rebuilt on the new master rsb's granted queue.  Resend this
+ * request to the new master.
+ */
+
+int remote_stage(struct dlm_lkb *lkb, int state)
+{
+	int error;
+
+	lkb->lkb_lockqueue_state = state;
+	add_to_lockqueue(lkb);
+
+	error = send_cluster_request(lkb, state);
+	if (error < 0) {
+		log_error(lkb->lkb_resource->res_ls, "remote_stage error %d %x",
+			  error, lkb->lkb_id);
+		/* Leave on lockqueue, it will be resent to correct node during
+		 * recovery. */
+	}
+	return 0;
+}
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete.
+ */
+
+void add_to_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+{
+	struct rq_entry *entry;
+	int length = hd->rh_length;
+
+	if (test_bit(LSFL_REQUEST_WARN, &ls->ls_flags))
+		log_error(ls, "request during recovery from %u", nodeid);
+
+	if (in_nodes_gone(ls, nodeid))
+		return;
+
+	entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+	if (!entry) {
+		// TODO something better
+		printk("dlm: add_to_requestqueue: out of memory\n");
+		return;
+	}
+
+	log_debug(ls, "add_to_requestq cmd %d fr %d", hd->rh_cmd, nodeid);
+	entry->rqe_nodeid = nodeid;
+	memcpy(entry->rqe_request, hd, length);
+
+	down(&ls->ls_requestqueue_lock);
+	list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
+	up(&ls->ls_requestqueue_lock);
+}
+
+int process_requestqueue(struct dlm_ls *ls)
+{
+	int error = 0, count = 0;
+	struct rq_entry *entry;
+	struct dlm_header *hd;
+
+	log_all(ls, "process held requests");
+
+	down(&ls->ls_requestqueue_lock);
+
+	for (;;) {
+		if (list_empty(&ls->ls_requestqueue)) {
+			up(&ls->ls_requestqueue_lock);
+			error = 0;
+			break;
+		}
+
+		entry = list_entry(ls->ls_requestqueue.next, struct rq_entry,
+				   rqe_list);
+		up(&ls->ls_requestqueue_lock);
+		hd = (struct dlm_header *) entry->rqe_request;
+
+		log_debug(ls, "process_requestq cmd %d fr %u", hd->rh_cmd,
+			  entry->rqe_nodeid);
+
+		error = process_cluster_request(entry->rqe_nodeid, hd, TRUE);
+		if (error == -EINTR) {
+			/* entry is left on requestqueue */
+			log_debug(ls, "process_requestqueue abort eintr");
+			break;
+		}
+
+		down(&ls->ls_requestqueue_lock);
+		list_del(&entry->rqe_list);
+		kfree(entry);
+		count++;
+
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+			log_debug(ls, "process_requestqueue abort ls_run");
+			up(&ls->ls_requestqueue_lock);
+			error = -EINTR;
+			break;
+		}
+	}
+
+	log_all(ls, "processed %d requests", count);
+	return error;
+}
+
+void wait_requestqueue(struct dlm_ls *ls)
+{
+	for (;;) {
+		down(&ls->ls_requestqueue_lock);
+		if (list_empty(&ls->ls_requestqueue))
+			break;
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
+			break;
+		up(&ls->ls_requestqueue_lock);
+		schedule();
+	}
+	up(&ls->ls_requestqueue_lock);
+}
+
+/*
+ * Resdir requests (lookup or remove) and replies from before recovery are
+ * invalid since the resdir was rebuilt.  Clear them.  Requests from nodes now
+ * gone are also invalid.
+ */
+
+void purge_requestqueue(struct dlm_ls *ls)
+{
+	int count = 0;
+	struct rq_entry *entry, *safe;
+	struct dlm_header *hd;
+	struct dlm_lkb *lkb;
+
+	log_all(ls, "purge requests");
+
+	down(&ls->ls_requestqueue_lock);
+
+	list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
+		hd = (struct dlm_header *) entry->rqe_request;
+
+		if (hd->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
+		    hd->rh_cmd == GDLM_REMCMD_LOOKUP ||
+		    in_nodes_gone(ls, entry->rqe_nodeid)) {
+
+			list_del(&entry->rqe_list);
+			kfree(entry);
+			count++;
+
+		} else if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
+
+			/*
+			 * Replies to resdir lookups are invalid and must be
+			 * purged.  The lookup requests are marked in
+			 * lockqueue_lkb_mark and will be resent in
+			 * resend_cluster_requests.  The only way to check if
+			 * this is a lookup reply is to look at the
+			 * lockqueue_state of the lkb.
+			 */
+
+			lkb = find_lock_by_id(ls, hd->rh_lkid);
+			DLM_ASSERT(lkb,);
+			if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
+				list_del(&entry->rqe_list);
+				kfree(entry);
+				count++;
+			}
+		}
+	}
+	up(&ls->ls_requestqueue_lock);
+
+	log_all(ls, "purged %d requests", count);
+}
+
+/*
+ * Check if there's a reply for the given lkid in the requestqueue.
+ */
+
+int reply_in_requestqueue(struct dlm_ls *ls, int lkid)
+{
+	int rv = FALSE;
+	struct rq_entry *entry;
+	struct dlm_header *hd;
+
+	down(&ls->ls_requestqueue_lock);
+
+	list_for_each_entry(entry, &ls->ls_requestqueue, rqe_list) {
+		hd = (struct dlm_header *) entry->rqe_request;
+		if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY && hd->rh_lkid == lkid){
+			log_debug(ls, "reply_in_requestq cmd %d fr %d id %x",
+				  hd->rh_cmd, entry->rqe_nodeid, lkid);
+			rv = TRUE;
+			break;
+		}
+	}
+	up(&ls->ls_requestqueue_lock);
+
+	return rv;
+}
+
+void allocate_and_copy_lvb(struct dlm_ls *ls, char **lvbptr, char *src)
+{
+	if (!*lvbptr)
+		*lvbptr = allocate_lvb(ls);
+	if (*lvbptr)
+		memcpy(*lvbptr, src, DLM_LVB_LEN);
+}
+
+/*
+ * Process a lockqueue LKB after it has had it's remote processing complete and
+ * been pulled from the lockqueue.  Runs in the context of the DLM recvd thread
+ * on the machine that requested the lock.
+ */
+
+static void process_lockqueue_reply(struct dlm_lkb *lkb,
+				    struct dlm_reply *reply,
+				    uint32_t nodeid)
+{
+	struct dlm_rsb *rsb = lkb->lkb_resource;
+	struct dlm_ls *ls = rsb->res_ls;
+	int oldstate, state = lkb->lkb_lockqueue_state;
+
+	if (state)
+		remove_from_lockqueue(lkb);
+
+	switch (state) {
+	case GDLM_LQSTATE_WAIT_RSB:
+
+		if (reply->rl_status) {
+			DLM_ASSERT(reply->rl_status == -EEXIST,);
+			if (rsb->res_nodeid == -1) {
+				msleep(500);
+				remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
+				break;
+			}
+		} else {
+			if (reply->rl_nodeid == our_nodeid()) {
+				set_bit(RESFL_MASTER, &rsb->res_flags);
+				rsb->res_nodeid = 0;
+			} else {
+				clear_bit(RESFL_MASTER, &rsb->res_flags);
+				rsb->res_nodeid = reply->rl_nodeid;
+			}
+		}
+
+		log_debug(ls, "(%d) lu rep %x fr %u %u", lkb->lkb_ownpid,
+			  lkb->lkb_id, nodeid,
+			  rsb->res_nodeid);
+
+		lkb->lkb_nodeid = rsb->res_nodeid;
+		dlm_lock_stage2(ls, lkb, rsb, lkb->lkb_lockqueue_flags);
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONVERT:
+	case GDLM_LQSTATE_WAIT_CONDGRANT:
+
+		/*
+		 * the destination wasn't the master
+		 * this implies the request was a CONDGRANT
+		 */
+
+		if (reply->rl_status == -EINVAL) {
+			int master_nodeid;
+
+			DLM_ASSERT(state == GDLM_LQSTATE_WAIT_CONDGRANT, );
+
+			log_debug(ls, "(%d) req reply einval %x fr %d r %d %s",
+				  lkb->lkb_ownpid, lkb->lkb_id, nodeid,
+				  rsb->res_nodeid, rsb->res_name);
+
+			lkb_dequeue(lkb);
+
+			if (rsb->res_nodeid == lkb->lkb_nodeid || rsb->res_nodeid == -1){
+				/*
+				 * We need to re-lookup the master and resend our
+				 * request to it.
+				 */
+
+				lkb->lkb_nodeid = -1;
+				rsb->res_nodeid = -1;
+
+				if (get_directory_nodeid(rsb) != our_nodeid())
+					remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
+				else {
+			    		int error = dlm_dir_lookup(ls, our_nodeid(),
+							   	   rsb->res_name,
+							   	   rsb->res_length,
+							   	   &master_nodeid);
+					if (error == -EEXIST) {
+						/* don't expect this will happen */
+						log_all(ls, "EEXIST %x", lkb->lkb_id);
+						print_lkb(lkb);
+						print_rsb(rsb);
+					}
+
+			    		if (master_nodeid == our_nodeid()) {
+						set_bit(RESFL_MASTER, &rsb->res_flags);
+						master_nodeid = 0;
+			        	} else
+						clear_bit(RESFL_MASTER,&rsb->res_flags);
+
+					rsb->res_nodeid = master_nodeid;
+					lkb->lkb_nodeid = master_nodeid;
+
+					dlm_lock_stage2(ls, lkb, rsb,
+							lkb->lkb_lockqueue_flags);
+				}
+			} else {
+				/*
+				 * Another request on this rsb has since found
+				 * the master, we'll use that one although it too
+				 * may be invalid requiring us to retry again.
+				 */
+
+				lkb->lkb_nodeid = rsb->res_nodeid;
+				dlm_lock_stage2(ls, lkb, rsb,
+						lkb->lkb_lockqueue_flags);
+			}
+
+			break;
+		}
+
+
+		/*
+		 * After a remote lock/conversion/grant request we put the lock
+		 * on the right queue and send an AST if appropriate.  Any lock
+		 * shuffling (eg newly granted locks because this one was
+		 * converted downwards) will be dealt with in seperate messages
+		 * (which may be in the same network message)
+		 */
+
+		if (!lkb->lkb_remid)
+			lkb->lkb_remid = reply->rl_lkid;
+
+		/*
+		 * The remote request failed (we assume because of NOQUEUE).
+		 * If this is a new request (non-conv) the lkb was created just
+		 * for it so the lkb should be freed.  If this was a
+		 * conversion, the lkb already existed so we should put it back
+		 * on the grant queue.
+		 */
+
+		if (reply->rl_status != 0) {
+			DLM_ASSERT(reply->rl_status == -EAGAIN,);
+
+			if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
+				res_lkb_dequeue(lkb);
+				lkb->lkb_retstatus = reply->rl_status;
+				queue_ast(lkb, AST_COMP | AST_DEL, 0);
+			} else {
+				res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+				lkb->lkb_retstatus = reply->rl_status;
+				queue_ast(lkb, AST_COMP, 0);
+			}
+			break;
+		}
+
+		/*
+		 * The remote request was successful in granting the request or
+		 * queuing it to be granted later.  Add the lkb to the
+		 * appropriate rsb queue.
+		 */
+
+		switch (reply->rl_lockstate) {
+		case GDLM_LKSTS_GRANTED:
+
+			/* Compact version of grant_lock(). */
+
+			down_write(&rsb->res_lock);
+			if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+				memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
+				       DLM_LVB_LEN);
+
+			lkb->lkb_grmode = lkb->lkb_rqmode;
+			lkb->lkb_rqmode = DLM_LOCK_IV;
+			lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+
+			if (lkb->lkb_range) {
+				lkb->lkb_range[GR_RANGE_START] =
+				    lkb->lkb_range[RQ_RANGE_START];
+				lkb->lkb_range[GR_RANGE_END] =
+				    lkb->lkb_range[RQ_RANGE_END];
+			}
+			up_write(&rsb->res_lock);
+
+			lkb->lkb_retstatus = 0;
+			queue_ast(lkb, AST_COMP, 0);
+			break;
+
+		case GDLM_LKSTS_WAITING:
+
+			if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
+				res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
+			else
+				log_error(ls, "wait reply for granted %x %u",
+					  lkb->lkb_id, lkb->lkb_nodeid);
+			break;
+
+		case GDLM_LKSTS_CONVERT:
+
+			if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
+				res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
+			else
+				log_error(ls, "convert reply for granted %x %u",
+					  lkb->lkb_id, lkb->lkb_nodeid);
+			break;
+
+		default:
+			log_error(ls, "process_lockqueue_reply state %d",
+				  reply->rl_lockstate);
+		}
+
+		break;
+
+	case GDLM_LQSTATE_WAIT_UNLOCK:
+
+		/*
+		 * Unlocks should never fail.  Update local lock info.  This
+		 * always sends completion AST with status in lksb
+		 */
+
+		DLM_ASSERT(reply->rl_status == 0,);
+		oldstate = res_lkb_dequeue(lkb);
+
+		/* Differentiate between unlocks and conversion cancellations */
+		if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL) {
+			if (oldstate == GDLM_LKSTS_CONVERT) {
+				res_lkb_enqueue(lkb->lkb_resource, lkb,
+						GDLM_LKSTS_GRANTED);
+				lkb->lkb_retstatus = -DLM_ECANCEL;
+				queue_ast(lkb, AST_COMP, 0);
+			} else
+				log_error(ls, "cancel state %d", oldstate);
+		} else {
+			DLM_ASSERT(oldstate == GDLM_LKSTS_GRANTED,
+				   print_lkb(lkb););
+
+			lkb->lkb_retstatus = -DLM_EUNLOCK;
+			queue_ast(lkb, AST_COMP | AST_DEL, 0);
+		}
+		break;
+
+	default:
+		log_error(ls, "process_lockqueue_reply id %x state %d",
+		          lkb->lkb_id, state);
+	}
+}
+
+/*
+ * Tell a remote node to grant a lock.  This happens when we are the master
+ * copy for a lock that is actually held on a remote node.  The remote end is
+ * also responsible for sending the completion AST.
+ */
+
+void remote_grant(struct dlm_lkb *lkb)
+{
+	struct writequeue_entry *e;
+	struct dlm_request *req;
+
+	// TODO Error handling
+	e = lowcomms_get_buffer(lkb->lkb_nodeid,
+				sizeof(struct dlm_request),
+				lkb->lkb_resource->res_ls->ls_allocation,
+				(char **) &req);
+	if (!e)
+		return;
+
+	req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
+	req->rr_header.rh_length = sizeof(struct dlm_request);
+	req->rr_header.rh_flags = 0;
+	req->rr_header.rh_lkid = lkb->lkb_id;
+	req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
+	req->rr_remlkid = lkb->lkb_remid;
+	req->rr_flags = 0;
+
+	if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
+		/* This is a confusing non-standard use of rr_flags which is
+		 * usually used to pass lockqueue_flags. */
+		req->rr_flags |= GDLM_LKFLG_DEMOTED;
+	}
+
+	add_request_lvb(lkb, req);
+	midcomms_send_buffer(&req->rr_header, e);
+}
+
+void reply_and_grant(struct dlm_lkb *lkb)
+{
+	struct dlm_request *req = lkb->lkb_request;
+	struct dlm_reply *reply;
+	struct writequeue_entry *e;
+
+	// TODO Error handling
+	e = lowcomms_get_buffer(lkb->lkb_nodeid,
+				sizeof(struct dlm_reply),
+				lkb->lkb_resource->res_ls->ls_allocation,
+				(char **) &reply);
+	if (!e)
+		return;
+
+	reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
+	reply->rl_header.rh_flags = 0;
+	reply->rl_header.rh_length = sizeof(struct dlm_reply);
+	reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
+	reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
+
+	reply->rl_status = lkb->lkb_retstatus;
+	reply->rl_lockstate = lkb->lkb_status;
+	reply->rl_lkid = lkb->lkb_id;
+
+	DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
+
+	lkb->lkb_request = NULL;
+
+	add_reply_lvb(lkb, reply);
+	midcomms_send_buffer(&reply->rl_header, e);
+}
+
+/*
+ * Request removal of a dead entry in the resource directory
+ */
+
+void remote_remove_direntry(struct dlm_ls *ls, int nodeid, char *name,
+			    int namelen)
+{
+	struct writequeue_entry *e;
+	struct dlm_request *req;
+
+	if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+		struct dlm_rcom *rc = allocate_rcom_buffer(ls);
+
+		memcpy(rc->rc_buf, name, namelen);
+		rc->rc_datalen = namelen;
+
+		rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
+
+		free_rcom_buffer(rc);
+		return;
+	}
+	// TODO Error handling
+	e = lowcomms_get_buffer(nodeid,
+				sizeof(struct dlm_request) + namelen - 1,
+				ls->ls_allocation, (char **) &req);
+	if (!e)
+		return;
+
+	memset(req, 0, sizeof(struct dlm_request) + namelen - 1);
+	req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
+	req->rr_header.rh_length =
+	    sizeof(struct dlm_request) + namelen - 1;
+	req->rr_header.rh_flags = 0;
+	req->rr_header.rh_lkid = 0;
+	req->rr_header.rh_lockspace = ls->ls_global_id;
+	req->rr_remlkid = 0;
+	memcpy(req->rr_name, name, namelen);
+
+	midcomms_send_buffer(&req->rr_header, e);
+}
+
+/*
+ * Send remote cluster request to directory or master node before the request
+ * is put on the lock queue.  Runs in the context of the locking caller.
+ */
+
+int send_cluster_request(struct dlm_lkb *lkb, int state)
+{
+	uint32_t target_nodeid;
+	struct dlm_rsb *rsb = lkb->lkb_resource;
+	struct dlm_ls *ls = rsb->res_ls;
+	struct dlm_request *req;
+	struct writequeue_entry *e;
+
+	if (state == GDLM_LQSTATE_WAIT_RSB)
+		target_nodeid = get_directory_nodeid(rsb);
+	else
+		target_nodeid = lkb->lkb_nodeid;
+
+	/* during recovery it's valid for target_nodeid to equal our own;
+	   resend_cluster_requests does this to get requests back on track */
+
+	DLM_ASSERT(target_nodeid && target_nodeid != -1,
+		   print_lkb(lkb);
+		   print_rsb(rsb);
+		   printk("target_nodeid %u\n", target_nodeid););
+
+	if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+		/* this may happen when called by resend_cluster_request */
+		log_error(ls, "send_cluster_request to %u state %d recovery",
+			  target_nodeid, state);
+	}
+
+	e = lowcomms_get_buffer(target_nodeid,
+				sizeof(struct dlm_request) +
+				rsb->res_length - 1, ls->ls_allocation,
+				(char **) &req);
+	if (!e)
+		return -ENOBUFS;
+	memset(req, 0, sizeof(struct dlm_request) + rsb->res_length - 1);
+
+	/* Common stuff, some are just defaults */
+
+	if (lkb->lkb_bastaddr)
+		req->rr_asts = AST_BAST;
+	if (lkb->lkb_astaddr)
+		req->rr_asts |= AST_COMP;
+	if (lkb->lkb_parent)
+		req->rr_remparid = lkb->lkb_parent->lkb_remid;
+
+	req->rr_flags = lkb->lkb_lockqueue_flags;
+	req->rr_rqmode = lkb->lkb_rqmode;
+	req->rr_remlkid = lkb->lkb_remid;
+	req->rr_pid = lkb->lkb_ownpid;
+	req->rr_header.rh_length =
+	    sizeof(struct dlm_request) + rsb->res_length - 1;
+	req->rr_header.rh_flags = 0;
+	req->rr_header.rh_lkid = lkb->lkb_id;
+	req->rr_header.rh_lockspace = ls->ls_global_id;
+
+	switch (state) {
+
+	case GDLM_LQSTATE_WAIT_RSB:
+
+		DLM_ASSERT(!lkb->lkb_parent,
+			   print_lkb(lkb);
+			   print_rsb(rsb););
+
+		log_debug(ls, "(%d) send lu %x to %u",
+			  lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
+
+		req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
+		memcpy(req->rr_name, rsb->res_name, rsb->res_length);
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONVERT:
+
+		DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
+			   print_lkb(lkb);
+			   print_rsb(rsb););
+
+		log_debug(ls, "(%d) send cv %x to %u",
+			  lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
+
+		req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
+		if (lkb->lkb_range) {
+			req->rr_flags |= GDLM_LKFLG_RANGE;
+			req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
+			req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
+		}
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONDGRANT:
+
+		DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
+			   print_lkb(lkb);
+			   print_rsb(rsb););
+
+		log_debug(ls, "(%d) send rq %x to %u",
+			  lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
+
+		req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
+		memcpy(req->rr_name, rsb->res_name, rsb->res_length);
+		if (lkb->lkb_range) {
+			req->rr_flags |= GDLM_LKFLG_RANGE;
+			req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
+			req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
+		}
+		break;
+
+	case GDLM_LQSTATE_WAIT_UNLOCK:
+
+		log_debug(ls, "(%d) send un %x to %u",
+			  lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
+
+		req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
+		break;
+
+	default:
+		DLM_ASSERT(0, printk("Unknown cluster request\n"););
+	}
+
+	add_request_lvb(lkb, req);
+	midcomms_send_buffer(&req->rr_header, e);
+
+	return 0;
+}
+
+/*
+ * We got a request from another cluster node, process it and return an info
+ * structure with the lock state/LVB etc as required.  Executes in the DLM's
+ * recvd thread.
+ */
+
+int process_cluster_request(int nodeid, struct dlm_header *req, int recovery)
+{
+	struct dlm_ls *lspace;
+	struct dlm_lkb *lkb = NULL;
+	struct dlm_rsb *rsb;
+	int send_reply = 0, status = 0, namelen;
+	struct dlm_request *freq = (struct dlm_request *) req;
+	struct dlm_reply *rp = (struct dlm_reply *) req;
+	struct dlm_reply reply;
+
+	lspace = find_lockspace_by_global_id(req->rh_lockspace);
+
+	if (!lspace) {
+		log_print("process_cluster_request invalid lockspace %x "
+			  "from %d req %u", req->rh_lockspace, nodeid,
+			  req->rh_cmd);
+		return -EINVAL;
+	}
+
+	/* wait for recoverd to drain requestqueue */
+	if (!recovery)
+		wait_requestqueue(lspace);
+
+	/*
+	 * If we're in recovery then queue the request for later.  Otherwise,
+	 * we still need to get the "in_recovery" lock to make sure the
+	 * recovery itself doesn't start until we are done.
+	 */
+ retry:
+	if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
+		if (!recovery)
+			add_to_requestqueue(lspace, nodeid, req);
+		status = -EINTR;
+		goto out;
+	}
+	if (!down_read_trylock(&lspace->ls_in_recovery)) {
+		schedule();
+		goto retry;
+	}
+
+
+	/*
+	 * Process the request.
+	 */
+
+	switch (req->rh_cmd) {
+
+	case GDLM_REMCMD_LOOKUP:
+		{
+			uint32_t dir_nodeid, r_nodeid;
+			int status;
+
+			namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
+
+			dir_nodeid = name_to_directory_nodeid(lspace,
+							      freq->rr_name,
+							      namelen);
+			if (dir_nodeid != our_nodeid())
+				log_debug(lspace, "ignoring directory lookup");
+
+			status = dlm_dir_lookup(lspace, nodeid, freq->rr_name,
+					        namelen, &r_nodeid);
+			reply.rl_status = status;
+			reply.rl_lockstate = 0;
+			reply.rl_nodeid = r_nodeid;
+		}
+		send_reply = 1;
+		break;
+
+	case GDLM_REMCMD_REM_RESDATA:
+
+		namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
+		dlm_dir_remove(lspace, nodeid, freq->rr_name, namelen);
+		break;
+
+	case GDLM_REMCMD_LOCKREQUEST:
+
+		lkb = remote_stage2(nodeid, lspace, freq);
+		if (lkb) {
+			lkb->lkb_request = freq;
+			lkb->lkb_ownpid = freq->rr_pid;
+			if (lkb->lkb_retstatus != -EINVAL)
+				dlm_lock_stage3(lkb);
+
+			/*
+			 * If the request was granted in lock_stage3, then a
+			 * reply message was already sent in combination with
+			 * the grant message and lkb_request is NULL.
+			 */
+
+			if (lkb->lkb_request) {
+				lkb->lkb_request = NULL;
+				send_reply = 1;
+				reply.rl_status = lkb->lkb_retstatus;
+				reply.rl_lockstate = lkb->lkb_status;
+				reply.rl_lkid = lkb->lkb_id;
+
+				/*
+				 * If the request could not be granted and the
+				 * user won't wait, then free up the LKB
+				 */
+
+				if (lkb->lkb_retstatus == -EAGAIN) {
+					rsb = lkb->lkb_resource;
+					release_lkb(lspace, lkb);
+					release_rsb(rsb);
+					lkb = NULL;
+				}
+				else if (lkb->lkb_retstatus == -EINVAL) {
+					release_lkb(lspace, lkb);
+					lkb = NULL;
+				}
+			}
+		} else {
+			reply.rl_status = -ENOMEM;
+			send_reply = 1;
+		}
+		break;
+
+	case GDLM_REMCMD_CONVREQUEST:
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+
+		DLM_ASSERT(lkb,
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		rsb = lkb->lkb_resource;
+
+		DLM_ASSERT(rsb,
+			   print_lkb(lkb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		DLM_ASSERT(!rsb->res_nodeid,
+			   print_lkb(lkb);
+			   print_rsb(rsb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
+			   print_lkb(lkb);
+			   print_rsb(rsb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_GRANTED,
+			   print_lkb(lkb);
+			   print_rsb(rsb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		/* Update orphan lock status */
+		if (freq->rr_flags & DLM_LKF_ORPHAN) {
+			lkb->lkb_flags |= GDLM_LKFLG_ORPHAN;
+		}
+
+		lkb->lkb_rqmode = freq->rr_rqmode;
+		lkb->lkb_lockqueue_flags = freq->rr_flags;
+		lkb->lkb_request = freq;
+		lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
+
+		if (lkb->lkb_flags & GDLM_LKFLG_VALBLK ||
+		    freq->rr_flags & DLM_LKF_VALBLK) {
+			lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
+			allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
+					      freq->rr_lvb);
+		}
+
+		if (freq->rr_flags & GDLM_LKFLG_RANGE) {
+			if (lkb_set_range(lspace, lkb, freq->rr_range_start,
+			                  freq->rr_range_end)) {
+				reply.rl_status = -ENOMEM;
+				send_reply = 1;
+				goto out;
+			}
+		}
+
+		log_debug(lspace, "(%d) cv %u from %u %x \"%s\"",
+			  lkb->lkb_ownpid, lkb->lkb_rqmode, nodeid,
+			  lkb->lkb_id, rsb->res_name);
+
+		dlm_convert_stage2(lkb, FALSE);
+
+		/*
+		 * If the conv request was granted in stage2, then a reply
+		 * message was already sent in combination with the grant
+		 * message.
+		 */
+
+		if (lkb->lkb_request) {
+			lkb->lkb_request = NULL;
+			send_reply = 1;
+			reply.rl_status = lkb->lkb_retstatus;
+			reply.rl_lockstate = lkb->lkb_status;
+			reply.rl_lkid = lkb->lkb_id;
+		}
+		break;
+
+	case GDLM_REMCMD_LOCKREPLY:
+
+		lkb = find_lock_by_id(lspace, req->rh_lkid);
+
+		DLM_ASSERT(lkb,
+			   print_reply(rp);
+			   printk("nodeid %u\n", nodeid););
+
+		DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
+			   print_lkb(lkb);
+			   print_reply(rp);
+			   printk("nodeid %u\n", nodeid););
+
+		process_lockqueue_reply(lkb, rp, nodeid);
+		break;
+
+	case GDLM_REMCMD_LOCKGRANT:
+
+		/*
+		 * Remote lock has been granted asynchronously.  Do a compact
+		 * version of what grant_lock() does.
+		 */
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		DLM_ASSERT(lkb,
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		rsb = lkb->lkb_resource;
+
+		DLM_ASSERT(rsb,
+			   print_lkb(lkb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		DLM_ASSERT(rsb->res_nodeid,
+			   print_lkb(lkb);
+			   print_rsb(rsb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
+			   print_lkb(lkb);
+			   print_rsb(rsb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		if (lkb->lkb_lockqueue_state) {
+			log_debug(rsb->res_ls, "grant lock on lockqueue %d",
+				  lkb->lkb_lockqueue_state);
+
+			/* Don't grant locks that are waiting for an unlock */
+			if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK)
+			        return 0;
+
+			print_lkb(lkb);
+			print_request(freq);
+			remove_from_lockqueue(lkb);
+			if (!lkb->lkb_remid)
+				lkb->lkb_remid = req->rh_lkid;
+		}
+
+		down_write(&rsb->res_lock);
+
+		if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+			allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr, freq->rr_lvb);
+
+		lkb->lkb_grmode = lkb->lkb_rqmode;
+		lkb->lkb_rqmode = DLM_LOCK_IV;
+
+		if (lkb->lkb_range) {
+			lkb->lkb_range[GR_RANGE_START] =
+			    lkb->lkb_range[RQ_RANGE_START];
+			lkb->lkb_range[GR_RANGE_END] =
+			    lkb->lkb_range[RQ_RANGE_END];
+		}
+
+		lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+		up_write(&rsb->res_lock);
+
+		if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
+			lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
+
+		lkb->lkb_retstatus = 0;
+		queue_ast(lkb, AST_COMP, 0);
+		break;
+
+	case GDLM_REMCMD_SENDBAST:
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		DLM_ASSERT(lkb,
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
+			queue_ast(lkb, AST_BAST, freq->rr_rqmode);
+		break;
+
+	case GDLM_REMCMD_SENDCAST:
+
+		/* This is only used for some error completion ASTs */
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		DLM_ASSERT(lkb,
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		/* Return the lock to granted status */
+		res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
+		lkb->lkb_retstatus = freq->rr_status;
+		queue_ast(lkb, AST_COMP, 0);
+		break;
+
+	case GDLM_REMCMD_UNLOCKREQUEST:
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		DLM_ASSERT(lkb,
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
+			   print_lkb(lkb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		DLM_ASSERT(lkb->lkb_nodeid == nodeid,
+			   print_lkb(lkb);
+			   print_request(freq);
+			   printk("nodeid %u\n", nodeid););
+
+		rsb = find_rsb_to_unlock(lspace, lkb);
+
+		log_debug(lspace, "(%d) un from %u %x \"%s\"", lkb->lkb_ownpid,
+			  nodeid, lkb->lkb_id, rsb->res_name);
+
+		reply.rl_status = dlm_unlock_stage2(lkb, rsb, freq->rr_flags);
+		send_reply = 1;
+		break;
+
+	case GDLM_REMCMD_QUERY:
+	        remote_query(nodeid, lspace, req);
+		break;
+
+	case GDLM_REMCMD_QUERYREPLY:
+	        remote_query_reply(nodeid, lspace, req);
+		break;
+
+	default:
+		log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
+	}
+
+	up_read(&lspace->ls_in_recovery);
+
+      out:
+	if (send_reply) {
+		reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
+		reply.rl_header.rh_flags = 0;
+		reply.rl_header.rh_length = sizeof(reply);
+		reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
+		reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
+
+		status = midcomms_send_message(nodeid, &reply.rl_header,
+			                       GFP_KERNEL);
+	}
+
+	wake_astd();
+	put_lockspace(lspace);
+	return status;
+}
+
+static void add_reply_lvb(struct dlm_lkb *lkb, struct dlm_reply *reply)
+{
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+		memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
+}
+
+static void add_request_lvb(struct dlm_lkb *lkb, struct dlm_request *req)
+{
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+		memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
+}
diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
--- linux-orig/cluster/dlm/lockqueue.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lockqueue.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKQUEUE_DOT_H__
+#define __LOCKQUEUE_DOT_H__
+
+void remote_grant(struct dlm_lkb * lkb);
+void reply_and_grant(struct dlm_lkb * lkb);
+int remote_stage(struct dlm_lkb * lkb, int state);
+int process_cluster_request(int csid, struct dlm_header *req, int recovery);
+int send_cluster_request(struct dlm_lkb * lkb, int state);
+void purge_requestqueue(struct dlm_ls * ls);
+int process_requestqueue(struct dlm_ls * ls);
+int reply_in_requestqueue(struct dlm_ls * ls, int lkid);
+void remote_remove_direntry(struct dlm_ls * ls, int nodeid, char *name,
+			    int namelen);
+void allocate_and_copy_lvb(struct dlm_ls * ls, char **lvbptr, char *src);
+
+#endif				/* __LOCKQUEUE_DOT_H__ */
diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
--- linux-orig/cluster/dlm/lockspace.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lockspace.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,715 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/module.h>
+
+#include "dlm_internal.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lockspace.h"
+#include "device.h"
+
+#define GDST_NONE       (0)
+#define GDST_RUNNING    (1)
+
+static int dlmstate;
+static int dlmcount;
+static struct semaphore dlmstate_lock;
+struct list_head lslist;
+spinlock_t lslist_lock;
+struct kcl_service_ops ls_ops;
+
+static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
+
+
+void dlm_lockspace_init(void)
+{
+	dlmstate = GDST_NONE;
+	dlmcount = 0;
+	init_MUTEX(&dlmstate_lock);
+	INIT_LIST_HEAD(&lslist);
+	spin_lock_init(&lslist_lock);
+}
+
+struct dlm_ls *find_lockspace_by_name(char *name, int namelen)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_namelen == namelen &&
+		    memcmp(ls->ls_name, name, namelen) == 0)
+			goto out;
+	}
+	ls = NULL;
+      out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *find_lockspace_by_global_id(uint32_t id)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_global_id == id) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+      out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *find_lockspace_by_local_id(void *id)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_local_id == (uint32_t)(long)id) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+      out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+/* must be called with lslist_lock held */
+void hold_lockspace(struct dlm_ls *ls)
+{
+	ls->ls_count++;
+}
+
+void put_lockspace(struct dlm_ls *ls)
+{
+	spin_lock(&lslist_lock);
+	ls->ls_count--;
+	spin_unlock(&lslist_lock);
+}
+
+static void remove_lockspace(struct dlm_ls *ls)
+{
+	for (;;) {
+		spin_lock(&lslist_lock);
+		if (ls->ls_count == 0) {
+			list_del(&ls->ls_list);
+			spin_unlock(&lslist_lock);
+			return;
+		}
+		spin_unlock(&lslist_lock);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
+}
+
+/*
+ * Called from dlm_init.  These are the general threads which are not
+ * lockspace-specific and work for all dlm lockspaces.
+ */
+
+static int threads_start(void)
+{
+	int error;
+
+	/* Thread which process lock requests for all ls's */
+	error = astd_start();
+	if (error) {
+		log_print("cannot start ast thread %d", error);
+		goto fail;
+	}
+
+	/* Thread for sending/receiving messages for all ls's */
+	error = lowcomms_start();
+	if (error) {
+		log_print("cannot start lowcomms %d", error);
+		goto astd_fail;
+	}
+
+	return 0;
+
+      astd_fail:
+	astd_stop();
+
+      fail:
+	return error;
+}
+
+static void threads_stop(void)
+{
+	lowcomms_stop();
+	astd_stop();
+}
+
+static int init_internal(void)
+{
+	int error = 0;
+
+	if (dlmstate == GDST_RUNNING)
+		dlmcount++;
+	else {
+		error = threads_start();
+		if (error)
+			goto out;
+
+		dlmstate = GDST_RUNNING;
+		dlmcount = 1;
+	}
+
+      out:
+	return error;
+}
+
+/*
+ * Called after dlm module is loaded and before any lockspaces are created.
+ * Starts and initializes global threads and structures.  These global entities
+ * are shared by and independent of all lockspaces.
+ *
+ * There should be a dlm-specific user command which a person can run which
+ * calls this function.  If a user hasn't run that command and something
+ * creates a new lockspace, this is called first.
+ *
+ * This also starts the default lockspace.
+ */
+
+int dlm_init(void)
+{
+	int error;
+
+	down(&dlmstate_lock);
+	error = init_internal();
+	up(&dlmstate_lock);
+
+	return error;
+}
+
+int dlm_release(void)
+{
+	int error = 0;
+
+	down(&dlmstate_lock);
+
+	if (dlmstate == GDST_NONE)
+		goto out;
+
+	if (dlmcount)
+		dlmcount--;
+
+	if (dlmcount)
+		goto out;
+
+	spin_lock(&lslist_lock);
+	if (!list_empty(&lslist)) {
+		spin_unlock(&lslist_lock);
+		log_print("cannot stop threads, lockspaces still exist");
+		goto out;
+	}
+	spin_unlock(&lslist_lock);
+
+	threads_stop();
+	dlmstate = GDST_NONE;
+
+      out:
+	up(&dlmstate_lock);
+
+	return error;
+}
+
+struct dlm_ls *allocate_ls(int namelen)
+{
+	struct dlm_ls *ls;
+
+	ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+	if (ls)
+		memset(ls, 0, sizeof(struct dlm_ls) + namelen);
+
+	return ls;
+}
+
+static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
+{
+	struct dlm_ls *ls;
+	int i, size, error = -ENOMEM;
+	uint32_t local_id = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	if (namelen > MAX_SERVICE_NAME_LEN)
+		return -EINVAL;
+
+	ls = find_lockspace_by_name(name, namelen);
+	if (ls) {
+		*lockspace = (void *)(long) ls->ls_local_id;
+		return -EEXIST;
+	}
+
+	/*
+	 * Initialize ls fields
+	 */
+
+	ls = allocate_ls(namelen);
+	if (!ls)
+		goto out;
+
+	memcpy(ls->ls_name, name, namelen);
+	ls->ls_namelen = namelen;
+
+	ls->ls_allocation = GFP_KERNEL;
+	ls->ls_count = 0;
+	ls->ls_flags = 0;
+
+	size = dlm_config.rsbtbl_size;
+	ls->ls_rsbtbl_size = size;
+
+	ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+	if (!ls->ls_rsbtbl)
+		goto out_lsfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+		rwlock_init(&ls->ls_rsbtbl[i].lock);
+	}
+
+	size = dlm_config.lkbtbl_size;
+	ls->ls_lkbtbl_size = size;
+
+	ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+	if (!ls->ls_lkbtbl)
+		goto out_rsbfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
+		rwlock_init(&ls->ls_lkbtbl[i].lock);
+		ls->ls_lkbtbl[i].counter = 1;
+	}
+
+	size = dlm_config.dirtbl_size;
+	ls->ls_dirtbl_size = size;
+
+	ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+	if (!ls->ls_dirtbl)
+		goto out_lkbfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
+		rwlock_init(&ls->ls_dirtbl[i].lock);
+	}
+
+	INIT_LIST_HEAD(&ls->ls_nodes);
+	INIT_LIST_HEAD(&ls->ls_nodes_gone);
+	ls->ls_num_nodes = 0;
+	ls->ls_node_array = NULL;
+	ls->ls_recoverd_task = NULL;
+	init_MUTEX(&ls->ls_recoverd_lock);
+	INIT_LIST_HEAD(&ls->ls_recover);
+	spin_lock_init(&ls->ls_recover_lock);
+	INIT_LIST_HEAD(&ls->ls_recover_list);
+	ls->ls_recover_list_count = 0;
+	spin_lock_init(&ls->ls_recover_list_lock);
+	init_waitqueue_head(&ls->ls_wait_general);
+	INIT_LIST_HEAD(&ls->ls_rootres);
+	INIT_LIST_HEAD(&ls->ls_requestqueue);
+	INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
+	ls->ls_last_stop = 0;
+	ls->ls_last_start = 0;
+	ls->ls_last_finish = 0;
+	ls->ls_rcom_msgid = 0;
+	init_MUTEX(&ls->ls_requestqueue_lock);
+	init_MUTEX(&ls->ls_rcom_lock);
+	init_rwsem(&ls->ls_unlock_sem);
+	init_rwsem(&ls->ls_root_lock);
+	init_rwsem(&ls->ls_in_recovery);
+
+	down_write(&ls->ls_in_recovery);
+
+	if (flags & DLM_LSF_NOTIMERS)
+		set_bit(LSFL_NOTIMERS, &ls->ls_flags);
+
+
+	/*
+	 * Connect this lockspace with the cluster manager
+	 */
+
+	error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
+				     &ls_ops, TRUE, (void *) ls, &local_id);
+	if (error)
+		goto out_recoverd;
+
+	ls->ls_state = LSST_INIT;
+	ls->ls_local_id = local_id;
+
+	spin_lock(&lslist_lock);
+	list_add(&ls->ls_list, &lslist);
+	spin_unlock(&lslist_lock);
+
+	error = kcl_join_service(local_id);
+	if (error) {
+		log_error(ls, "service manager join error %d", error);
+		goto out_reg;
+	}
+
+	/* The ls isn't actually running until it receives a start() from CMAN.
+	   Neither does it have a global ls id until started. */
+
+	/* Return the local ID as the lockspace handle. I've left this
+	   cast to a void* as it allows us to replace it with pretty much
+	   anything at a future date without breaking clients. But returning
+	   the address of the lockspace is a bad idea as it could get
+	   forcibly removed, leaving client with a dangling pointer */
+
+	*lockspace = (void *)(long) local_id;
+	return 0;
+
+ out_reg:
+	kcl_unregister_service(ls->ls_local_id);
+ out_recoverd:
+	dlm_recoverd_stop(ls);
+	kfree(ls->ls_dirtbl);
+ out_lkbfree:
+	kfree(ls->ls_lkbtbl);
+ out_rsbfree:
+	kfree(ls->ls_rsbtbl);
+ out_lsfree:
+	kfree(ls);
+ out:
+	return error;
+}
+
+/*
+ * Called by a system like GFS which wants independent lock spaces.
+ */
+
+int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
+{
+	int error = -ENOSYS;
+
+	down(&dlmstate_lock);
+	error = init_internal();
+	if (error)
+		goto out;
+
+	error = new_lockspace(name, namelen, lockspace, flags);
+ out:
+	up(&dlmstate_lock);
+	return error;
+}
+
+/* Return 1 if the lockspace still has active remote locks,
+ *        2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(struct dlm_ls *ls)
+{
+	int i, lkb_found = 0;
+	struct dlm_lkb *lkb;
+
+	/* NOTE: We check the lockidtbl here rather than the resource table.
+	   This is because there may be LKBs queued as ASTs that have been
+	   unlinked from their RSBs and are pending deletion once the AST has
+	   been delivered */
+
+	for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+		read_lock(&ls->ls_lkbtbl[i].lock);
+		if (!list_empty(&ls->ls_lkbtbl[i].list)) {
+			lkb_found = 1;
+			list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
+					    lkb_idtbl_list) {
+				if (!lkb->lkb_nodeid) {
+					read_unlock(&ls->ls_lkbtbl[i].lock);
+					return 2;
+				}
+			}
+		}
+		read_unlock(&ls->ls_lkbtbl[i].lock);
+	}
+	return lkb_found;
+}
+
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *rsb;
+	struct dlm_recover *rv;
+	struct list_head *head;
+	int i;
+	int busy = lockspace_busy(ls);
+
+	/* Don't destroy a busy lockspace */
+	if (busy > force)
+		return -EBUSY;
+
+	if (force < 3) {
+		kcl_leave_service(ls->ls_local_id);
+		kcl_unregister_service(ls->ls_local_id);
+	}
+
+	dlm_recoverd_stop(ls);
+
+	remove_lockspace(ls);
+
+	/*
+	 * Free direntry structs.
+	 */
+
+	dlm_dir_clear(ls);
+	kfree(ls->ls_dirtbl);
+
+	/*
+	 * Free all lkb's on lkbtbl[] lists.
+	 */
+
+	for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+		head = &ls->ls_lkbtbl[i].list;
+		while (!list_empty(head)) {
+			lkb = list_entry(head->next, struct dlm_lkb,
+					 lkb_idtbl_list);
+			list_del(&lkb->lkb_idtbl_list);
+
+			if (lkb->lkb_lockqueue_state)
+				remove_from_lockqueue(lkb);
+
+			if (lkb->lkb_astflags & (AST_COMP | AST_BAST))
+				list_del(&lkb->lkb_astqueue);
+
+			if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
+				free_lvb(lkb->lkb_lvbptr);
+
+			free_lkb(lkb);
+		}
+	}
+
+	kfree(ls->ls_lkbtbl);
+
+	/*
+	 * Free all rsb's on rsbtbl[] lists
+	 */
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		head = &ls->ls_rsbtbl[i].list;
+		while (!list_empty(head)) {
+			rsb = list_entry(head->next, struct dlm_rsb,
+					 res_hashchain);
+			list_del(&rsb->res_hashchain);
+
+			if (rsb->res_lvbptr)
+				free_lvb(rsb->res_lvbptr);
+
+			free_rsb(rsb);
+		}
+	}
+
+	kfree(ls->ls_rsbtbl);
+
+	/*
+	 * Free structures on any other lists
+	 */
+
+	head = &ls->ls_recover;
+	while (!list_empty(head)) {
+		rv = list_entry(head->next, struct dlm_recover, list);
+		list_del(&rv->list);
+		kfree(rv);
+	}
+
+	clear_free_de(ls);
+
+	ls_nodes_clear(ls);
+	ls_nodes_gone_clear(ls);
+	if (ls->ls_node_array)
+		kfree(ls->ls_node_array);
+
+	kfree(ls);
+	dlm_release();
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer.  We blindly free everything we're managing for this
+ * lockspace.  Remaining nodes will go through the recovery process as if we'd
+ * died.  The lockspace must continue to function as usual, participating in
+ * recoveries, until kcl_leave_service returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+	struct dlm_ls *ls;
+
+	ls = find_lockspace_by_local_id(lockspace);
+	if (!ls)
+		return -EINVAL;
+	put_lockspace(ls);
+	return release_lockspace(ls, force);
+}
+
+
+/* Called when the cluster is being shut down dirtily */
+void dlm_emergency_shutdown()
+{
+	struct dlm_ls *ls;
+	struct dlm_ls *tmp;
+
+	/* Shut lowcomms down to prevent any socket activity */
+	lowcomms_stop_accept();
+
+	/* Delete the devices that belong the the userland
+	   lockspaces to be deleted. */
+	dlm_device_free_devices();
+
+	/* Now try to clean the lockspaces */
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
+		spin_unlock(&lslist_lock);
+		release_lockspace(ls, 3);
+		spin_lock(&lslist_lock);
+	}
+
+	spin_unlock(&lslist_lock);
+}
+
+struct dlm_recover *allocate_dlm_recover(void)
+{
+	struct dlm_recover *rv;
+
+	rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+	if (rv)
+		memset(rv, 0, sizeof(struct dlm_recover));
+	return rv;
+}
+
+/*
+ * Called by CMAN on a specific ls.  "stop" means set flag which while set
+ * causes all new requests to ls to be queued and not submitted until flag is
+ * cleared.  stop on a ls also needs to cancel any prior starts on the ls.
+ * The recoverd thread carries out any work called for by this event.
+ */
+
+static int dlm_ls_stop(void *servicedata)
+{
+	struct dlm_ls *ls = (struct dlm_ls *) servicedata;
+	int new;
+
+	spin_lock(&ls->ls_recover_lock);
+	ls->ls_last_stop = ls->ls_last_start;
+	set_bit(LSFL_LS_STOP, &ls->ls_flags);
+	new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	/*
+	 * This in_recovery lock does two things:
+	 *
+	 * 1) Keeps this function from returning until all threads are out
+	 *    of locking routines and locking is truely stopped.
+	 * 2) Keeps any new requests from being processed until it's unlocked
+	 *    when recovery is complete.
+	 */
+
+	if (new)
+		down_write(&ls->ls_in_recovery);
+
+	clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
+	clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
+	clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
+	clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
+
+	dlm_recoverd_kick(ls);
+
+	return 0;
+}
+
+/*
+ * Called by CMAN on a specific ls.  "start" means enable the lockspace to do
+ * request processing which first requires that the recovery procedure be
+ * stepped through with all nodes sharing the lockspace (nodeids).  The first
+ * start on the ls after it's created is a special case and requires some extra
+ * work like figuring out our own local nodeid.  We can't do all this in the
+ * calling CMAN context, so we must pass this work off to the recoverd thread
+ * which was created in dlm_init().  The recoverd thread carries out any work
+ * called for by this event.
+ */
+
+static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
+			int event_id, int type)
+{
+	struct dlm_ls *ls = (struct dlm_ls *) servicedata;
+	struct dlm_recover *rv;
+	int error = -ENOMEM;
+
+	rv = allocate_dlm_recover();
+	if (!rv)
+		goto out;
+
+	rv->nodeids = nodeids;
+	rv->node_count = count;
+	rv->event_id = event_id;
+
+	spin_lock(&ls->ls_recover_lock);
+	if (ls->ls_last_start == event_id)
+		log_all(ls, "repeated start %d stop %d finish %d",
+			event_id, ls->ls_last_stop, ls->ls_last_finish);
+	ls->ls_last_start = event_id;
+	list_add_tail(&rv->list, &ls->ls_recover);
+	set_bit(LSFL_LS_START, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	dlm_recoverd_kick(ls);
+	error = 0;
+
+      out:
+	return error;
+}
+
+/*
+ * Called by CMAN on a specific ls.  "finish" means that all nodes which
+ * received a "start" have completed the start and called kcl_start_done.
+ * The recoverd thread carries out any work called for by this event.
+ */
+
+static void dlm_ls_finish(void *servicedata, int event_id)
+{
+	struct dlm_ls *ls = (struct dlm_ls *) servicedata;
+
+	spin_lock(&ls->ls_recover_lock);
+	ls->ls_last_finish = event_id;
+	set_bit(LSFL_LS_FINISH, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	dlm_recoverd_kick(ls);
+}
+
+struct kcl_service_ops ls_ops = {
+	.stop = dlm_ls_stop,
+	.start = dlm_ls_start,
+	.finish = dlm_ls_finish
+};
diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
--- linux-orig/cluster/dlm/lockspace.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lockspace.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+void dlm_lockspace_init(void);
+int dlm_init(void);
+int dlm_release(void);
+int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
+int dlm_release_lockspace(void *ls, int force);
+void dlm_emergency_shutdown(void);
+struct dlm_ls *find_lockspace_by_global_id(uint32_t id);
+struct dlm_ls *find_lockspace_by_local_id(void *id);
+struct dlm_ls *find_lockspace_by_name(char *name, int namelen);
+void hold_lockspace(struct dlm_ls *ls);
+void put_lockspace(struct dlm_ls *ls);
+
+#endif				/* __LOCKSPACE_DOT_H__ */
diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
--- linux-orig/cluster/dlm/lowcomms.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lowcomms.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,1415 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never waits.
+ *
+ */
+
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/pagemap.h>
+#include <cluster/cnxman.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "config.h"
+
+struct cbuf {
+	unsigned base;
+	unsigned len;
+	unsigned mask;
+};
+
+#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_EAT(cb, n) do { (cb)->len  -= (n); \
+                             (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+
+struct connection {
+	struct socket *sock;	/* NULL if not connected */
+	uint32_t nodeid;	/* So we know who we are in the list */
+	struct rw_semaphore sock_sem;	/* Stop connect races */
+	struct list_head read_list;	/* On this list when ready for reading */
+	struct list_head write_list;	/* On this list when ready for writing */
+	struct list_head state_list;	/* On this list when ready to connect */
+	unsigned long flags;	/* bit 1,2 = We are on the read/write lists */
+#define CF_READ_PENDING 1
+#define CF_WRITE_PENDING 2
+#define CF_CONNECT_PENDING 3
+#define CF_IS_OTHERCON 4
+	struct list_head writequeue;	/* List of outgoing writequeue_entries */
+	struct list_head listenlist;    /* List of allocated listening sockets */
+	spinlock_t writequeue_lock;
+	int (*rx_action) (struct connection *);	/* What to do when active */
+	struct page *rx_page;
+	struct cbuf cb;
+	int retries;
+	atomic_t waiting_requests;
+#define MAX_CONNECT_RETRIES 3
+	struct connection *othercon;
+};
+#define sock2con(x) ((struct connection *)(x)->sk_user_data)
+
+/* An entry waiting to be sent */
+struct writequeue_entry {
+	struct list_head list;
+	struct page *page;
+	int offset;
+	int len;
+	int end;
+	int users;
+	struct connection *con;
+};
+
+/* "Template" structure for IPv4 and IPv6 used to fill
+ * in the missing bits when converting between cman (which knows
+ * nothing about sockaddr structs) and real life where we actually
+ * have to connect to these addresses. Also one of these structs
+ * will hold the cached "us" address.
+ *
+ * It's an in6 sockaddr just so there's enough space for anything
+ * we're likely to see here.
+ */
+static struct sockaddr_in6 local_addr;
+
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+
+static wait_queue_t lowcomms_send_waitq_head;
+static wait_queue_head_t lowcomms_send_waitq;
+static wait_queue_t lowcomms_recv_waitq_head;
+static wait_queue_head_t lowcomms_recv_waitq;
+
+/* An array of pointers to connections, indexed by NODEID */
+static struct connection **connections;
+static struct rw_semaphore connections_lock;
+static kmem_cache_t *con_cache;
+static int conn_array_size;
+static atomic_t accepting;
+
+/* List of sockets that have reads pending */
+static struct list_head read_sockets;
+static spinlock_t read_sockets_lock;
+
+/* List of sockets which have writes pending */
+static struct list_head write_sockets;
+static spinlock_t write_sockets_lock;
+
+/* List of sockets which have connects pending */
+static struct list_head state_sockets;
+static spinlock_t state_sockets_lock;
+
+/* List of allocated listen sockets */
+static struct list_head listen_sockets;
+
+static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
+static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
+
+
+static struct connection *nodeid2con(int nodeid, int allocation)
+{
+	struct connection *con = NULL;
+
+	down_read(&connections_lock);
+	if (nodeid >= conn_array_size) {
+		int new_size = nodeid + dlm_config.conn_increment;
+		struct connection **new_conns;
+
+		new_conns = kmalloc(sizeof(struct connection *) *
+				    new_size, allocation);
+		if (!new_conns)
+			goto finish;
+
+		up_read(&connections_lock);
+		/* The worst that can happen here (I think), is that
+		   we get two consecutive reallocations */
+		down_write(&connections_lock);
+
+		memset(new_conns, 0, sizeof(struct connection *) * new_size);
+		memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
+		conn_array_size = new_size;
+		kfree(connections);
+		connections = new_conns;
+
+		up_write(&connections_lock);
+		down_read(&connections_lock);
+	}
+
+	con = connections[nodeid];
+	if (con == NULL && allocation) {
+		con = kmem_cache_alloc(con_cache, allocation);
+		if (!con)
+			goto finish;
+
+		memset(con, 0, sizeof(*con));
+		con->nodeid = nodeid;
+		init_rwsem(&con->sock_sem);
+		INIT_LIST_HEAD(&con->writequeue);
+		spin_lock_init(&con->writequeue_lock);
+
+		connections[nodeid] = con;
+	}
+
+ finish:
+	up_read(&connections_lock);
+	return con;
+}
+
+/* Data available on socket or listen socket received a connect */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+	struct connection *con = sock2con(sk);
+
+	atomic_inc(&con->waiting_requests);
+	if (test_and_set_bit(CF_READ_PENDING, &con->flags))
+		return;
+
+	spin_lock_bh(&read_sockets_lock);
+	list_add_tail(&con->read_list, &read_sockets);
+	spin_unlock_bh(&read_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_recv_waitq);
+}
+
+static void lowcomms_write_space(struct sock *sk)
+{
+	struct connection *con = sock2con(sk);
+
+	if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+		return;
+
+	spin_lock_bh(&write_sockets_lock);
+	list_add_tail(&con->write_list, &write_sockets);
+	spin_unlock_bh(&write_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_send_waitq);
+}
+
+static inline void lowcomms_connect_sock(struct connection *con)
+{
+	if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
+		return;
+	if (!atomic_read(&accepting))
+		return;
+
+	spin_lock_bh(&state_sockets_lock);
+	list_add_tail(&con->state_list, &state_sockets);
+	spin_unlock_bh(&state_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_send_waitq);
+}
+
+static void lowcomms_state_change(struct sock *sk)
+{
+/*	struct connection *con = sock2con(sk); */
+
+	switch (sk->sk_state) {
+	case TCP_ESTABLISHED:
+		lowcomms_write_space(sk);
+		break;
+
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+	case TCP_TIME_WAIT:
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+	case TCP_LAST_ACK:
+	case TCP_CLOSING:
+		/* FIXME: I think this causes more trouble than it solves.
+		   lowcomms wil reconnect anyway when there is something to
+		   send. This just attempts reconnection if a node goes down!
+		*/
+		/* lowcomms_connect_sock(con); */
+		break;
+
+	default:
+		printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
+		break;
+	}
+}
+
+/* Make a socket active */
+static int add_sock(struct socket *sock, struct connection *con)
+{
+	con->sock = sock;
+
+	/* Install a data_ready callback */
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->sock->sk->sk_write_space = lowcomms_write_space;
+	con->sock->sk->sk_state_change = lowcomms_state_change;
+
+	return 0;
+}
+
+/* Add the port number to an IP6 or 4 sockaddr and return the address
+   length */
+static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
+			  int *addr_len)
+{
+        saddr->sin6_family = local_addr.sin6_family;
+        if (local_addr.sin6_family == AF_INET) {
+		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+		in4_addr->sin_port = cpu_to_be16(port);
+		*addr_len = sizeof(struct sockaddr_in);
+	}
+	else {
+		saddr->sin6_port = cpu_to_be16(port);
+		*addr_len = sizeof(struct sockaddr_in6);
+	}
+}
+
+/* Close a remote connection and tidy up */
+static void close_connection(struct connection *con, int and_other)
+{
+	down_write(&con->sock_sem);
+
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+		if (con->othercon && and_other) {
+			/* Argh! recursion in kernel code!
+			   Actually, this isn't a list so it
+			   will only re-enter once.
+			*/
+			close_connection(con->othercon, TRUE);
+		}
+	}
+	if (con->rx_page) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+	up_write(&con->sock_sem);
+}
+
+/* Data received from remote end */
+static int receive_from_sock(struct connection *con)
+{
+	int ret = 0;
+	struct msghdr msg;
+	struct iovec iov[2];
+	mm_segment_t fs;
+	unsigned len;
+	int r;
+	int call_again_soon = 0;
+
+	down_read(&con->sock_sem);
+
+	if (con->sock == NULL)
+		goto out;
+	if (con->rx_page == NULL) {
+		/*
+		 * This doesn't need to be atomic, but I think it should
+		 * improve performance if it is.
+		 */
+		con->rx_page = alloc_page(GFP_ATOMIC);
+		if (con->rx_page == NULL)
+			goto out_resched;
+		CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
+	}
+
+	/*
+	 * To avoid doing too many short reads, we will reschedule for
+	 * another time if there are less than 20 bytes left in the buffer.
+	 */
+	if (!CBUF_MAY_ADD(&con->cb, 20))
+		goto out_resched;
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = iov;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_flags = 0;
+
+	/*
+	 * iov[0] is the bit of the circular buffer between the current end
+	 * point (cb.base + cb.len) and the end of the buffer.
+	 */
+	iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
+	iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
+	iov[1].iov_len = 0;
+
+	/*
+	 * iov[1] is the bit of the circular buffer between the start of the
+	 * buffer and the start of the currently used section (cb.base)
+	 */
+	if (CBUF_DATA(&con->cb) >= con->cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
+		iov[1].iov_len = con->cb.base;
+		iov[1].iov_base = page_address(con->rx_page);
+		msg.msg_iovlen = 2;
+	}
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	fs = get_fs();
+	set_fs(get_ds());
+	r = ret = sock_recvmsg(con->sock, &msg, len,
+			       MSG_DONTWAIT | MSG_NOSIGNAL);
+	set_fs(fs);
+
+	if (ret <= 0)
+		goto out_close;
+	if (ret == len)
+		call_again_soon = 1;
+	CBUF_ADD(&con->cb, ret);
+	ret = midcomms_process_incoming_buffer(con->nodeid,
+					       page_address(con->rx_page),
+					       con->cb.base, con->cb.len,
+					       PAGE_CACHE_SIZE);
+	if (ret == -EBADMSG) {
+		printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
+		       "iov_len=%u, iov_base[0]=%p, read=%d\n",
+		       page_address(con->rx_page), con->cb.base, con->cb.len,
+		       len, iov[0].iov_base, r);
+	}
+	if (ret < 0)
+		goto out_close;
+	CBUF_EAT(&con->cb, ret);
+
+	if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+
+      out:
+	if (call_again_soon)
+		goto out_resched;
+	up_read(&con->sock_sem);
+	ret = 0;
+	goto out_ret;
+
+      out_resched:
+	lowcomms_data_ready(con->sock->sk, 0);
+	up_read(&con->sock_sem);
+	ret = 0;
+	goto out_ret;
+
+      out_close:
+	up_read(&con->sock_sem);
+	if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
+		close_connection(con, FALSE);
+		lowcomms_connect_sock(con);
+	}
+
+      out_ret:
+	return ret;
+}
+
+/* Listening socket is busy, accept a connection */
+static int accept_from_sock(struct connection *con)
+{
+	int result;
+	struct sockaddr_in6 peeraddr;
+	struct socket *newsock;
+	int len;
+	int nodeid;
+	struct connection *newcon;
+
+	memset(&peeraddr, 0, sizeof(peeraddr));
+	newsock = sock_alloc();
+	if (!newsock)
+		return -ENOMEM;
+
+	down_read(&con->sock_sem);
+
+	result = -ENOTCONN;
+	if (con->sock == NULL)
+		goto accept_err;
+
+	newsock->type = con->sock->type;
+	newsock->ops = con->sock->ops;
+
+	result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
+	if (result < 0)
+		goto accept_err;
+
+	/* Get the connected socket's peer */
+	if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
+				  &len, 2)) {
+		result = -ECONNABORTED;
+		goto accept_err;
+	}
+
+	/* Get the new node's NODEID */
+	nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
+	if (nodeid == 0) {
+	    	printk("dlm: connect from non cluster node\n");
+		sock_release(newsock);
+		up_read(&con->sock_sem);
+		return -1;
+	}
+
+	log_print("got connection from %d", nodeid);
+
+	/*  Check to see if we already have a connection to this node. This
+	 *  could happen if the two nodes initiate a connection at roughly
+	 *  the same time and the connections cross on the wire.
+	 * TEMPORARY FIX:
+	 *  In this case we store the incoming one in "othercon"
+	 */
+	newcon = nodeid2con(nodeid, GFP_KERNEL);
+	if (!newcon) {
+		result = -ENOMEM;
+		goto accept_err;
+	}
+	down_write(&newcon->sock_sem);
+	if (newcon->sock) {
+	        struct connection *othercon = newcon->othercon;
+
+		if (!othercon) {
+			othercon = kmem_cache_alloc(con_cache, GFP_KERNEL);
+			if (!othercon) {
+				printk("dlm: failed to allocate incoming socket\n");
+				up_write(&newcon->sock_sem);
+				result = -ENOMEM;
+				goto accept_err;
+			}
+			memset(othercon, 0, sizeof(*othercon));
+			othercon->nodeid = nodeid;
+			othercon->rx_action = receive_from_sock;
+			init_rwsem(&othercon->sock_sem);
+			set_bit(CF_IS_OTHERCON, &othercon->flags);
+			newcon->othercon = othercon;
+		}
+		othercon->sock = newsock;
+		newsock->sk->sk_user_data = othercon;
+		add_sock(newsock, othercon);
+	}
+	else {
+		newsock->sk->sk_user_data = newcon;
+		newcon->rx_action = receive_from_sock;
+		add_sock(newsock, newcon);
+
+	}
+
+	up_write(&newcon->sock_sem);
+
+	/*
+	 * Add it to the active queue in case we got data
+	 * beween processing the accept adding the socket
+	 * to the read_sockets list
+	 */
+	lowcomms_data_ready(newsock->sk, 0);
+	up_read(&con->sock_sem);
+
+	return 0;
+
+      accept_err:
+	up_read(&con->sock_sem);
+	sock_release(newsock);
+
+	if (result != -EAGAIN)
+		printk("dlm: error accepting connection from node: %d\n", result);
+	return result;
+}
+
+/* Connect a new socket to its peer */
+static int connect_to_sock(struct connection *con)
+{
+	int result = -EHOSTUNREACH;
+	struct sockaddr_in6 saddr;
+	int addr_len;
+	struct socket *sock;
+
+	if (con->nodeid == 0) {
+		log_print("attempt to connect sock 0 foiled");
+		return 0;
+	}
+
+	down_write(&con->sock_sem);
+	if (con->retries++ > MAX_CONNECT_RETRIES)
+		goto out;
+
+	// FIXME not sure this should happen, let alone like this.
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (result < 0)
+		goto out_err;
+
+	memset(&saddr, 0, sizeof(saddr));
+	if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
+	        goto out_err;
+
+	sock->sk->sk_user_data = con;
+	con->rx_action = receive_from_sock;
+
+	make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
+
+	add_sock(sock, con);
+
+	log_print("connecting to %d", con->nodeid);
+	result =
+	    sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
+			       O_NONBLOCK);
+	if (result == -EINPROGRESS)
+		result = 0;
+	if (result != 0)
+		goto out_err;
+
+      out:
+	up_write(&con->sock_sem);
+	/*
+	 * Returning an error here means we've given up trying to connect to
+	 * a remote node, otherwise we return 0 and reschedule the connetion
+	 * attempt
+	 */
+	return result;
+
+      out_err:
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
+	/*
+	 * Some errors are fatal and this list might need adjusting. For other
+	 * errors we try again until the max number of retries is reached.
+	 */
+	if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
+	    result != -ENETDOWN && result != EINVAL
+	    && result != -EPROTONOSUPPORT) {
+		lowcomms_connect_sock(con);
+		result = 0;
+	}
+	goto out;
+}
+
+static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
+{
+        struct socket *sock = NULL;
+	mm_segment_t fs;
+	int result = 0;
+	int one = 1;
+	struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (result < 0) {
+		printk("dlm: Can't create listening comms socket\n");
+		goto create_out;
+	}
+
+	fs = get_fs();
+	set_fs(get_ds());
+	result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
+	set_fs(fs);
+	if (result < 0) {
+		printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
+	}
+	sock->sk->sk_user_data = con;
+	con->rx_action = accept_from_sock;
+	con->sock = sock;
+
+	/* Bind to our port */
+	make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
+	result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
+	if (result < 0) {
+		printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		goto create_out;
+	}
+
+	fs = get_fs();
+	set_fs(get_ds());
+
+	result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
+	set_fs(fs);
+	if (result < 0) {
+		printk("dlm: Set keepalive failed: %d\n", result);
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		goto create_out;
+	}
+
+      create_out:
+	return sock;
+}
+
+
+/* Listen on all interfaces */
+static int listen_for_all(void)
+{
+	int result = 0;
+	int nodeid;
+	struct socket *sock = NULL;
+	struct list_head *addr_list;
+	struct connection *con = nodeid2con(0, GFP_KERNEL);
+	struct connection *temp;
+	struct cluster_node_addr *node_addr;
+	char local_addr[sizeof(struct sockaddr_in6)];
+
+	/* This will also fill in local_addr */
+	nodeid = lowcomms_our_nodeid();
+
+	addr_list = kcl_get_node_addresses(nodeid);
+	if (!addr_list) {
+	        printk("dlm: cannot initialise comms layer\n");
+		result = -ENOTCONN;
+		goto create_out;
+	}
+
+	list_for_each_entry(node_addr, addr_list, list) {
+
+		if (!con) {
+			con = kmem_cache_alloc(con_cache, GFP_KERNEL);
+			if (!con) {
+				printk("dlm: failed to allocate listen socket\n");
+				result = -ENOMEM;
+				goto create_free;
+			}
+			memset(con, 0, sizeof(*con));
+			init_rwsem(&con->sock_sem);
+			spin_lock_init(&con->writequeue_lock);
+			INIT_LIST_HEAD(&con->writequeue);
+			set_bit(CF_IS_OTHERCON, &con->flags);
+		}
+
+		memcpy(local_addr, node_addr->addr, node_addr->addr_len);
+	        sock = create_listen_sock(con, local_addr,
+					  node_addr->addr_len);
+		if (sock) {
+			add_sock(sock, con);
+
+			/* Keep a list of dynamically allocated listening sockets
+			   so we can free them at shutdown */
+			if (test_bit(CF_IS_OTHERCON, &con->flags)) {
+				list_add_tail(&con->listenlist, &listen_sockets);
+			}
+		}
+		else {
+			result = -EADDRINUSE;
+			kmem_cache_free(con_cache, con);
+			goto create_free;
+		}
+
+		con = NULL;
+	}
+
+      create_out:
+	return result;
+
+      create_free:
+	/* Free up any dynamically allocated listening sockets */
+	list_for_each_entry_safe(con, temp, &listen_sockets, listenlist) {
+		sock_release(con->sock);
+		kmem_cache_free(con_cache, con);
+	}
+	return result;
+}
+
+
+
+static struct writequeue_entry *new_writequeue_entry(struct connection *con,
+						     int allocation)
+{
+	struct writequeue_entry *entry;
+
+	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+	if (!entry)
+		return NULL;
+
+	entry->page = alloc_page(allocation);
+	if (!entry->page) {
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->offset = 0;
+	entry->len = 0;
+	entry->end = 0;
+	entry->users = 0;
+	entry->con = con;
+
+	return entry;
+}
+
+struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
+					     int allocation, char **ppc)
+{
+	struct connection *con = nodeid2con(nodeid, allocation);
+	struct writequeue_entry *e;
+	int offset = 0;
+	int users = 0;
+
+	if (!con)
+		return NULL;
+
+	if (!atomic_read(&accepting))
+		return NULL;
+
+	spin_lock(&con->writequeue_lock);
+	e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
+	if (((struct list_head *) e == &con->writequeue) ||
+	    (PAGE_CACHE_SIZE - e->end < len)) {
+		e = NULL;
+	} else {
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+	}
+	spin_unlock(&con->writequeue_lock);
+
+	if (e) {
+	      got_one:
+		if (users == 0)
+			kmap(e->page);
+		*ppc = page_address(e->page) + offset;
+		return e;
+	}
+
+	e = new_writequeue_entry(con, allocation);
+	if (e) {
+		spin_lock(&con->writequeue_lock);
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+		list_add_tail(&e->list, &con->writequeue);
+		spin_unlock(&con->writequeue_lock);
+		goto got_one;
+	}
+	return NULL;
+}
+
+void lowcomms_commit_buffer(struct writequeue_entry *e)
+{
+	struct connection *con = e->con;
+	int users;
+
+	if (!atomic_read(&accepting))
+		return;
+
+	spin_lock(&con->writequeue_lock);
+	users = --e->users;
+	if (users)
+		goto out;
+	e->len = e->end - e->offset;
+	kunmap(e->page);
+	spin_unlock(&con->writequeue_lock);
+
+	if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
+		spin_lock_bh(&write_sockets_lock);
+		list_add_tail(&con->write_list, &write_sockets);
+		spin_unlock_bh(&write_sockets_lock);
+
+		wake_up_interruptible(&lowcomms_send_waitq);
+	}
+	return;
+
+      out:
+	spin_unlock(&con->writequeue_lock);
+	return;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+}
+
+/* Send a message */
+static int send_to_sock(struct connection *con)
+{
+	int ret = 0;
+	ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
+	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	struct writequeue_entry *e;
+	int len, offset;
+
+	down_read(&con->sock_sem);
+	if (con->sock == NULL)
+		goto out_connect;
+
+	sendpage = con->sock->ops->sendpage;
+
+	spin_lock(&con->writequeue_lock);
+	for (;;) {
+		e = list_entry(con->writequeue.next, struct writequeue_entry,
+			       list);
+		if ((struct list_head *) e == &con->writequeue)
+			break;
+
+		len = e->len;
+		offset = e->offset;
+		BUG_ON(len == 0 && e->users == 0);
+		spin_unlock(&con->writequeue_lock);
+
+		ret = 0;
+		if (len) {
+			ret = sendpage(con->sock, e->page, offset, len,
+				       msg_flags);
+			if (ret == -EAGAIN || ret == 0)
+				goto out;
+			if (ret <= 0)
+				goto send_error;
+		}
+
+		spin_lock(&con->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			free_entry(e);
+			continue;
+		}
+	}
+	spin_unlock(&con->writequeue_lock);
+      out:
+	up_read(&con->sock_sem);
+	return ret;
+
+      send_error:
+	up_read(&con->sock_sem);
+	close_connection(con, FALSE);
+	lowcomms_connect_sock(con);
+	return ret;
+
+      out_connect:
+	up_read(&con->sock_sem);
+	lowcomms_connect_sock(con);
+	return 0;
+}
+
+static void clean_one_writequeue(struct connection *con)
+{
+	struct list_head *list;
+	struct list_head *temp;
+
+	spin_lock(&con->writequeue_lock);
+	list_for_each_safe(list, temp, &con->writequeue) {
+		struct writequeue_entry *e =
+			list_entry(list, struct writequeue_entry, list);
+		list_del(&e->list);
+		free_entry(e);
+	}
+	spin_unlock(&con->writequeue_lock);
+}
+
+/* Called from recovery when it knows that a node has
+   left the cluster */
+int lowcomms_close(int nodeid)
+{
+	struct connection *con;
+
+	if (!connections)
+		goto out;
+
+	log_print("closing connection to node %d", nodeid);
+	con = nodeid2con(nodeid, 0);
+	if (con) {
+		close_connection(con, TRUE);
+		clean_one_writequeue(con);
+		atomic_set(&con->waiting_requests, 0);
+	}
+	return 0;
+
+      out:
+	return -1;
+}
+
+/* API send message call, may queue the request */
+/* N.B. This is the old interface - use the new one for new calls */
+int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
+{
+	struct writequeue_entry *e;
+	char *b;
+
+	e = lowcomms_get_buffer(nodeid, len, allocation, &b);
+	if (e) {
+		memcpy(b, buf, len);
+		lowcomms_commit_buffer(e);
+		return 0;
+	}
+	return -ENOBUFS;
+}
+
+/* Look for activity on active sockets */
+static void process_sockets(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+
+	spin_lock_bh(&read_sockets_lock);
+	list_for_each_safe(list, temp, &read_sockets) {
+		struct connection *con =
+		    list_entry(list, struct connection, read_list);
+		list_del(&con->read_list);
+		clear_bit(CF_READ_PENDING, &con->flags);
+
+		spin_unlock_bh(&read_sockets_lock);
+
+		/* This can reach zero if we are processing requests
+		 * as they come in.
+		 */
+		if (atomic_read(&con->waiting_requests) == 0) {
+			spin_lock_bh(&read_sockets_lock);
+			continue;
+		}
+
+		do {
+			con->rx_action(con);
+		} while (!atomic_dec_and_test(&con->waiting_requests) &&
+			 !kthread_should_stop());
+
+		/* Don't starve out everyone else */
+		schedule();
+		spin_lock_bh(&read_sockets_lock);
+	}
+	spin_unlock_bh(&read_sockets_lock);
+}
+
+/* Try to send any messages that are pending
+ */
+static void process_output_queue(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+	int ret;
+
+	spin_lock_bh(&write_sockets_lock);
+	list_for_each_safe(list, temp, &write_sockets) {
+		struct connection *con =
+		    list_entry(list, struct connection, write_list);
+		list_del(&con->write_list);
+		clear_bit(CF_WRITE_PENDING, &con->flags);
+
+		spin_unlock_bh(&write_sockets_lock);
+
+		ret = send_to_sock(con);
+		if (ret < 0) {
+		}
+		spin_lock_bh(&write_sockets_lock);
+	}
+	spin_unlock_bh(&write_sockets_lock);
+}
+
+static void process_state_queue(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+	int ret;
+
+	spin_lock_bh(&state_sockets_lock);
+	list_for_each_safe(list, temp, &state_sockets) {
+		struct connection *con =
+		    list_entry(list, struct connection, state_list);
+		list_del(&con->state_list);
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		spin_unlock_bh(&state_sockets_lock);
+
+		ret = connect_to_sock(con);
+		if (ret < 0) {
+		}
+		spin_lock_bh(&state_sockets_lock);
+	}
+	spin_unlock_bh(&state_sockets_lock);
+}
+
+
+/* Discard all entries on the write queues */
+static void clean_writequeues(void)
+{
+	int nodeid;
+
+	for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
+		struct connection *con = nodeid2con(nodeid, 0);
+
+		if (con)
+			clean_one_writequeue(con);
+	}
+}
+
+static int read_list_empty(void)
+{
+	int status;
+
+	spin_lock_bh(&read_sockets_lock);
+	status = list_empty(&read_sockets);
+	spin_unlock_bh(&read_sockets_lock);
+
+	return status;
+}
+
+/* DLM Transport comms receive daemon */
+static int dlm_recvd(void *data)
+{
+	init_waitqueue_head(&lowcomms_recv_waitq);
+	init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
+	add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (read_list_empty())
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		process_sockets();
+	}
+
+	return 0;
+}
+
+static int write_and_state_lists_empty(void)
+{
+	int status;
+
+	spin_lock_bh(&write_sockets_lock);
+	status = list_empty(&write_sockets);
+	spin_unlock_bh(&write_sockets_lock);
+
+	spin_lock_bh(&state_sockets_lock);
+	if (list_empty(&state_sockets) == 0)
+		status = 0;
+	spin_unlock_bh(&state_sockets_lock);
+
+	return status;
+}
+
+/* DLM Transport send daemon */
+static int dlm_sendd(void *data)
+{
+	init_waitqueue_head(&lowcomms_send_waitq);
+	init_waitqueue_entry(&lowcomms_send_waitq_head, current);
+	add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (write_and_state_lists_empty())
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		process_state_queue();
+		process_output_queue();
+	}
+
+	return 0;
+}
+
+static void daemons_stop(void)
+{
+	kthread_stop(recv_task);
+	kthread_stop(send_task);
+}
+
+static int daemons_start(void)
+{
+	struct task_struct *p;
+	int error;
+
+	p = kthread_run(dlm_recvd, NULL, 0, "dlm_recvd");
+	error = IS_ERR(p);
+       	if (error) {
+		log_print("can't start dlm_recvd %d", error);
+		return error;
+	}
+	recv_task = p;
+
+	p = kthread_run(dlm_sendd, NULL, 0, "dlm_sendd");
+	error = IS_ERR(p);
+       	if (error) {
+		log_print("can't start dlm_sendd %d", error);
+		kthread_stop(recv_task);
+		return error;
+	}
+	send_task = p;
+
+	return 0;
+}
+
+/*
+ * Return the largest buffer size we can cope with.
+ */
+int lowcomms_max_buffer_size(void)
+{
+	return PAGE_CACHE_SIZE;
+}
+
+void lowcomms_stop(void)
+{
+	int i;
+	struct connection *temp;
+	struct connection *lcon;
+
+	atomic_set(&accepting, 0);
+
+	/* Set all the activity flags to prevent any
+	   socket activity.
+	*/
+	for (i = 0; i < conn_array_size; i++) {
+		if (connections[i])
+			connections[i]->flags = 0x7;
+	}
+	daemons_stop();
+	clean_writequeues();
+
+	for (i = 0; i < conn_array_size; i++) {
+		if (connections[i]) {
+			close_connection(connections[i], TRUE);
+			if (connections[i]->othercon)
+				kmem_cache_free(con_cache, connections[i]->othercon);
+			kmem_cache_free(con_cache, connections[i]);
+		}
+	}
+
+	kfree(connections);
+	connections = NULL;
+
+	/* Free up any dynamically allocated listening sockets */
+	list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
+		sock_release(lcon->sock);
+		kmem_cache_free(con_cache, lcon);
+	}
+
+	kmem_cache_destroy(con_cache);
+	kcl_releaseref_cluster();
+}
+
+/* This is quite likely to sleep... */
+int lowcomms_start(void)
+{
+	int error = 0;
+	struct connection *temp;
+	struct connection *lcon;
+
+	INIT_LIST_HEAD(&read_sockets);
+	INIT_LIST_HEAD(&write_sockets);
+	INIT_LIST_HEAD(&state_sockets);
+	INIT_LIST_HEAD(&listen_sockets);
+
+	spin_lock_init(&read_sockets_lock);
+	spin_lock_init(&write_sockets_lock);
+	spin_lock_init(&state_sockets_lock);
+	init_rwsem(&connections_lock);
+
+	error = -ENOTCONN;
+	if (kcl_addref_cluster())
+		goto out;
+
+	/*
+	 * Temporarily initialise the waitq head so that lowcomms_send_message
+	 * doesn't crash if it gets called before the thread is fully
+	 * initialised
+	 */
+	init_waitqueue_head(&lowcomms_send_waitq);
+
+	error = -ENOMEM;
+	connections = kmalloc(sizeof(struct connection *) *
+			      dlm_config.conn_increment, GFP_KERNEL);
+	if (!connections)
+		goto out;
+
+	memset(connections, 0,
+	       sizeof(struct connection *) * dlm_config.conn_increment);
+
+	conn_array_size = dlm_config.conn_increment;
+
+	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
+				      __alignof__(struct connection), 0, NULL, NULL);
+	if (!con_cache)
+		goto fail_free_conn;
+
+
+	/* Start listening */
+	error = listen_for_all();
+	if (error)
+		goto fail_unlisten;
+
+	error = daemons_start();
+	if (error)
+		goto fail_unlisten;
+
+	atomic_set(&accepting, 1);
+
+	return 0;
+
+      fail_unlisten:
+	close_connection(connections[0], 0);
+	kmem_cache_free(con_cache, connections[0]);
+	list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
+		sock_release(lcon->sock);
+		kmem_cache_free(con_cache, lcon);
+	}
+
+	kmem_cache_destroy(con_cache);
+
+      fail_free_conn:
+	kcl_releaseref_cluster();
+	kfree(connections);
+
+      out:
+	return error;
+}
+
+/* Don't accept any more outgoing work */
+void lowcomms_stop_accept()
+{
+        atomic_set(&accepting, 0);
+}
+
+/* Cluster Manager interface functions for looking up
+   nodeids and IP addresses by each other
+*/
+
+/* Return the IP address of a node given its NODEID */
+static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
+{
+	struct list_head *addrs;
+	struct cluster_node_addr *node_addr;
+	struct cluster_node_addr *current_addr = NULL;
+	struct sockaddr_in6 *saddr;
+	int interface;
+	int i;
+
+	addrs = kcl_get_node_addresses(nodeid);
+	if (!addrs)
+		return -1;
+
+	interface = kcl_get_current_interface();
+
+	/* Look for address number <interface> */
+	i=0; /* i/f numbers start at 1 */
+	list_for_each_entry(node_addr, addrs, list) {
+	        if (interface == ++i) {
+		        current_addr = node_addr;
+			break;
+		}
+	}
+
+	/* If that failed then just use the first one */
+	if (!current_addr)
+ 	        current_addr = (struct cluster_node_addr *)addrs->next;
+
+	saddr = (struct sockaddr_in6 *)current_addr->addr;
+
+	/* Extract the IP address */
+	if (local_addr.sin6_family == AF_INET) {
+	        struct sockaddr_in *in4  = (struct sockaddr_in *)saddr;
+		struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
+		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+	}
+	else {
+	        struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
+		memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
+	}
+
+	return 0;
+}
+
+/* Return the NODEID for a node given its sockaddr */
+static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
+{
+	struct kcl_cluster_node node;
+	struct sockaddr_in6 ipv6_addr;
+	struct sockaddr_in  ipv4_addr;
+
+	if (local_addr.sin6_family == AF_INET) {
+	        struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
+		memcpy(&ipv4_addr, &local_addr, addr_len);
+		memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
+
+		addr = (struct sockaddr *)&ipv4_addr;
+	}
+	else {
+	        struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
+		memcpy(&ipv6_addr, &local_addr, addr_len);
+		memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
+
+		addr = (struct sockaddr *)&ipv6_addr;
+	}
+
+	if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
+		return node.node_id;
+	else
+		return 0;
+}
+
+int lowcomms_our_nodeid(void)
+{
+	struct kcl_cluster_node node;
+	struct list_head *addrs;
+	struct cluster_node_addr *first_addr;
+	static int our_nodeid = 0;
+
+	if (our_nodeid)
+		return our_nodeid;
+
+	if (kcl_get_node_by_nodeid(0, &node) == -1)
+		return 0;
+
+	our_nodeid = node.node_id;
+
+	/* Fill in the "template" structure */
+	addrs = kcl_get_node_addresses(our_nodeid);
+	if (!addrs)
+		return 0;
+
+	first_addr = (struct cluster_node_addr *) addrs->next;
+	memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
+
+	return node.node_id;
+}
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
--- linux-orig/cluster/dlm/lowcomms.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lowcomms.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+/* The old interface */
+int lowcomms_send_message(int csid, char *buf, int len, int allocation);
+
+/* The new interface */
+struct writequeue_entry;
+extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
+						    int allocation, char **ppc);
+extern void lowcomms_commit_buffer(struct writequeue_entry *e);
+
+int lowcomms_start(void);
+void lowcomms_stop(void);
+void lowcomms_stop_accept(void);
+int lowcomms_close(int nodeid);
+int lowcomms_max_buffer_size(void);
+
+int lowcomms_our_nodeid(void);
+
+#endif				/* __LOWCOMMS_DOT_H__ */
diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
--- linux-orig/cluster/dlm/main.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/main.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,93 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#define EXPORT_SYMTAB
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <net/sock.h>
+
+#include <cluster/cnxman.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "locking.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "lowcomms.h"
+
+int  dlm_device_init(void);
+void dlm_device_exit(void);
+void dlm_proc_init(void);
+void dlm_proc_exit(void);
+
+
+/* Cluster manager callbacks, we want to know if a node dies
+   N.B. this is independent of lockspace-specific event callbacks from SM */
+
+static void cman_callback(kcl_callback_reason reason, long arg)
+{
+	/* This is unconditional. so do what we can to tidy up */
+	if (reason == LEAVING) {
+	        dlm_emergency_shutdown();
+	}
+}
+
+int __init init_dlm(void)
+{
+	dlm_proc_init();
+	dlm_lockspace_init();
+	dlm_nodes_init();
+	dlm_device_init();
+	dlm_memory_init();
+	dlm_config_init();
+
+	kcl_add_callback(cman_callback);
+
+	printk("DLM %s (built %s %s) installed\n",
+	       DLM_RELEASE_NAME, __DATE__, __TIME__);
+
+	return 0;
+}
+
+void __exit exit_dlm(void)
+{
+	kcl_remove_callback(cman_callback);
+
+	dlm_device_exit();
+	dlm_memory_exit();
+	dlm_config_exit();
+	dlm_proc_exit();
+}
+
+MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+EXPORT_SYMBOL(dlm_init);
+EXPORT_SYMBOL(dlm_release);
+EXPORT_SYMBOL(dlm_new_lockspace);
+EXPORT_SYMBOL(dlm_release_lockspace);
+EXPORT_SYMBOL(dlm_lock);
+EXPORT_SYMBOL(dlm_unlock);
+EXPORT_SYMBOL(dlm_debug_dump);
+EXPORT_SYMBOL(dlm_locks_dump);
diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
--- linux-orig/cluster/dlm/memory.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/memory.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,238 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* memory.c
+ * 
+ * memory allocation routines
+ * 
+ */
+
+#include "dlm_internal.h"
+#include "memory.h"
+#include "config.h"
+
+/* as the man says...Shouldn't this be in a header file somewhere? */
+#define	BYTES_PER_WORD		sizeof(void *)
+
+static kmem_cache_t *rsb_cache_small;
+static kmem_cache_t *rsb_cache_large;
+static kmem_cache_t *lkb_cache;
+static kmem_cache_t *lvb_cache;
+static kmem_cache_t *resdir_cache_large;
+static kmem_cache_t *resdir_cache_small;
+
+/* The thresholds above which we allocate large RSBs/direntry rather than small 
+ * ones. This must make the resultant structure end on a word boundary */
+#define LARGE_RSB_NAME 28
+#define LARGE_RES_NAME 28
+
+int dlm_memory_init()
+{
+	int ret = -ENOMEM;
+
+
+	rsb_cache_small =
+	    kmem_cache_create("dlm_rsb(small)",
+			      (sizeof(struct dlm_rsb) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
+			      __alignof__(struct dlm_rsb), 0, NULL, NULL);
+	if (!rsb_cache_small)
+		goto out;
+
+	rsb_cache_large =
+	    kmem_cache_create("dlm_rsb(large)",
+			      sizeof(struct dlm_rsb) + DLM_RESNAME_MAXLEN,
+			      __alignof__(struct dlm_rsb), 0, NULL, NULL);
+	if (!rsb_cache_large)
+		goto out_free_rsbs;
+
+	lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+				      __alignof__(struct dlm_lkb), 0, NULL, NULL);
+	if (!lkb_cache)
+		goto out_free_rsbl;
+
+	resdir_cache_large =
+	    kmem_cache_create("dlm_resdir(l)",
+			      sizeof(struct dlm_direntry) + DLM_RESNAME_MAXLEN,
+			      __alignof__(struct dlm_direntry), 0, NULL, NULL);
+	if (!resdir_cache_large)
+		goto out_free_lkb;
+
+	resdir_cache_small =
+	    kmem_cache_create("dlm_resdir(s)",
+			      (sizeof(struct dlm_direntry) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
+			      __alignof__(struct dlm_direntry), 0, NULL, NULL);
+	if (!resdir_cache_small)
+		goto out_free_resl;
+
+	/* LVB cache also holds ranges, so should be 64bit aligned */
+	lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
+				      __alignof__(uint64_t), 0, NULL, NULL);
+	if (!lkb_cache)
+		goto out_free_ress;
+
+	ret = 0;
+	goto out;
+
+      out_free_ress:
+	kmem_cache_destroy(resdir_cache_small);
+
+      out_free_resl:
+	kmem_cache_destroy(resdir_cache_large);
+
+      out_free_lkb:
+	kmem_cache_destroy(lkb_cache);
+
+      out_free_rsbl:
+	kmem_cache_destroy(rsb_cache_large);
+
+      out_free_rsbs:
+	kmem_cache_destroy(rsb_cache_small);
+
+      out:
+	return ret;
+}
+
+void dlm_memory_exit()
+{
+	kmem_cache_destroy(rsb_cache_large);
+	kmem_cache_destroy(rsb_cache_small);
+	kmem_cache_destroy(lkb_cache);
+	kmem_cache_destroy(resdir_cache_small);
+	kmem_cache_destroy(resdir_cache_large);
+	kmem_cache_destroy(lvb_cache);
+}
+
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+{
+	struct dlm_rsb *r;
+
+	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+	if (namelen >= LARGE_RSB_NAME)
+		r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
+	else
+		r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
+
+	if (r)
+		memset(r, 0, sizeof(struct dlm_rsb) + namelen);
+
+	return r;
+}
+
+void free_rsb(struct dlm_rsb *r)
+{
+	int length = r->res_length;
+
+#ifdef POISON
+	memset(r, 0x55, sizeof(struct dlm_rsb) + r->res_length);
+#endif
+
+	if (length >= LARGE_RSB_NAME)
+		kmem_cache_free(rsb_cache_large, r);
+	else
+		kmem_cache_free(rsb_cache_small, r);
+}
+
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+{
+	struct dlm_lkb *l;
+
+	l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
+	if (l)
+		memset(l, 0, sizeof(struct dlm_lkb));
+
+	return l;
+}
+
+void free_lkb(struct dlm_lkb *l)
+{
+#ifdef POISON
+	memset(l, 0xAA, sizeof(struct dlm_lkb));
+#endif
+	kmem_cache_free(lkb_cache, l);
+}
+
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
+{
+	struct dlm_direntry *rd;
+
+	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+	if (namelen >= LARGE_RES_NAME)
+		rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
+	else
+		rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
+
+	if (rd)
+		memset(rd, 0, sizeof(struct dlm_direntry));
+
+	return rd;
+}
+
+void free_direntry(struct dlm_direntry *de)
+{
+	if (de->length >= LARGE_RES_NAME)
+		kmem_cache_free(resdir_cache_large, de);
+	else
+		kmem_cache_free(resdir_cache_small, de);
+}
+
+char *allocate_lvb(struct dlm_ls *ls)
+{
+	char *l;
+
+	l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
+	if (l)
+		memset(l, 0, DLM_LVB_LEN);
+
+	return l;
+}
+
+void free_lvb(char *l)
+{
+	kmem_cache_free(lvb_cache, l);
+}
+
+/* Ranges are allocated from the LVB cache as they are the same size (4x64
+ * bits) */
+uint64_t *allocate_range(struct dlm_ls * ls)
+{
+	uint64_t *l;
+
+	l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
+	if (l)
+		memset(l, 0, DLM_LVB_LEN);
+
+	return l;
+}
+
+void free_range(uint64_t *l)
+{
+	kmem_cache_free(lvb_cache, l);
+}
+
+struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls)
+{
+	struct dlm_rcom *rc;
+
+	rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
+	if (rc)
+		memset(rc, 0, dlm_config.buffer_size);
+
+	return rc;
+}
+
+void free_rcom_buffer(struct dlm_rcom *rc)
+{
+	kfree(rc);
+}
diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
--- linux-orig/cluster/dlm/memory.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/memory.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,32 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+void free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+void free_lkb(struct dlm_lkb *l);
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+void free_direntry(struct dlm_direntry *de);
+char *allocate_lvb(struct dlm_ls *ls);
+void free_lvb(char *l);
+struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls);
+void free_rcom_buffer(struct dlm_rcom *rc);
+uint64_t *allocate_range(struct dlm_ls *ls);
+void free_range(uint64_t *l);
+
+#endif		/* __MEMORY_DOT_H__ */
diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
--- linux-orig/cluster/dlm/midcomms.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/midcomms.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,355 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ *
+ * It knows the format of the mid-level messages used and nodeidss
+ * but it does not know how to resolve a nodeid into an IP address
+ * or any of the comms channel details
+ *
+ */
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "lockqueue.h"
+#include "nodes.h"
+#include "reccomms.h"
+#include "config.h"
+
+/* Byteorder routines */
+
+static void host_to_network(void *msg)
+{
+	struct dlm_header *head = msg;
+	struct dlm_request *req = msg;
+	struct dlm_reply *rep = msg;
+	struct dlm_query_request *qreq = msg;
+	struct dlm_query_reply *qrep= msg;
+	struct dlm_rcom *rc = msg;
+
+	/* Force into network byte order */
+
+	/*
+	 * Do the common header first
+	 */
+
+	head->rh_length = cpu_to_le16(head->rh_length);
+	head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
+	/* Leave the lkid alone as it is transparent at the remote end */
+
+	/*
+	 * Do the fields in the remlockrequest or remlockreply structs
+	 */
+
+	switch (req->rr_header.rh_cmd) {
+
+	case GDLM_REMCMD_LOCKREQUEST:
+	case GDLM_REMCMD_CONVREQUEST:
+		req->rr_range_start = cpu_to_le64(req->rr_range_start);
+		req->rr_range_end = cpu_to_le64(req->rr_range_end);
+		/* Deliberate fall through */
+	case GDLM_REMCMD_UNLOCKREQUEST:
+	case GDLM_REMCMD_LOOKUP:
+	case GDLM_REMCMD_LOCKGRANT:
+	case GDLM_REMCMD_SENDBAST:
+	case GDLM_REMCMD_SENDCAST:
+	case GDLM_REMCMD_REM_RESDATA:
+		req->rr_flags = cpu_to_le32(req->rr_flags);
+		req->rr_status = cpu_to_le32(req->rr_status);
+		break;
+
+	case GDLM_REMCMD_LOCKREPLY:
+		rep->rl_lockstate = cpu_to_le32(rep->rl_lockstate);
+		rep->rl_nodeid = cpu_to_le32(rep->rl_nodeid);
+		rep->rl_status = cpu_to_le32(rep->rl_status);
+		break;
+
+	case GDLM_REMCMD_RECOVERMESSAGE:
+	case GDLM_REMCMD_RECOVERREPLY:
+		rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
+		rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
+		break;
+
+	case GDLM_REMCMD_QUERY:
+	        qreq->rq_mstlkid = cpu_to_le32(qreq->rq_mstlkid);
+		qreq->rq_query = cpu_to_le32(qreq->rq_query);
+		qreq->rq_maxlocks = cpu_to_le32(qreq->rq_maxlocks);
+		break;
+
+	case GDLM_REMCMD_QUERYREPLY:
+	        qrep->rq_numlocks = cpu_to_le32(qrep->rq_numlocks);
+		qrep->rq_status = cpu_to_le32(qrep->rq_status);
+		qrep->rq_grantcount = cpu_to_le32(qrep->rq_grantcount);
+		qrep->rq_waitcount = cpu_to_le32(qrep->rq_waitcount);
+		qrep->rq_convcount = cpu_to_le32(qrep->rq_convcount);
+		break;
+
+	default:
+		printk("dlm: warning, unknown REMCMD type %u\n",
+		       req->rr_header.rh_cmd);
+	}
+}
+
+static void network_to_host(void *msg)
+{
+	struct dlm_header *head = msg;
+	struct dlm_request *req = msg;
+	struct dlm_reply *rep = msg;
+	struct dlm_query_request *qreq = msg;
+	struct dlm_query_reply *qrep = msg;
+	struct dlm_rcom *rc = msg;
+
+	/* Force into host byte order */
+
+	/*
+	 * Do the common header first
+	 */
+
+	head->rh_length = le16_to_cpu(head->rh_length);
+	head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
+	/* Leave the lkid alone as it is transparent at the remote end */
+
+	/*
+	 * Do the fields in the remlockrequest or remlockreply structs
+	 */
+
+	switch (req->rr_header.rh_cmd) {
+
+	case GDLM_REMCMD_LOCKREQUEST:
+	case GDLM_REMCMD_CONVREQUEST:
+		req->rr_range_start = le64_to_cpu(req->rr_range_start);
+		req->rr_range_end = le64_to_cpu(req->rr_range_end);
+	case GDLM_REMCMD_LOOKUP:
+	case GDLM_REMCMD_UNLOCKREQUEST:
+	case GDLM_REMCMD_LOCKGRANT:
+	case GDLM_REMCMD_SENDBAST:
+	case GDLM_REMCMD_SENDCAST:
+	case GDLM_REMCMD_REM_RESDATA:
+		/* Actually, not much to do here as the remote lock IDs are
+		 * transparent too */
+		req->rr_flags = le32_to_cpu(req->rr_flags);
+		req->rr_status = le32_to_cpu(req->rr_status);
+		break;
+
+	case GDLM_REMCMD_LOCKREPLY:
+		rep->rl_lockstate = le32_to_cpu(rep->rl_lockstate);
+		rep->rl_nodeid = le32_to_cpu(rep->rl_nodeid);
+		rep->rl_status = le32_to_cpu(rep->rl_status);
+		break;
+
+	case GDLM_REMCMD_RECOVERMESSAGE:
+	case GDLM_REMCMD_RECOVERREPLY:
+		rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
+		rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
+		break;
+
+
+	case GDLM_REMCMD_QUERY:
+	        qreq->rq_mstlkid = le32_to_cpu(qreq->rq_mstlkid);
+		qreq->rq_query = le32_to_cpu(qreq->rq_query);
+		qreq->rq_maxlocks = le32_to_cpu(qreq->rq_maxlocks);
+		break;
+
+	case GDLM_REMCMD_QUERYREPLY:
+	        qrep->rq_numlocks = le32_to_cpu(qrep->rq_numlocks);
+		qrep->rq_status = le32_to_cpu(qrep->rq_status);
+		qrep->rq_grantcount = le32_to_cpu(qrep->rq_grantcount);
+		qrep->rq_waitcount = le32_to_cpu(qrep->rq_waitcount);
+		qrep->rq_convcount = le32_to_cpu(qrep->rq_convcount);
+		break;
+
+	default:
+		printk("dlm: warning, unknown REMCMD type %u\n",
+		       req->rr_header.rh_cmd);
+	}
+}
+
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+			 unsigned len, unsigned limit)
+{
+	unsigned copy = len;
+
+	if ((copy + offset) > limit)
+		copy = limit - offset;
+	memcpy(dst, base + offset, copy);
+	len -= copy;
+	if (len)
+		memcpy(dst + copy, base, len);
+}
+
+static void khexdump(const unsigned char *c, int len)
+{
+	while (len > 16) {
+		printk(KERN_INFO
+		       "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
+		       c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
+		       c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
+		len -= 16;
+		c += 16;
+	}
+	while (len > 4) {
+		printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
+		       c[3]);
+		len -= 4;
+		c += 4;
+	}
+	while (len > 0) {
+		printk(KERN_INFO "%02x\n", c[0]);
+		len--;
+		c++;
+	}
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ *
+ */
+int midcomms_process_incoming_buffer(int nodeid, const void *base,
+				     unsigned offset, unsigned len,
+				     unsigned limit)
+{
+	unsigned char __tmp[sizeof(struct dlm_header) + 64];
+	struct dlm_header *msg = (struct dlm_header *) __tmp;
+	int ret = 0;
+	int err = 0;
+	unsigned msglen;
+	__u32 id, space;
+
+	while (len > sizeof(struct dlm_header)) {
+		/* Get message header and check it over */
+		copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
+			     limit);
+		msglen = le16_to_cpu(msg->rh_length);
+		id = msg->rh_lkid;
+		space = msg->rh_lockspace;
+
+		/* Check message size */
+		err = -EINVAL;
+		if (msglen < sizeof(struct dlm_header))
+			break;
+		err = -E2BIG;
+		if (msglen > dlm_config.buffer_size) {
+			printk("dlm: message size from %d too big %d(pkt len=%d)\n", nodeid, msglen, len);
+			khexdump((const unsigned char *) msg, len);
+			break;
+		}
+		err = 0;
+
+		/* Not enough in buffer yet? wait for some more */
+		if (msglen > len)
+			break;
+
+		/* Make sure our temp buffer is large enough */
+		if (msglen > sizeof(__tmp) &&
+		    msg == (struct dlm_header *) __tmp) {
+			msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+			if (msg == NULL)
+				return ret;
+		}
+
+		copy_from_cb(msg, base, offset, msglen, limit);
+		BUG_ON(id != msg->rh_lkid);
+		BUG_ON(space != msg->rh_lockspace);
+		ret += msglen;
+		offset += msglen;
+		offset &= (limit - 1);
+		len -= msglen;
+		network_to_host(msg);
+
+		if ((msg->rh_cmd > 32) ||
+		    (msg->rh_cmd == 0) ||
+		    (msg->rh_length < sizeof(struct dlm_header)) ||
+		    (msg->rh_length > dlm_config.buffer_size)) {
+
+			printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
+			       "lkid=%u, lockspace=%u\n",
+			       msg->rh_cmd, msg->rh_flags, msg->rh_length,
+			       msg->rh_lkid, msg->rh_lockspace);
+
+			printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
+			       "ret=%u, limit=%08x newbuf=%d\n",
+			       base, offset, len, ret, limit,
+			       ((struct dlm_header *) __tmp == msg));
+
+			khexdump((const unsigned char *) msg, msg->rh_length);
+
+			return -EBADMSG;
+		}
+
+		switch (msg->rh_cmd) {
+		case GDLM_REMCMD_RECOVERMESSAGE:
+		case GDLM_REMCMD_RECOVERREPLY:
+			process_recovery_comm(nodeid, msg);
+			break;
+		default:
+			process_cluster_request(nodeid, msg, FALSE);
+		}
+	}
+
+	if (msg != (struct dlm_header *) __tmp)
+		kfree(msg);
+
+	return err ? err : ret;
+}
+
+/*
+ * Send a lowcomms buffer
+ */
+
+void midcomms_send_buffer(struct dlm_header *msg, struct writequeue_entry *e)
+{
+	host_to_network(msg);
+	lowcomms_commit_buffer(e);
+}
+
+/*
+ * Make the message into network byte order and send it
+ */
+
+int midcomms_send_message(uint32_t nodeid, struct dlm_header *msg,
+			  int allocation)
+{
+	int len = msg->rh_length;
+
+	host_to_network(msg);
+
+	/*
+	 * Loopback.  In fact, the locking code pretty much prevents this from
+	 * being needed but it can happen when the directory node is also the
+	 * local node.
+	 */
+
+	if (nodeid == our_nodeid())
+		return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
+							len, len);
+
+	return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
+}
diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
--- linux-orig/cluster/dlm/midcomms.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/midcomms.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+int midcomms_send_message(uint32_t csid, struct dlm_header *msg,
+			  int allocation);
+int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
+				     unsigned len, unsigned limit);
+void midcomms_send_buffer(struct dlm_header *msg,
+			  struct writequeue_entry *e);
+
+#endif				/* __MIDCOMMS_DOT_H__ */
diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
--- linux-orig/cluster/dlm/nodes.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/nodes.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,347 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <net/sock.h>
+#include <cluster/cnxman.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "nodes.h"
+#include "recover.h"
+#include "reccomms.h"
+#include "util.h"
+
+static struct list_head cluster_nodes;
+static spinlock_t node_lock;
+
+
+void dlm_nodes_init(void)
+{
+	INIT_LIST_HEAD(&cluster_nodes);
+	spin_lock_init(&node_lock);
+}
+
+static struct dlm_node *search_node(uint32_t nodeid)
+{
+	struct dlm_node *node;
+
+	list_for_each_entry(node, &cluster_nodes, list) {
+		if (node->nodeid == nodeid)
+			goto out;
+	}
+	node = NULL;
+ out:
+	return node;
+}
+
+static void put_node(struct dlm_node *node)
+{
+	spin_lock(&node_lock);
+	if (atomic_dec_and_test(&node->refcount)) {
+		lowcomms_close(node->nodeid);
+		list_del(&node->list);
+		spin_unlock(&node_lock);
+		kfree(node);
+		return;
+	}
+	spin_unlock(&node_lock);
+}
+
+static int get_node(uint32_t nodeid, struct dlm_node **ndp)
+{
+	struct dlm_node *node, *node2;
+	int error = -ENOMEM;
+
+	spin_lock(&node_lock);
+	node = search_node(nodeid);
+	if (node)
+		atomic_inc(&node->refcount);
+	spin_unlock(&node_lock);
+
+	if (node)
+		goto out;
+
+	node = (struct dlm_node *) kmalloc(sizeof(struct dlm_node), GFP_KERNEL);
+	if (!node)
+		goto fail;
+
+	memset(node, 0, sizeof(struct dlm_node));
+	node->nodeid = nodeid;
+
+	spin_lock(&node_lock);
+	node2 = search_node(nodeid);
+	if (node2) {
+		atomic_inc(&node2->refcount);
+		spin_unlock(&node_lock);
+		kfree(node);
+		node = node2;
+		goto out;
+	}
+
+	atomic_set(&node->refcount, 1);
+	list_add_tail(&node->list, &cluster_nodes);
+	spin_unlock(&node_lock);
+
+ out:
+	*ndp = node;
+	return 0;
+ fail:
+	return error;
+}
+
+int init_new_csb(uint32_t nodeid, struct dlm_csb **ret_csb)
+{
+	struct dlm_csb *csb;
+	struct dlm_node *node;
+	int error = -ENOMEM;
+
+	csb = (struct dlm_csb *) kmalloc(sizeof(struct dlm_csb), GFP_KERNEL);
+	if (!csb)
+		goto fail;
+
+	memset(csb, 0, sizeof(struct dlm_csb));
+
+	error = get_node(nodeid, &node);
+	if (error)
+		goto fail_free;
+
+	csb->node = node;
+	*ret_csb = csb;
+	return 0;
+
+ fail_free:
+	kfree(csb);
+ fail:
+	return error;
+}
+
+void release_csb(struct dlm_csb *csb)
+{
+	put_node(csb->node);
+	kfree(csb);
+}
+
+uint32_t our_nodeid(void)
+{
+	return lowcomms_our_nodeid();
+}
+
+static void make_node_array(struct dlm_ls *ls)
+{
+	struct dlm_csb *csb;
+	uint32_t *array;
+	int i = 0;
+
+	if (ls->ls_node_array) {
+		kfree(ls->ls_node_array);
+		ls->ls_node_array = NULL;
+	}
+
+	array = kmalloc(sizeof(uint32_t) * ls->ls_num_nodes, GFP_KERNEL);
+	if (!array)
+		return;
+
+	list_for_each_entry(csb, &ls->ls_nodes, list)
+		array[i++] = csb->node->nodeid;
+
+	ls->ls_node_array = array;
+}
+
+int nodes_reconfig_wait(struct dlm_ls *ls)
+{
+	int error;
+
+	if (ls->ls_low_nodeid == our_nodeid()) {
+		error = dlm_wait_status_all(ls, NODES_VALID);
+		if (!error)
+			set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
+
+		/* Experimental: this delay should allow any final messages
+		 * from the previous node to be received before beginning
+		 * recovery. */
+
+		if (ls->ls_num_nodes == 1) {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout((2) * HZ);
+		}
+
+	} else
+		error = dlm_wait_status_low(ls, NODES_ALL_VALID);
+
+	return error;
+}
+
+static void add_ordered_node(struct dlm_ls *ls, struct dlm_csb *new)
+{
+	struct dlm_csb *csb = NULL;
+	struct list_head *tmp;
+	struct list_head *newlist = &new->list;
+	struct list_head *head = &ls->ls_nodes;
+
+	list_for_each(tmp, head) {
+		csb = list_entry(tmp, struct dlm_csb, list);
+
+		if (new->node->nodeid < csb->node->nodeid)
+			break;
+	}
+
+	if (!csb)
+		list_add_tail(newlist, head);
+	else {
+		/* FIXME: can use list macro here */
+		newlist->prev = tmp->prev;
+		newlist->next = tmp;
+		tmp->prev->next = newlist;
+		tmp->prev = newlist;
+	}
+}
+
+int ls_nodes_reconfig(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+	struct dlm_csb *csb, *safe;
+	int error, i, found, pos = 0, neg = 0;
+	uint32_t low = (uint32_t) (-1);
+
+	/* 
+	 * Remove (and save) departed nodes from lockspace's nodes list
+	 */
+
+	list_for_each_entry_safe(csb, safe, &ls->ls_nodes, list) {
+		found = FALSE;
+		for (i = 0; i < rv->node_count; i++) {
+			if (csb->node->nodeid == rv->nodeids[i]) {
+				found = TRUE;
+				break;
+			}
+		}
+
+		if (!found) {
+			neg++;
+			csb->gone_event = rv->event_id;
+			list_del(&csb->list);
+			list_add_tail(&csb->list, &ls->ls_nodes_gone);
+			ls->ls_num_nodes--;
+			log_all(ls, "remove node %u", csb->node->nodeid);
+		}
+	}
+
+	/* 
+	 * Add new nodes to lockspace's nodes list
+	 */
+
+	for (i = 0; i < rv->node_count; i++) {
+		found = FALSE;
+		list_for_each_entry(csb, &ls->ls_nodes, list) {
+			if (csb->node->nodeid == rv->nodeids[i]) {
+				found = TRUE;
+				break;
+			}
+		}
+
+		if (!found) {
+			pos++;
+
+			error = init_new_csb(rv->nodeids[i], &csb);
+			DLM_ASSERT(!error,);
+
+			add_ordered_node(ls, csb);
+			ls->ls_num_nodes++;
+			log_all(ls, "add node %u", csb->node->nodeid);
+		}
+	}
+
+	list_for_each_entry(csb, &ls->ls_nodes, list) {
+		if (csb->node->nodeid < low)
+			low = csb->node->nodeid;
+	}
+
+	ls->ls_low_nodeid = low;
+	set_bit(LSFL_NODES_VALID, &ls->ls_flags);
+	*neg_out = neg;
+	make_node_array(ls);
+
+	error = nodes_reconfig_wait(ls);
+
+	log_all(ls, "total nodes %d", ls->ls_num_nodes);
+
+	return error;
+}
+
+static void nodes_clear(struct list_head *head)
+{
+	struct dlm_csb *csb;
+
+	while (!list_empty(head)) {
+		csb = list_entry(head->next, struct dlm_csb, list);
+		list_del(&csb->list);
+		release_csb(csb);
+	}
+}
+
+void ls_nodes_clear(struct dlm_ls *ls)
+{
+	nodes_clear(&ls->ls_nodes);
+	ls->ls_num_nodes = 0;
+}
+
+void ls_nodes_gone_clear(struct dlm_ls *ls)
+{
+	nodes_clear(&ls->ls_nodes_gone);
+}
+
+int ls_nodes_init(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+	struct dlm_csb *csb;
+	int i, error;
+	uint32_t low = (uint32_t) (-1);
+
+	/* nodes may be left from a previous failed start */
+	ls_nodes_clear(ls);
+
+	log_all(ls, "add nodes");
+
+	for (i = 0; i < rv->node_count; i++) {
+		error = init_new_csb(rv->nodeids[i], &csb);
+		if (error)
+			goto fail;
+
+		add_ordered_node(ls, csb);
+		ls->ls_num_nodes++;
+
+		if (csb->node->nodeid < low)
+			low = csb->node->nodeid;
+	}
+
+	ls->ls_low_nodeid = low;
+	set_bit(LSFL_NODES_VALID, &ls->ls_flags);
+	make_node_array(ls);
+
+	error = nodes_reconfig_wait(ls);
+
+	log_all(ls, "total nodes %d", ls->ls_num_nodes);
+	return error;
+ fail:
+	ls_nodes_clear(ls);
+	return error;
+}
+
+int in_nodes_gone(struct dlm_ls *ls, uint32_t nodeid)
+{
+	struct dlm_csb *csb;
+
+	list_for_each_entry(csb, &ls->ls_nodes_gone, list) {
+		if (csb->node->nodeid == nodeid)
+			return TRUE;
+	}
+	return FALSE;
+}
diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
--- linux-orig/cluster/dlm/nodes.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/nodes.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,27 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __NODES_DOT_H__
+#define __NODES_DOT_H__
+
+void dlm_nodes_init(void);
+int init_new_csb(uint32_t nodeid, struct dlm_csb ** ret_csb);
+void release_csb(struct dlm_csb * csb);
+uint32_t our_nodeid(void);
+int ls_nodes_reconfig(struct dlm_ls * ls, struct dlm_recover * gr, int *neg);
+int ls_nodes_init(struct dlm_ls * ls, struct dlm_recover * gr);
+int in_nodes_gone(struct dlm_ls * ls, uint32_t nodeid);
+void ls_nodes_clear(struct dlm_ls *ls);
+void ls_nodes_gone_clear(struct dlm_ls *ls);
+
+#endif				/* __NODES_DOT_H__ */
diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
--- linux-orig/cluster/dlm/proc.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/proc.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,652 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+
+#if defined(DLM_DEBUG)
+#define DLM_DEBUG_SIZE		(1024)
+#define MAX_DEBUG_MSG_LEN	(64)
+#else
+#define DLM_DEBUG_SIZE		(0)
+#define MAX_DEBUG_MSG_LEN	(0)
+#endif
+
+static char *			debug_buf;
+static unsigned int		debug_size;
+static unsigned int		debug_point;
+static int			debug_wrap;
+static spinlock_t		debug_lock;
+static struct proc_dir_entry *	debug_proc_entry = NULL;
+static char			proc_ls_name[255] = "";
+
+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
+static struct proc_dir_entry *	locks_proc_entry = NULL;
+static struct seq_operations	locks_info_op;
+static struct proc_dir_entry *	dir_proc_entry = NULL;
+static struct seq_operations	dir_info_op;
+
+
+/*
+ * /proc/cluster/dlm_locks - dump resources and locks
+ */
+
+static int locks_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &locks_info_op);
+}
+
+/* Write simply sets the lockspace to use */
+static ssize_t locks_write(struct file *file, const char *buf,
+			   size_t count, loff_t * ppos)
+{
+	if (count < sizeof(proc_ls_name)) {
+		copy_from_user(proc_ls_name, buf, count);
+		proc_ls_name[count] = '\0';
+
+		/* Remove any trailing LF so that lazy users
+		   can just echo "lsname" > /proc/cluster/dlm_locks */
+		if (proc_ls_name[count - 1] == '\n')
+			proc_ls_name[count - 1] = '\0';
+
+		return count;
+	}
+	return 0;
+}
+
+static struct file_operations locks_fops = {
+	open:locks_open,
+	write:locks_write,
+	read:seq_read,
+	llseek:seq_lseek,
+	release:seq_release,
+};
+
+struct ls_dumpinfo {
+	int entry;
+	struct list_head *next;
+	struct dlm_ls *ls;
+	struct dlm_rsb *rsb;
+	struct dlm_direntry *de;
+};
+
+static int print_resource(struct dlm_rsb * res, struct seq_file *s);
+
+static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
+{
+	int i;
+
+	if (!di->next) {
+		/* Find the next non-empty hash bucket */
+		for (i = di->entry; i < di->ls->ls_rsbtbl_size; i++) {
+			read_lock(&di->ls->ls_rsbtbl[i].lock);
+			if (!list_empty(&di->ls->ls_rsbtbl[i].list)) {
+				di->next = di->ls->ls_rsbtbl[i].list.next;
+				read_unlock(&di->ls->ls_rsbtbl[i].lock);
+				break;
+			}
+			read_unlock(&di->ls->ls_rsbtbl[i].lock);
+		}
+		di->entry = i;
+
+		if (di->entry >= di->ls->ls_rsbtbl_size)
+			return NULL;    /* End of hash list */
+	} else {		/* Find the next entry in the list */
+		i = di->entry;
+		read_lock(&di->ls->ls_rsbtbl[i].lock);
+		di->next = di->next->next;
+		if (di->next->next == di->ls->ls_rsbtbl[i].list.next) {
+			/* End of list - move to next bucket */
+			di->next = NULL;
+			di->entry++;
+			read_unlock(&di->ls->ls_rsbtbl[i].lock);
+			return next_rsb(di);	/* do the top half of this conditional */
+		}
+		read_unlock(&di->ls->ls_rsbtbl[i].lock);
+	}
+	di->rsb = list_entry(di->next, struct dlm_rsb, res_hashchain);
+
+	return di;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	struct ls_dumpinfo *di;
+	struct dlm_ls *ls;
+	int i;
+
+	ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
+	if (!ls)
+		return NULL;
+
+	di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
+	if (!di)
+		return NULL;
+
+	if (*pos == 0)
+		seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
+
+	di->entry = 0;
+	di->next = NULL;
+	di->ls = ls;
+	di->de = NULL;
+
+	for (i = 0; i < *pos; i++)
+		if (next_rsb(di) == NULL)
+			return NULL;
+
+	return next_rsb(di);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct ls_dumpinfo *di = p;
+
+	*pos += 1;
+
+	return next_rsb(di);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+	struct ls_dumpinfo *di = p;
+	return print_resource(di->rsb, m);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static struct seq_operations locks_info_op = {
+	start:s_start,
+	next:s_next,
+	stop:s_stop,
+	show:s_show
+};
+
+static char *print_lockmode(int mode)
+{
+	switch (mode) {
+	case DLM_LOCK_IV:
+		return "--";
+	case DLM_LOCK_NL:
+		return "NL";
+	case DLM_LOCK_CR:
+		return "CR";
+	case DLM_LOCK_CW:
+		return "CW";
+	case DLM_LOCK_PR:
+		return "PR";
+	case DLM_LOCK_PW:
+		return "PW";
+	case DLM_LOCK_EX:
+		return "EX";
+	default:
+		return "??";
+	}
+}
+
+static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
+		       struct dlm_rsb *res)
+{
+
+	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+	if (lkb->lkb_status == GDLM_LKSTS_CONVERT
+	    || lkb->lkb_status == GDLM_LKSTS_WAITING)
+		seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+	if (lkb->lkb_range) {
+		/* This warns on Alpha. Tough. Only I see it */
+		if (lkb->lkb_status == GDLM_LKSTS_CONVERT
+		    || lkb->lkb_status == GDLM_LKSTS_GRANTED)
+			seq_printf(s, " %" PRIx64 "-%" PRIx64,
+				   lkb->lkb_range[GR_RANGE_START],
+				   lkb->lkb_range[GR_RANGE_END]);
+		if (lkb->lkb_status == GDLM_LKSTS_CONVERT
+		    || lkb->lkb_status == GDLM_LKSTS_WAITING)
+			seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
+				   lkb->lkb_range[RQ_RANGE_START],
+				   lkb->lkb_range[RQ_RANGE_END]);
+	}
+
+	if (lkb->lkb_nodeid) {
+		if (lkb->lkb_nodeid != res->res_nodeid)
+			seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+				   lkb->lkb_remid);
+		else
+			seq_printf(s, " Master:     %08x", lkb->lkb_remid);
+	}
+
+	if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
+		seq_printf(s, "  LQ: %d,0x%x", lkb->lkb_lockqueue_state,
+			   lkb->lkb_lockqueue_flags);
+
+	seq_printf(s, "\n");
+}
+
+static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+{
+	int i;
+	struct list_head *locklist;
+
+	seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
+		   res->res_parent, res->res_length);
+	for (i = 0; i < res->res_length; i++) {
+		if (isprint(res->res_name[i]))
+			seq_printf(s, "%c", res->res_name[i]);
+		else
+			seq_printf(s, "%c", '.');
+	}
+	if (res->res_nodeid)
+		seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+			   res->res_nodeid);
+	else
+		seq_printf(s, "\"  \nMaster Copy\n");
+
+	/* Print the LVB: */
+	if (res->res_lvbptr) {
+		seq_printf(s, "LVB: ");
+		for (i = 0; i < DLM_LVB_LEN; i++) {
+			if (i == DLM_LVB_LEN / 2)
+				seq_printf(s, "\n     ");
+			seq_printf(s, "%02x ",
+				   (unsigned char) res->res_lvbptr[i]);
+		}
+		seq_printf(s, "\n");
+	}
+
+	/* Print the locks attached to this resource */
+	seq_printf(s, "Granted Queue\n");
+	list_for_each(locklist, &res->res_grantqueue) {
+		struct dlm_lkb *this_lkb =
+		    list_entry(locklist, struct dlm_lkb, lkb_statequeue);
+		print_lock(s, this_lkb, res);
+	}
+
+	seq_printf(s, "Conversion Queue\n");
+	list_for_each(locklist, &res->res_convertqueue) {
+		struct dlm_lkb *this_lkb =
+		    list_entry(locklist, struct dlm_lkb, lkb_statequeue);
+		print_lock(s, this_lkb, res);
+	}
+
+	seq_printf(s, "Waiting Queue\n");
+	list_for_each(locklist, &res->res_waitqueue) {
+		struct dlm_lkb *this_lkb =
+		    list_entry(locklist, struct dlm_lkb, lkb_statequeue);
+		print_lock(s, this_lkb, res);
+	}
+
+	return 0;
+}
+
+
+/*
+ * /proc/cluster/dlm_dir - dump resource directory
+ */
+
+static int print_de(struct dlm_direntry *de, struct seq_file *s)
+{
+	char strname[DLM_RESNAME_MAXLEN+1];
+
+	memset(strname, 0, DLM_RESNAME_MAXLEN+1);
+	memcpy(strname, de->name, de->length);
+
+	seq_printf(s, "%s %u\n", strname, de->master_nodeid);
+	return 0;
+}
+
+static int dir_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &dir_info_op);
+}
+
+static ssize_t dir_write(struct file *file, const char *buf,
+			 size_t count, loff_t *ppos)
+{
+	return locks_write(file, buf, count, ppos);
+}
+
+static struct file_operations dir_fops = {
+	.open    = dir_open,
+	.write   = dir_write,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.owner   = THIS_MODULE,
+};
+
+static struct ls_dumpinfo *next_de(struct ls_dumpinfo *di)
+{
+	int i;
+
+	if (!di->next) {
+		/* Find the next non-empty hash bucket */
+		for (i = di->entry; i < di->ls->ls_dirtbl_size; i++) {
+			read_lock(&di->ls->ls_dirtbl[i].lock);
+			if (!list_empty(&di->ls->ls_dirtbl[i].list)) {
+				di->next = di->ls->ls_dirtbl[i].list.next;
+				read_unlock(&di->ls->ls_dirtbl[i].lock);
+				break;
+			}
+			read_unlock(&di->ls->ls_dirtbl[i].lock);
+		}
+		di->entry = i;
+
+		if (di->entry >= di->ls->ls_dirtbl_size)
+			return NULL;    /* End of hash list */
+	} else {		/* Find the next entry in the list */
+		i = di->entry;
+		read_lock(&di->ls->ls_dirtbl[i].lock);
+		di->next = di->next->next;
+		if (di->next->next == di->ls->ls_dirtbl[i].list.next) {
+			/* End of list - move to next bucket */
+			di->next = NULL;
+			di->entry++;
+			read_unlock(&di->ls->ls_dirtbl[i].lock);
+			return next_de(di);	/* do the top half of this conditional */
+		}
+		read_unlock(&di->ls->ls_dirtbl[i].lock);
+	}
+	di->de = list_entry(di->next, struct dlm_direntry, list);
+
+	return di;
+}
+
+static void *dir_start(struct seq_file *m, loff_t *pos)
+{
+	struct ls_dumpinfo *di;
+	struct dlm_ls *ls;
+	int i;
+
+	ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
+	if (!ls)
+		return NULL;
+
+	di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
+	if (!di)
+		return NULL;
+
+	if (*pos == 0)
+		seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
+
+	di->entry = 0;
+	di->next = NULL;
+	di->ls = ls;
+
+	for (i = 0; i < *pos; i++)
+		if (next_de(di) == NULL)
+			return NULL;
+
+	return next_de(di);
+}
+
+static void *dir_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct ls_dumpinfo *di = p;
+
+	*pos += 1;
+
+	return next_de(di);
+}
+
+static int dir_show(struct seq_file *m, void *p)
+{
+	struct ls_dumpinfo *di = p;
+	return print_de(di->de, m);
+}
+
+static void dir_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static struct seq_operations dir_info_op = {
+	.start = dir_start,
+	.next  = dir_next,
+	.stop  = dir_stop,
+	.show  = dir_show,
+};
+#endif				/* CONFIG_CLUSTER_DLM_PROCLOCKS */
+
+void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...)
+{
+	va_list va;
+	int i, n, size, len;
+	char buf[MAX_DEBUG_MSG_LEN+1];
+
+	spin_lock(&debug_lock);
+
+	if (!debug_buf)
+		goto out;
+
+	size = MAX_DEBUG_MSG_LEN;
+	memset(buf, 0, size+1);
+
+	n = snprintf(buf, size, "%s ", ls->ls_name);
+	size -= n;
+
+	va_start(va, fmt);
+	vsnprintf(buf+n, size, fmt, va);
+	va_end(va);
+
+	len = strlen(buf);
+	if (len > MAX_DEBUG_MSG_LEN-1)
+		len = MAX_DEBUG_MSG_LEN-1;
+	buf[len] = '\n';
+	buf[len+1] = '\0';
+
+	for (i = 0; i < strlen(buf); i++) {
+		debug_buf[debug_point++] = buf[i];
+
+		if (debug_point == debug_size) {
+			debug_point = 0;
+			debug_wrap = 1;
+		}
+	}
+ out:
+	spin_unlock(&debug_lock);
+}
+
+void dlm_debug_dump(void)
+{
+	int i;
+
+	spin_lock(&debug_lock);
+	if (debug_wrap) {
+		for (i = debug_point; i < debug_size; i++)
+			printk("%c", debug_buf[i]);
+	}
+	for (i = 0; i < debug_point; i++)
+		printk("%c", debug_buf[i]);
+	spin_unlock(&debug_lock);
+}
+
+void dlm_debug_setup(int size)
+{
+	char *b = NULL;
+
+	if (size > PAGE_SIZE)
+		size = PAGE_SIZE;
+	if (size)
+		b = kmalloc(size, GFP_KERNEL);
+
+	spin_lock(&debug_lock);
+	if (debug_buf)
+		kfree(debug_buf);
+	if (!size || !b)
+		goto out;
+	debug_size = size;
+	debug_point = 0;
+	debug_wrap = 0;
+	debug_buf = b;
+	memset(debug_buf, 0, debug_size);
+ out:
+        spin_unlock(&debug_lock);
+}
+
+static void dlm_debug_init(void)
+{
+	debug_buf = NULL;
+        debug_size = 0;
+	debug_point = 0;
+	debug_wrap = 0;
+	spin_lock_init(&debug_lock);
+
+	dlm_debug_setup(DLM_DEBUG_SIZE);
+}
+
+#ifdef CONFIG_PROC_FS
+int dlm_debug_info(char *b, char **start, off_t offset, int length)
+{
+	int i, n = 0;
+
+	spin_lock(&debug_lock);
+
+	if (debug_wrap) {
+		for (i = debug_point; i < debug_size; i++)
+			n += sprintf(b + n, "%c", debug_buf[i]);
+	}
+	for (i = 0; i < debug_point; i++)
+		n += sprintf(b + n, "%c", debug_buf[i]);
+
+	spin_unlock(&debug_lock);
+
+	return n;
+}
+#endif
+
+#ifdef CONFIG_DLM_STATS
+struct dlm_statinfo dlm_stats;
+static struct proc_dir_entry *stats_proc_entry = NULL;
+static int dlm_stats_info(char *b, char **start, off_t offset, int length)
+{
+	int n=0;
+	int i;
+	long lq_locks = 0;
+	unsigned long lq_time = 0;
+
+	n += sprintf(b+n, "DLM stats (HZ=%d)\n\n", HZ);
+	n += sprintf(b+n, "Lock operations:    %7d\n", dlm_stats.lockops);
+	n += sprintf(b+n, "Unlock operations:  %7d\n", dlm_stats.unlockops);
+	n += sprintf(b+n, "Convert operations: %7d\n", dlm_stats.convertops);
+	n += sprintf(b+n, "Completion ASTs:    %7d\n", dlm_stats.cast);
+	n += sprintf(b+n, "Blocking ASTs:      %7d\n", dlm_stats.bast);
+	n += sprintf(b+n, "\n");
+	n += sprintf(b+n, "Lockqueue        num  waittime   ave\n");
+	for (i=1; i<=4 ; i++) {
+		char *lq_reason="???";
+		switch (i){
+		case 1: lq_reason = "WAIT_RSB   ";
+			break;
+		case 2: lq_reason = "WAIT_CONV  ";
+			break;
+		case 3: lq_reason = "WAIT_GRANT ";
+			break;
+		case 4: lq_reason = "WAIT_UNLOCK";
+			break;
+		}
+		if (dlm_stats.lockqueue_locks[i])
+			n += sprintf(b+n, "%s   %6lu   %7lu   %3lu\n",
+				     lq_reason,
+				     dlm_stats.lockqueue_locks[i],
+				     dlm_stats.lockqueue_time[i],
+				     dlm_stats.lockqueue_time[i]/
+				     dlm_stats.lockqueue_locks[i]);
+
+		lq_locks += dlm_stats.lockqueue_locks[i];
+		lq_time += dlm_stats.lockqueue_time[i];
+	}
+	if (lq_locks)
+		n += sprintf(b+n, "Total         %6lu   %7lu   %3lu\n",
+			     lq_locks, lq_time, lq_time/lq_locks);
+	return n;
+}
+
+static int dlm_stats_clear(struct file *file, const char __user *buffer,
+                           unsigned long count, void *data)
+{
+	memset(&dlm_stats, 0, sizeof(dlm_stats));
+	return count;
+}
+#endif  /* CONFIG_DLM_STATS */
+
+void dlm_proc_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
+					     NULL);
+	if (!debug_proc_entry)
+		return;
+
+	debug_proc_entry->get_info = &dlm_debug_info;
+#endif
+
+#ifdef CONFIG_DLM_STATS
+	stats_proc_entry = create_proc_entry("cluster/dlm_stats",
+					     S_IRUSR | S_IWUSR, NULL);
+	if (!stats_proc_entry)
+		return;
+
+	stats_proc_entry->get_info = &dlm_stats_info;
+	stats_proc_entry->write_proc = &dlm_stats_clear;
+#endif
+
+	dlm_debug_init();
+
+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
+	locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
+						  S_IFREG | 0400,
+						  NULL, NULL, NULL);
+	if (!locks_proc_entry)
+		return;
+	locks_proc_entry->proc_fops = &locks_fops;
+
+	dir_proc_entry = create_proc_read_entry("cluster/dlm_dir",
+						S_IFREG | 0400,
+						NULL, NULL, NULL);
+	if (!dir_proc_entry)
+		return;
+	dir_proc_entry->proc_fops = &dir_fops;
+#endif
+}
+
+void dlm_proc_exit(void)
+{
+#ifdef CONFIG_PROC_FS
+	if (debug_proc_entry) {
+		remove_proc_entry("cluster/dlm_debug", NULL);
+		dlm_debug_setup(0);
+	}
+#endif
+
+#ifdef CONFIG_DLM_STATS
+	if (stats_proc_entry)
+		remove_proc_entry("cluster/dlm_stats", NULL);
+#endif
+
+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
+	if (locks_proc_entry)
+		remove_proc_entry("cluster/dlm_locks", NULL);
+	if (dir_proc_entry)
+		remove_proc_entry("cluster/dlm_dir", NULL);
+#endif
+}
diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
--- linux-orig/cluster/dlm/queries.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/queries.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,713 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * queries.c
+ *
+ * This file provides the kernel query interface to the DLM.
+ *
+ */
+
+#define EXPORT_SYMTAB
+#include <linux/module.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lockqueue.h"
+#include "locking.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "dir.h"
+#include "ast.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rsb.h"
+
+static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo);
+static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo);
+
+/*
+ * API entry point.
+ */
+int dlm_query(void *lockspace,
+	      struct dlm_lksb *lksb,
+	      int query,
+	      struct dlm_queryinfo *qinfo,
+	      void (ast_routine(void *)),
+	      void *astarg)
+{
+	int status = -EINVAL;
+	struct dlm_lkb *target_lkb;
+	struct dlm_lkb *query_lkb = NULL;	/* Our temporary LKB */
+	struct dlm_ls  *ls = find_lockspace_by_local_id(lockspace);
+
+	if (!ls)
+		return -EINVAL;
+	if (!qinfo)
+		goto out;
+	if (!ast_routine)
+	        goto out;
+	if (!lksb)
+	        goto out;
+
+	if (!qinfo->gqi_lockinfo)
+		qinfo->gqi_locksize = 0;
+
+        /* Find the lkid */
+	target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
+	if (!target_lkb)
+		goto out;
+
+	/* If the user wants a list of locks that are blocking or
+	   not blocking this lock, then it must be waiting
+	   for something
+	*/
+	if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
+	     (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
+	    target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
+		goto out;
+
+	/* We now allocate an LKB for our own use (so we can hang
+	 * things like the AST routine and the lksb from it) */
+	lksb->sb_status = -EBUSY;
+	query_lkb = create_lkb(ls);
+	if (!query_lkb) {
+	        status = -ENOMEM;
+		goto out;
+	}
+	query_lkb->lkb_astaddr  = ast_routine;
+	query_lkb->lkb_astparam = (long)astarg;
+	query_lkb->lkb_resource = target_lkb->lkb_resource;
+	query_lkb->lkb_lksb     = lksb;
+
+	/* Don't free the resource while we are querying it. This ref
+	 * will be dropped when the LKB is freed */
+	hold_rsb(query_lkb->lkb_resource);
+
+	/* Fill in the stuff that's always local */
+	if (qinfo->gqi_resinfo) {
+		if (target_lkb->lkb_resource->res_nodeid)
+			qinfo->gqi_resinfo->rsi_masternode =
+				target_lkb->lkb_resource->res_nodeid;
+		else
+			qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
+		qinfo->gqi_resinfo->rsi_length =
+			target_lkb->lkb_resource->res_length;
+		memcpy(qinfo->gqi_resinfo->rsi_name,
+		       target_lkb->lkb_resource->res_name,
+		       qinfo->gqi_resinfo->rsi_length);
+	}
+
+	/* If the master is local (or the user doesn't want the overhead of a
+	 * remote call) - fill in the details here */
+	if (target_lkb->lkb_resource->res_nodeid == 0 ||
+	    (query & DLM_QUERY_LOCAL)) {
+
+		status = 0;
+		/* Resource info */
+		if (qinfo->gqi_resinfo) {
+			query_resource(target_lkb->lkb_resource,
+				       qinfo->gqi_resinfo);
+		}
+
+		/* Lock lists */
+		if (qinfo->gqi_lockinfo) {
+			status = query_locks(query, target_lkb, qinfo);
+		}
+
+		query_lkb->lkb_retstatus = status;
+		queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
+		wake_astd();
+
+		/* An AST will be delivered so we must return success here */
+		status = 0;
+		goto out;
+	}
+
+	/* Remote master */
+	if (target_lkb->lkb_resource->res_nodeid != 0)
+	{
+		struct dlm_query_request *remquery;
+		struct writequeue_entry *e;
+
+		/* Clear this cos the receiving end adds to it with
+		   each incoming packet */
+		qinfo->gqi_lockcount = 0;
+
+		/* Squirrel a pointer to the query info struct
+		   somewhere illegal */
+		query_lkb->lkb_request = (struct dlm_request *) qinfo;
+
+		e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
+					sizeof(struct dlm_query_request),
+					ls->ls_allocation,
+					(char **) &remquery);
+		if (!e) {
+			status = -ENOBUFS;
+			goto out;
+		}
+
+		/* Build remote packet */
+		memset(remquery, 0, sizeof(struct dlm_query_request));
+
+		remquery->rq_maxlocks  = qinfo->gqi_locksize;
+		remquery->rq_query     = query;
+		remquery->rq_mstlkid   = target_lkb->lkb_remid;
+		if (qinfo->gqi_lockinfo)
+			remquery->rq_maxlocks = qinfo->gqi_locksize;
+
+		remquery->rq_header.rh_cmd       = GDLM_REMCMD_QUERY;
+		remquery->rq_header.rh_flags     = 0;
+		remquery->rq_header.rh_length    = sizeof(struct dlm_query_request);
+		remquery->rq_header.rh_lkid      = query_lkb->lkb_id;
+		remquery->rq_header.rh_lockspace = ls->ls_global_id;
+
+		midcomms_send_buffer(&remquery->rq_header, e);
+		status = 0;
+	}
+
+      out:
+	put_lockspace(ls);
+	return status;
+}
+
+static inline int valid_range(struct dlm_range *r)
+{
+    if (r->ra_start != 0ULL ||
+	r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
+	return 1;
+    else
+	return 0;
+}
+
+static void put_int(int x, char *buf, int *offp)
+{
+        x = cpu_to_le32(x);
+        memcpy(buf + *offp, &x, sizeof(int));
+        *offp += sizeof(int);
+}
+
+static void put_int64(uint64_t x, char *buf, int *offp)
+{
+        x = cpu_to_le64(x);
+        memcpy(buf + *offp, &x, sizeof(uint64_t));
+        *offp += sizeof(uint64_t);
+}
+
+static int get_int(char *buf, int *offp)
+{
+        int value;
+        memcpy(&value, buf + *offp, sizeof(int));
+        *offp += sizeof(int);
+        return le32_to_cpu(value);
+}
+
+static uint64_t get_int64(char *buf, int *offp)
+{
+        uint64_t value;
+
+        memcpy(&value, buf + *offp, sizeof(uint64_t));
+        *offp += sizeof(uint64_t);
+        return le64_to_cpu(value);
+}
+
+#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
+
+/* Called from recvd to get lock info for a remote node */
+int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
+{
+        struct dlm_query_request *query = (struct dlm_query_request *) msg;
+	struct dlm_query_reply *reply;
+	struct dlm_resinfo resinfo;
+	struct dlm_queryinfo qinfo;
+	struct writequeue_entry *e;
+	char *buf;
+	struct dlm_lkb *lkb;
+	int status = 0;
+	int bufidx;
+	int finished = 0;
+	int cur_lock = 0;
+	int start_lock = 0;
+
+	lkb = find_lock_by_id(ls, query->rq_mstlkid);
+	if (!lkb) {
+		status = -EINVAL;
+		goto send_error;
+	}
+
+	qinfo.gqi_resinfo = &resinfo;
+	qinfo.gqi_locksize = query->rq_maxlocks;
+
+	/* Get the resource bits */
+	query_resource(lkb->lkb_resource, &resinfo);
+
+	/* Now get the locks if wanted */
+	if (query->rq_maxlocks) {
+	        qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
+					     GFP_KERNEL);
+		if (!qinfo.gqi_lockinfo) {
+		        status = -ENOMEM;
+			goto send_error;
+		}
+
+		status = query_locks(query->rq_query, lkb, &qinfo);
+		if (status && status != -E2BIG) {
+			kfree(qinfo.gqi_lockinfo);
+			goto send_error;
+		}
+	}
+	else {
+	        qinfo.gqi_lockinfo = NULL;
+		qinfo.gqi_lockcount = 0;
+	}
+
+	/* Send as many blocks as needed for all the locks */
+	do {
+		int i;
+		int msg_len = sizeof(struct dlm_query_reply);
+		int last_msg_len = msg_len; /* keeps compiler quiet */
+		int last_lock;
+
+		/* First work out how many locks we can fit into a block */
+		for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
+
+			last_msg_len = msg_len;
+
+			msg_len += LOCK_LEN;
+			if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
+			    valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
+
+				msg_len += sizeof(uint64_t) * 4;
+			}
+		}
+
+		/* There must be a neater way of doing this... */
+		if (msg_len > PAGE_SIZE) {
+			last_lock = i-1;
+			msg_len = last_msg_len;
+		}
+		else {
+			last_lock = i;
+		}
+
+		e = lowcomms_get_buffer(nodeid,
+					msg_len,
+					ls->ls_allocation,
+					(char **) &reply);
+		if (!e) {
+			kfree(qinfo.gqi_lockinfo);
+			status = -ENOBUFS;
+			goto out;
+		}
+
+		reply->rq_header.rh_cmd       = GDLM_REMCMD_QUERYREPLY;
+		reply->rq_header.rh_length    = msg_len;
+		reply->rq_header.rh_lkid      = msg->rh_lkid;
+		reply->rq_header.rh_lockspace = msg->rh_lockspace;
+
+		reply->rq_status     = status;
+		reply->rq_startlock  = cur_lock;
+		reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
+		reply->rq_convcount  = qinfo.gqi_resinfo->rsi_convcount;
+		reply->rq_waitcount  = qinfo.gqi_resinfo->rsi_waitcount;
+		memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
+
+		buf = (char *)reply;
+		bufidx = sizeof(struct dlm_query_reply);
+
+		for (; cur_lock < last_lock; cur_lock++) {
+
+			buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
+			buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
+			buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_ownpid, buf, &bufidx);
+
+			if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
+			    valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
+
+				buf[bufidx++] = 1;
+				put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
+				put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
+				put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
+				put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
+			}
+			else {
+				buf[bufidx++] = 0;
+			}
+		}
+
+		if (cur_lock == qinfo.gqi_lockcount) {
+			reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
+			finished = 1;
+		}
+		else {
+			reply->rq_header.rh_flags = 0;
+		}
+
+		reply->rq_numlocks = cur_lock - start_lock;
+		start_lock = cur_lock;
+
+		midcomms_send_buffer(&reply->rq_header, e);
+	} while (!finished);
+
+	kfree(qinfo.gqi_lockinfo);
+ out:
+	return status;
+
+ send_error:
+	e = lowcomms_get_buffer(nodeid,
+				sizeof(struct dlm_query_reply),
+				ls->ls_allocation,
+				(char **) &reply);
+	if (!e) {
+		status =  -ENOBUFS;
+		goto out;
+	}
+	reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
+	reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
+	reply->rq_header.rh_length = sizeof(struct dlm_query_reply);
+	reply->rq_header.rh_lkid = msg->rh_lkid;
+	reply->rq_header.rh_lockspace = msg->rh_lockspace;
+	reply->rq_status     = status;
+	reply->rq_numlocks   = 0;
+	reply->rq_startlock  = 0;
+	reply->rq_grantcount = 0;
+	reply->rq_convcount  = 0;
+	reply->rq_waitcount  = 0;
+
+	midcomms_send_buffer(&reply->rq_header, e);
+
+	return status;
+}
+
+/* Reply to a remote query */
+int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
+{
+	struct dlm_lkb *query_lkb;
+	struct dlm_queryinfo *qinfo;
+	struct dlm_query_reply *reply;
+	char *buf;
+	int i;
+	int bufidx;
+
+	query_lkb = find_lock_by_id(ls, msg->rh_lkid);
+	if (!query_lkb)
+		return -EINVAL;
+
+	qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
+	reply = (struct dlm_query_reply *) msg;
+
+	/* Copy the easy bits first */
+	qinfo->gqi_lockcount += reply->rq_numlocks;
+	if (qinfo->gqi_resinfo) {
+		qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
+		qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
+		qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
+		memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
+			DLM_LVB_LEN);
+	}
+
+	/* Now unpack the locks */
+	bufidx = sizeof(struct dlm_query_reply);
+	buf = (char *) msg;
+
+	DLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
+		    printk("start = %d, num + %d. Max=  %d\n",
+			   reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
+
+	for (i = reply->rq_startlock;
+	     i < reply->rq_startlock + reply->rq_numlocks; i++) {
+		qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
+		qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
+		qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
+		qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
+		qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
+		qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
+		qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
+		qinfo->gqi_lockinfo[i].lki_ownpid = get_int(buf, &bufidx);
+		if (buf[bufidx++]) {
+			qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
+			qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = get_int64(buf, &bufidx);
+			qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
+			qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = get_int64(buf, &bufidx);
+		}
+		else {
+			qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
+			qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
+			qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
+			qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
+		}
+	}
+
+	/* If this was the last block then now tell the user */
+	if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
+	        query_lkb->lkb_retstatus = reply->rq_status;
+		queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
+		wake_astd();
+	}
+
+	return 0;
+}
+
+/* Aggregate resource information */
+static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo)
+{
+	struct list_head *tmp;
+
+	if (rsb->res_lvbptr)
+		memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
+
+	down_read(&rsb->res_lock);
+	resinfo->rsi_grantcount = 0;
+	list_for_each(tmp, &rsb->res_grantqueue) {
+		resinfo->rsi_grantcount++;
+	}
+
+	resinfo->rsi_waitcount = 0;
+	list_for_each(tmp, &rsb->res_waitqueue) {
+		resinfo->rsi_waitcount++;
+	}
+
+	resinfo->rsi_convcount = 0;
+	list_for_each(tmp, &rsb->res_convertqueue) {
+		resinfo->rsi_convcount++;
+	}
+	up_read(&rsb->res_lock);
+
+	return 0;
+}
+
+static int add_lock(struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
+{
+	int entry;
+
+	/* Don't fill it in if the buffer is full */
+	if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
+		return -E2BIG;
+
+	/* gqi_lockcount contains the number of locks we have returned */
+	entry = qinfo->gqi_lockcount++;
+
+	/* Fun with master copies */
+	if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
+	        qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
+		qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
+	}
+	else {
+	        qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
+		qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
+	}
+
+	/* Also make sure we always have a valid nodeid in there, the
+	   calling end may not know which node "0" is */
+	if (lkb->lkb_nodeid)
+	    qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
+	else
+	    qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
+
+	if (lkb->lkb_parent)
+		qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
+	else
+		qinfo->gqi_lockinfo[entry].lki_parent = 0;
+
+	qinfo->gqi_lockinfo[entry].lki_state  = lkb->lkb_status;
+	qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
+	qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
+	qinfo->gqi_lockinfo[entry].lki_ownpid = lkb->lkb_ownpid;
+
+	if (lkb->lkb_range) {
+		qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
+			lkb->lkb_range[GR_RANGE_START];
+		qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
+			lkb->lkb_range[GR_RANGE_END];
+		qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
+			lkb->lkb_range[RQ_RANGE_START];
+		qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
+			lkb->lkb_range[RQ_RANGE_END];
+	} else {
+		qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
+		qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
+		qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
+		qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
+	}
+	return 0;
+}
+
+static int query_lkb_queue(struct dlm_rsb *rsb,
+			   struct list_head *queue, int query,
+			   struct dlm_queryinfo *qinfo)
+{
+	struct list_head *tmp;
+	int status = 0;
+	int mode = query & DLM_QUERY_MODE_MASK;
+
+	down_read(&rsb->res_lock);
+	list_for_each(tmp, queue) {
+		struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
+		int lkmode;
+
+		if (query & DLM_QUERY_RQMODE)
+			lkmode = lkb->lkb_rqmode;
+		else
+			lkmode = lkb->lkb_grmode;
+
+		/* Add the LKB info to the list if it matches the criteria in
+		 * the query bitmap */
+		switch (query & DLM_QUERY_MASK) {
+		case DLM_QUERY_LOCKS_ALL:
+			status = add_lock(lkb, qinfo);
+			break;
+
+		case DLM_QUERY_LOCKS_HIGHER:
+			if (lkmode > mode)
+				status = add_lock(lkb, qinfo);
+			break;
+
+		case DLM_QUERY_LOCKS_EQUAL:
+			if (lkmode == mode)
+				status = add_lock(lkb, qinfo);
+			break;
+
+		case DLM_QUERY_LOCKS_LOWER:
+			if (lkmode < mode)
+				status = add_lock(lkb, qinfo);
+
+		case DLM_QUERY_LOCKS_ORPHAN:
+			if (lkb->lkb_flags & GDLM_LKFLG_ORPHAN)
+				status = add_lock(lkb, qinfo);
+			break;
+		}
+	}
+	up_read(&rsb->res_lock);
+	return status;
+}
+
+/*
+ * Return 1 if the locks' ranges overlap
+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
+ */
+static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
+{
+	if (!lkb1->lkb_range || !lkb2->lkb_range)
+		return 1;
+
+	if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
+	    lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
+		return 0;
+
+	return 1;
+}
+extern const int __dlm_compat_matrix[8][8];
+
+
+static int get_blocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
+{
+	struct list_head *tmp;
+	int status = 0;
+
+	down_read(&qlkb->lkb_resource->res_lock);
+	list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
+		struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
+
+		if (ranges_overlap(lkb, qlkb) &&
+		    !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
+			status = add_lock(lkb, qinfo);
+	}
+	up_read(&qlkb->lkb_resource->res_lock);
+
+	return status;
+}
+
+static int get_nonblocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
+{
+	struct list_head *tmp;
+	int status = 0;
+
+	down_read(&qlkb->lkb_resource->res_lock);
+	list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
+		struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
+
+		if (!(ranges_overlap(lkb, qlkb) &&
+		      !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
+			status = add_lock(lkb, qinfo);
+	}
+	up_read(&qlkb->lkb_resource->res_lock);
+
+	return status;
+}
+
+/* Gather a list of appropriate locks */
+static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
+{
+	int status = 0;
+
+
+	/* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
+	 * was requested as the mode
+	 */
+	if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
+		query &= ~DLM_QUERY_MODE_MASK;
+		if (query & DLM_QUERY_RQMODE)
+			query |= lkb->lkb_rqmode;
+		else
+			query |= lkb->lkb_grmode;
+	}
+
+	qinfo->gqi_lockcount = 0;
+
+	/* BLOCKING/NOTBLOCK only look at the granted queue */
+	if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
+		return get_blocking_locks(lkb, qinfo);
+
+	if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
+		return get_nonblocking_locks(lkb, qinfo);
+
+        /* Do the lock queues that were requested */
+	if (query & DLM_QUERY_QUEUE_GRANT) {
+		status = query_lkb_queue(lkb->lkb_resource,
+					 &lkb->lkb_resource->res_grantqueue,
+					 query,	qinfo);
+	}
+
+	if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
+		status = query_lkb_queue(lkb->lkb_resource,
+					 &lkb->lkb_resource->res_convertqueue,
+					 query, qinfo);
+	}
+
+	if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
+		status = query_lkb_queue(lkb->lkb_resource,
+					 &lkb->lkb_resource->res_waitqueue,
+					 query, qinfo);
+	}
+
+
+	return status;
+}
+
+EXPORT_SYMBOL(dlm_query);
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
--- linux-orig/cluster/dlm/queries.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/queries.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,20 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __QUERIES_DOT_H__
+#define __QUERIES_DOT_H__
+
+extern int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
+extern int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
+
+#endif                          /* __QUERIES_DOT_H__ */
diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
--- linux-orig/cluster/dlm/rebuild.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/rebuild.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,1280 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * Rebuild RSB's on new masters.  Functions for transferring locks and
+ * subresources to new RSB masters during recovery.
+ */
+
+#include "dlm_internal.h"
+#include "reccomms.h"
+#include "lkb.h"
+#include "rsb.h"
+#include "nodes.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+
+
+/* Types of entity serialised in remastering messages */
+#define REMASTER_ROOTRSB 1
+#define REMASTER_RSB     2
+#define REMASTER_LKB     3
+
+struct rcom_fill {
+	char *			outbuf;		/* Beginning of data */
+	int 			offset;		/* Current offset into outbuf */
+	int 			maxlen;		/* Max value of offset */
+	int 			remasterid;
+	int 			count;
+	struct dlm_rsb *	rsb;
+	struct dlm_rsb *	subrsb;
+	struct dlm_lkb *	lkb;
+	struct list_head *	lkbqueue;
+	char 			more;
+};
+typedef struct rcom_fill rcom_fill_t;
+
+
+struct rebuild_node {
+	struct list_head	list;
+	int 			nodeid;
+	struct dlm_rsb *	rootrsb;
+};
+typedef struct rebuild_node rebuild_node_t;
+
+
+/*
+ * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
+ * master.  The rsb will be "done" with recovery when the new master has
+ * replied with all the new remote lockid's for this rsb's lkb's.
+ */
+
+void expect_new_lkids(struct dlm_rsb *rsb)
+{
+	rsb->res_newlkid_expect = 0;
+	recover_list_add(rsb);
+}
+
+/*
+ * This function is called on root rsb or subrsb when another lkb is being sent
+ * to the new master for which we expect to receive a corresponding remote lkid
+ */
+
+void need_new_lkid(struct dlm_rsb *rsb)
+{
+	struct dlm_rsb *root = rsb;
+
+	if (rsb->res_parent)
+		root = rsb->res_root;
+
+	if (!root->res_newlkid_expect)
+		recover_list_add(root);
+	else
+		DLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
+
+	root->res_newlkid_expect++;
+}
+
+/*
+ * This function is called for each lkb for which a new remote lkid is
+ * received.  Decrement the expected number of remote lkids expected for the
+ * root rsb.
+ */
+
+void have_new_lkid(struct dlm_lkb *lkb)
+{
+	struct dlm_rsb *root = lkb->lkb_resource;
+
+	if (root->res_parent)
+		root = root->res_root;
+
+	down_write(&root->res_lock);
+
+	DLM_ASSERT(root->res_newlkid_expect,
+		   printk("newlkid_expect=%d\n", root->res_newlkid_expect););
+
+	root->res_newlkid_expect--;
+
+	if (!root->res_newlkid_expect) {
+		clear_bit(RESFL_NEW_MASTER, &root->res_flags);
+		recover_list_del(root);
+	}
+	up_write(&root->res_lock);
+}
+
+/*
+ * Return the rebuild struct for a node - will create an entry on the rootrsb
+ * list if necessary.
+ *
+ * Currently no locking is needed here as it all happens in the dlm_recvd
+ * thread
+ */
+
+static rebuild_node_t *find_rebuild_root(struct dlm_ls *ls, int nodeid)
+{
+	rebuild_node_t *node = NULL;
+
+	list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
+		if (node->nodeid == nodeid)
+			return node;
+	}
+
+	/* Not found, add one */
+	node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
+	if (!node)
+		return NULL;
+
+	node->nodeid = nodeid;
+	node->rootrsb = NULL;
+	list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
+
+	return node;
+}
+
+/*
+ * Tidy up after a rebuild run.  Called when all recovery has finished
+ */
+
+void rebuild_freemem(struct dlm_ls *ls)
+{
+	rebuild_node_t *node = NULL, *s;
+
+	list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
+		list_del(&node->list);
+		kfree(node);
+	}
+}
+
+static void put_int(int x, char *buf, int *offp)
+{
+	x = cpu_to_le32(x);
+	memcpy(buf + *offp, &x, sizeof(int));
+	*offp += sizeof(int);
+}
+
+static void put_int64(uint64_t x, char *buf, int *offp)
+{
+	x = cpu_to_le64(x);
+	memcpy(buf + *offp, &x, sizeof(uint64_t));
+	*offp += sizeof(uint64_t);
+}
+
+static void put_bytes(char *x, int len, char *buf, int *offp)
+{
+	put_int(len, buf, offp);
+	memcpy(buf + *offp, x, len);
+	*offp += len;
+}
+
+static void put_char(char x, char *buf, int *offp)
+{
+	buf[*offp] = x;
+	*offp += 1;
+}
+
+static int get_int(char *buf, int *offp)
+{
+	int value;
+	memcpy(&value, buf + *offp, sizeof(int));
+	*offp += sizeof(int);
+	return le32_to_cpu(value);
+}
+
+static uint64_t get_int64(char *buf, int *offp)
+{
+	uint64_t value;
+
+	memcpy(&value, buf + *offp, sizeof(uint64_t));
+	*offp += sizeof(uint64_t);
+	return le64_to_cpu(value);
+}
+
+static char get_char(char *buf, int *offp)
+{
+	char x = buf[*offp];
+
+	*offp += 1;
+	return x;
+}
+
+static void get_bytes(char *bytes, int *len, char *buf, int *offp)
+{
+	*len = get_int(buf, offp);
+	memcpy(bytes, buf + *offp, *len);
+	*offp += *len;
+}
+
+static int lkb_length(struct dlm_lkb *lkb)
+{
+	int len = 0;
+
+	len += sizeof(int);	/* lkb_id */
+	len += sizeof(int);	/* lkb_resource->res_reamasterid */
+	len += sizeof(int);	/* lkb_flags */
+	len += sizeof(int);	/* lkb_status */
+	len += sizeof(char);	/* lkb_rqmode */
+	len += sizeof(char);	/* lkb_grmode */
+	len += sizeof(int);	/* lkb_childcnt */
+	len += sizeof(int);	/* lkb_parent->lkb_id */
+	len += sizeof(int);	/* lkb_bastaddr */
+	len += sizeof(int);     /* lkb_ownpid */
+
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
+		len += sizeof(int);	/* number of lvb bytes */
+		len += DLM_LVB_LEN;
+	}
+
+	if (lkb->lkb_range) {
+		len += sizeof(uint64_t);
+		len += sizeof(uint64_t);
+		if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
+			len += sizeof(uint64_t);
+			len += sizeof(uint64_t);
+		}
+	}
+
+	return len;
+}
+
+/*
+ * It's up to the caller to be sure there's enough space in the buffer.
+ */
+
+static void serialise_lkb(struct dlm_lkb *lkb, char *buf, int *offp)
+{
+	int flags;
+
+	/* Need to tell the remote end if we have a range */
+	flags = lkb->lkb_flags;
+	if (lkb->lkb_range)
+		flags |= GDLM_LKFLG_RANGE;
+
+	/*
+	 * See lkb_length()
+	 * Total: 30 (no lvb) or 66 (with lvb) bytes
+	 */
+
+	put_int(lkb->lkb_id, buf, offp);
+	put_int(lkb->lkb_resource->res_remasterid, buf, offp);
+	put_int(flags, buf, offp);
+	put_int(lkb->lkb_status, buf, offp);
+	put_char(lkb->lkb_rqmode, buf, offp);
+	put_char(lkb->lkb_grmode, buf, offp);
+	put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
+
+	if (lkb->lkb_parent)
+		put_int(lkb->lkb_parent->lkb_id, buf, offp);
+	else
+		put_int(0, buf, offp);
+
+	if (lkb->lkb_bastaddr)
+		put_int(1, buf, offp);
+	else
+		put_int(0, buf, offp);
+	put_int(lkb->lkb_ownpid, buf, offp);
+
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
+		DLM_ASSERT(lkb->lkb_lvbptr,);
+		put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
+	}
+
+	/* Only send the range we actually need */
+	if (lkb->lkb_range) {
+		switch (lkb->lkb_status) {
+		case GDLM_LKSTS_CONVERT:
+			put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
+			put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
+			put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
+			put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
+			break;
+		case GDLM_LKSTS_WAITING:
+			put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
+			put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
+			break;
+		case GDLM_LKSTS_GRANTED:
+			put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
+			put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
+			break;
+		default:
+			DLM_ASSERT(0,);
+		}
+	}
+}
+
+static int rsb_length(struct dlm_rsb *rsb)
+{
+	int len = 0;
+
+	len += sizeof(int);	/* number of res_name bytes */
+	len += rsb->res_length;	/* res_name */
+	len += sizeof(int);	/* res_remasterid */
+	len += sizeof(int);	/* res_parent->res_remasterid */
+
+	return len;
+}
+
+static inline struct dlm_rsb *next_subrsb(struct dlm_rsb *subrsb)
+{
+	struct list_head *tmp;
+	struct dlm_rsb *r;
+
+	tmp = subrsb->res_subreslist.next;
+	r = list_entry(tmp, struct dlm_rsb, res_subreslist);
+
+	return r;
+}
+
+static inline int last_in_list(struct dlm_rsb *r, struct list_head *head)
+{
+	struct dlm_rsb *last;
+	last = list_entry(head->prev, struct dlm_rsb, res_subreslist);
+	if (last == r)
+		return 1;
+	return 0;
+}
+
+static int lkbs_to_remaster_list(struct list_head *head)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+		if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
+			continue;
+		return TRUE;
+	}
+	return FALSE;
+}
+
+/*
+ * Used to decide if an rsb should be rebuilt on a new master.  An rsb only
+ * needs to be rebuild if we have lkb's queued on it.  NOREBUILD lkb's are not
+ * rebuilt.
+ */
+
+static int lkbs_to_remaster(struct dlm_rsb *r)
+{
+	struct dlm_rsb *sub;
+
+	if (lkbs_to_remaster_list(&r->res_grantqueue))
+		return TRUE;
+	if (lkbs_to_remaster_list(&r->res_convertqueue))
+		return TRUE;
+	if (lkbs_to_remaster_list(&r->res_waitqueue))
+		return TRUE;
+
+	list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
+		if (lkbs_to_remaster_list(&sub->res_grantqueue))
+			return TRUE;
+		if (lkbs_to_remaster_list(&sub->res_convertqueue))
+			return TRUE;
+		if (lkbs_to_remaster_list(&sub->res_waitqueue))
+			return TRUE;
+	}
+
+	return FALSE;
+}
+
+static void serialise_rsb(struct dlm_rsb *rsb, char *buf, int *offp)
+{
+	/*
+	 * See rsb_length()
+	 * Total: 36 bytes (4 + 24 + 4 + 4)
+	 */
+
+	put_bytes(rsb->res_name, rsb->res_length, buf, offp);
+	put_int(rsb->res_remasterid, buf, offp);
+
+	if (rsb->res_parent)
+		put_int(rsb->res_parent->res_remasterid, buf, offp);
+	else
+		put_int(0, buf, offp);
+
+	DLM_ASSERT(!rsb->res_lvbptr,);
+}
+
+/*
+ * Flatten an LKB into a buffer for sending to the new RSB master.  As a
+ * side-effect the nodeid of the lock is set to the nodeid of the new RSB
+ * master.
+ */
+
+static int pack_one_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			rcom_fill_t *fill)
+{
+	if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
+		goto nospace;
+
+	lkb->lkb_nodeid = r->res_nodeid;
+
+	put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
+	serialise_lkb(lkb, fill->outbuf, &fill->offset);
+
+	fill->count++;
+	need_new_lkid(r);
+	return 0;
+
+      nospace:
+	return -ENOSPC;
+}
+
+/*
+ * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
+ */
+
+static int pack_lkb_queue(struct dlm_rsb *r, struct list_head *queue,
+			  rcom_fill_t *fill)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	list_for_each_entry(lkb, queue, lkb_statequeue) {
+		if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
+			continue;
+
+		error = pack_one_lkb(r, lkb, fill);
+		if (error)
+			goto nospace;
+	}
+
+	return 0;
+
+      nospace:
+	fill->lkb = lkb;
+	fill->lkbqueue = queue;
+
+	return error;
+}
+
+static int pack_lkb_queues(struct dlm_rsb *r, rcom_fill_t *fill)
+{
+	int error;
+
+	error = pack_lkb_queue(r, &r->res_grantqueue, fill);
+	if (error)
+		goto nospace;
+
+	error = pack_lkb_queue(r, &r->res_convertqueue, fill);
+	if (error)
+		goto nospace;
+
+	error = pack_lkb_queue(r, &r->res_waitqueue, fill);
+
+      nospace:
+	return error;
+}
+
+/*
+ * Pack remaining lkb's for rsb or subrsb.  This may include a partial lkb
+ * queue and full lkb queues.
+ */
+
+static int pack_lkb_remaining(struct dlm_rsb *r, rcom_fill_t *fill)
+{
+	struct list_head *tmp, *start, *end;
+	struct dlm_lkb *lkb;
+	int error;
+
+	/*
+	 * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
+	 */
+
+	error = pack_one_lkb(r, fill->lkb, fill);
+	if (error)
+		goto out;
+
+	start = fill->lkb->lkb_statequeue.next;
+	end = fill->lkbqueue;
+
+	for (tmp = start; tmp != end; tmp = tmp->next) {
+		lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
+
+		error = pack_one_lkb(r, lkb, fill);
+		if (error) {
+			fill->lkb = lkb;
+			goto out;
+		}
+	}
+
+	/*
+	 * Pack all lkb's on r's queues following fill->lkbqueue.
+	 */
+
+	if (fill->lkbqueue == &r->res_waitqueue)
+		goto out;
+	if (fill->lkbqueue == &r->res_convertqueue)
+		goto skip;
+
+	DLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
+
+	error = pack_lkb_queue(r, &r->res_convertqueue, fill);
+	if (error)
+		goto out;
+      skip:
+	error = pack_lkb_queue(r, &r->res_waitqueue, fill);
+
+      out:
+	return error;
+}
+
+static int pack_one_subrsb(struct dlm_rsb *rsb, struct dlm_rsb *subrsb,
+			   rcom_fill_t *fill)
+{
+	int error;
+
+	down_write(&subrsb->res_lock);
+
+	if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
+		goto nospace;
+
+	subrsb->res_nodeid = rsb->res_nodeid;
+	subrsb->res_remasterid = ++fill->remasterid;
+
+	put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
+	serialise_rsb(subrsb, fill->outbuf, &fill->offset);
+
+	error = pack_lkb_queues(subrsb, fill);
+	if (error)
+		goto nospace;
+
+	up_write(&subrsb->res_lock);
+
+	return 0;
+
+      nospace:
+	up_write(&subrsb->res_lock);
+	fill->subrsb = subrsb;
+
+	return -ENOSPC;
+}
+
+static int pack_subrsbs(struct dlm_rsb *rsb, struct dlm_rsb *in_subrsb,
+			rcom_fill_t *fill)
+{
+	struct dlm_rsb *subrsb;
+	int error = 0;
+
+	/*
+	 * When an initial subrsb is given, we know it needs to be packed.
+	 * When no initial subrsb is given, begin with the first (if any exist).
+	 */
+
+	if (!in_subrsb) {
+		if (list_empty(&rsb->res_subreslist))
+			goto out;
+
+		subrsb = list_entry(rsb->res_subreslist.next, struct dlm_rsb,
+			       	    res_subreslist);
+	} else
+		subrsb = in_subrsb;
+
+	for (;;) {
+		error = pack_one_subrsb(rsb, subrsb, fill);
+		if (error)
+			goto out;
+
+		if (last_in_list(subrsb, &rsb->res_subreslist))
+			break;
+
+		subrsb = next_subrsb(subrsb);
+	}
+
+      out:
+	return error;
+}
+
+/*
+ * Finish packing whatever is left in an rsb tree.  If space runs out while
+ * finishing, save subrsb/lkb and this will be called again for the same rsb.
+ *
+ * !subrsb &&  lkb, we left off part way through root rsb's lkbs.
+ *  subrsb && !lkb, we left off just before starting a new subrsb.
+ *  subrsb &&  lkb, we left off part way through a subrsb's lkbs.
+ * !subrsb && !lkb, we shouldn't be in this function, but starting
+ *                  a new rsb in pack_rsb_tree().
+ */
+
+static int pack_rsb_tree_remaining(struct dlm_ls *ls, struct dlm_rsb *rsb,
+				   rcom_fill_t *fill)
+{
+	struct dlm_rsb *subrsb = NULL;
+	int error = 0;
+
+	if (!fill->subrsb && fill->lkb) {
+		error = pack_lkb_remaining(rsb, fill);
+		if (error)
+			goto out;
+
+		error = pack_subrsbs(rsb, NULL, fill);
+		if (error)
+			goto out;
+	}
+
+	else if (fill->subrsb && !fill->lkb) {
+		error = pack_subrsbs(rsb, fill->subrsb, fill);
+		if (error)
+			goto out;
+	}
+
+	else if (fill->subrsb && fill->lkb) {
+		error = pack_lkb_remaining(fill->subrsb, fill);
+		if (error)
+			goto out;
+
+		if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
+			goto out;
+
+		subrsb = next_subrsb(fill->subrsb);
+
+		error = pack_subrsbs(rsb, subrsb, fill);
+		if (error)
+			goto out;
+	}
+
+	fill->subrsb = NULL;
+	fill->lkb = NULL;
+
+      out:
+	return error;
+}
+
+/*
+ * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
+ * buffer.  When the buffer runs out of space, save the place to restart (the
+ * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
+ */
+
+static int pack_rsb_tree(struct dlm_ls *ls, struct dlm_rsb *rsb,
+			 rcom_fill_t *fill)
+{
+	int error = -ENOSPC;
+
+	fill->remasterid = 0;
+
+	/*
+	 * Pack the root rsb itself.  A 1 byte type precedes the serialised
+	 * rsb.  Then pack the lkb's for the root rsb.
+	 */
+
+	down_write(&rsb->res_lock);
+
+	if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
+		goto out;
+
+	rsb->res_remasterid = ++fill->remasterid;
+	put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
+	serialise_rsb(rsb, fill->outbuf, &fill->offset);
+
+	error = pack_lkb_queues(rsb, fill);
+	if (error)
+		goto out;
+
+	up_write(&rsb->res_lock);
+
+	/*
+	 * Pack subrsb/lkb's under the root rsb.
+	 */
+
+	error = pack_subrsbs(rsb, NULL, fill);
+
+	return error;
+
+      out:
+	up_write(&rsb->res_lock);
+	return error;
+}
+
+/*
+ * Given an RSB, return the next RSB that should be sent to a new master.
+ */
+
+static struct dlm_rsb *next_remastered_rsb(struct dlm_ls *ls,
+					   struct dlm_rsb *rsb)
+{
+	struct list_head *tmp, *start, *end;
+	struct dlm_rsb *r;
+
+	if (!rsb)
+		start = ls->ls_rootres.next;
+	else
+		start = rsb->res_rootlist.next;
+
+	end = &ls->ls_rootres;
+
+	for (tmp = start; tmp != end; tmp = tmp->next) {
+		r = list_entry(tmp, struct dlm_rsb, res_rootlist);
+
+		if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
+			if (r->res_nodeid && lkbs_to_remaster(r)) {
+				expect_new_lkids(r);
+				return r;
+			} else
+				clear_bit(RESFL_NEW_MASTER, &r->res_flags);
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Given an rcom buffer, fill it with RSB's that need to be sent to a single
+ * new master node.  In the case where all the data to send to one node
+ * requires multiple messages, this function needs to resume filling each
+ * successive buffer from the point where it left off when the previous buffer
+ * filled up.
+ */
+
+static void fill_rcom_buffer(struct dlm_ls *ls, rcom_fill_t *fill,
+			     uint32_t *nodeid)
+{
+	struct dlm_rsb *rsb, *prev_rsb = fill->rsb;
+	int error;
+
+	fill->offset = 0;
+
+	if (!prev_rsb) {
+
+		/*
+		 * The first time this function is called.
+		 */
+
+		rsb = next_remastered_rsb(ls, NULL);
+		if (!rsb)
+			goto no_more;
+
+	} else if (fill->subrsb || fill->lkb) {
+
+		/*
+		 * Continue packing an rsb tree that was partially packed last
+		 * time (fill->subrsb/lkb indicates where packing of last block
+		 * left off)
+		 */
+
+		rsb = prev_rsb;
+		*nodeid = rsb->res_nodeid;
+
+		error = pack_rsb_tree_remaining(ls, rsb, fill);
+		if (error == -ENOSPC)
+			goto more;
+
+		rsb = next_remastered_rsb(ls, prev_rsb);
+		if (!rsb)
+			goto no_more;
+
+		if (rsb->res_nodeid != prev_rsb->res_nodeid)
+			goto more;
+	} else {
+		rsb = prev_rsb;
+	}
+
+	/*
+	 * Pack rsb trees into the buffer until we run out of space, run out of
+	 * new rsb's or hit a new nodeid.
+	 */
+
+	*nodeid = rsb->res_nodeid;
+
+	for (;;) {
+		error = pack_rsb_tree(ls, rsb, fill);
+		if (error == -ENOSPC)
+			goto more;
+
+		prev_rsb = rsb;
+
+		rsb = next_remastered_rsb(ls, prev_rsb);
+		if (!rsb)
+			goto no_more;
+
+		if (rsb->res_nodeid != prev_rsb->res_nodeid)
+			goto more;
+	}
+
+      more:
+	fill->more = 1;
+	fill->rsb = rsb;
+	return;
+
+      no_more:
+	fill->more = 0;
+}
+
+/*
+ * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
+ */
+
+int rebuild_rsbs_send(struct dlm_ls *ls)
+{
+	struct dlm_rcom *rc;
+	rcom_fill_t fill;
+	uint32_t nodeid;
+	int error;
+
+	DLM_ASSERT(recover_list_empty(ls),);
+
+	log_all(ls, "rebuild locks");
+
+	error = -ENOMEM;
+	rc = allocate_rcom_buffer(ls);
+	if (!rc)
+		goto ret;
+
+	down_read(&ls->ls_root_lock);
+
+	error = 0;
+	memset(&fill, 0, sizeof(rcom_fill_t));
+	fill.outbuf = rc->rc_buf;
+	fill.maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+
+	do {
+		fill_rcom_buffer(ls, &fill, &nodeid);
+		if (!fill.offset)
+			break;
+
+		rc->rc_datalen = fill.offset;
+		error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
+		if (error) {
+			up_read(&ls->ls_root_lock);
+			goto out;
+		}
+
+		schedule();
+		error = dlm_recovery_stopped(ls);
+		if (error) {
+			up_read(&ls->ls_root_lock);
+			goto out;
+		}
+	}
+	while (fill.more);
+
+	up_read(&ls->ls_root_lock);
+
+	error = dlm_wait_function(ls, &recover_list_empty);
+
+	log_all(ls, "rebuilt %d locks", fill.count);
+
+      out:
+	free_rcom_buffer(rc);
+
+      ret:
+	return error;
+}
+
+static struct dlm_rsb *find_by_remasterid(struct dlm_ls *ls, int remasterid,
+				    	  struct dlm_rsb *rootrsb)
+{
+	struct dlm_rsb *rsb;
+
+	DLM_ASSERT(rootrsb,);
+
+	if (rootrsb->res_remasterid == remasterid) {
+		rsb = rootrsb;
+		goto out;
+	}
+
+	list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
+		if (rsb->res_remasterid == remasterid)
+			goto out;
+	}
+	rsb = NULL;
+
+      out:
+	return rsb;
+}
+
+/*
+ * Search a queue for the given remote lock id (remlkid).
+ */
+
+static struct dlm_lkb *search_remlkid(struct list_head *statequeue, int nodeid,
+				      int remid)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, statequeue, lkb_statequeue) {
+		if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
+			return lkb;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Given a remote lock ID (and a parent resource), return the local LKB for it
+ * Hopefully we dont need to do this too often on deep lock trees.  This is
+ * VERY suboptimal for anything but the smallest lock trees. It searches the
+ * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
+ * returns the LKB address.  OPTIMISATION: we should keep a list of these while
+ * we are building up the remastered LKBs
+ */
+
+static struct dlm_lkb *find_by_remlkid(struct dlm_rsb *rootrsb, int nodeid,
+				       int remid)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *rsb;
+
+	lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
+	if (lkb)
+		goto out;
+
+	lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
+	if (lkb)
+		goto out;
+
+	lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
+	if (lkb)
+		goto out;
+
+	list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
+		lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
+		if (lkb)
+			goto out;
+
+		lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
+		if (lkb)
+			goto out;
+
+		lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
+		if (lkb)
+			goto out;
+	}
+	lkb = NULL;
+
+      out:
+	return lkb;
+}
+
+/*
+ * Unpack an LKB from a remaster operation
+ */
+
+static int deserialise_lkb(struct dlm_ls *ls, int rem_nodeid,
+			   struct dlm_rsb *rootrsb, char *buf, int *ptr,
+			   char *outbuf, int *outoffp)
+{
+	struct dlm_lkb *lkb, *exist_lkb = NULL;
+	struct dlm_rsb *rsb;
+	int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
+
+	remote_lkid = get_int(buf, ptr);
+
+	rsb_rmid = get_int(buf, ptr);
+	rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
+	DLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
+
+	/*
+	 * We could have received this lkb already from a previous recovery
+	 * that was interrupted.  We still need to advance ptr so read in
+	 * lkb and then release it.  FIXME: verify this is valid.
+	 */
+	lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
+	if (lkb) {
+		log_all(ls, "lkb %x exists %s", remote_lkid, rsb->res_name);
+		exist_lkb = lkb;
+	}
+
+	lkb = create_lkb(ls);
+	if (!lkb)
+		goto out;
+
+	lkb->lkb_remid = remote_lkid;
+	lkb->lkb_flags = get_int(buf, ptr);
+	status = get_int(buf, ptr);
+	lkb->lkb_rqmode = get_char(buf, ptr);
+	lkb->lkb_grmode = get_char(buf, ptr);
+	atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
+
+	parentid = get_int(buf, ptr);
+	lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
+	lkb->lkb_ownpid = get_int(buf, ptr);
+
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
+		lkb->lkb_lvbptr = allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			goto out;
+		get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
+	}
+
+	if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
+		uint64_t start, end;
+
+		/* Don't need to keep the range flag, for comms use only */
+		lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
+		start = get_int64(buf, ptr);
+		end = get_int64(buf, ptr);
+
+		lkb->lkb_range = allocate_range(ls);
+		if (!lkb->lkb_range)
+			goto out;
+
+		switch (status) {
+		case GDLM_LKSTS_CONVERT:
+			lkb->lkb_range[RQ_RANGE_START] = start;
+			lkb->lkb_range[RQ_RANGE_END] = end;
+			start = get_int64(buf, ptr);
+			end = get_int64(buf, ptr);
+			lkb->lkb_range[GR_RANGE_START] = start;
+			lkb->lkb_range[GR_RANGE_END] = end;
+
+		case GDLM_LKSTS_WAITING:
+			lkb->lkb_range[RQ_RANGE_START] = start;
+			lkb->lkb_range[RQ_RANGE_END] = end;
+			break;
+
+		case GDLM_LKSTS_GRANTED:
+			lkb->lkb_range[GR_RANGE_START] = start;
+			lkb->lkb_range[GR_RANGE_END] = end;
+			break;
+		default:
+			DLM_ASSERT(0,);
+		}
+	}
+
+	if (exist_lkb) {
+		/* verify lkb and exist_lkb values match? */
+		release_lkb(ls, lkb);
+		lkb = exist_lkb;
+		goto put_lkid;
+	}
+
+	/* Resolve local lock LKB address from parent ID */
+	if (parentid)
+		lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
+				                  parentid);
+
+	atomic_inc(&rsb->res_ref);
+	lkb->lkb_resource = rsb;
+
+	lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
+	lkb->lkb_nodeid = rem_nodeid;
+
+	/*
+	 * Put the lkb on an RSB queue.  An lkb that's in the midst of a
+	 * conversion request (on the requesting node's lockqueue and has
+	 * LQCONVERT set) should be put on the granted queue.  The convert
+	 * request will be resent by the requesting node.
+	 */
+
+	if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
+		lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
+		DLM_ASSERT(status == GDLM_LKSTS_CONVERT,
+			    printk("status=%d\n", status););
+		lkb->lkb_rqmode = DLM_LOCK_IV;
+		status = GDLM_LKSTS_GRANTED;
+	}
+
+	lkb_enqueue(rsb, lkb, status);
+
+	/*
+	 * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
+	 */
+
+	if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+	    && lkb->lkb_grmode > DLM_LOCK_NL) {
+		if (!rsb->res_lvbptr)
+			rsb->res_lvbptr = allocate_lvb(ls);
+		if (!rsb->res_lvbptr)
+			goto out;
+		memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+	}
+
+	/*
+	 * Clear flags that may have been sent over that are only relevant in
+	 * the context of the sender.
+	 */
+
+	lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND |
+			    GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED);
+
+      put_lkid:
+	/* Return the new LKID to the caller's buffer */
+	put_int(lkb->lkb_id, outbuf, outoffp);
+	put_int(lkb->lkb_remid, outbuf, outoffp);
+	error = 0;
+
+      out:
+	return error;
+}
+
+static struct dlm_rsb *deserialise_rsb(struct dlm_ls *ls, int nodeid,
+				       struct dlm_rsb *rootrsb, char *buf,
+				       int *ptr)
+{
+	int length;
+	int remasterid;
+	int parent_remasterid;
+	char name[DLM_RESNAME_MAXLEN];
+	int error;
+	struct dlm_rsb *parent = NULL;
+	struct dlm_rsb *rsb;
+
+	get_bytes(name, &length, buf, ptr);
+	remasterid = get_int(buf, ptr);
+	parent_remasterid = get_int(buf, ptr);
+
+	if (parent_remasterid)
+		parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
+
+	/*
+	 * The rsb reference from this find_or_create_rsb() will keep the rsb
+	 * around while we add new lkb's to it from deserialise_lkb.  Each of
+	 * the lkb's will add an rsb reference.  The reference added here is
+	 * removed by release_rsb() after all lkb's are added.
+	 */
+
+	error = find_rsb(ls, parent, name, length, CREATE, &rsb);
+	DLM_ASSERT(!error,);
+
+	set_bit(RESFL_MASTER, &rsb->res_flags);
+
+	/* There is a case where the above needs to create the RSB. */
+	if (rsb->res_nodeid == -1)
+		rsb->res_nodeid = our_nodeid();
+
+	rsb->res_remasterid = remasterid;
+
+	return rsb;
+}
+
+/*
+ * Processing at the receiving end of a NEWLOCKS message from a node in
+ * rebuild_rsbs_send().  Rebuild a remastered lock tree.  Nodeid is the remote
+ * node whose locks we are now mastering.  For a reply we need to send back the
+ * new lockids of the remastered locks so that remote ops can find them.
+ */
+
+int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
+{
+	struct dlm_rcom *rc;
+	struct dlm_rsb *rsb = NULL;
+	rebuild_node_t *rnode;
+	char *outbuf;
+	int outptr, ptr = 0, error = -ENOMEM;
+
+	rnode = find_rebuild_root(ls, nodeid);
+	if (!rnode)
+		goto out;
+
+	/*
+	 * Allocate a buffer for the reply message which is a list of remote
+	 * lock IDs and their (new) local lock ids.  It will always be big
+	 * enough to fit <n> ID pairs if it already fit <n> LKBs.
+	 */
+
+	rc = allocate_rcom_buffer(ls);
+	if (!rc)
+		goto out;
+	outbuf = rc->rc_buf;
+	outptr = 0;
+
+	/*
+	 * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
+	 * created.  Each deserialise_rsb adds an rsb reference that must be
+	 * removed with release_rsb once all new lkb's for an rsb have been
+	 * added.
+	 */
+
+	while (ptr < len) {
+		int type;
+
+		type = get_char(buf, &ptr);
+
+		switch (type) {
+		case REMASTER_ROOTRSB:
+			if (rsb)
+				release_rsb(rsb);
+			rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
+					      &ptr);
+			rnode->rootrsb = rsb;
+			break;
+
+		case REMASTER_RSB:
+			if (rsb)
+				release_rsb(rsb);
+			rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
+					      &ptr);
+			break;
+
+		case REMASTER_LKB:
+			deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
+					outbuf, &outptr);
+			break;
+
+		default:
+			DLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
+					      "len=%d\n", type, nodeid, ptr,
+					      len););
+		}
+	}
+
+	if (rsb)
+		release_rsb(rsb);
+
+	/*
+	 * Reply with the new lock IDs.
+	 */
+
+	rc->rc_datalen = outptr;
+	error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
+
+	free_rcom_buffer(rc);
+
+      out:
+	return error;
+}
+
+/*
+ * Processing for a NEWLOCKIDS message.  Called when we get the reply from the
+ * new master telling us what the new remote lock IDs are for the remastered
+ * locks
+ */
+
+int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
+{
+	int offset = 0;
+
+	if (len == 1)
+		len = 0;
+
+	while (offset < len) {
+		int remote_id;
+		int local_id;
+		struct dlm_lkb *lkb;
+
+		if (offset + 8 > len) {
+			log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
+				  "length nodeid=%d offset=%d len=%d",
+				  nodeid, offset, len);
+			break;
+		}
+
+		remote_id = get_int(buf, &offset);
+		local_id = get_int(buf, &offset);
+
+		lkb = find_lock_by_id(ls, local_id);
+		if (lkb) {
+			lkb->lkb_remid = remote_id;
+			have_new_lkid(lkb);
+		} else {
+			log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
+				  "nodeid=%d id=%x remid=%x offset=%d len=%d",
+				  nodeid, local_id, remote_id, offset, len);
+		}
+	}
+
+	if (recover_list_empty(ls))
+		wake_up(&ls->ls_wait_general);
+
+	return 0;
+}
diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
--- linux-orig/cluster/dlm/rebuild.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/rebuild.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REBUILD_DOT_H__
+#define __REBUILD_DOT_H__
+
+int rebuild_rsbs_send(struct dlm_ls *ls);
+int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
+int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
+int rebuild_freemem(struct dlm_ls *ls);
+
+#endif				/* __REBUILD_DOT_H__ */
diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
--- linux-orig/cluster/dlm/reccomms.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/reccomms.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,447 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "reccomms.h"
+#include "nodes.h"
+#include "lockspace.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "rebuild.h"
+#include "memory.h"
+
+/* Running on the basis that only a single recovery communication will be done
+ * at a time per lockspace */
+
+static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc);
+
+static int rcom_response(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
+}
+
+/**
+ * rcom_send_message - send or request recovery data
+ * @ls: the lockspace
+ * @nodeid: node to which the message is sent
+ * @type: type of recovery message
+ * @rc: the rc buffer to send
+ * @need_reply: wait for reply if this is set
+ *
+ * Using this interface
+ * i)   Allocate an rc buffer:  
+ *          rc = allocate_rcom_buffer(ls);
+ * ii)  Copy data to send beginning at rc->rc_buf:
+ *          memcpy(rc->rc_buf, mybuf, mylen);
+ * iii) Set rc->rc_datalen to the number of bytes copied in (ii): 
+ *          rc->rc_datalen = mylen
+ * iv)  Submit the rc to this function:
+ *          rcom_send_message(rc);
+ *
+ * The max value of "mylen" is dlm_config.buffer_size - sizeof(struct
+ * dlm_rcom).  If more data must be passed in one send, use
+ * rcom_expand_buffer() which incrementally increases the size of the rc buffer
+ * by dlm_config.buffer_size bytes.
+ *
+ * Any data returned for the message (when need_reply is set) will saved in
+ * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
+ * number of bytes copied into rc->rc_buf.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
+		      struct dlm_rcom *rc, int need_reply)
+{
+	int error = 0;
+
+	if (!rc->rc_datalen)
+		rc->rc_datalen = 1;
+
+	/* 
+	 * Fill in the header.
+	 */
+
+	rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
+	rc->rc_header.rh_lockspace = ls->ls_global_id;
+	rc->rc_header.rh_length = sizeof(struct dlm_rcom) + rc->rc_datalen - 1;
+	rc->rc_subcmd = type;
+	rc->rc_msgid = ++ls->ls_rcom_msgid;
+
+	/* 
+	 * When a reply is received, the reply data goes back into this buffer.
+	 * Synchronous rcom requests (need_reply=1) are serialised because of
+	 * the single ls_rcom.
+	 */
+
+	if (need_reply) {
+		down(&ls->ls_rcom_lock);
+		ls->ls_rcom = rc;
+	}
+
+	/* 
+	 * After sending the message we'll wait at the end of this function to
+	 * get a reply.  The READY flag will be set when the reply has been
+	 * received and requested data has been copied into
+	 * ls->ls_rcom->rc_buf;
+	 */
+
+	DLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
+
+	/* 
+	 * The WAIT bit indicates that we're waiting for and willing to accept a
+	 * reply.  Any replies are ignored unless this bit is set.
+	 */
+
+	set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
+
+	/* 
+	 * Process the message locally.
+	 */
+
+	if (nodeid == our_nodeid()) {
+		rcom_process_message(ls, nodeid, rc);
+		goto out;
+	}
+
+	/* 
+	 * Send the message.
+	 */
+
+	log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
+
+	error = midcomms_send_message(nodeid, (struct dlm_header *) rc,
+				      GFP_KERNEL);
+	DLM_ASSERT(error >= 0, printk("error = %d\n", error););
+	error = 0;
+
+	/* 
+	 * Wait for a reply.  Once a reply is processed from midcomms, the
+	 * READY bit will be set and we'll be awoken (dlm_wait_function will
+	 * return 0).
+	 */
+
+	if (need_reply) {
+		error = dlm_wait_function(ls, &rcom_response);
+		if (error)
+			log_debug(ls, "rcom wait error %d", error);
+	}
+
+      out:
+	clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
+	clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
+
+	if (need_reply)
+		up(&ls->ls_rcom_lock);
+
+	return error;
+}
+
+/* 
+ * Runs in same context as midcomms.
+ */
+
+static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc)
+{
+	struct dlm_rcom rc_stack;
+	struct dlm_rcom *reply = NULL;
+	int status, datalen, maxlen;
+	uint32_t r_nodeid, be_nodeid;
+
+	if (!ls)
+		return;
+
+	if (dlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
+		log_error(ls, "ignoring recovery message %x from %u",
+			  rc->rc_subcmd, nodeid);
+		return;
+	}
+
+	switch (rc->rc_subcmd) {
+
+	case RECCOMM_STATUS:
+
+		memset(&rc_stack, 0, sizeof(struct dlm_rcom));
+		reply = &rc_stack;
+
+		reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+		reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+		reply->rc_subcmd = rc->rc_subcmd;
+		reply->rc_msgid = rc->rc_msgid;
+		reply->rc_buf[0] = 0;
+
+		if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
+			reply->rc_buf[0] |= RESDIR_VALID;
+
+		if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
+			reply->rc_buf[0] |= RESDIR_ALL_VALID;
+
+		if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
+			reply->rc_buf[0] |= NODES_VALID;
+
+		if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
+			reply->rc_buf[0] |= NODES_ALL_VALID;
+
+		reply->rc_datalen = 1;
+		reply->rc_header.rh_length =
+			sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
+
+		log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
+		break;
+
+	case RECCOMM_RECOVERNAMES:
+
+		reply = allocate_rcom_buffer(ls);
+		DLM_ASSERT(reply,);
+		maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+
+		reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+		reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+		reply->rc_subcmd = rc->rc_subcmd;
+		reply->rc_msgid = rc->rc_msgid;
+
+		/* 
+		 * The other node wants a bunch of resource names.  The name of
+		 * the resource to begin with is in rc->rc_buf.
+		 */
+
+		datalen = dlm_dir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
+					       reply->rc_buf, maxlen, nodeid);
+
+		reply->rc_datalen = datalen;
+		reply->rc_header.rh_length =
+		    sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
+
+		log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
+			  reply->rc_msgid);
+		break;
+
+	case RECCOMM_GETMASTER:
+
+		reply = allocate_rcom_buffer(ls);
+		DLM_ASSERT(reply,);
+
+		reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+		reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+		reply->rc_subcmd = rc->rc_subcmd;
+		reply->rc_msgid = rc->rc_msgid;
+
+		/* 
+		 * The other node wants to know the master of a named resource.
+		 */
+
+		status = dlm_dir_lookup(ls, nodeid, rc->rc_buf, rc->rc_datalen,
+					&r_nodeid);
+		if (status != 0) {
+			log_all(ls, "rcom lookup error %d", status);
+			free_rcom_buffer(reply);
+			reply = NULL;
+			return;
+		}
+		be_nodeid = cpu_to_be32(r_nodeid);
+		memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
+		reply->rc_datalen = sizeof(uint32_t);
+		reply->rc_header.rh_length =
+		    sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
+		break;
+
+	case RECCOMM_BULKLOOKUP:
+
+		reply = allocate_rcom_buffer(ls);
+		DLM_ASSERT(reply,);
+
+		reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+		reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+		reply->rc_subcmd = rc->rc_subcmd;
+		reply->rc_msgid = rc->rc_msgid;
+
+		/* 
+		 * This is a bulk version of the above and just returns a
+		 * buffer full of node ids to match the resources
+		 */
+
+		datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
+				             rc->rc_datalen, reply->rc_buf);
+		if (datalen < 0) {
+			free_rcom_buffer(reply);
+			reply = NULL;
+			return;
+		}
+
+		reply->rc_datalen = datalen;
+		reply->rc_header.rh_length =
+		    sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
+		break;
+
+		/* 
+		 * These RECCOMM messages don't need replies.
+		 */
+
+	case RECCOMM_NEWLOCKS:
+		rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
+		break;
+
+	case RECCOMM_NEWLOCKIDS:
+		rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
+		break;
+
+	case RECCOMM_REMRESDATA:
+		dlm_dir_remove(ls, nodeid, rc->rc_buf, rc->rc_datalen);
+		break;
+
+	default:
+		DLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
+	}
+
+	if (reply) {
+		if (nodeid == our_nodeid()) {
+			DLM_ASSERT(rc == ls->ls_rcom,);
+			memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
+			rc->rc_datalen = reply->rc_datalen;
+		} else {
+			midcomms_send_message(nodeid,
+					      (struct dlm_header *) reply,
+					      GFP_KERNEL);
+		}
+
+		if (reply != &rc_stack)
+			free_rcom_buffer(reply);
+	}
+}
+
+static void process_reply_sync(struct dlm_ls *ls, uint32_t nodeid,
+			       struct dlm_rcom *reply)
+{
+	struct dlm_rcom *rc = ls->ls_rcom;
+
+	if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
+		log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
+		return;
+	}
+
+	if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
+		log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
+		          reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
+		return;
+	}
+
+	memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
+	rc->rc_datalen = reply->rc_datalen;
+
+	/* 
+	 * Tell the thread waiting in rcom_send_message() that it can go ahead.
+	 */
+
+	set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
+	wake_up(&ls->ls_wait_general);
+}
+
+static void process_reply_async(struct dlm_ls *ls, uint32_t nodeid,
+				struct dlm_rcom *reply)
+{
+	restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
+			       reply->rc_msgid);
+}
+
+/* 
+ * Runs in same context as midcomms.
+ */
+
+static void rcom_process_reply(struct dlm_ls *ls, uint32_t nodeid,
+			       struct dlm_rcom *reply)
+{
+	if (dlm_recovery_stopped(ls)) {
+		log_error(ls, "ignoring recovery reply %x from %u",
+			  reply->rc_subcmd, nodeid);
+		return;
+	}
+
+	switch (reply->rc_subcmd) {
+	case RECCOMM_GETMASTER:
+		process_reply_async(ls, nodeid, reply);
+		break;
+	case RECCOMM_STATUS:
+	case RECCOMM_NEWLOCKS:
+	case RECCOMM_NEWLOCKIDS:
+	case RECCOMM_RECOVERNAMES:
+		process_reply_sync(ls, nodeid, reply);
+		break;
+	default:
+		log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
+		          reply->rc_subcmd, nodeid);
+	}
+}
+
+
+static int send_ls_not_ready(uint32_t nodeid, struct dlm_header *header)
+{
+	struct writequeue_entry *wq;
+	struct dlm_rcom *rc = (struct dlm_rcom *) header;
+	struct dlm_rcom *reply;
+
+	wq = lowcomms_get_buffer(nodeid, sizeof(struct dlm_rcom), GFP_KERNEL,
+			         (char **)&reply);
+	if (!wq)
+		return -ENOMEM;
+
+	reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+	reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+	reply->rc_subcmd = rc->rc_subcmd;
+	reply->rc_msgid = rc->rc_msgid;
+	reply->rc_buf[0] = 0;
+
+	reply->rc_datalen = 1;
+	reply->rc_header.rh_length = sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
+
+	midcomms_send_buffer((struct dlm_header *)reply, wq);
+	return 0;
+}
+
+
+/* 
+ * Runs in same context as midcomms.  Both recovery requests and recovery
+ * replies come through this function.
+ */
+
+void process_recovery_comm(uint32_t nodeid, struct dlm_header *header)
+{
+	struct dlm_ls *ls = find_lockspace_by_global_id(header->rh_lockspace);
+	struct dlm_rcom *rc = (struct dlm_rcom *) header;
+
+	/* If the lockspace doesn't exist then still send a status message
+	   back; it's possible that it just doesn't have its global_id yet. */
+
+	if (!ls) {
+	      send_ls_not_ready(nodeid, header);
+	      return;
+	}
+
+	switch (header->rh_cmd) {
+	case GDLM_REMCMD_RECOVERMESSAGE:
+		rcom_process_message(ls, nodeid, rc);
+		break;
+
+	case GDLM_REMCMD_RECOVERREPLY:
+		rcom_process_reply(ls, nodeid, rc);
+		break;
+
+	default:
+		DLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
+	}
+
+	put_lockspace(ls);
+}
+
diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
--- linux-orig/cluster/dlm/reccomms.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/reccomms.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,36 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECCOMMS_DOT_H__
+#define __RECCOMMS_DOT_H__
+
+/* Bit flags */
+
+#define RESDIR_VALID            (1)
+#define RESDIR_ALL_VALID        (2)
+#define NODES_VALID             (4)
+#define NODES_ALL_VALID         (8)
+
+#define RECCOMM_STATUS          (1)
+#define RECCOMM_RECOVERNAMES    (2)
+#define RECCOMM_GETMASTER       (3)
+#define RECCOMM_BULKLOOKUP      (4)
+#define RECCOMM_NEWLOCKS        (5)
+#define RECCOMM_NEWLOCKIDS      (6)
+#define RECCOMM_REMRESDATA      (7)
+
+int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
+		      struct dlm_rcom *rc, int need_reply);
+void process_recovery_comm(uint32_t nodeid, struct dlm_header *header);
+
+#endif
diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
--- linux-orig/cluster/dlm/recover.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/recover.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,611 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "reccomms.h"
+#include "dir.h"
+#include "locking.h"
+#include "rsb.h"
+#include "lockspace.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+
+/*
+ * Called in recovery routines to check whether the recovery process has been
+ * interrupted/stopped by another transition.  A recovery in-process will abort
+ * if the lockspace is "stopped" so that a new recovery process can start from
+ * the beginning when the lockspace is "started" again.
+ */
+
+int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_LS_STOP, &ls->ls_flags);
+}
+
+static void dlm_wait_timer_fn(unsigned long data)
+{
+	struct dlm_ls *ls = (struct dlm_ls *) data;
+
+	wake_up(&ls->ls_wait_general);
+}
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
+ * set due to failure of a node in ls_nodes).  When another function thinks it
+ * could have completed the waited-on task, they should wake up ls_wait_general
+ * to get an immediate response rather than waiting for the timer to detect the
+ * result.  A timer wakes us up periodically while waiting to see if we should
+ * abort due to a node failure.
+ */
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+	struct timer_list timer;
+	int error = 0;
+
+	init_timer(&timer);
+	timer.function = dlm_wait_timer_fn;
+	timer.data = (long) ls;
+
+	for (;;) {
+		mod_timer(&timer, jiffies + (dlm_config.recover_timer * HZ));
+
+		wchan_cond_sleep_intr(ls->ls_wait_general,
+				      !testfn(ls) &&
+				      !test_bit(LSFL_LS_STOP, &ls->ls_flags));
+
+		if (timer_pending(&timer))
+			del_timer(&timer);
+
+		if (testfn(ls))
+			break;
+
+		if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
+			error = -1;
+			break;
+		}
+	}
+
+	return error;
+}
+
+int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status)
+{
+	struct dlm_rcom rc_stack, *rc;
+	struct dlm_csb *csb;
+	int status;
+	int error = 0;
+
+	memset(&rc_stack, 0, sizeof(struct dlm_rcom));
+	rc = &rc_stack;
+	rc->rc_datalen = 0;
+
+	list_for_each_entry(csb, &ls->ls_nodes, list) {
+		for (;;) {
+			error = dlm_recovery_stopped(ls);
+			if (error)
+				goto out;
+
+			error = rcom_send_message(ls, csb->node->nodeid,
+						  RECCOMM_STATUS, rc, 1);
+			if (error)
+				goto out;
+
+			status = rc->rc_buf[0];
+			if (status & wait_status)
+				break;
+			else {
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ >> 1);
+			}
+		}
+	}
+
+      out:
+	return error;
+}
+
+int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status)
+{
+	struct dlm_rcom rc_stack, *rc;
+	uint32_t nodeid = ls->ls_low_nodeid;
+	int status;
+	int error = 0;
+
+	memset(&rc_stack, 0, sizeof(struct dlm_rcom));
+	rc = &rc_stack;
+	rc->rc_datalen = 0;
+
+	for (;;) {
+		error = dlm_recovery_stopped(ls);
+		if (error)
+			goto out;
+
+		error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
+		if (error)
+			break;
+
+		status = rc->rc_buf[0];
+		if (status & wait_status)
+			break;
+		else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ >> 1);
+		}
+	}
+
+      out:
+	return error;
+}
+
+static int purge_queue(struct dlm_ls *ls, struct list_head *queue)
+{
+	struct dlm_lkb *lkb, *safe;
+	struct dlm_rsb *rsb;
+	int count = 0;
+
+	list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+		if (!lkb->lkb_nodeid)
+			continue;
+
+		DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
+
+		if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
+			list_del(&lkb->lkb_statequeue);
+
+			rsb = lkb->lkb_resource;
+			lkb->lkb_status = 0;
+
+			if (lkb->lkb_status == GDLM_LKSTS_CONVERT
+			    && &lkb->lkb_duetime)
+				remove_from_deadlockqueue(lkb);
+
+			release_lkb(ls, lkb);
+			release_rsb_locked(rsb);
+			count++;
+		}
+	}
+
+	return count;
+}
+
+/*
+ * Go through local restbl and for each rsb we're master of, clear out any
+ * lkb's held by departed nodes.
+ */
+
+int restbl_lkb_purge(struct dlm_ls *ls)
+{
+	struct list_head *tmp2, *safe2;
+	int count = 0;
+	struct dlm_rsb *rootrsb, *safe, *rsb;
+
+	log_all(ls, "purge locks of departed nodes");
+	down_write(&ls->ls_root_lock);
+
+	list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
+
+		if (rootrsb->res_nodeid)
+			continue;
+
+		hold_rsb(rootrsb);
+		down_write(&rootrsb->res_lock);
+
+		/* This traverses the subreslist in reverse order so we purge
+		 * the children before their parents. */
+
+		for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
+		     tmp2 != &rootrsb->res_subreslist;
+		     tmp2 = safe2, safe2 = safe2->prev) {
+			rsb = list_entry(tmp2, struct dlm_rsb, res_subreslist);
+
+			hold_rsb(rsb);
+			purge_queue(ls, &rsb->res_grantqueue);
+			purge_queue(ls, &rsb->res_convertqueue);
+			purge_queue(ls, &rsb->res_waitqueue);
+			release_rsb_locked(rsb);
+		}
+		count += purge_queue(ls, &rootrsb->res_grantqueue);
+		count += purge_queue(ls, &rootrsb->res_convertqueue);
+		count += purge_queue(ls, &rootrsb->res_waitqueue);
+
+		up_write(&rootrsb->res_lock);
+		release_rsb_locked(rootrsb);
+	}
+
+	up_write(&ls->ls_root_lock);
+	log_all(ls, "purged %d locks", count);
+
+	return 0;
+}
+
+/*
+ * Grant any locks that have become grantable after a purge
+ */
+
+int restbl_grant_after_purge(struct dlm_ls *ls)
+{
+	struct dlm_rsb *root, *rsb, *safe;
+	int error = 0;
+
+	down_read(&ls->ls_root_lock);
+
+	list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
+		/* only the rsb master grants locks */
+		if (root->res_nodeid)
+			continue;
+
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+			log_debug(ls, "restbl_grant_after_purge aborted");
+			error = -EINTR;
+			up_read(&ls->ls_root_lock);
+			goto out;
+		}
+
+		down_write(&root->res_lock);
+		grant_pending_locks(root);
+		up_write(&root->res_lock);
+
+		list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
+			down_write(&rsb->res_lock);
+			grant_pending_locks(rsb);
+			up_write(&rsb->res_lock);
+		}
+	}
+	up_read(&ls->ls_root_lock);
+	wake_astd();
+ out:
+	return error;
+}
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, queue, lkb_statequeue) {
+		/* Don't muck around with pre-exising sublocks */
+		if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
+			lkb->lkb_nodeid = nodeid;
+	}
+}
+
+static void set_master_lkbs(struct dlm_rsb *rsb)
+{
+	set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
+	set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
+	set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
+}
+
+/*
+ * This rsb struct is now the master so it is responsible for keeping the
+ * latest rsb.  Find if any current lkb's have an up to date copy of the lvb to
+ * be used as the rsb copy.  An equivalent step occurs as new lkb's arrive for
+ * this rsb in deserialise_lkb.
+ */
+
+static void set_rsb_lvb(struct dlm_rsb *rsb)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
+
+		if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
+		    (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
+		    (lkb->lkb_grmode > DLM_LOCK_NL))
+		{
+			if (!rsb->res_lvbptr)
+				rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
+
+			memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+			return;
+		}
+	}
+
+	list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
+
+		if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
+		    (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
+		    (lkb->lkb_grmode > DLM_LOCK_NL))
+		{
+			if (!rsb->res_lvbptr)
+				rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
+
+			memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+			return;
+		}
+	}
+}
+
+/*
+ * Propogate the new master nodeid to locks, subrsbs, sublocks.
+ * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
+ */
+
+static void set_new_master(struct dlm_rsb *rsb, uint32_t nodeid)
+{
+	struct dlm_rsb *subrsb;
+
+	down_write(&rsb->res_lock);
+
+	if (nodeid == our_nodeid()) {
+		set_bit(RESFL_MASTER, &rsb->res_flags);
+		rsb->res_nodeid = 0;
+		set_rsb_lvb(rsb);
+	} else
+		rsb->res_nodeid = nodeid;
+
+	set_master_lkbs(rsb);
+
+	list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
+		subrsb->res_nodeid = rsb->res_nodeid;
+		set_master_lkbs(subrsb);
+	}
+
+	up_write(&rsb->res_lock);
+
+	set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid.  As replies are returned from the resource directories the
+ * rsb's are removed from the list.  When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ */
+
+int recover_list_empty(struct dlm_ls *ls)
+{
+	int empty;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	empty = list_empty(&ls->ls_recover_list);
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	return empty;
+}
+
+int recover_list_count(struct dlm_ls *ls)
+{
+	int count;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	count = ls->ls_recover_list_count;
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	return count;
+}
+
+void recover_list_add(struct dlm_rsb *rsb)
+{
+	struct dlm_ls *ls = rsb->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
+		list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
+		ls->ls_recover_list_count++;
+		hold_rsb(rsb);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+void recover_list_del(struct dlm_rsb *rsb)
+{
+	struct dlm_ls *ls = rsb->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
+	list_del(&rsb->res_recover_list);
+	ls->ls_recover_list_count--;
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	release_rsb(rsb);
+}
+
+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, int msgid)
+{
+	struct dlm_rsb *rsb = NULL;
+
+	spin_lock(&ls->ls_recover_list_lock);
+
+	list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
+		if (rsb->res_recover_msgid == msgid)
+		        goto rec_found;
+	}
+	rsb = NULL;
+
+ rec_found:
+	spin_unlock(&ls->ls_recover_list_lock);
+	return rsb;
+}
+
+static int rsb_master_lookup(struct dlm_rsb *rsb, struct dlm_rcom *rc)
+{
+	struct dlm_ls *ls = rsb->res_ls;
+	uint32_t dir_nodeid, r_nodeid;
+	int error;
+
+	dir_nodeid = get_directory_nodeid(rsb);
+
+	if (dir_nodeid == our_nodeid()) {
+		error = dlm_dir_lookup(ls, dir_nodeid, rsb->res_name,
+				       rsb->res_length, &r_nodeid);
+		if (error == -EEXIST) {
+			log_all(ls, "rsb_master_lookup %u EEXIST %s",
+				r_nodeid, rsb->res_name);
+		} else if (error)
+			goto fail;
+
+		set_new_master(rsb, r_nodeid);
+	} else {
+		/* As we are the only thread doing recovery this 
+		   should be safe. if not then we need to use a different
+		   ID somehow. We must set it in the RSB before rcom_send_msg
+		   completes cos we may get a reply quite quickly.
+		*/
+		rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
+
+		recover_list_add(rsb);
+
+		memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
+		rc->rc_datalen = rsb->res_length;
+
+		error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
+				          rc, 0);
+		if (error)
+			goto fail;
+	}
+
+ fail:
+	return error;
+}
+
+static int needs_update(struct dlm_ls *ls, struct dlm_rsb *r)
+{
+	if (!r->res_nodeid)
+		return FALSE;
+
+	if (r->res_nodeid == -1)
+		return FALSE;
+
+	if (in_nodes_gone(ls, r->res_nodeid))
+		return TRUE;
+
+	return FALSE;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the resdir.  The resdir will
+ * assign mastery to the first node to look up the new master.  That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the resdir requests individually and asynchronously to the
+ * correct resdir node.  The replies are processed in rsb_master_recv().
+ */
+
+int restbl_rsb_update(struct dlm_ls *ls)
+{
+	struct dlm_rsb *rsb, *safe;
+	struct dlm_rcom *rc;
+	int error = -ENOMEM;
+	int count = 0;
+
+	log_all(ls, "update remastered resources");
+
+	rc = allocate_rcom_buffer(ls);
+	if (!rc)
+		goto out;
+
+	down_read(&ls->ls_root_lock);
+
+	list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
+		error = dlm_recovery_stopped(ls);
+		if (error) {
+			up_read(&ls->ls_root_lock);
+			goto out_free;
+		}
+
+		if (needs_update(ls, rsb)) {
+			error = rsb_master_lookup(rsb, rc);
+			if (error) {
+				up_read(&ls->ls_root_lock);
+				goto out_free;
+			}
+			count++;
+		}
+	}
+	up_read(&ls->ls_root_lock);
+
+	error = dlm_wait_function(ls, &recover_list_empty);
+
+	log_all(ls, "updated %d resources", count);
+ out_free:
+	free_rcom_buffer(rc);
+ out:
+	return error;
+}
+
+int restbl_rsb_update_recv(struct dlm_ls *ls, uint32_t nodeid, char *buf,
+			   int length, int msgid)
+{
+	struct dlm_rsb *rsb;
+	uint32_t be_nodeid;
+
+	rsb = recover_list_find(ls, msgid);
+	if (!rsb) {
+		log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
+		goto out;
+	}
+
+	memcpy(&be_nodeid, buf, sizeof(uint32_t));
+	set_new_master(rsb, be32_to_cpu(be_nodeid));
+	recover_list_del(rsb);
+
+	if (recover_list_empty(ls))
+		wake_up(&ls->ls_wait_general);
+
+ out:
+	return 0;
+}
+
+/*
+ * This function not used any longer.
+ */
+
+int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
+		       char *outbuf)
+{
+	char *inbufptr, *outbufptr;
+
+	/*
+	 * The other node wants nodeids matching the resource names in inbuf.
+	 * The resource names are packed into inbuf as
+	 * [len1][name1][len2][name2]...  where lenX is 1 byte and nameX is
+	 * lenX bytes.  Matching nodeids are packed into outbuf in order
+	 * [nodeid1][nodeid2]...
+	 */
+
+	inbufptr = inbuf;
+	outbufptr = outbuf;
+
+	while (inbufptr < inbuf + inlen) {
+		uint32_t r_nodeid, be_nodeid;
+		int status;
+
+		status = dlm_dir_lookup(ls, nodeid, inbufptr + 1, *inbufptr,
+					&r_nodeid);
+		if (status != 0)
+			goto fail;
+
+		inbufptr += *inbufptr + 1;
+
+		be_nodeid = cpu_to_be32(r_nodeid);
+		memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
+		outbufptr += sizeof(uint32_t);
+
+		/* add assertion that outbufptr - outbuf is not > than ... */
+	}
+
+	return (outbufptr - outbuf);
+ fail:
+	return -1;
+}
diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
--- linux-orig/cluster/dlm/recover.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/recover.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,33 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls));
+int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status);
+int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status);
+int dlm_recovery_stopped(struct dlm_ls *ls);
+int recover_list_empty(struct dlm_ls *ls);
+int recover_list_count(struct dlm_ls *ls);
+void recover_list_add(struct dlm_rsb *rsb);
+void recover_list_del(struct dlm_rsb *rsb);
+int restbl_lkb_purge(struct dlm_ls *ls);
+void restbl_grant_after_purge(struct dlm_ls *ls);
+int restbl_rsb_update(struct dlm_ls *ls);
+int restbl_rsb_update_recv(struct dlm_ls *ls, int nodeid, char *buf, int len,
+			   int msgid);
+int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
+		       char *outbuf);
+
+#endif				/* __RECOVER_DOT_H__ */
diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
--- linux-orig/cluster/dlm/recoverd.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/recoverd.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,713 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "nodes.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lockspace.h"
+#include "lowcomms.h"
+#include "lockqueue.h"
+#include "lkb.h"
+#include "rebuild.h"
+
+/*
+ * next_move actions
+ */
+
+#define DO_STOP             (1)
+#define DO_START            (2)
+#define DO_FINISH           (3)
+#define DO_FINISH_STOP      (4)
+#define DO_FINISH_START     (5)
+
+/*
+ * Queue of lockspaces (dlm_recover structs) which need to be
+ * started/recovered
+ */
+
+static int enable_locking(struct dlm_ls *ls, int event_id)
+{
+	int error = 0;
+
+	spin_lock(&ls->ls_recover_lock);
+	if (ls->ls_last_stop < event_id) {
+		set_bit(LSFL_LS_RUN, &ls->ls_flags);
+		up_write(&ls->ls_in_recovery);
+	} else {
+		error = -EINTR;
+		log_debug(ls, "enable_locking: abort %d", event_id);
+	}
+	spin_unlock(&ls->ls_recover_lock);
+	return error;
+}
+
+static int ls_first_start(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+	int error;
+
+	log_all(ls, "recover event %u (first)", rv->event_id);
+
+	kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
+
+	error = ls_nodes_init(ls, rv);
+	if (error) {
+		log_error(ls, "nodes_init failed %d", error);
+		goto out;
+	}
+
+	error = dlm_dir_rebuild_local(ls);
+	if (error) {
+		log_error(ls, "dlm_dir_rebuild_local failed %d", error);
+		goto out;
+	}
+
+	error = dlm_dir_rebuild_wait(ls);
+	if (error) {
+		log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
+		goto out;
+	}
+
+	log_all(ls, "recover event %u done", rv->event_id);
+	kcl_start_done(ls->ls_local_id, rv->event_id);
+
+ out:
+	return error;
+}
+
+/*
+ * We are given here a new group of nodes which are in the lockspace.  We first
+ * figure out the differences in ls membership from when we were last running.
+ * If nodes from before are gone, then there will be some lock recovery to do.
+ * If there are only nodes which have joined, then there's no lock recovery.
+ *
+ * note: cman requires an rc to finish starting on an revent (where nodes die)
+ * before it allows an sevent (where nodes join) to be processed.  This means
+ * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
+ * joined.
+ */
+
+static int ls_reconfig(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+	int error, neg = 0;
+
+	log_all(ls, "recover event %u", rv->event_id);
+
+	/*
+	 * this list may be left over from a previous aborted recovery
+	 */
+
+	rebuild_freemem(ls);
+
+	/*
+	 * Add or remove nodes from the lockspace's ls_nodes list.
+	 */
+
+	error = ls_nodes_reconfig(ls, rv, &neg);
+	if (error) {
+		log_error(ls, "nodes_reconfig failed %d", error);
+		goto fail;
+	}
+
+	/*
+	 * Rebuild our own share of the resdir by collecting from all other
+	 * nodes rsb name/master pairs for which the name hashes to us.
+	 */
+
+	error = dlm_dir_rebuild_local(ls);
+	if (error) {
+		log_error(ls, "dlm_dir_rebuild_local failed %d", error);
+		goto fail;
+	}
+
+	/*
+	 * Purge resdir-related requests that are being held in requestqueue.
+	 * All resdir requests from before recovery started are invalid now due
+	 * to the resdir rebuild and will be resent by the requesting nodes.
+	 */
+
+	purge_requestqueue(ls);
+	set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
+
+	/*
+	 * Wait for all nodes to complete resdir rebuild.
+	 */
+
+	error = dlm_dir_rebuild_wait(ls);
+	if (error) {
+		log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
+		goto fail;
+	}
+
+	/*
+	 * Mark our own lkb's waiting in the lockqueue for remote replies from
+	 * nodes that are now departed.  These will be resent to the new
+	 * masters in resend_cluster_requests.  Also mark resdir lookup
+	 * requests for resending.
+	 */
+
+	lockqueue_lkb_mark(ls);
+
+	error = dlm_recovery_stopped(ls);
+	if (error)
+		goto fail;
+
+	if (neg) {
+		/*
+		 * Clear lkb's for departed nodes.  This can't fail since it
+		 * doesn't involve communicating with other nodes.
+		 */
+
+		restbl_lkb_purge(ls);
+
+		/*
+		 * Get new master id's for rsb's of departed nodes.  This fails
+		 * if we can't communicate with other nodes.
+		 */
+
+		error = restbl_rsb_update(ls);
+		if (error) {
+			log_error(ls, "restbl_rsb_update failed %d", error);
+			goto fail;
+		}
+
+		/*
+		 * Send our lkb info to new masters.  This fails if we can't
+		 * communicate with a node.
+		 */
+
+		error = rebuild_rsbs_send(ls);
+		if (error) {
+			log_error(ls, "rebuild_rsbs_send failed %d", error);
+			goto fail;
+		}
+	}
+
+	clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
+
+	log_all(ls, "recover event %u done", rv->event_id);
+	kcl_start_done(ls->ls_local_id, rv->event_id);
+	return 0;
+
+ fail:
+	log_all(ls, "recover event %d error %d", rv->event_id, error);
+	return error;
+}
+
+static void clear_finished_nodes(struct dlm_ls *ls, int finish_event)
+{
+	struct dlm_csb *csb, *safe;
+
+	list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, list) {
+		if (csb->gone_event <= finish_event) {
+			list_del(&csb->list);
+			release_csb(csb);
+		}
+	}
+}
+
+/*
+ * Between calls to this routine for a ls, there can be multiple stop/start
+ * events from cman where every start but the latest is cancelled by stops.
+ * There can only be a single finish from cman because every finish requires us
+ * to call start_done.  A single finish event could be followed by multiple
+ * stop/start events.  This routine takes any combination of events from cman
+ * and boils them down to one course of action.
+ */
+
+static int next_move(struct dlm_ls *ls, struct dlm_recover **rv_out,
+		     int *finish_out)
+{
+	LIST_HEAD(events);
+	unsigned int cmd = 0, stop, start, finish;
+	unsigned int last_stop, last_start, last_finish;
+	struct dlm_recover *rv = NULL, *start_rv = NULL;
+
+	/*
+	 * Grab the current state of cman/sm events.
+	 */
+
+	spin_lock(&ls->ls_recover_lock);
+
+	stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
+	start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
+	finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
+
+	last_stop = ls->ls_last_stop;
+	last_start = ls->ls_last_start;
+	last_finish = ls->ls_last_finish;
+
+	while (!list_empty(&ls->ls_recover)) {
+		rv = list_entry(ls->ls_recover.next, struct dlm_recover, list);
+		list_del(&rv->list);
+		list_add_tail(&rv->list, &events);
+	}
+
+	/*
+	 * There are two cases where we need to adjust these event values:
+	 * 1. - we get a first start
+	 *    - we get a stop
+	 *    - we process the start + stop here and notice this special case
+	 * 
+	 * 2. - we get a first start
+	 *    - we process the start
+	 *    - we get a stop
+	 *    - we process the stop here and notice this special case
+	 *
+	 * In both cases, the first start we received was aborted by a
+	 * stop before we received a finish.  last_finish being zero is the
+	 * indication that this is the "first" start, i.e. we've not yet
+	 * finished a start; if we had, last_finish would be non-zero.
+	 * Part of the problem arises from the fact that when we initially
+	 * get start/stop/start, SM uses the same event id for both starts
+	 * (since the first was cancelled).
+	 *
+	 * In both cases, last_start and last_stop will be equal.
+	 * In both cases, finish=0.
+	 * In the first case start=1 && stop=1.
+	 * In the second case start=0 && stop=1.
+	 *
+	 * In both cases, we need to make adjustments to values so:
+	 * - we process the current event (now) as a normal stop
+	 * - the next start we receive will be processed normally
+	 *   (taking into account the assertions below)
+	 *
+	 * In the first case, dlm_ls_start() will have printed the
+	 * "repeated start" warning.
+	 *
+	 * In the first case we need to get rid of the recover event struct.
+	 *
+	 * - set stop=1, start=0, finish=0 for case 4 below
+	 * - last_stop and last_start must be set equal per the case 4 assert
+	 * - ls_last_stop = 0 so the next start will be larger
+	 * - ls_last_start = 0 not really necessary (avoids dlm_ls_start print)
+	 */
+
+	if (!last_finish && (last_start == last_stop)) {
+		log_all(ls, "move reset %u,%u,%u ids %u,%u,%u", stop,
+			start, finish, last_stop, last_start, last_finish);
+		stop = 1;
+		start = 0;
+		finish = 0;
+		last_stop = 0;
+		last_start = 0;
+		ls->ls_last_stop = 0;
+		ls->ls_last_start = 0;
+
+		while (!list_empty(&events)) {
+			rv = list_entry(events.next, struct dlm_recover, list);
+			list_del(&rv->list);
+			kfree(rv->nodeids);
+			kfree(rv);
+		}
+	}
+	spin_unlock(&ls->ls_recover_lock);
+
+	log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
+		  last_stop, last_start, last_finish);
+
+	/*
+	 * Toss start events which have since been cancelled.
+	 */
+
+	while (!list_empty(&events)) {
+		DLM_ASSERT(start,);
+		rv = list_entry(events.next, struct dlm_recover, list);
+		list_del(&rv->list);
+
+		if (rv->event_id <= last_stop) {
+			log_debug(ls, "move skip event %u", rv->event_id);
+			kfree(rv->nodeids);
+			kfree(rv);
+			rv = NULL;
+		} else {
+			log_debug(ls, "move use event %u", rv->event_id);
+			DLM_ASSERT(!start_rv,);
+			start_rv = rv;
+		}
+	}
+
+	/*
+	 * Eight possible combinations of events.
+	 */
+
+	/* 0 */
+	if (!stop && !start && !finish) {
+		DLM_ASSERT(!start_rv,);
+		cmd = 0;
+		goto out;
+	}
+
+	/* 1 */
+	if (!stop && !start && finish) {
+		DLM_ASSERT(!start_rv,);
+		DLM_ASSERT(last_start > last_stop,);
+		DLM_ASSERT(last_finish == last_start,);
+		cmd = DO_FINISH;
+		*finish_out = last_finish;
+		goto out;
+	}
+
+	/* 2 */
+	if (!stop && start && !finish) {
+		DLM_ASSERT(start_rv,);
+		DLM_ASSERT(last_start > last_stop,);
+		cmd = DO_START;
+		*rv_out = start_rv;
+		goto out;
+	}
+
+	/* 3 */
+	if (!stop && start && finish) {
+		DLM_ASSERT(0, printk("finish and start with no stop\n"););
+	}
+
+	/* 4 */
+	if (stop && !start && !finish) {
+		DLM_ASSERT(!start_rv,);
+		DLM_ASSERT(last_start == last_stop,);
+		cmd = DO_STOP;
+		goto out;
+	}
+
+	/* 5 */
+	if (stop && !start && finish) {
+		DLM_ASSERT(!start_rv,);
+		DLM_ASSERT(last_finish == last_start,);
+		DLM_ASSERT(last_stop == last_start,);
+		cmd = DO_FINISH_STOP;
+		*finish_out = last_finish;
+		goto out;
+	}
+
+	/* 6 */
+	if (stop && start && !finish) {
+		if (start_rv) {
+			DLM_ASSERT(last_start > last_stop,);
+			cmd = DO_START;
+			*rv_out = start_rv;
+		} else {
+			DLM_ASSERT(last_stop == last_start,);
+			cmd = DO_STOP;
+		}
+		goto out;
+	}
+
+	/* 7 */
+	if (stop && start && finish) {
+		if (start_rv) {
+			DLM_ASSERT(last_start > last_stop,);
+			DLM_ASSERT(last_start > last_finish,);
+			cmd = DO_FINISH_START;
+			*finish_out = last_finish;
+			*rv_out = start_rv;
+		} else {
+			DLM_ASSERT(last_start == last_stop,);
+			DLM_ASSERT(last_start > last_finish,);
+			cmd = DO_FINISH_STOP;
+			*finish_out = last_finish;
+		}
+		goto out;
+	}
+
+ out:
+	return cmd;
+}
+
+/*
+ * This function decides what to do given every combination of current
+ * lockspace state and next lockspace state.
+ */
+
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+	struct dlm_recover *rv = NULL;
+	int error, cur_state, next_state = 0, do_now, finish_event = 0;
+
+	do_now = next_move(ls, &rv, &finish_event);
+	if (!do_now)
+		goto out;
+
+	cur_state = ls->ls_state;
+	next_state = 0;
+
+	DLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
+		    log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
+
+	/*
+	 * LSST_CLEAR - we're not in any recovery state.  We can get a stop or
+	 * a stop and start which equates with a START.
+	 */
+
+	if (cur_state == LSST_CLEAR) {
+		switch (do_now) {
+		case DO_STOP:
+			next_state = LSST_WAIT_START;
+			break;
+
+		case DO_START:
+			error = ls_reconfig(ls, rv);
+			if (error)
+				next_state = LSST_WAIT_START;
+			else
+				next_state = LSST_RECONFIG_DONE;
+			break;
+
+		case DO_FINISH:	/* invalid */
+		case DO_FINISH_STOP:	/* invalid */
+		case DO_FINISH_START:	/* invalid */
+		default:
+			DLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+	/*
+	 * LSST_WAIT_START - we're not running because of getting a stop or
+	 * failing a start.  We wait in this state for another stop/start or
+	 * just the next start to begin another reconfig attempt.
+	 */
+
+	if (cur_state == LSST_WAIT_START) {
+		switch (do_now) {
+		case DO_STOP:
+			break;
+
+		case DO_START:
+			error = ls_reconfig(ls, rv);
+			if (error)
+				next_state = LSST_WAIT_START;
+			else
+				next_state = LSST_RECONFIG_DONE;
+			break;
+
+		case DO_FINISH:	/* invalid */
+		case DO_FINISH_STOP:	/* invalid */
+		case DO_FINISH_START:	/* invalid */
+		default:
+			DLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+	/*
+	 * LSST_RECONFIG_DONE - we entered this state after successfully
+	 * completing ls_reconfig and calling kcl_start_done.  We expect to get
+	 * a finish if everything goes ok.  A finish could be followed by stop
+	 * or stop/start before we get here to check it.  Or a finish may never
+	 * happen, only stop or stop/start.
+	 */
+
+	if (cur_state == LSST_RECONFIG_DONE) {
+		switch (do_now) {
+		case DO_FINISH:
+			rebuild_freemem(ls);
+
+			clear_finished_nodes(ls, finish_event);
+			next_state = LSST_CLEAR;
+
+			error = enable_locking(ls, finish_event);
+			if (error)
+				break;
+
+			error = process_requestqueue(ls);
+			if (error)
+				break;
+
+			error = resend_cluster_requests(ls);
+			if (error)
+				break;
+
+			restbl_grant_after_purge(ls);
+
+			log_all(ls, "recover event %u finished", finish_event);
+			break;
+
+		case DO_STOP:
+			next_state = LSST_WAIT_START;
+			break;
+
+		case DO_FINISH_STOP:
+			clear_finished_nodes(ls, finish_event);
+			next_state = LSST_WAIT_START;
+			break;
+
+		case DO_FINISH_START:
+			clear_finished_nodes(ls, finish_event);
+			/* fall into DO_START */
+
+		case DO_START:
+			error = ls_reconfig(ls, rv);
+			if (error)
+				next_state = LSST_WAIT_START;
+			else
+				next_state = LSST_RECONFIG_DONE;
+			break;
+
+		default:
+			DLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+	/*
+	 * LSST_INIT - state after ls is created and before it has been
+	 * started.  A start operation will cause the ls to be started for the
+	 * first time.  A failed start will cause to just wait in INIT for
+	 * another stop/start.
+	 */
+
+	if (cur_state == LSST_INIT) {
+		switch (do_now) {
+		case DO_START:
+			error = ls_first_start(ls, rv);
+			if (!error)
+				next_state = LSST_INIT_DONE;
+			break;
+
+		case DO_STOP:
+			break;
+
+		case DO_FINISH:	/* invalid */
+		case DO_FINISH_STOP:	/* invalid */
+		case DO_FINISH_START:	/* invalid */
+		default:
+			DLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+	/*
+	 * LSST_INIT_DONE - after the first start operation is completed
+	 * successfully and kcl_start_done() called.  If there are no errors, a
+	 * finish will arrive next and we'll move to LSST_CLEAR.
+	 */
+
+	if (cur_state == LSST_INIT_DONE) {
+		switch (do_now) {
+		case DO_STOP:
+		case DO_FINISH_STOP:
+			next_state = LSST_WAIT_START;
+			break;
+
+		case DO_START:
+		case DO_FINISH_START:
+			error = ls_reconfig(ls, rv);
+			if (error)
+				next_state = LSST_WAIT_START;
+			else
+				next_state = LSST_RECONFIG_DONE;
+			break;
+
+		case DO_FINISH:
+			next_state = LSST_CLEAR;
+
+			enable_locking(ls, finish_event);
+
+			process_requestqueue(ls);
+
+			log_all(ls, "recover event %u finished", finish_event);
+			break;
+
+		default:
+			DLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+ out:
+	if (next_state)
+		ls->ls_state = next_state;
+
+	if (rv) {
+		kfree(rv->nodeids);
+		kfree(rv);
+	}
+}
+
+int dlm_recoverd(void *arg)
+{
+	struct dlm_ls *ls = arg;
+
+	hold_lockspace(ls);
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!test_bit(LSFL_WORK, &ls->ls_flags))
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		if (test_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags)) {
+			down(&ls->ls_recoverd_lock);
+			ls->ls_recoverd_task = NULL;
+			up(&ls->ls_recoverd_lock);
+			goto out;
+		}
+
+		if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) {
+			do_ls_recovery(ls);
+
+			down(&ls->ls_recoverd_lock);
+			if (ls->ls_state == LSST_CLEAR &&
+			    !test_bit(LSFL_WORK, &ls->ls_flags)) {
+				ls->ls_recoverd_task = NULL;
+				up(&ls->ls_recoverd_lock);
+				goto out;
+			}
+			up(&ls->ls_recoverd_lock);
+		}
+	}
+
+ out:
+	put_lockspace(ls);
+	return 0;
+}
+
+void dlm_recoverd_kick(struct dlm_ls *ls)
+{
+    	struct task_struct *p;
+
+	down(&ls->ls_recoverd_lock);
+        set_bit(LSFL_WORK, &ls->ls_flags);
+
+	if (!ls->ls_recoverd_task) {
+	    	p = kthread_run(dlm_recoverd, (void *) ls, 0, "dlm_recoverd");
+		if (IS_ERR(p)) {
+			log_error(ls, "can't start dlm_recoverd %ld",
+				  PTR_ERR(p));
+			goto out;
+		}
+		ls->ls_recoverd_task = p;
+	} else
+		wake_up_process(ls->ls_recoverd_task);
+ out:
+	up(&ls->ls_recoverd_lock);
+}
+
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+	set_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags);
+
+	for (;;) {
+		down(&ls->ls_recoverd_lock);
+		if (!ls->ls_recoverd_task) {
+			up(&ls->ls_recoverd_lock);
+			break;
+		}
+		wake_up_process(ls->ls_recoverd_task);
+		up(&ls->ls_recoverd_lock);
+		msleep(100);
+	}
+}
+
diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
--- linux-orig/cluster/dlm/recoverd.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/recoverd.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+int dlm_recoverd(void *arg);
+void dlm_recoverd_kick(struct dlm_ls *ls);
+void dlm_recoverd_stop(struct dlm_ls *ls);
+
+#endif				/* __RECOVERD_DOT_H__ */
diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
--- linux-orig/cluster/dlm/rsb.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/rsb.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,329 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "locking.h"
+#include "memory.h"
+#include "lockqueue.h"
+#include "nodes.h"
+#include "dir.h"
+#include "util.h"
+#include "rsb.h"
+
+static struct dlm_rsb *search_hashchain(struct list_head *head,
+					struct dlm_rsb *parent,
+					char *name, int namelen)
+{
+	struct dlm_rsb *r;
+
+	list_for_each_entry(r, head, res_hashchain) {
+		if ((parent == r->res_parent) && (namelen == r->res_length) &&
+		    (memcmp(name, r->res_name, namelen) == 0)) {
+			return r;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * A way to arbitrarily hold onto an rsb which we already have a reference to
+ * to make sure it doesn't go away.  Opposite of release_rsb().
+ */
+
+void hold_rsb(struct dlm_rsb *r)
+{
+	atomic_inc(&r->res_ref);
+}
+
+/*
+ * release_rsb() - Decrement reference count on rsb struct.  Free the rsb
+ * struct when there are zero references.  Every lkb for the rsb adds a
+ * reference.  When ref is zero there can be no more lkb's for the rsb, on the
+ * queue's or anywhere else.
+ */
+
+static void _release_rsb(struct dlm_rsb *r, int locked)
+{
+	struct dlm_ls *ls = r->res_ls;
+	uint32_t nodeid;
+	int removed = FALSE;
+
+	write_lock(&ls->ls_rsbtbl[r->res_bucket].lock);
+	if (atomic_dec_and_test(&r->res_ref)) {
+		DLM_ASSERT(list_empty(&r->res_grantqueue), print_rsb(r););
+		DLM_ASSERT(list_empty(&r->res_waitqueue), print_rsb(r););
+		DLM_ASSERT(list_empty(&r->res_convertqueue), print_rsb(r););
+		removed = TRUE;
+		list_del(&r->res_hashchain);
+	}
+	write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock);
+
+	if (!removed)
+		return;
+
+	if (!locked)
+		down_write(&ls->ls_root_lock);
+	if (r->res_parent)
+		list_del(&r->res_subreslist);
+	else
+		list_del(&r->res_rootlist);
+	if (!locked)
+		up_write(&ls->ls_root_lock);
+
+	if (r->res_parent || !test_bit(RESFL_MASTER, &r->res_flags))
+		goto out;
+
+	nodeid = get_directory_nodeid(r);
+
+	if (nodeid != our_nodeid())
+		remote_remove_direntry(ls, nodeid, r->res_name, r->res_length);
+	else
+		dlm_dir_remove(ls, nodeid, r->res_name, r->res_length);
+ out:
+	if (r->res_lvbptr)
+		free_lvb(r->res_lvbptr);
+
+	free_rsb(r);
+}
+
+void release_rsb(struct dlm_rsb *r)
+{
+	_release_rsb(r, 0);
+}
+
+void release_rsb_locked(struct dlm_rsb *r)
+{
+	_release_rsb(r, 1);
+}
+
+struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	return r;
+}
+
+/*
+ * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
+ * If the rsb exists, its ref count is incremented by this function.  If it
+ * doesn't exist, it's created with a ref count of one.
+ */
+
+int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name, int len,
+	     int flags, struct dlm_rsb **rp)
+{
+	uint32_t bucket;
+	struct dlm_rsb *r, *tmp;
+	int error = -ENOMEM;
+
+	DLM_ASSERT(len <= DLM_RESNAME_MAXLEN,);
+
+	bucket = dlm_hash(name, len);
+	bucket &= (ls->ls_rsbtbl_size - 1);
+
+	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	r = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len);
+	if (r) {
+		if (r->res_nodeid != 0 && (flags & MASTER))
+			r = NULL;
+		else
+			atomic_inc(&r->res_ref);
+	}
+	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+
+	if (r)
+		goto out_set;
+
+	/* Always create sublocks */
+	if (!(flags & CREATE) && !parent) {
+		*rp = NULL;
+		goto out;
+	}
+
+	r = allocate_rsb(ls, len);
+	if (!r)
+		goto fail;
+
+	INIT_LIST_HEAD(&r->res_subreslist);
+	INIT_LIST_HEAD(&r->res_grantqueue);
+	INIT_LIST_HEAD(&r->res_convertqueue);
+	INIT_LIST_HEAD(&r->res_waitqueue);
+
+	memcpy(r->res_name, name, len);
+	r->res_length = len;
+	r->res_ls = ls;
+	init_rwsem(&r->res_lock);
+	atomic_set(&r->res_ref, 1);
+	r->res_bucket = bucket;
+
+	if (parent) {
+		r->res_parent = parent;
+		r->res_depth = parent->res_depth + 1;
+		r->res_root = parent->res_root;
+		r->res_nodeid = parent->res_nodeid;
+	} else {
+		r->res_parent = NULL;
+		r->res_depth = 1;
+		r->res_root = r;
+		r->res_nodeid = -1;
+	}
+
+	write_lock(&ls->ls_rsbtbl[bucket].lock);
+	tmp = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len);
+	if (tmp) {
+		atomic_inc(&tmp->res_ref);
+		write_unlock(&ls->ls_rsbtbl[bucket].lock);
+		free_rsb(r);
+		r = tmp;
+	} else {
+		list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+		write_unlock(&ls->ls_rsbtbl[bucket].lock);
+
+		down_write(&ls->ls_root_lock);
+		if (parent)
+			list_add_tail(&r->res_subreslist,
+				      &r->res_root->res_subreslist);
+		else
+			list_add(&r->res_rootlist, &ls->ls_rootres);
+		up_write(&ls->ls_root_lock);
+	}
+
+      out_set:
+	*rp = r;
+
+      out:
+	error = 0;
+
+      fail:
+	return error;
+}
+
+/*
+ * Add a LKB to a resource's grant/convert/wait queue. in order
+ */
+
+void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
+{
+	struct dlm_lkb *lkb = NULL;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+		if (lkb->lkb_rqmode < mode)
+			break;
+	}
+
+	if (!lkb) {
+		/* No entries in the queue, we are alone */
+	        list_add_tail(new, head);
+	} else {
+	        __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+	}
+}
+
+/*
+ * The rsb res_lock must be held in write when this function is called.
+ */
+
+void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
+{
+	DLM_ASSERT(!lkb->lkb_status,
+		   print_lkb(lkb);
+		   print_rsb(r););
+
+	lkb->lkb_status = type;
+
+	switch (type) {
+	case GDLM_LKSTS_WAITING:
+		if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE)
+			list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+		else
+			list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+		break;
+
+	case GDLM_LKSTS_GRANTED:
+		lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+				lkb->lkb_grmode);
+		break;
+
+	case GDLM_LKSTS_CONVERT:
+		if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE)
+			list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+		else
+			list_add_tail(&lkb->lkb_statequeue,
+				      &r->res_convertqueue);
+		break;
+
+	default:
+		DLM_ASSERT(0,);
+	}
+}
+
+void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
+{
+	down_write(&r->res_lock);
+	lkb_enqueue(r, lkb, type);
+	up_write(&r->res_lock);
+}
+
+/*
+ * The rsb res_lock must be held in write when this function is called.
+ */
+
+int lkb_dequeue(struct dlm_lkb *lkb)
+{
+	int status = lkb->lkb_status;
+
+	if (!status)
+		goto out;
+
+	lkb->lkb_status = 0;
+	list_del(&lkb->lkb_statequeue);
+
+      out:
+	return status;
+}
+
+int res_lkb_dequeue(struct dlm_lkb *lkb)
+{
+	int status;
+
+	down_write(&lkb->lkb_resource->res_lock);
+	status = lkb_dequeue(lkb);
+	up_write(&lkb->lkb_resource->res_lock);
+
+	return status;
+}
+
+/*
+ * The rsb res_lock must be held in write when this function is called.
+ */
+
+int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
+{
+	int status;
+
+	status = lkb_dequeue(lkb);
+	lkb_enqueue(r, lkb, type);
+
+	return status;
+}
+
+int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
+{
+	int status;
+
+	down_write(&r->res_lock);
+	status = lkb_swqueue(r, lkb, type);
+	up_write(&r->res_lock);
+
+	return status;
+}
diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
--- linux-orig/cluster/dlm/rsb.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/rsb.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RSB_DOT_H__
+#define __RSB_DOT_H__
+
+#define CREATE 1
+#define MASTER 2
+
+void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
+void release_rsb(struct dlm_rsb *r);
+void release_rsb_locked(struct dlm_rsb *r);
+void hold_rsb(struct dlm_rsb *r);
+int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name,
+	     int namelen, int flags, struct dlm_rsb **rp);
+struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb);
+void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
+void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
+int lkb_dequeue(struct dlm_lkb *lkb);
+int res_lkb_dequeue(struct dlm_lkb *lkb);
+int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
+int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
+
+#endif				/* __RSB_DOT_H__ */
diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
--- linux-orig/cluster/dlm/util.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/util.c	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,183 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+
+static const uint32_t crc_32_tab[] = {
+	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+	0xe963a535, 0x9e6495a3,
+	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
+	0xe7b82d07, 0x90bf1d91,
+	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
+	0xf4d4b551, 0x83d385c7,
+	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+	0xfa0f3d63, 0x8d080df5,
+	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
+	0xd20d85fd, 0xa50ab56b,
+	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
+	0xdcd60dcf, 0xabd13d59,
+	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+	0xcfba9599, 0xb8bda50f,
+	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
+	0xc1611dab, 0xb6662d3d,
+	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+	0x9fbfe4a5, 0xe8b8d433,
+	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+	0x91646c97, 0xe6635c01,
+	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
+	0x8208f4c1, 0xf50fc457,
+	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
+	0x8cd37cf3, 0xfbd44c65,
+	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+	0xa4d1c46d, 0xd3d6f4fb,
+	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+	0xaa0a4c5f, 0xdd0d7cc9,
+	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
+	0xb966d409, 0xce61e49f,
+	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+	0xb7bd5c3b, 0xc0ba6cad,
+	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
+	0x04db2615, 0x73dc1683,
+	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
+	0x0a00ae27, 0x7d079eb1,
+	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+	0x196c3671, 0x6e6b06e7,
+	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
+	0x17b7be43, 0x60b08ed5,
+	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
+	0x3fb506dd, 0x48b2364b,
+	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+	0x316e8eef, 0x4669be79,
+	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
+	0x220216b9, 0x5505262f,
+	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+	0x2cd99e8b, 0x5bdeae1d,
+	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+	0x72076785, 0x05005713,
+	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
+	0x7cdcefb7, 0x0bdbdf21,
+	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
+	0x6fb077e1, 0x18b74777,
+	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+	0x616bffd3, 0x166ccf45,
+	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+	0x4969474d, 0x3e6e77db,
+	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
+	0x47b2cf7f, 0x30b5ffe9,
+	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+	0x54de5729, 0x23d967bf,
+	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
+	0x5a05df1b, 0x2d02ef8d
+};
+
+/**
+ * dlm_hash - hash an array of data
+ * @data: the data to be hashed
+ * @len: the length of data to be hashed
+ *
+ * Copied from GFS.
+ *
+ * Take some data and convert it to a 32-bit hash.
+ *
+ * The hash function is a 32-bit CRC of the data.  The algorithm uses
+ * the crc_32_tab table above.
+ *
+ * This may not be the fastest hash function, but it does a fair bit better
+ * at providing uniform results than the others I've looked at.  That's
+ * really important for efficient directories.
+ *
+ * Returns: the hash
+ */
+
+uint32_t dlm_hash(const char *data, int len)
+{
+	uint32_t hash = 0xFFFFFFFF;
+
+	for (; len--; data++)
+		hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
+
+	hash = ~hash;
+
+	return hash;
+}
+
+void print_lkb(struct dlm_lkb *lkb)
+{
+	printk("dlm: lkb\n"
+	       "id %x\n"
+	       "remid %x\n"
+	       "flags %x\n"
+	       "status %x\n"
+	       "rqmode %d\n"
+	       "grmode %d\n"
+	       "nodeid %d\n"
+	       "lqstate %x\n"
+	       "lqflags %x\n",
+		lkb->lkb_id,
+		lkb->lkb_remid,
+		lkb->lkb_flags,
+		lkb->lkb_status,
+		lkb->lkb_rqmode,
+		lkb->lkb_grmode,
+		lkb->lkb_nodeid,
+		lkb->lkb_lockqueue_state,
+		lkb->lkb_lockqueue_flags);
+}
+
+void print_rsb(struct dlm_rsb *r)
+{
+	printk("dlm: rsb\n"
+	       "name \"%s\"\n"
+	       "nodeid %d\n"
+	       "flags %lx\n"
+	       "ref %u\n",
+	       r->res_name,
+	       r->res_nodeid,
+	       r->res_flags,
+	       atomic_read(&r->res_ref));
+}
+
+void print_request(struct dlm_request *req)
+{
+	printk("dlm: request\n"
+	       "rh_cmd %u\n"
+	       "rh_lkid %x\n"
+	       "remlkid %x\n"
+	       "flags %x\n"
+	       "status %u\n"
+	       "rqmode %u\n",
+	       req->rr_header.rh_cmd,
+	       req->rr_header.rh_lkid,
+	       req->rr_remlkid,
+	       req->rr_flags,
+	       req->rr_status,
+	       req->rr_rqmode);
+}
+
+void print_reply(struct dlm_reply *rp)
+{
+	printk("dlm: reply\n"
+	       "rh_cmd %u\n"
+	       "rh_lkid %x\n"
+	       "lockstate %u\n"
+	       "nodeid %u\n"
+	       "status %u\n"
+	       "lkid %x\n",
+	       rp->rl_header.rh_cmd,
+	       rp->rl_header.rh_lkid,
+	       rp->rl_lockstate,
+	       rp->rl_nodeid,
+	       rp->rl_status,
+	       rp->rl_lkid);
+}
+
diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
--- linux-orig/cluster/dlm/util.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/util.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+uint32_t dlm_hash(const char *data, int len);
+
+void print_lkb(struct dlm_lkb *lkb);
+void print_rsb(struct dlm_rsb *r);
+void print_request(struct dlm_request *req);
+void print_reply(struct dlm_reply *rp);
+
+#endif
diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
--- linux-orig/include/cluster/dlm.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/include/cluster/dlm.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,416 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_DOT_H__
+#define __DLM_DOT_H__
+
+/*
+ * Interface to DLM - routines and structures to use DLM lockspaces.
+ */
+
+/*
+ * Lock Modes
+ */
+
+#define DLM_LOCK_IV            (-1)	/* invalid */
+#define DLM_LOCK_NL            (0)	/* null */
+#define DLM_LOCK_CR            (1)	/* concurrent read */
+#define DLM_LOCK_CW            (2)	/* concurrent write */
+#define DLM_LOCK_PR            (3)	/* protected read */
+#define DLM_LOCK_PW            (4)	/* protected write */
+#define DLM_LOCK_EX            (5)	/* exclusive */
+
+/*
+ * Maximum size in bytes of a dlm_lock name
+ */
+
+#define DLM_RESNAME_MAXLEN     (64)
+
+/*
+ * Size in bytes of Lock Value Block
+ */
+
+#define DLM_LVB_LEN            (32)
+
+/*
+ * Flags to dlm_new_lockspace
+ *
+ * DLM_LSF_NOTIMERS
+ *
+ * Do not subject locks in this lockspace to time-outs.
+ */
+
+#define DLM_LSF_NOTIMERS       (1)
+
+/*
+ * Flags to dlm_lock
+ *
+ * DLM_LKF_NOQUEUE
+ *
+ * Do not queue the lock request on the wait queue if it cannot be granted
+ * immediately.  If the lock cannot be granted because of this flag, DLM will
+ * either return -EAGAIN from the dlm_lock call or will return 0 from
+ * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
+ *
+ * DLM_LKF_CONVERT
+ *
+ * Indicates a lock conversion request.  For conversions the name and namelen
+ * are ignored and the lock ID in the LKSB is used to identify the lock.
+ *
+ * DLM_LKF_VALBLK
+ *
+ * Requests DLM to return the current contents of the lock value block in the
+ * lock status block.  When this flag is set in a lock conversion from PW or EX
+ * modes, DLM assigns the value specified in the lock status block to the lock
+ * value block of the lock resource.  The LVB is a DLM_LVB_LEN size array
+ * containing application-specific information.
+ *
+ * DLM_LKF_QUECVT
+ *
+ * Force a conversion request to be queued, even if it is compatible with
+ * the granted modes of other locks on the same resource.
+ *
+ * DLM_LKF_CANCEL
+ *
+ * Used to cancel a pending conversion (with dlm_unlock).  Lock is returned to
+ * previously granted mode.
+ *
+ * DLM_LKF_IVVALBLK
+ *
+ * Invalidate/clear the lock value block.
+ *
+ * DLM_LKF_CONVDEADLK
+ *
+ * The granted mode of a lock being converted (from a non-NL mode) can be
+ * changed to NL in the process of acquiring the requested mode to avoid
+ * conversion deadlock.
+ *
+ * DLM_LKF_PERSISTENT
+ *
+ * Only relevant to locks originating in userspace. Signals to the ioctl.c code
+ * that this lock should not be unlocked when the process exits.
+ *
+ * DLM_LKF_NODLKWT
+ *
+ * This lock is not to be checked for conversion deadlocks.
+ *
+ * DLM_LKF_NODLCKBLK
+ *
+ * not yet implemented
+ *
+ * DLM_LKF_EXPEDITE
+ *
+ * Used only with new requests for NL mode locks.  Tells the lock manager
+ * to grant the lock, ignoring other locks in convert and wait queues.
+ *
+ * DLM_LKF_NOQUEUEBAST
+ *
+ * Send blocking AST's before returning -EAGAIN to the caller.  It is only
+ * used along with the NOQUEUE flag.  Blocking AST's are not sent for failed
+ * NOQUEUE requests otherwise.
+ *
+ * DLM_LKF_HEADQUE
+ *
+ * Add a lock to the head of the convert or wait queue rather than the tail.
+ *
+ * DLM_LKF_NOORDER
+ *
+ * Disregard the standard grant order rules and grant a lock as soon as it
+ * is compatible with other granted locks.
+ */
+
+#define DLM_LKF_NOQUEUE        (0x00000001)
+#define DLM_LKF_CANCEL         (0x00000002)
+#define DLM_LKF_CONVERT        (0x00000004)
+#define DLM_LKF_VALBLK         (0x00000008)
+#define DLM_LKF_QUECVT         (0x00000010)
+#define DLM_LKF_IVVALBLK       (0x00000020)
+#define DLM_LKF_CONVDEADLK     (0x00000040)
+#define DLM_LKF_PERSISTENT     (0x00000080)
+#define DLM_LKF_NODLCKWT       (0x00000100)
+#define DLM_LKF_NODLCKBLK      (0x00000200)
+#define DLM_LKF_EXPEDITE       (0x00000400)
+#define DLM_LKF_NOQUEUEBAST    (0x00000800)
+#define DLM_LKF_HEADQUE        (0x00001000)
+#define DLM_LKF_NOORDER        (0x00002000)
+#define DLM_LKF_ORPHAN         (0x00004000)
+
+/*
+ * Some return codes that are not in errno.h
+ */
+
+#define DLM_ECANCEL            (0x10001)
+#define DLM_EUNLOCK            (0x10002)
+
+typedef void dlm_lockspace_t;
+
+/*
+ * Lock range structure
+ */
+
+struct dlm_range {
+	uint64_t ra_start;
+	uint64_t ra_end;
+};
+
+/*
+ * Lock status block
+ *
+ * Use this structure to specify the contents of the lock value block.  For a
+ * conversion request, this structure is used to specify the lock ID of the
+ * lock.  DLM writes the status of the lock request and the lock ID assigned
+ * to the request in the lock status block.
+ *
+ * sb_lkid: the returned lock ID.  It is set on new (non-conversion) requests.
+ * It is available when dlm_lock returns.
+ *
+ * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
+ * shown for the DLM_LKF_VALBLK flag.
+ *
+ * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
+ * it was first demoted to NL to avoid conversion deadlock.
+ *
+ * sb_status: the returned status of the lock request set prior to AST
+ * execution.  Possible return values:
+ *
+ * 0 if lock request was successful
+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
+ * -ENOMEM if there is no memory to process request
+ * -EINVAL if there are invalid parameters
+ * -DLM_EUNLOCK if unlock request was successful
+ * -DLM_ECANCEL ?
+ */
+
+#define DLM_SBF_DEMOTED        (0x01)
+
+struct dlm_lksb {
+	int 	 sb_status;
+	uint32_t sb_lkid;
+	char 	 sb_flags;
+	char *	 sb_lvbptr;
+};
+
+/*
+ * These defines are the bits that make up the query code.
+ */
+
+/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
+ * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
+#define DLM_LOCK_THIS            0x0007
+#define DLM_QUERY_MODE_MASK      0x0007
+
+/* Bits 3, 4, 5  bitmap of queue(s) to query */
+#define DLM_QUERY_QUEUE_WAIT     0x0008
+#define DLM_QUERY_QUEUE_CONVERT  0x0010
+#define DLM_QUERY_QUEUE_GRANT    0x0020
+#define DLM_QUERY_QUEUE_GRANTED  0x0030	/* Shorthand */
+#define DLM_QUERY_QUEUE_ALL      0x0038	/* Shorthand */
+
+/* Bit 6, Return only the information that can be established without a network
+ * round-trip. The caller must be aware of the implications of this. Useful for
+ * just getting the master node id or resource name. */
+#define DLM_QUERY_LOCAL          0x0040
+
+/* Bits 8 up, query type */
+#define DLM_QUERY_LOCKS_HIGHER   0x0100
+#define DLM_QUERY_LOCKS_LOWER    0x0200
+#define DLM_QUERY_LOCKS_EQUAL    0x0300
+#define DLM_QUERY_LOCKS_BLOCKING 0x0400
+#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
+#define DLM_QUERY_LOCKS_ALL      0x0600
+#define DLM_QUERY_LOCKS_ORPHAN   0x0700
+#define DLM_QUERY_MASK           0x0F00
+
+/* GRMODE is the default for mode comparisons,
+   RQMODE might also be handy */
+#define DLM_QUERY_GRMODE         0x0000
+#define DLM_QUERY_RQMODE         0x1000
+
+/* Structures passed into and out of the query */
+
+struct dlm_lockinfo {
+	int lki_lkid;		/* Lock ID on originating node */
+        int lki_mstlkid;        /* Lock ID on master node */
+	int lki_parent;
+	int lki_node;		/* Originating node (not master) */
+	int lki_ownpid;		/* Owner pid on originating node */
+	uint8_t lki_state;	/* Queue the lock is on */
+	uint8_t lki_grmode;	/* Granted mode */
+	uint8_t lki_rqmode;	/* Requested mode */
+	struct dlm_range lki_grrange;	/* Granted range, if applicable */
+	struct dlm_range lki_rqrange;	/* Requested range, if applicable */
+};
+
+struct dlm_resinfo {
+	int rsi_length;
+	int rsi_grantcount;	/* No. of nodes on grant queue */
+	int rsi_convcount;	/* No. of nodes on convert queue */
+	int rsi_waitcount;	/* No. of nodes on wait queue */
+	int rsi_masternode;	/* Master for this resource */
+	char rsi_name[DLM_RESNAME_MAXLEN];	/* Resource name */
+	char rsi_valblk[DLM_LVB_LEN];	/* Master's LVB contents, if applicable
+					 */
+};
+
+struct dlm_queryinfo {
+	struct dlm_resinfo *gqi_resinfo;
+	struct dlm_lockinfo *gqi_lockinfo;	/* This points to an array
+						 * of structs */
+	int gqi_locksize;	/* input */
+	int gqi_lockcount;	/* output */
+};
+
+#ifdef __KERNEL__
+/*
+ * dlm_init
+ *
+ * Starts and initializes DLM threads and structures.  Creation of the first
+ * lockspace will call this if it has not been called already.
+ *
+ * Returns: 0 if successful, -EXXX on error
+ */
+
+int dlm_init(void);
+
+/*
+ * dlm_release
+ *
+ * Stops DLM threads.
+ *
+ * Returns: 0 if successful, -EXXX on error
+ */
+
+int dlm_release(void);
+
+/*
+ * dlm_new_lockspace
+ *
+ * Starts a lockspace with the given name.  If the named lockspace exists in
+ * the cluster, the calling node joins it.
+ */
+
+int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
+		      int flags);
+
+/*
+ * dlm_release_lockspace
+ *
+ * Stop a lockspace.
+ */
+
+int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
+
+/*
+ * dlm_lock
+ *
+ * Make an asyncronous request to acquire or convert a lock on a named
+ * resource.
+ *
+ * lockspace: context for the request
+ * mode: the requested mode of the lock (DLM_LOCK_)
+ * lksb: lock status block for input and async return values
+ * flags: input flags (DLM_LKF_)
+ * name: name of the resource to lock, can be binary
+ * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
+ * parent: the lock ID of a parent lock or 0 if none
+ * lockast: function DLM executes when it completes processing the request
+ * astarg: argument passed to lockast and bast functions
+ * bast: function DLM executes when this lock later blocks another request
+ *
+ * Returns:
+ * 0 if request is successfully queued for processing
+ * -EINVAL if any input parameters are invalid
+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
+ * -ENOMEM if there is no memory to process request
+ * -ENOTCONN if there is a communication error
+ *
+ * If the call to dlm_lock returns an error then the operation has failed and
+ * the AST routine will not be called.  If dlm_lock returns 0 it is still
+ * possible that the lock operation will fail. The AST routine will be called
+ * when the locking is complete and the status is returned in the lksb.
+ *
+ * If the AST routines or parameter are passed to a conversion operation then
+ * they will overwrite those values that were passed to a previous dlm_lock
+ * call.
+ *
+ * AST routines should not block (at least not for long), but may make
+ * any locking calls they please.
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+	     uint32_t mode,
+	     struct dlm_lksb *lksb,
+	     uint32_t flags,
+	     void *name,
+	     unsigned int namelen,
+	     uint32_t parent,
+	     void (*lockast) (void *astarg),
+	     void *astarg,
+	     void (*bast) (void *astarg, int mode),
+	     struct dlm_range *range);
+
+/*
+ * dlm_unlock
+ *
+ * Asynchronously release a lock on a resource.  The AST routine is called
+ * when the resource is successfully unlocked.
+ *
+ * lockspace: context for the request
+ * lkid: the lock ID as returned in the lksb
+ * flags: input flags (DLM_LKF_)
+ * lksb: if NULL the lksb parameter passed to last lock request is used
+ * astarg: the arg used with the completion ast for the unlock
+ *
+ * Returns:
+ * 0 if request is successfully queued for processing
+ * -EINVAL if any input parameters are invalid
+ * -ENOTEMPTY if the lock still has sublocks
+ * -EBUSY if the lock is waiting for a remote lock operation
+ * -ENOTCONN if there is a communication error
+ */
+
+extern int dlm_unlock(dlm_lockspace_t *lockspace,
+		       uint32_t lkid,
+		       uint32_t flags,
+		       struct dlm_lksb *lksb,
+		       void *astarg);
+
+/* Query interface
+ *
+ * Query the other holders of a resource, given a known lock ID
+ *
+ * lockspace:   context for the request
+ * lksb:        LKSB, sb_lkid contains the lock ID of a valid lock
+ *              on the resource. sb_status will contain the status
+ *	        of the request on completion.
+ * query:       query bitmap see DLM_QUERY_* above
+ * qinfo:       pointer to dlm_queryinfo structure
+ * ast_routine: AST routine to call on completion
+ * artarg:      argument to AST routine. It is "traditional"
+ *              to put the qinfo pointer into lksb->sb_lvbptr
+ *              and pass the lksb in here.
+ */
+extern int dlm_query(dlm_lockspace_t *lockspace,
+		      struct dlm_lksb *lksb,
+		      int query,
+		      struct dlm_queryinfo *qinfo,
+		      void (ast_routine(void *)),
+		      void *astarg);
+
+
+void dlm_debug_dump(void);
+void dlm_locks_dump(void);
+
+#endif				/* __KERNEL__ */
+
+#endif				/* __DLM_DOT_H__ */
diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
--- linux-orig/include/cluster/dlm_device.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/include/cluster/dlm_device.h	2004-11-03 11:31:56.000000000 +0800
@@ -0,0 +1,64 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* This is the device interface for dlm, most users will use a library
+ * interface.
+ */
+
+/* Version of the device interface */
+#define DLM_DEVICE_VERSION_MAJOR 2
+#define DLM_DEVICE_VERSION_MINOR 0
+#define DLM_DEVICE_VERSION_PATCH 0
+
+/* struct passed to the lock write */
+struct dlm_lock_params {
+	uint32_t version[3];
+	uint8_t cmd;
+	uint8_t mode;
+	uint16_t flags;
+	uint32_t lkid;
+	uint32_t parent;
+	struct dlm_range range;
+	uint8_t namelen;
+        void *castparam;
+	void *castaddr;
+	void *bastparam;
+        void *bastaddr;
+        struct dlm_lksb *lksb;
+	char name[1];
+};
+
+
+/* struct read from the "device" fd,
+   consists mainly of userspace pointers for the library to use */
+struct dlm_lock_result {
+    	uint8_t cmd;
+        void *astparam;
+        void (*astaddr)(void *astparam);
+        struct dlm_lksb *user_lksb;
+        struct dlm_lksb lksb;  /* But this has real data in it */
+        uint8_t bast_mode; /* Not yet used */
+};
+
+/* commands passed to the device */
+#define DLM_USER_LOCK       1
+#define DLM_USER_UNLOCK     2
+#define DLM_USER_QUERY      3
+
+/* Arbitrary length restriction */
+#define MAX_LS_NAME_LEN 64
+
+/* ioctls on the device */
+#define DLM_CREATE_LOCKSPACE         _IOW('D', 0x01, char *)
+#define DLM_RELEASE_LOCKSPACE        _IOW('D', 0x02, char *)
+#define DLM_FORCE_RELEASE_LOCKSPACE  _IOW('D', 0x03, char *)