[packages/kernel.git] / linux-cluster-dlm.patch

# Add DLM to the build system
diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
--- linux-2.6.7/cluster/Kconfig	2004-06-17 15:00:36.000000000 +0800
+++ linux/cluster/Kconfig	2004-06-17 15:00:57.000000000 +0800
@@ -10,4 +10,22 @@ config CLUSTER
 	needed by all the other components. It provides membership services
 	for those other subsystems.
 
+config CLUSTER_DLM
+	tristate "Distributed Lock Manager"
+	depends on CLUSTER
+	---help---
+	A fully distributed lock manager, providing cluster-wide locking services
+	and protected lock namespaces for kernel and userland applications.
+
+config CLUSTER_DLM_PROCLOCKS
+       boolean "/proc/locks support for DLM"
+       depends on CLUSTER_DLM
+       depends on PROC_FS
+       ---help---
+       If this option is enabled a file will appear in /proc/cluster/dlm_locks.
+       write into this "file" the name of a lockspace known to the DLM and then
+       read out a list of all the resources and locks in that lockspace that are
+       known to the local node. Note because the DLM is distributed this may not
+       be the full lock picture.
+
 endmenu
diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
--- linux-2.6.7/cluster/Makefile	2004-06-17 15:00:36.000000000 +0800
+++ linux/cluster/Makefile	2004-06-17 15:00:57.000000000 +0800
@@ -1,3 +1,4 @@
 obj-y	:= nocluster.o
 
 obj-$(CONFIG_CLUSTER)         += cman/
+obj-$(CONFIG_CLUSTER_DLM)     += dlm/
diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile
--- linux-2.6.7/cluster/dlm/Makefile	1970-01-01 07:30:00.000000000 +0730
+++ linux/cluster/dlm/Makefile	2004-06-17 15:00:57.000000000 +0800
@@ -0,0 +1,23 @@
+dlm-objs		  :=	ast.o \
+				config.o \
+				device.o \
+				dir.o \
+				lkb.o \
+				locking.o \
+				lockqueue.o \
+				lockspace.o \
+				lowcomms.o \
+				main.o \
+				memory.o \
+				midcomms.o \
+				nodes.o \
+				proc.o \
+				queries.o \
+				rebuild.o \
+				reccomms.o \
+				recover.o \
+				recoverd.o \
+				rsb.o \
+				util.o \
+
+obj-$(CONFIG_CLUSTER_DLM) += dlm.o
diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
--- linux-orig/cluster/dlm/ast.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/ast.c	2004-06-29 20:01:19.000000000 +0800
@@ -0,0 +1,560 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* 
+ * This delivers ASTs and checks for dead remote requests and deadlocks.
+ */
+
+#include <linux/timer.h>
+
+#include "dlm_internal.h"
+#include "rsb.h"
+#include "lockqueue.h"
+#include "dir.h"
+#include "locking.h"
+#include "lkb.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "ast.h"
+#include "nodes.h"
+#include "config.h"
+
+/* Wake up flags for astd */
+#define GDLMD_WAKE_ASTS  1
+#define GDLMD_WAKE_TIMER 2
+
+static struct list_head _deadlockqueue;
+static struct semaphore _deadlockqueue_lock;
+static struct list_head _lockqueue;
+static struct semaphore _lockqueue_lock;
+static struct timer_list _lockqueue_timer;
+static struct list_head _ast_queue;
+static struct semaphore _ast_queue_lock;
+static wait_queue_head_t _astd_waitchan;
+static atomic_t _astd_running;
+static long _astd_pid;
+static unsigned long _astd_wakeflags;
+static struct completion _astd_done;
+
+void add_to_lockqueue(gd_lkb_t *lkb)
+{
+	/* Time stamp the entry so we know if it's been waiting too long */
+	lkb->lkb_lockqueue_time = jiffies;
+
+	down(&_lockqueue_lock);
+	list_add(&lkb->lkb_lockqueue, &_lockqueue);
+	up(&_lockqueue_lock);
+}
+
+void remove_from_lockqueue(gd_lkb_t *lkb)
+{
+	down(&_lockqueue_lock);
+	list_del(&lkb->lkb_lockqueue);
+	up(&_lockqueue_lock);
+}
+
+void add_to_deadlockqueue(gd_lkb_t *lkb)
+{
+	if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
+		return;
+	lkb->lkb_duetime = jiffies;
+	down(&_deadlockqueue_lock);
+	list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
+	up(&_deadlockqueue_lock);
+}
+
+void remove_from_deadlockqueue(gd_lkb_t *lkb)
+{
+	if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
+		return;
+
+	down(&_deadlockqueue_lock);
+	list_del(&lkb->lkb_deadlockq);
+	up(&_deadlockqueue_lock);
+
+	/* Invalidate the due time */
+	memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
+}
+
+/* 
+ * deliver an AST to a user
+ */
+
+static void deliver_ast(gd_lkb_t *lkb, uint16_t ast_type)
+{
+	void (*cast) (long param) = lkb->lkb_astaddr;
+	void (*bast) (long param, int mode) = lkb->lkb_bastaddr;
+
+	if (ast_type == AST_BAST) {
+		if (!bast)
+			return;
+		if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
+			return;
+		bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode);
+	} else {
+		if (!cast)
+			return;
+		cast(lkb->lkb_astparam);
+	}
+}
+
+/* 
+ * Queue an AST for delivery, this will only deal with
+ * kernel ASTs, usermode API will piggyback on top of this.
+ *
+ * This can be called in either the user or DLM context.
+ * ASTs are queued EVEN IF we are already running in gdlm_astd
+ * context as we don't know what other locks are held (eg we could
+ * be being called from a lock operation that was called from
+ * another AST!
+ * If the AST is to be queued remotely then a message is sent to
+ * the target system via midcomms.
+ */
+
+void queue_ast(gd_lkb_t *lkb, uint16_t flags, uint8_t rqmode)
+{
+	struct gd_remlockrequest req;
+
+	if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
+		/* 
+		 * Send a message to have an ast queued remotely.  Note: we do
+		 * not send remote completion asts, they are handled as part of
+		 * remote lock granting.
+		 */
+		if (flags & AST_BAST) {
+			req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
+			req.rr_header.rh_length = sizeof(req);
+			req.rr_header.rh_flags = 0;
+			req.rr_header.rh_lkid = lkb->lkb_id;
+			req.rr_header.rh_lockspace =
+			    lkb->lkb_resource->res_ls->ls_global_id;
+			req.rr_status = lkb->lkb_retstatus;
+			req.rr_remlkid = lkb->lkb_remid;
+			req.rr_rqmode = rqmode;
+
+			midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
+				lkb->lkb_resource->res_ls->ls_allocation);
+		} else if (lkb->lkb_retstatus == -EDEADLOCK) {
+			/* 
+			 * We only queue remote Completion ASTs here for error
+			 * completions that happen out of band.
+			 * DEADLOCK is one such.
+			 */
+			req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
+			req.rr_header.rh_length = sizeof(req);
+			req.rr_header.rh_flags = 0;
+			req.rr_header.rh_lkid = lkb->lkb_id;
+			req.rr_header.rh_lockspace =
+			    lkb->lkb_resource->res_ls->ls_global_id;
+			req.rr_status = lkb->lkb_retstatus;
+			req.rr_remlkid = lkb->lkb_remid;
+			req.rr_rqmode = rqmode;
+
+			midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
+				lkb->lkb_resource->res_ls->ls_allocation);
+		}
+	} else {
+		/* 
+		 * Prepare info that will be returned in ast/bast.
+		 */
+
+		if (flags & AST_BAST) {
+			lkb->lkb_bastmode = rqmode;
+		} else {
+			lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
+
+			if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
+				lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
+			else
+				lkb->lkb_lksb->sb_flags = 0;
+		}
+
+		down(&_ast_queue_lock);
+		if (lkb->lkb_astflags & AST_DEL)
+			log_print("queue_ast on deleted lkb %x ast %x pid %u",
+				  lkb->lkb_id, lkb->lkb_astflags, current->pid);
+		if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST)))
+			list_add_tail(&lkb->lkb_astqueue, &_ast_queue);
+		lkb->lkb_astflags |= flags;
+		up(&_ast_queue_lock);
+
+		/* It is the responsibility of the caller to call wake_astd()
+		 * after it has finished other locking operations that request
+		 * the ASTs to be delivered after */
+	}
+}
+
+/* 
+ * Process any LKBs on the AST queue.
+ */
+
+static void process_asts(void)
+{
+	gd_lkb_t *lkb;
+	uint16_t flags;
+
+	for (;;) {
+		down(&_ast_queue_lock);
+		if (list_empty(&_ast_queue)) {
+			up(&_ast_queue_lock);
+			break;
+		}
+
+		lkb = list_entry(_ast_queue.next, gd_lkb_t, lkb_astqueue);
+		list_del(&lkb->lkb_astqueue);
+		flags = lkb->lkb_astflags;
+		lkb->lkb_astflags = 0;
+		up(&_ast_queue_lock);
+
+		if (flags & AST_COMP)
+			deliver_ast(lkb, AST_COMP);
+
+		if (flags & AST_BAST) {
+			if (flags & AST_DEL)
+				log_print("skip bast on %x", lkb->lkb_id);
+			else
+				deliver_ast(lkb, AST_BAST);
+		}
+
+		if (flags & AST_DEL) {
+			gd_res_t *rsb = lkb->lkb_resource;
+			gd_ls_t *ls = rsb->res_ls;
+
+			GDLM_ASSERT(lkb->lkb_astflags == 0,
+			    printk("%x %x\n", lkb->lkb_id, lkb->lkb_astflags););
+
+			down_read(&ls->ls_in_recovery);
+			release_lkb(ls, lkb);
+			release_rsb(rsb);
+			up_read(&ls->ls_in_recovery);
+		}
+
+		schedule();
+	}
+}
+
+void lockqueue_lkb_mark(gd_ls_t *ls)
+{
+	gd_lkb_t *lkb, *safe;
+	int count = 0;
+
+	log_all(ls, "mark waiting requests");
+
+	down(&_lockqueue_lock);
+
+	list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
+
+		if (lkb->lkb_resource->res_ls != ls)
+			continue;
+
+		/* 
+		 * These lkb's are new and the master is being looked up.  Mark
+		 * the lkb request to be resent.  Even if the destination node
+		 * for the request is still living and has our request, it will
+		 * purge all resdir requests in purge_requestqueue.  If there's
+		 * a reply to the LOOKUP request in our requestqueue (the reply
+		 * arrived after ls_stop), it is invalid and will be discarded
+		 * in purge_requestqueue, too.
+		 */
+
+		if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
+			GDLM_ASSERT(lkb->lkb_nodeid == -1,
+				    log_error(ls, "nodeid=%d\n",
+					      lkb->lkb_nodeid););
+
+			lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
+			count++;
+			continue;
+		}
+
+		/* 
+		 * These lkb's have an outstanding request to a bygone node.
+		 * The request will be redirected to the new master node in
+		 * resend_cluster_requests().  Don't mark the request for
+		 * resending if there's a reply for it saved in the
+		 * requestqueue.
+		 */
+
+		if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
+		    !reply_in_requestqueue(ls, lkb->lkb_id)) {
+
+			lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
+
+			/* 
+			 * Don't rebuild this lkb on a new rsb in
+			 * rebuild_rsbs_send().
+			 */
+
+			if (lkb->lkb_lockqueue_state ==
+			    GDLM_LQSTATE_WAIT_CONDGRANT) {
+				GDLM_ASSERT(lkb->lkb_status ==
+					    GDLM_LKSTS_WAITING, );
+				lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
+			}
+
+			/* 
+			 * This flag indicates to the new master that his lkb
+			 * is in the midst of a convert request and should be
+			 * placed on the granted queue rather than the convert
+			 * queue.  We will resend this convert request to the
+			 * new master.
+			 */
+
+			else if (lkb->lkb_lockqueue_state ==
+				 GDLM_LQSTATE_WAIT_CONVERT) {
+				GDLM_ASSERT(lkb->lkb_status ==
+					    GDLM_LKSTS_CONVERT, );
+				lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
+			}
+
+			count++;
+		}
+	}
+	up(&_lockqueue_lock);
+
+	log_all(ls, "marked %d requests", count);
+}
+
+int resend_cluster_requests(gd_ls_t *ls)
+{
+	gd_lkb_t *lkb, *safe;
+	int error = 0, state, count = 0;
+
+	log_all(ls, "resend marked requests");
+
+	down(&_lockqueue_lock);
+
+	list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
+
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+			log_debug(ls, "resend_cluster_requests: aborted");
+			error = -EINTR;
+			break;
+		}
+
+		if (lkb->lkb_resource->res_ls != ls)
+			continue;
+
+		log_debug(ls, "resend_cluster_requests id=%x nodeid=%d "
+		          "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid,
+			  lkb->lkb_lockqueue_state, lkb->lkb_flags);
+
+		/* 
+		 * Resend/process the lockqueue lkb's (in-progres requests)
+		 * that were flagged at the start of recovery in
+		 * lockqueue_lkb_mark().
+		 */
+
+		if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
+			lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
+			lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
+			lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
+
+			if (lkb->lkb_nodeid == -1) {
+				/* 
+				 * Send lookup to new resdir node.
+				 */
+				lkb->lkb_lockqueue_time = jiffies;
+				send_cluster_request(lkb,
+						     lkb->lkb_lockqueue_state);
+			}
+
+			else if (lkb->lkb_nodeid != 0) {
+				/* 
+				 * There's a new RSB master (that's not us.)
+				 */
+				lkb->lkb_lockqueue_time = jiffies;
+				send_cluster_request(lkb,
+						     lkb->lkb_lockqueue_state);
+			}
+
+			else {
+				/* 
+				 * We are the new RSB master for this lkb
+				 * request.
+				 */
+				state = lkb->lkb_lockqueue_state;
+				lkb->lkb_lockqueue_state = 0;
+				/* list_del equals remove_from_lockqueue() */
+				list_del(&lkb->lkb_lockqueue);
+				process_remastered_lkb(lkb, state);
+			}
+
+			count++;
+		}
+	}
+	up(&_lockqueue_lock);
+
+	log_all(ls, "resent %d requests", count);
+	return error;
+}
+
+/* 
+ * Process any LKBs on the Lock queue, this
+ * just looks at the entries to see if they have been
+ * on the queue too long and fails the requests if so.
+ */
+
+static void process_lockqueue(void)
+{
+	gd_lkb_t *lkb, *safe;
+	gd_ls_t *ls;
+	int count = 0;
+
+	down(&_lockqueue_lock);
+
+	list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
+		ls = lkb->lkb_resource->res_ls;
+
+		if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
+			continue;
+
+		/* Don't time out locks that are in transition */
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
+			continue;
+
+		if (check_timeout(lkb->lkb_lockqueue_time,
+				  dlm_config.lock_timeout)) {
+			count++;
+			list_del(&lkb->lkb_lockqueue);
+			up(&_lockqueue_lock);
+			cancel_lockop(lkb, -ETIMEDOUT);
+			down(&_lockqueue_lock);
+		}
+	}
+	up(&_lockqueue_lock);
+
+	if (count)
+		wake_astd();
+
+	if (atomic_read(&_astd_running))
+		mod_timer(&_lockqueue_timer,
+			  jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
+}
+
+/* Look for deadlocks */
+static void process_deadlockqueue(void)
+{
+	gd_lkb_t *lkb, *safe;
+
+	down(&_deadlockqueue_lock);
+
+	list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
+		gd_lkb_t *kill_lkb;
+
+		/* Only look at "due" locks */
+		if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
+			break;
+
+		/* Don't look at locks that are in transition */
+		if (!test_bit(LSFL_LS_RUN,
+			      &lkb->lkb_resource->res_ls->ls_flags))
+			continue;
+
+		up(&_deadlockqueue_lock);
+
+		/* Lock has hit due time, check for conversion deadlock */
+		kill_lkb = conversion_deadlock_check(lkb);
+		if (kill_lkb)
+			cancel_conversion(kill_lkb, -EDEADLOCK);
+
+		down(&_deadlockqueue_lock);
+	}
+	up(&_deadlockqueue_lock);
+}
+
+static __inline__ int no_asts(void)
+{
+	int ret;
+
+	down(&_ast_queue_lock);
+	ret = list_empty(&_ast_queue);
+	up(&_ast_queue_lock);
+	return ret;
+}
+
+static void lockqueue_timer_fn(unsigned long arg)
+{
+	set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags);
+	wake_up(&_astd_waitchan);
+}
+
+/* 
+ * DLM daemon which delivers asts.
+ */
+
+static int dlm_astd(void *data)
+{
+	daemonize("dlm_astd");
+
+	INIT_LIST_HEAD(&_lockqueue);
+	init_MUTEX(&_lockqueue_lock);
+	INIT_LIST_HEAD(&_deadlockqueue);
+	init_MUTEX(&_deadlockqueue_lock);
+	INIT_LIST_HEAD(&_ast_queue);
+	init_MUTEX(&_ast_queue_lock);
+	init_waitqueue_head(&_astd_waitchan);
+	complete(&_astd_done);
+
+	/* 
+	 * Set a timer to check the lockqueue for dead locks (and deadlocks).
+	 */
+
+	init_timer(&_lockqueue_timer);
+	_lockqueue_timer.function = lockqueue_timer_fn;
+	_lockqueue_timer.data = 0;
+	mod_timer(&_lockqueue_timer,
+		  jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
+
+	while (atomic_read(&_astd_running)) {
+		wchan_cond_sleep_intr(_astd_waitchan, no_asts());
+
+		if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags))
+			process_asts();
+
+		if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) {
+			process_lockqueue();
+			if (dlm_config.deadlocktime)
+				process_deadlockqueue();
+		}
+	}
+
+	if (timer_pending(&_lockqueue_timer))
+		del_timer(&_lockqueue_timer);
+
+	complete(&_astd_done);
+
+	return 0;
+}
+
+void wake_astd(void)
+{
+	set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags);
+	wake_up(&_astd_waitchan);
+}
+
+int astd_start()
+{
+	init_completion(&_astd_done);
+	atomic_set(&_astd_running, 1);
+	_astd_pid = kernel_thread(dlm_astd, NULL, 0);
+	wait_for_completion(&_astd_done);
+	return 0;
+}
+
+void astd_stop()
+{
+	atomic_set(&_astd_running, 0);
+	wake_astd();
+	wait_for_completion(&_astd_done);
+}
diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
--- linux-orig/cluster/dlm/ast.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/ast.h	2004-06-29 20:01:19.000000000 +0800
@@ -0,0 +1,28 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __AST_DOT_H__
+#define __AST_DOT_H__
+
+void lockqueue_lkb_mark(gd_ls_t *ls);
+int resend_cluster_requests(gd_ls_t *ls);
+void add_to_lockqueue(gd_lkb_t *lkb);
+void remove_from_lockqueue(gd_lkb_t *lkb);
+void add_to_deadlockqueue(gd_lkb_t *lkb);
+void remove_from_deadlockqueue(gd_lkb_t *lkb);
+void queue_ast(gd_lkb_t *lkb, uint16_t astflags, uint8_t rqmode);
+void wake_astd(void);
+int astd_start(void);
+void astd_stop(void);
+
+#endif				/* __AST_DOT_H__ */
diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
--- linux-orig/cluster/dlm/config.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/config.c	2004-06-29 20:01:19.000000000 +0800
@@ -0,0 +1,125 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT       21064
+#define DEFAULT_LOCK_TIMEOUT      30
+#define DEFAULT_BUFFER_SIZE     4096
+#define DEFAULT_RESHASHTBL       256
+#define DEFAULT_LOCKIDTBL       1024
+#define DEFAULT_MAX_CONNECTIONS  128
+#define DEFAULT_DEADLOCKTIME      10
+
+struct config_info dlm_config = {
+	.tcp_port = DEFAULT_TCP_PORT,
+	.lock_timeout = DEFAULT_LOCK_TIMEOUT,
+	.buffer_size = DEFAULT_BUFFER_SIZE,
+	.reshashtbl = DEFAULT_RESHASHTBL,
+	.lockidtbl = DEFAULT_LOCKIDTBL,
+	.max_connections = DEFAULT_MAX_CONNECTIONS,
+	.deadlocktime = DEFAULT_DEADLOCKTIME,
+};
+
+
+static struct config_proc_info {
+    char *name;
+    int  *value;
+} config_proc[] = {
+    {
+	.name = "tcp_port",
+	.value = &dlm_config.tcp_port,
+    },
+    {
+	.name = "lock_timeout",
+	.value = &dlm_config.lock_timeout,
+    },
+    {
+	.name = "buffer_size",
+	.value = &dlm_config.buffer_size,
+    },
+    {
+	.name = "reshashtbl",
+	.value = &dlm_config.reshashtbl,
+    },
+    {
+	.name = "lockidtbl",
+	.value = &dlm_config.lockidtbl,
+    },
+    {
+	.name = "max_connections",
+	.value = &dlm_config.max_connections,
+    },
+    {
+	.name = "deadlocktime",
+	.value = &dlm_config.deadlocktime,
+    },
+};
+static struct proc_dir_entry *dlm_dir;
+
+static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
+				int *eof, void *data)
+{
+	struct config_proc_info *cinfo = data;
+	return snprintf(page, count, "%d\n", *cinfo->value);
+}
+
+static int dlm_config_write_proc(struct file *file, const char *buffer,
+				 unsigned long count, void *data)
+{
+	struct config_proc_info *cinfo = data;
+	int value;
+	char *end;
+
+	value = simple_strtoul(buffer, &end, 10);
+	if (*end)
+		*cinfo->value = value;
+	return count;
+}
+
+int dlm_config_init(void)
+{
+	int i;
+	struct proc_dir_entry *pde;
+
+	dlm_dir = proc_mkdir("cluster/config/dlm", 0);
+	if (!dlm_dir)
+		return -1;
+
+	dlm_dir->owner = THIS_MODULE;
+
+	for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
+		pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
+		if (pde) {
+			pde->data = &config_proc[i];
+			pde->write_proc = dlm_config_write_proc;
+			pde->read_proc = dlm_config_read_proc;
+		}
+	}
+	return 0;
+}
+
+void dlm_config_exit(void)
+{
+	int i;
+
+	for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
+		remove_proc_entry(config_proc[i].name, dlm_dir);
+	remove_proc_entry("cluster/config/dlm", NULL);
+}
diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
--- linux-orig/cluster/dlm/config.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/config.h	2004-06-29 20:01:19.000000000 +0800
@@ -0,0 +1,31 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+struct config_info {
+	int tcp_port;
+	int lock_timeout;
+	int buffer_size;
+	int reshashtbl;
+	int lockidtbl;
+	int max_connections;
+	int deadlocktime;
+};
+
+extern struct config_info dlm_config;
+extern int  dlm_config_init(void);
+extern void dlm_config_exit(void);
+
+#endif				/* __CONFIG_DOT_H__ */
diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
--- linux-orig/cluster/dlm/device.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/device.c	2004-06-29 20:01:19.000000000 +0800
@@ -0,0 +1,1020 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * device.c
+ *
+ * This is the userland interface to the DLM.
+ *
+ * The locking is done via a misc char device (find the
+ * registered minor number in /proc/misc).
+ *
+ * User code should not use this interface directly but
+ * call the library routines in libdlm.a instead.
+ *
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <asm/ioctls.h>
+
+#include "dlm_internal.h"
+#include "device.h"
+
+extern gd_lkb_t *dlm_get_lkb(gd_ls_t *, int);
+static struct file_operations _dlm_fops;
+static const char *name_prefix="dlm";
+static struct list_head user_ls_list;
+
+/* Flags in li_flags */
+#define LI_FLAG_COMPLETE  1
+#define LI_FLAG_FIRSTLOCK 2
+
+struct lock_info {
+	uint8_t li_cmd;
+	struct dlm_lksb li_lksb;
+	wait_queue_head_t li_waitq;
+	unsigned long li_flags;
+	void __user *li_astparam;
+	void __user *li_astaddr;
+	void __user *li_bastaddr;
+	struct file_info *li_file;
+	struct dlm_lksb __user *li_user_lksb;
+	struct semaphore li_firstlock;
+	struct dlm_queryinfo *li_queryinfo;
+	struct dlm_queryinfo __user *li_user_queryinfo;
+};
+
+/* A queued AST no less */
+struct ast_info {
+	struct dlm_lock_result result;
+	struct dlm_queryinfo *queryinfo;
+	struct dlm_queryinfo __user *user_queryinfo;
+	struct list_head list;
+};
+
+/* One of these per userland lockspace */
+struct user_ls {
+	void    *ls_lockspace;
+	atomic_t ls_refcnt;
+	long     ls_flags; /* bit 1 means LS has been deleted */
+
+	/* Passed into misc_register() */
+	struct miscdevice ls_miscinfo;
+	struct list_head  ls_list;
+};
+
+/* misc_device info for the control device */
+static struct miscdevice ctl_device;
+
+/*
+ * Stuff we hang off the file struct.
+ * The first two are to cope with unlocking all the
+ * locks help by a process when it dies.
+ */
+struct file_info {
+	struct list_head    fi_lkb_list;     /* List of active lkbs */
+	spinlock_t          fi_lkb_lock;
+	struct list_head    fi_ast_list;     /* Queue of ASTs to be delivered */
+	spinlock_t          fi_ast_lock;
+	wait_queue_head_t   fi_wait;
+	struct user_ls     *fi_ls;
+	atomic_t            fi_refcnt;       /* Number of users */
+	unsigned long       fi_flags;        /* Bit 1 means the device is open */
+};
+
+
+/* get and put ops for file_info.
+   Actually I don't really like "get" and "put", but everyone
+   else seems to use them and I can't think of anything
+   nicer at the moment */
+static void get_file_info(struct file_info *f)
+{
+	atomic_inc(&f->fi_refcnt);
+}
+
+static void put_file_info(struct file_info *f)
+{
+	if (atomic_dec_and_test(&f->fi_refcnt))
+		kfree(f);
+}
+
+/* Find a lockspace struct given the device minor number */
+static struct user_ls *find_lockspace(int minor)
+{
+	struct user_ls *lsinfo;
+
+	list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
+
+		if (lsinfo->ls_miscinfo.minor == minor)
+			return lsinfo;
+	}
+	return NULL;
+}
+
+static void add_lockspace_to_list(struct user_ls *lsinfo)
+{
+	list_add(&lsinfo->ls_list, &user_ls_list);
+}
+
+/* Register a lockspace with the DLM and create a misc
+   device for userland to access it */
+static int register_lockspace(char *name, struct user_ls **ls)
+{
+	struct user_ls *newls;
+	int status;
+	int namelen;
+
+	namelen = strlen(name)+strlen(name_prefix)+2;
+
+	newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
+	if (!newls)
+		return -ENOMEM;
+	memset(newls, 0, sizeof(struct user_ls));
+
+	newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
+	if (!newls->ls_miscinfo.name) {
+		kfree(newls);
+		return -ENOMEM;
+	}
+	snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
+
+	status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1,
+				    strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1,
+				    &newls->ls_lockspace, 0);
+
+	if (status != 0) {
+		kfree(newls->ls_miscinfo.name);
+		kfree(newls);
+		return status;
+	}
+
+	newls->ls_miscinfo.fops = &_dlm_fops;
+	newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
+
+	status = misc_register(&newls->ls_miscinfo);
+	if (status) {
+		log_print("failed to register misc device for %s", name);
+		dlm_release_lockspace(newls->ls_lockspace, 0);
+		kfree(newls->ls_miscinfo.name);
+		kfree(newls);
+		return status;
+	}
+
+
+	add_lockspace_to_list(newls);
+	*ls = newls;
+	return 0;
+}
+
+static int unregister_lockspace(struct user_ls *lsinfo, int force)
+{
+	int status;
+
+	status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
+	if (status)
+		return status;
+
+	status = misc_deregister(&lsinfo->ls_miscinfo);
+	if (status)
+		return status;
+
+	list_del(&lsinfo->ls_list);
+	kfree(lsinfo->ls_miscinfo.name);
+	kfree(lsinfo);
+
+	return 0;
+}
+
+/* Add it to userland's AST queue */
+static void add_to_astqueue(struct lock_info *li, void *astaddr)
+{
+	struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
+	if (!ast)
+		return;
+
+	ast->result.astparam  = li->li_astparam;
+	ast->result.astaddr   = astaddr;
+	ast->result.user_lksb = li->li_user_lksb;
+	ast->result.cmd       = li->li_cmd;
+	memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
+
+	/* These two will both be NULL for anything other than queries */
+	ast->queryinfo        = li->li_queryinfo;
+	ast->user_queryinfo   = li->li_user_queryinfo;
+
+	spin_lock(&li->li_file->fi_ast_lock);
+	list_add_tail(&ast->list, &li->li_file->fi_ast_list);
+	spin_unlock(&li->li_file->fi_ast_lock);
+	wake_up_interruptible(&li->li_file->fi_wait);
+}
+
+static void bast_routine(void *param, int mode)
+{
+	struct lock_info *li = param;
+
+	if (param) {
+		add_to_astqueue(li, li->li_bastaddr);
+	}
+}
+
+/*
+ * This is the kernel's AST routine.
+ * All lock, unlock & query operations complete here.
+ * The only syncronous ops are those done during device close.
+ */
+static void ast_routine(void *param)
+{
+	struct lock_info *li = param;
+
+	/* Param may be NULL if a persistent lock is unlocked by someone else */
+	if (!param)
+		return;
+
+	/* If it's an async request then post data to the user's AST queue. */
+	if (li->li_astaddr) {
+
+		/* Only queue AST if the device is still open */
+		if (test_bit(1, &li->li_file->fi_flags))
+			add_to_astqueue(li, li->li_astaddr);
+
+		/* If it's a new lock operation that failed, then
+		 * remove it from the owner queue and free the
+		 * lock_info. The DLM will not free the LKB until this
+		 * AST has completed.
+		 */
+		if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
+		    li->li_lksb.sb_status != 0) {
+			gd_lkb_t *lkb;
+
+			/* Wait till dlm_lock() has finished */
+			down(&li->li_firstlock);
+			lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
+			if (lkb) {
+				spin_lock(&li->li_file->fi_lkb_lock);
+				list_del(&lkb->lkb_ownerqueue);
+				spin_unlock(&li->li_file->fi_lkb_lock);
+			}
+			up(&li->li_firstlock);
+			put_file_info(li->li_file);
+			kfree(li);
+			return;
+		}
+		/* Free unlocks & queries */
+		if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
+		    li->li_cmd == DLM_USER_QUERY) {
+			put_file_info(li->li_file);
+			kfree(li);
+		}
+	}
+	else {
+		/* Syncronous request, just wake up the caller */
+		set_bit(LI_FLAG_COMPLETE, &li->li_flags);
+		wake_up_interruptible(&li->li_waitq);
+	}
+}
+
+/*
+ * Wait for the lock op to complete and return the status.
+ */
+static int wait_for_ast(struct lock_info *li)
+{
+	/* Wait for the AST routine to complete */
+	set_task_state(current, TASK_INTERRUPTIBLE);
+	while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
+		schedule();
+
+	set_task_state(current, TASK_RUNNING);
+
+	return li->li_lksb.sb_status;
+}
+
+
+/* Open on control device */
+static int dlm_ctl_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+/* Close on control device */
+static int dlm_ctl_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+/* Open on lockspace device */
+static int dlm_open(struct inode *inode, struct file *file)
+{
+	struct file_info *f;
+	struct user_ls *lsinfo;
+
+	lsinfo = find_lockspace(iminor(inode));
+	if (!lsinfo)
+		return -ENOENT;
+
+	f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
+	if (!f)
+		return -ENOMEM;
+
+	atomic_inc(&lsinfo->ls_refcnt);
+	INIT_LIST_HEAD(&f->fi_lkb_list);
+	INIT_LIST_HEAD(&f->fi_ast_list);
+	spin_lock_init(&f->fi_ast_lock);
+	spin_lock_init(&f->fi_lkb_lock);
+	init_waitqueue_head(&f->fi_wait);
+	f->fi_ls = lsinfo;
+	atomic_set(&f->fi_refcnt, 1);
+	set_bit(1, &f->fi_flags);
+
+	file->private_data = f;
+
+	return 0;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_lock_params *params)
+{
+	if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+	    (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+	     params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+		log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
+		       params->version[0],
+		       params->version[1],
+		       params->version[2],
+		       DLM_DEVICE_VERSION_MAJOR,
+		       DLM_DEVICE_VERSION_MINOR,
+		       DLM_DEVICE_VERSION_PATCH);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Close on lockspace device */
+static int dlm_close(struct inode *inode, struct file *file)
+{
+	struct file_info *f = file->private_data;
+	struct lock_info li;
+	sigset_t tmpsig;
+	sigset_t allsigs;
+	gd_lkb_t *lkb, *safe;
+	struct user_ls *lsinfo;
+	DECLARE_WAITQUEUE(wq, current);
+
+	lsinfo = find_lockspace(iminor(inode));
+	if (!lsinfo)
+		return -ENOENT;
+
+	/* Mark this closed so that ASTs will not be delivered any more */
+	clear_bit(1, &f->fi_flags);
+
+	/* Block signals while we are doing this */
+	sigfillset(&allsigs);
+	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+	/* We use our own lock_info struct here, so that any
+	 * outstanding "real" ASTs will be delivered with the
+	 * corresponding "real" params, thus freeing the lock_info
+	 * that belongs the lock. This catches the corner case where
+	 * a lock is BUSY when we try to unlock it here
+	 */
+	memset(&li, 0, sizeof(li));
+	clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
+	init_waitqueue_head(&li.li_waitq);
+	add_wait_queue(&li.li_waitq, &wq);
+
+	/*
+	 * Free any outstanding locks, they are on the
+	 * list in LIFO order so there should be no problems
+	 * about unlocking parents before children.
+	 * Although we don't remove the lkbs from the list here
+	 * (what would be the point?), foreach_safe is needed
+	 * because the lkbs are freed during dlm_unlock operations
+	 */
+	list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) {
+		int status;
+		int lock_status;
+		int flags = 0;
+		struct lock_info *old_li;
+
+		/* Make a copy of this pointer. If all goes well we will
+		 * free it later. if not it will be left to the AST routine
+		 * to tidy up
+		 */
+		old_li = (struct lock_info *)lkb->lkb_astparam;
+
+		/* Don't unlock persistent locks */
+		if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
+			list_del(&lkb->lkb_ownerqueue);
+
+			/* But tidy our references in it */
+			kfree(old_li);
+			lkb->lkb_astparam = (long)NULL;
+			put_file_info(f);
+			continue;
+		}
+
+		clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
+
+		/* If it's not granted then cancel the request.
+		 * If the lock was WAITING then it will be dropped,
+		 *    if it was converting then it will be reverted to GRANTED,
+		 *    then we will unlock it.
+		 */
+		lock_status = lkb->lkb_status;
+
+		if (lock_status != GDLM_LKSTS_GRANTED)
+			flags = DLM_LKF_CANCEL;
+
+		status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
+
+		/* Must wait for it to complete as the next lock could be its
+		 * parent */
+		if (status == 0)
+			wait_for_ast(&li);
+
+		/* If it was waiting for a conversion, it will
+		   now be granted so we can unlock it properly */
+		if (lock_status == GDLM_LKSTS_CONVERT) {
+
+			clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
+			status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li);
+
+			if (status == 0)
+				wait_for_ast(&li);
+		}
+		/* Unlock suceeded, free the lock_info struct. */
+		if (status == 0) {
+			kfree(old_li);
+			put_file_info(f);
+		}
+	}
+
+	remove_wait_queue(&li.li_waitq, &wq);
+
+	/* If this is the last reference, and the lockspace has been deleted
+	   the free the struct */
+	if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
+		kfree(lsinfo);
+	}
+
+	/* Restore signals */
+	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+	recalc_sigpending();
+
+	return 0;
+}
+
+/*
+ * ioctls to create/remove lockspaces, and check how many
+ * outstanding ASTs there are against a particular LS.
+ */
+static int dlm_ioctl(struct inode *inode, struct file *file,
+		     uint command, ulong u)
+{
+	struct file_info *fi = file->private_data;
+	int status = -EINVAL;
+	int count;
+	struct list_head *tmp_list;
+
+	switch (command) {
+
+		/* Are there any ASTs for us to read?
+		 * Warning, this returns the number of messages (ASTs)
+		 * in the queue, NOT the number of bytes to read
+		 */
+	case FIONREAD:
+		count = 0;
+		spin_lock(&fi->fi_ast_lock);
+		list_for_each(tmp_list, &fi->fi_ast_list)
+			count++;
+		spin_unlock(&fi->fi_ast_lock);
+		status = put_user(count, (int *)u);
+		break;
+
+	default:
+		return -ENOTTY;
+	}
+
+	return status;
+}
+
+/*
+ * ioctls to create/remove lockspaces.
+ */
+static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
+			 uint command, ulong u)
+{
+	int status = -EINVAL;
+	char ls_name[MAX_LS_NAME_LEN];
+	struct user_ls *lsinfo;
+	int force = 0;
+
+	switch (command) {
+	case DLM_CREATE_LOCKSPACE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
+			return -EFAULT;
+		status = register_lockspace(ls_name, &lsinfo);
+
+		/* If it succeeded then return the minor number */
+		if (status == 0)
+			status = lsinfo->ls_miscinfo.minor;
+		break;
+
+	case DLM_FORCE_RELEASE_LOCKSPACE:
+		force = 2;
+
+	case DLM_RELEASE_LOCKSPACE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		lsinfo = find_lockspace(u);
+		if (!lsinfo)
+			return -EINVAL;
+		status = unregister_lockspace(lsinfo, force);
+		break;
+
+	default:
+		return -ENOTTY;
+	}
+
+	return status;
+}
+
+/* Deal with the messy stuff of copying a web of structs
+   from kernel space to userspace */
+static int copy_query_result(struct ast_info *ast)
+{
+	int status = -EFAULT;
+	struct dlm_queryinfo qi;
+
+	/* Get the pointers to userspace structs */
+	if (copy_from_user(&qi, ast->user_queryinfo,
+			   sizeof(struct dlm_queryinfo)))
+		goto copy_out;
+
+	/* TODO: does this deref a user pointer? */
+	if (put_user(ast->queryinfo->gqi_lockcount,
+		     &ast->user_queryinfo->gqi_lockcount))
+		goto copy_out;
+
+	if (qi.gqi_resinfo) {
+		if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
+				 sizeof(struct dlm_resinfo)))
+			goto copy_out;
+	}
+
+	if (qi.gqi_lockinfo) {
+		if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
+				 sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
+			goto copy_out;
+	}
+
+	status = 0;
+
+	if (ast->queryinfo->gqi_lockinfo)
+		kfree(ast->queryinfo->gqi_lockinfo);
+
+	if (ast->queryinfo->gqi_resinfo)
+		kfree(ast->queryinfo->gqi_resinfo);
+
+	kfree(ast->queryinfo);
+
+ copy_out:
+	return status;
+}
+
+/* Read call, might block if no ASTs are waiting.
+ * It will only ever return one message at a time, regardless
+ * of how many are pending.
+ */
+static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
+{
+	struct file_info *fi = file->private_data;
+	struct ast_info *ast;
+	int ret;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (count < sizeof(struct dlm_lock_result))
+		return -EINVAL;
+
+	spin_lock(&fi->fi_ast_lock);
+	if (list_empty(&fi->fi_ast_list)) {
+
+		/* No waiting ASTs.
+		 * Return EOF if the lockspace been deleted.
+		 */
+		if (test_bit(1, &fi->fi_ls->ls_flags))
+			return 0;
+
+		if (file->f_flags & O_NONBLOCK) {
+			spin_unlock(&fi->fi_ast_lock);
+			return -EAGAIN;
+		}
+
+		add_wait_queue(&fi->fi_wait, &wait);
+
+	repeat:
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&fi->fi_ast_list) &&
+		    !signal_pending(current)) {
+
+			spin_unlock(&fi->fi_ast_lock);
+			schedule();
+			spin_lock(&fi->fi_ast_lock);
+			goto repeat;
+		}
+
+		current->state = TASK_RUNNING;
+		remove_wait_queue(&fi->fi_wait, &wait);
+
+		if (signal_pending(current)) {
+			spin_unlock(&fi->fi_ast_lock);
+			return -ERESTARTSYS;
+		}
+	}
+
+	ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
+	list_del(&ast->list);
+	spin_unlock(&fi->fi_ast_lock);
+
+	ret = sizeof(struct dlm_lock_result);
+	if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
+		ret = -EFAULT;
+
+	/* If it was a query then copy the result block back here */
+	if (ast->queryinfo) {
+		int status = copy_query_result(ast);
+		if (status)
+			ret = status;
+	}
+
+	kfree(ast);
+	return ret;
+}
+
+static unsigned int dlm_poll(struct file *file, poll_table *wait)
+{
+	struct file_info *fi = file->private_data;
+
+	poll_wait(file, &fi->fi_wait, wait);
+
+	spin_lock(&fi->fi_ast_lock);
+	if (!list_empty(&fi->fi_ast_list)) {
+		spin_unlock(&fi->fi_ast_lock);
+		return POLLIN | POLLRDNORM;
+	}
+
+	spin_unlock(&fi->fi_ast_lock);
+	return 0;
+}
+
+static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
+{
+	struct lock_info *li;
+	int status;
+
+	li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
+	if (!li)
+		return -ENOMEM;
+
+	get_file_info(fi);
+	li->li_user_lksb = kparams->lksb;
+	li->li_astparam  = kparams->astparam;
+	li->li_bastaddr  = kparams->bastaddr;
+	li->li_astaddr   = kparams->astaddr;
+	li->li_file      = fi;
+	li->li_flags     = 0;
+	li->li_cmd       = kparams->cmd;
+	clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
+
+	if (copy_from_user(&li->li_lksb, kparams->lksb,
+			   sizeof(struct dlm_lksb))) {
+		kfree(li);
+		return -EFAULT;
+	}
+	li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
+
+	/* Allocate query structs */
+	status = -ENOMEM;
+	li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
+	if (!li->li_queryinfo)
+		goto out1;
+
+	/* Mainly to get gqi_lock buffer size */
+	if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
+			   sizeof(struct dlm_queryinfo))) {
+		status = -EFAULT;
+		goto out1;
+	}
+
+	/* Overwrite userspace pointers we just copied with kernel space ones */
+	if (li->li_queryinfo->gqi_resinfo) {
+		li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
+		if (!li->li_queryinfo->gqi_resinfo)
+			goto out1;
+	}
+	if (li->li_queryinfo->gqi_lockinfo) {
+		li->li_queryinfo->gqi_lockinfo =
+			kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
+				GFP_KERNEL);
+		if (!li->li_queryinfo->gqi_lockinfo)
+			goto out2;
+	}
+
+	li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
+
+	return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
+			  kparams->flags, /* query */
+			  li->li_queryinfo,
+			  ast_routine, li);
+
+ out2:
+	kfree(li->li_queryinfo);
+
+ out1:
+	kfree(li);
+	return status;
+}
+
+static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
+			const char *buffer)
+{
+	struct lock_info *li;
+	int status;
+	char name[DLM_RESNAME_MAXLEN];
+
+	/*
+	 * Validate things that we need to have correct.
+	 */
+	if (kparams->namelen > DLM_RESNAME_MAXLEN)
+		return -EINVAL;
+
+	if (!kparams->astaddr)
+		return -EINVAL;
+
+	if (!kparams->lksb)
+		return -EINVAL;
+
+	/* Get the lock name */
+	if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
+			   kparams->namelen)) {
+		return -EFAULT;
+	}
+
+	/* For conversions, the lock will already have a lock_info
+	   block squirelled away in astparam */
+	if (kparams->flags & DLM_LKF_CONVERT) {
+		gd_lkb_t *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
+		if (!lkb) {
+			return -EINVAL;
+		}
+		li = (struct lock_info *)lkb->lkb_astparam;
+
+		/* Only override these if they are provided */
+		if (li->li_user_lksb)
+			li->li_user_lksb = kparams->lksb;
+		if (li->li_astparam)
+			li->li_astparam  = kparams->astparam;
+		if (li->li_bastaddr)
+			li->li_bastaddr  = kparams->bastaddr;
+		if (li->li_bastaddr)
+			li->li_astaddr   = kparams->astaddr;
+		li->li_flags     = 0;
+	}
+	else {
+		li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
+		if (!li)
+			return -ENOMEM;
+
+		li->li_user_lksb = kparams->lksb;
+		li->li_astparam  = kparams->astparam;
+		li->li_bastaddr  = kparams->bastaddr;
+		li->li_astaddr   = kparams->astaddr;
+		li->li_file      = fi;
+		li->li_flags     = 0;
+		li->li_cmd       = kparams->cmd;
+		li->li_queryinfo  = NULL;
+
+		/* semaphore to allow us to complete our work before
+  		   the AST routine runs. In fact we only need (and use) this
+		   when the initial lock fails */
+		init_MUTEX_LOCKED(&li->li_firstlock);
+		set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
+
+		get_file_info(fi);
+	}
+
+	/* Copy the user's LKSB into kernel space,
+	   needed for conversions & value block operations */
+	if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb,
+					    sizeof(struct dlm_lksb)))
+		return -EFAULT;
+
+	/* Lock it ... */
+	status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
+			   kparams->flags, name, kparams->namelen,
+			   kparams->parent,
+			   ast_routine,
+			   li,
+			   li->li_bastaddr ? bast_routine : NULL,
+			   kparams->range.ra_end ? &kparams->range : NULL);
+
+	/* If it succeeded (this far) with a new lock then keep track of
+	   it on the file's lkb list */
+	if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
+		gd_lkb_t *lkb;
+		lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
+
+		if (lkb) {
+			spin_lock(&fi->fi_lkb_lock);
+			list_add(&lkb->lkb_ownerqueue,
+				 &fi->fi_lkb_list);
+			spin_unlock(&fi->fi_lkb_lock);
+		}
+		else {
+			log_print("failed to get lkb for new lock");
+		}
+		up(&li->li_firstlock);
+	}
+
+	return status;
+}
+
+static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
+{
+	struct lock_info *li;
+	gd_lkb_t *lkb;
+	int status;
+
+	lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
+	if (!lkb) {
+		return -EINVAL;
+	}
+
+	li = (struct lock_info *)lkb->lkb_astparam;
+
+	li->li_user_lksb = kparams->lksb;
+	li->li_astparam  = kparams->astparam;
+	li->li_cmd       = kparams->cmd;
+
+	/* Have to do it here cos the lkb may not exist after
+	 * dlm_unlock() */
+	spin_lock(&fi->fi_lkb_lock);
+	list_del(&lkb->lkb_ownerqueue);
+	spin_unlock(&fi->fi_lkb_lock);
+
+	/* Use existing lksb & astparams */
+	status = dlm_unlock(fi->fi_ls->ls_lockspace,
+			     kparams->lkid,
+			     kparams->flags, NULL, NULL);
+
+	return status;
+}
+
+/* Write call, submit a locking request */
+static ssize_t dlm_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *ppos)
+{
+	struct file_info *fi = file->private_data;
+	struct dlm_lock_params kparams;
+	sigset_t tmpsig;
+	sigset_t allsigs;
+	int status;
+
+	if (count < sizeof(kparams))
+		return -EINVAL;
+
+	/* Has the lockspace been deleted */
+	if (test_bit(1, &fi->fi_ls->ls_flags))
+		return -ENOENT;
+
+	/* Get the command info */
+	if (copy_from_user(&kparams, buffer, sizeof(kparams)))
+		return -EFAULT;
+
+	if (check_version(&kparams))
+		return -EINVAL;
+
+	/* Block signals while we are doing this */
+	sigfillset(&allsigs);
+	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+	switch (kparams.cmd)
+	{
+	case DLM_USER_LOCK:
+		status = do_user_lock(fi, &kparams, buffer);
+		break;
+
+	case DLM_USER_UNLOCK:
+		status = do_user_unlock(fi, &kparams);
+		break;
+
+	case DLM_USER_QUERY:
+		status = do_user_query(fi, &kparams);
+		break;
+
+	default:
+		status = -EINVAL;
+		break;
+	}
+	/* Restore signals */
+	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+	recalc_sigpending();
+
+	if (status == 0)
+		return count;
+	else
+		return status;
+}
+
+void dlm_device_free_devices()
+{
+	struct user_ls *tmp;
+	struct user_ls *lsinfo;
+
+	list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
+		misc_deregister(&lsinfo->ls_miscinfo);
+
+		/* Tidy up, but don't delete the lsinfo struct until
+		   all the users have closed their devices */
+		list_del(&lsinfo->ls_list);
+		kfree(lsinfo->ls_miscinfo.name);
+		set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
+	}
+}
+
+static struct file_operations _dlm_fops = {
+      .open    = dlm_open,
+      .release = dlm_close,
+      .ioctl   = dlm_ioctl,
+      .read    = dlm_read,
+      .write   = dlm_write,
+      .poll    = dlm_poll,
+      .owner   = THIS_MODULE,
+};
+
+static struct file_operations _dlm_ctl_fops = {
+      .open    = dlm_ctl_open,
+      .release = dlm_ctl_close,
+      .ioctl   = dlm_ctl_ioctl,
+      .owner   = THIS_MODULE,
+};
+
+/*
+ * Create control device
+ */
+int dlm_device_init(void)
+{
+	int r;
+
+	INIT_LIST_HEAD(&user_ls_list);
+
+	ctl_device.name = "dlm-control";
+	ctl_device.fops = &_dlm_ctl_fops;
+	ctl_device.minor = MISC_DYNAMIC_MINOR;
+
+	r = misc_register(&ctl_device);
+	if (r) {
+		log_print("misc_register failed for DLM control device");
+		return r;
+	}
+
+	return 0;
+}
+
+void dlm_device_exit(void)
+{
+	misc_deregister(&ctl_device);
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
--- linux-orig/cluster/dlm/device.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/device.h	2004-06-29 20:01:19.000000000 +0800
@@ -0,0 +1,19 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DEVICE_DOT_H__
+#define __DEVICE_DOT_H__
+
+extern void dlm_device_free_devices(void);
+
+#endif				/* __DEVICE_DOT_H__ */
diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
--- linux-orig/cluster/dlm/dir.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/dir.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,430 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "nodes.h"
+#include "lockspace.h"
+#include "lowcomms.h"
+#include "reccomms.h"
+#include "rsb.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+
+/* 
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * From the hash value, we are interested in arriving at a final value between
+ * zero and the number of nodes minus one (num_nodes - 1).
+ *
+ * To accomplish this scaling, we take the nearest power of two larger than
+ * num_nodes and subtract one to create a bit mask.  The mask is applied to the
+ * hash, reducing the range to nearer the final range.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the previously masked hash value.
+ *
+ * This value in the desired range is used as an offset into the sorted list of
+ * nodeid's to give the particular nodeid of the directory node.
+ */
+
+uint32_t name_to_directory_nodeid(gd_ls_t *ls, char *name, int length)
+{
+	struct list_head *tmp;
+	gd_csb_t *csb = NULL;
+	uint32_t hash, node, n = 0, nodeid;
+
+	if (ls->ls_num_nodes == 1) {
+		nodeid = our_nodeid();
+		goto out;
+	}
+
+	hash = gdlm_hash(name, length);
+	node = (hash >> 16) & ls->ls_nodes_mask;
+	node %= ls->ls_num_nodes;
+
+	list_for_each(tmp, &ls->ls_nodes) {
+		if (n++ != node)
+			continue;
+		csb = list_entry(tmp, gd_csb_t, csb_list);
+		break;
+	}
+
+	GDLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n",
+				ls->ls_num_nodes, n, node, ls->ls_nodes_mask););
+	nodeid = csb->csb_node->gn_nodeid;
+
+      out:
+	return nodeid;
+}
+
+uint32_t get_directory_nodeid(gd_res_t *rsb)
+{
+	return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
+					rsb->res_length);
+}
+
+static inline uint32_t rd_hash(gd_ls_t *ls, char *name, int len)
+{
+	uint32_t val;
+
+	val = gdlm_hash(name, len);
+	val &= RESDIRHASH_MASK;
+
+	return val;
+}
+
+static void add_resdata_to_hash(gd_ls_t *ls, gd_resdata_t *rd)
+{
+	gd_resdir_bucket_t *bucket;
+	uint32_t hashval;
+
+	hashval = rd_hash(ls, rd->rd_name, rd->rd_length);
+	bucket = &ls->ls_resdir_hash[hashval];
+
+	list_add_tail(&rd->rd_list, &bucket->rb_reslist);
+}
+
+static gd_resdata_t *search_rdbucket(gd_ls_t *ls, char *name, int namelen,
+				     uint32_t bucket)
+{
+	struct list_head *head;
+	gd_resdata_t *rd;
+
+	head = &ls->ls_resdir_hash[bucket].rb_reslist;
+	list_for_each_entry(rd, head, rd_list) {
+		if (rd->rd_length == namelen &&
+		    !memcmp(name, rd->rd_name, namelen))
+			goto out;
+	}
+	rd = NULL;
+      out:
+	return rd;
+}
+
+void remove_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
+		    uint8_t sequence)
+{
+	gd_resdata_t *rd;
+	uint32_t bucket;
+
+	bucket = rd_hash(ls, name, namelen);
+
+	write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
+
+	rd = search_rdbucket(ls, name, namelen, bucket);
+
+	if (!rd) {
+		log_debug(ls, "remove_resdata not found nodeid=%u", nodeid);
+		goto out;
+	}
+
+	if (rd->rd_master_nodeid != nodeid) {
+		log_debug(ls, "remove_resdata wrong nodeid=%u", nodeid);
+		goto out;
+	}
+
+	if (rd->rd_sequence == sequence) {
+		list_del(&rd->rd_list);
+		free_resdata(rd);
+	} else {
+		/* 
+		log_debug(ls, "remove_resdata mismatch nodeid=%u rd=%u in=%u",
+		          nodeid, rd->rd_sequence, sequence);
+		*/
+	}
+
+      out:
+	write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
+}
+
+void resdir_clear(gd_ls_t *ls)
+{
+	struct list_head *head;
+	gd_resdata_t *rd;
+	int i;
+
+	for (i = 0; i < RESDIRHASH_SIZE; i++) {
+		head = &ls->ls_resdir_hash[i].rb_reslist;
+		while (!list_empty(head)) {
+			rd = list_entry(head->next, gd_resdata_t, rd_list);
+			list_del(&rd->rd_list);
+			free_resdata(rd);
+		}
+	}
+}
+
+static void gdlm_resmov_in(gd_resmov_t *rm, char *buf)
+{
+	gd_resmov_t tmp;
+
+	memcpy(&tmp, buf, sizeof(gd_resmov_t));
+
+	rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
+	rm->rm_length = be16_to_cpu(tmp.rm_length);
+}
+
+int resdir_rebuild_local(gd_ls_t *ls)
+{
+	gd_csb_t *csb;
+	gd_resdata_t *rd;
+	gd_rcom_t *rc;
+	gd_resmov_t mov, last_mov;
+	char *b, *last_name;
+	int error = -ENOMEM, count = 0;
+
+	log_all(ls, "rebuild resource directory");
+
+	resdir_clear(ls);
+
+	rc = allocate_rcom_buffer(ls);
+	if (!rc)
+		goto out;
+
+	last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+	if (!last_name)
+		goto free_rc;
+
+	list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
+		last_mov.rm_length = 0;
+		for (;;) {
+			error = gdlm_recovery_stopped(ls);
+			if (error)
+				goto free_last;
+
+			memcpy(rc->rc_buf, last_name, last_mov.rm_length);
+			rc->rc_datalen = last_mov.rm_length;
+
+			error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
+						  RECCOMM_RECOVERNAMES, rc, 1);
+			if (error)
+				goto free_last;
+
+			schedule();
+
+			/* 
+			 * pick each res out of buffer
+			 */
+
+			b = rc->rc_buf;
+
+			for (;;) {
+				gdlm_resmov_in(&mov, b);
+				b += sizeof(gd_resmov_t);
+
+				/* Length of 0 with a non-zero nodeid marks the 
+				 * end of the list */
+				if (!mov.rm_length && mov.rm_nodeid)
+					goto done;
+
+				/* This is just the end of the block */
+				if (!mov.rm_length)
+					break;
+
+				error = -ENOMEM;
+				rd = allocate_resdata(ls, mov.rm_length);
+				if (!rd)
+					goto free_last;
+
+				rd->rd_master_nodeid = mov.rm_nodeid;
+				rd->rd_length = mov.rm_length;
+				rd->rd_sequence = 1;
+
+				memcpy(rd->rd_name, b, mov.rm_length);
+				b += mov.rm_length;
+
+				add_resdata_to_hash(ls, rd);
+				count++;
+
+				last_mov = mov;
+				memset(last_name, 0, DLM_RESNAME_MAXLEN);
+				memcpy(last_name, rd->rd_name, rd->rd_length);
+			}
+		}
+	      done:
+		;
+	}
+
+	set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
+	error = 0;
+
+	log_all(ls, "rebuilt %d resources", count);
+
+      free_last:
+	kfree(last_name);
+
+      free_rc:
+	free_rcom_buffer(rc);
+
+      out:
+	return error;
+}
+
+/* 
+ * The reply end of resdir_rebuild_local/RECOVERNAMES.  Collect and send as
+ * many resource names as can fit in the buffer.
+ */
+
+int resdir_rebuild_send(gd_ls_t *ls, char *inbuf, int inlen, char *outbuf,
+			int outlen, uint32_t nodeid)
+{
+	struct list_head *list;
+	gd_res_t *start_rsb = NULL, *rsb;
+	int offset = 0, start_namelen, error;
+	char *start_name;
+	gd_resmov_t tmp;
+	uint32_t dir_nodeid;
+
+	/* 
+	 * Find the rsb where we left off (or start again)
+	 */
+
+	start_namelen = inlen;
+	start_name = inbuf;
+
+	if (start_namelen > 1) {
+		error = find_or_create_rsb(ls, NULL, start_name,
+				           start_namelen, 0, &start_rsb);
+		GDLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
+		release_rsb(start_rsb);
+	}
+
+	/* 
+	 * Send rsb names for rsb's we're master of and whose directory node
+	 * matches the requesting node.
+	 */
+
+	down_read(&ls->ls_rec_rsblist);
+	if (start_rsb)
+		list = start_rsb->res_rootlist.next;
+	else
+		list = ls->ls_rootres.next;
+
+	for (offset = 0; list != &ls->ls_rootres; list = list->next) {
+		rsb = list_entry(list, gd_res_t, res_rootlist);
+		if (rsb->res_nodeid)
+			continue;
+
+		dir_nodeid = get_directory_nodeid(rsb);
+		if (dir_nodeid != nodeid)
+			continue;
+
+		if (offset + sizeof(gd_resmov_t)*2 + rsb->res_length > outlen) {
+			/* Write end-of-block record */
+			memset(&tmp, 0, sizeof(gd_resmov_t));
+			memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
+			offset += sizeof(gd_resmov_t);
+			goto out;
+		}
+
+		memset(&tmp, 0, sizeof(gd_resmov_t));
+		tmp.rm_nodeid = cpu_to_be32(our_nodeid());
+		tmp.rm_length = cpu_to_be16(rsb->res_length);
+
+		memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
+		offset += sizeof(gd_resmov_t);
+
+		memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
+		offset += rsb->res_length;
+	}
+
+	/* 
+	 * If we've reached the end of the list (and there's room) write a
+	 * terminating record.
+	 */
+
+	if ((list == &ls->ls_rootres) &&
+	    (offset + sizeof(gd_resmov_t) <= outlen)) {
+
+		memset(&tmp, 0, sizeof(gd_resmov_t));
+		/* This only needs to be non-zero */
+		tmp.rm_nodeid = cpu_to_be32(1);
+		/* and this must be zero */
+		tmp.rm_length = 0;
+		memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
+		offset += sizeof(gd_resmov_t);
+	}
+
+ out:
+	up_read(&ls->ls_rec_rsblist);
+	return offset;
+}
+
+int get_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
+		gd_resdata_t **rdp, int recovery)
+{
+	gd_resdata_t *rd;
+	gd_resdata_t *tmp;
+	uint32_t bucket;
+
+	bucket = rd_hash(ls, name, namelen);
+
+	read_lock(&ls->ls_resdir_hash[bucket].rb_lock);
+	rd = search_rdbucket(ls, name, namelen, bucket);
+	read_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
+
+	if (rd)
+		goto out;
+
+	rd = allocate_resdata(ls, namelen);
+	if (!rd)
+		return -ENOMEM;
+
+	rd->rd_master_nodeid = nodeid;
+	rd->rd_length = namelen;
+	memcpy(rd->rd_name, name, namelen);
+
+	write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
+	tmp = search_rdbucket(ls, name, namelen, bucket);
+	if (!tmp)
+		list_add_tail(&rd->rd_list,
+			      &ls->ls_resdir_hash[bucket].rb_reslist);
+	write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
+
+	if (tmp) {
+		free_resdata(rd);
+		rd = tmp;
+	}
+
+      out:
+	*rdp = rd;
+
+	if (!recovery) {
+		if (++rd->rd_sequence == 0)
+			rd->rd_sequence++;
+	} else
+		rd->rd_sequence = 1;
+
+	return 0;
+}
+
+/* 
+ * The node with lowest id queries all nodes to determine when all are done.
+ * All other nodes query the low nodeid for this.
+ */
+
+int resdir_rebuild_wait(gd_ls_t *ls)
+{
+	int error;
+
+	if (ls->ls_low_nodeid == our_nodeid()) {
+		error = gdlm_wait_status_all(ls, RESDIR_VALID);
+		if (!error)
+			set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
+	} else
+		error = gdlm_wait_status_low(ls, RESDIR_ALL_VALID);
+
+	return error;
+}
diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
--- linux-orig/cluster/dlm/dir.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/dir.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+uint32_t name_to_directory_nodeid(gd_ls_t * ls, char *name, int length);
+uint32_t get_directory_nodeid(gd_res_t * rsb);
+void remove_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
+		    uint8_t sequence);
+int resdir_rebuild_local(gd_ls_t * ls);
+int resdir_rebuild_send(gd_ls_t * ls, char *inbuf, int inlen, char *outbuf,
+			int outlen, uint32_t nodeid);
+int get_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
+		gd_resdata_t ** rdp, int recovery);
+int resdir_rebuild_wait(gd_ls_t * ls);
+void resdir_clear(gd_ls_t * ls);
+void resdir_dump(gd_ls_t * ls);
+
+#endif				/* __DIR_DOT_H__ */
diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
--- linux-orig/cluster/dlm/dlm_internal.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/dlm_internal.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,626 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#define DLM_RELEASE_NAME "<CVS>"
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <asm/semaphore.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+
+#include <cluster/dlm.h>
+#include <cluster/dlm_device.h>
+#include <cluster/service.h>
+
+#ifndef TRUE
+#define TRUE (1)
+#endif
+
+#ifndef FALSE
+#define FALSE (0)
+#endif
+
+#if (BITS_PER_LONG == 64)
+#define PRIu64 "lu"
+#define PRId64 "ld"
+#define PRIo64 "lo"
+#define PRIx64 "lx"
+#define PRIX64 "lX"
+#define SCNu64 "lu"
+#define SCNd64 "ld"
+#define SCNo64 "lo"
+#define SCNx64 "lx"
+#define SCNX64 "lX"
+#else
+#define PRIu64 "Lu"
+#define PRId64 "Ld"
+#define PRIo64 "Lo"
+#define PRIx64 "Lx"
+#define PRIX64 "LX"
+#define SCNu64 "Lu"
+#define SCNd64 "Ld"
+#define SCNo64 "Lo"
+#define SCNx64 "Lx"
+#define SCNX64 "LX"
+#endif
+
+#define wchan_cond_sleep_intr(chan, sleep_cond) \
+do \
+{ \
+  DECLARE_WAITQUEUE(__wait_chan, current); \
+  current->state = TASK_INTERRUPTIBLE; \
+  add_wait_queue(&chan, &__wait_chan); \
+  if ((sleep_cond)) \
+    schedule(); \
+  remove_wait_queue(&chan, &__wait_chan); \
+  current->state = TASK_RUNNING; \
+} \
+while (0)
+
+static inline int check_timeout(unsigned long stamp, unsigned int seconds)
+{
+    return time_after(jiffies, stamp + seconds * HZ);
+}
+
+
+#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
+
+#define log_all(ls, fmt, args...) \
+	do { \
+		printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
+		dlm_debug_log(ls, fmt, ##args); \
+	} while (0)
+
+#define log_error log_all
+
+
+#define DLM_DEBUG
+#if defined(DLM_DEBUG)
+#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
+#else
+#define log_debug(ls, fmt, args...)
+#endif
+
+#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
+#undef log_debug
+#define log_debug log_all
+#endif
+
+
+#define GDLM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    dlm_debug_dump(); \
+    printk("\nDLM:  Assertion failed on line %d of file %s\n" \
+               "DLM:  assertion:  \"%s\"\n" \
+               "DLM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    BUG(); \
+    panic("DLM:  Record message above and reboot.\n"); \
+  } \
+}
+
+
+struct gd_ls;
+struct gd_lkb;
+struct gd_res;
+struct gd_csb;
+struct gd_node;
+struct gd_resmov;
+struct gd_resdata;
+struct gd_recover;
+struct gd_recinfo;
+struct gd_resdir_bucket;
+struct gd_remlockreply;
+struct gd_remlockrequest;
+struct gd_rcom;
+
+typedef struct gd_ls gd_ls_t;
+typedef struct gd_lkb gd_lkb_t;
+typedef struct gd_res gd_res_t;
+typedef struct gd_csb gd_csb_t;
+typedef struct gd_node gd_node_t;
+typedef struct gd_resmov gd_resmov_t;
+typedef struct gd_resdata gd_resdata_t;
+typedef struct gd_recover gd_recover_t;
+typedef struct gd_resdir_bucket gd_resdir_bucket_t;
+typedef struct gd_rcom gd_rcom_t;
+
+/*
+ * Resource Data - an entry for a resource in the resdir hash table
+ */
+
+struct gd_resdata {
+	struct list_head rd_list;
+	uint32_t rd_master_nodeid;
+	uint16_t rd_length;
+	uint8_t rd_sequence;
+	char rd_name[1];	/* <rd_length> bytes */
+};
+
+/*
+ * Resource Directory Bucket - a hash bucket of resdata entries in the resdir
+ * hash table
+ */
+
+struct gd_resdir_bucket {
+	struct list_head rb_reslist;
+	rwlock_t rb_lock;
+};
+
+/*
+ * A resource description as moved between nodes
+ */
+
+struct gd_resmov {
+	uint32_t rm_nodeid;
+	uint16_t rm_length;
+	uint16_t rm_pad;
+};
+
+/*
+ * An entry in the lock ID table.  Locks for this bucket are kept on list.
+ * Counter is used to assign an id to locks as they are added to this bucket.
+ */
+
+struct gd_lockidtbl_entry {
+	struct list_head list;
+	uint16_t counter;
+};
+
+/* Elements in the range array */
+
+#define GR_RANGE_START 0
+#define GR_RANGE_END   1
+#define RQ_RANGE_START 2
+#define RQ_RANGE_END   3
+
+/*
+ * Lockspace structure.  The context for GDLM locks.
+ */
+
+#define RESHASHTBL_SIZE     (256)
+
+#define RESDIRHASH_SHIFT    (9)
+#define RESDIRHASH_SIZE     (1 << RESDIRHASH_SHIFT)
+#define RESDIRHASH_MASK     (RESDIRHASH_SIZE - 1)
+
+#define LSFL_WORK               (0)
+#define LSFL_LS_RUN             (1)
+#define LSFL_LS_STOP            (2)
+#define LSFL_LS_START           (3)
+#define LSFL_LS_FINISH          (4)
+#define LSFL_RECCOMM_WAIT       (5)
+#define LSFL_RECCOMM_READY      (6)
+#define LSFL_NOTIMERS           (7)
+#define LSFL_FINISH_RECOVERY    (8)
+#define LSFL_RESDIR_VALID       (9)
+#define LSFL_ALL_RESDIR_VALID   (10)
+#define LSFL_NODES_VALID        (11)
+#define LSFL_ALL_NODES_VALID    (12)
+#define LSFL_REQUEST_WARN       (13)
+
+#define LSST_NONE           (0)
+#define LSST_INIT           (1)
+#define LSST_INIT_DONE      (2)
+#define LSST_CLEAR          (3)
+#define LSST_WAIT_START     (4)
+#define LSST_RECONFIG_DONE  (5)
+
+struct gd_ls {
+	struct list_head ls_list;	/* list of lockspaces */
+	uint32_t ls_local_id;	/* local unique lockspace ID */
+	uint32_t ls_global_id;	/* global unique lockspace ID */
+	int ls_allocation;	/* Memory allocation policy */
+	unsigned long ls_flags;	/* LSFL_ */
+
+	struct list_head ls_rootres;	/* List of root resources */
+
+	int ls_hashsize;
+	int ls_hashmask;
+	struct list_head *ls_reshashtbl;	/* Hash table for resources */
+	rwlock_t ls_reshash_lock;	/* Lock for hash table */
+
+	struct gd_lockidtbl_entry *ls_lockidtbl;
+	uint32_t ls_lockidtbl_size;	/* Size of lock id table */
+	rwlock_t ls_lockidtbl_lock;
+
+	struct list_head ls_nodes;	/* current nodes in RC */
+	uint32_t ls_num_nodes;	/* number of nodes in RC */
+	uint32_t ls_nodes_mask;
+	uint32_t ls_low_nodeid;
+
+	int ls_state;		/* state changes for recovery */
+	struct list_head ls_recover;	/* gr_recover_t structs */
+	int ls_last_stop;	/* event ids from sm */
+	int ls_last_start;
+	int ls_last_finish;
+	spinlock_t ls_recover_lock;
+	struct list_head ls_nodes_gone;	/* dead node list for recovery */
+
+	wait_queue_head_t ls_wait_general;
+
+	gd_rcom_t *ls_rcom;
+	uint32_t ls_rcom_msgid;
+	struct semaphore ls_rcom_lock;
+
+	struct list_head ls_recover_list;
+	int ls_recover_list_count;
+	spinlock_t ls_recover_list_lock;
+
+	struct rw_semaphore ls_in_recovery;	/* held in write during
+						 * recovery, read for normal
+						 * locking ops */
+	struct rw_semaphore ls_unlock_sem;	/* To prevent unlock on a
+						 * parent lock racing with a
+						 * new child lock */
+
+	struct rw_semaphore ls_rec_rsblist;	/* To prevent incoming recovery
+						 * operations happening while
+						 * we are purging */
+
+	struct rw_semaphore ls_gap_rsblist;	/* To protect rootres list
+						 * in grant_after_purge() which
+						 * runs outside recovery */
+
+	struct list_head ls_rebuild_rootrsb_list;	/* Root of lock trees
+							 * we are deserialising
+							 */
+
+	struct list_head ls_deadlockq;	/* List of locks in conversion ordered
+					 * by duetime. for deadlock detection */
+
+	struct list_head ls_requestqueue;	/* List of incoming requests
+						 * held while we are in
+						 * recovery */
+
+	gd_resdir_bucket_t ls_resdir_hash[RESDIRHASH_SIZE];
+
+	int ls_namelen;
+	char ls_name[1];	/* <namelen> bytes */
+};
+
+/*
+ * Cluster node (per node in cluster)
+ */
+
+struct gd_node {
+	struct list_head gn_list;	/* global list of cluster nodes */
+	uint32_t gn_nodeid;	/* cluster unique nodeid (cman) */
+	uint32_t gn_ipaddr;	/* node's first IP address (cman) */
+	int gn_refcount;	/* number of csb's referencing */
+};
+
+/*
+ * Cluster System Block (per node in a ls)
+ */
+
+struct gd_csb {
+	struct list_head csb_list;	/* per-lockspace list of nodes */
+	gd_node_t *csb_node;	/* global node structure */
+	int csb_gone_event;	/* event id when node was removed */
+
+	uint32_t csb_names_send_count;
+	uint32_t csb_names_send_msgid;
+	uint32_t csb_names_recv_count;
+	uint32_t csb_names_recv_msgid;
+	uint32_t csb_locks_send_count;
+	uint32_t csb_locks_send_msgid;
+	uint32_t csb_locks_recv_count;
+	uint32_t csb_locks_recv_msgid;
+};
+
+/*
+ * Resource block
+ */
+
+/* status */
+
+#define GDLM_RESSTS_DIRENTRY     1	/* This is a directory entry */
+#define GDLM_RESSTS_LVBINVALID   2	/* The LVB is invalid */
+
+#define RESFL_NEW_MASTER         (0)
+#define RESFL_RECOVER_LIST       (1)
+
+struct gd_res {
+	struct list_head res_hashchain;	/* Chain of resources in this hash
+					 * bucket */
+
+	gd_ls_t *res_ls;	/* The owning lockspace */
+
+	struct list_head res_rootlist;	/* List of root resources in lockspace */
+
+	struct list_head res_subreslist;	/* List of all sub-resources
+						 * for this root res. */
+	/* This is a list head on the root res and holds the whole tree below
+	 * it. */
+	uint8_t res_depth;	/* Depth in resource tree */
+	uint16_t res_status;
+	unsigned long res_flags;	/* Flags, RESFL_ */
+
+	struct list_head res_grantqueue;
+	struct list_head res_convertqueue;
+	struct list_head res_waitqueue;
+
+	uint32_t res_nodeid;	/* nodeid of master node */
+
+	gd_res_t *res_root;	/* If a subresource, this is our root */
+	gd_res_t *res_parent;	/* Our parent resource (if any) */
+
+	atomic_t res_ref;	/* No of lkb's */
+	uint16_t res_remasterid;	/* ID used during remaster */
+	struct list_head res_recover_list;	/* General list for use during
+						 * recovery */
+	int res_recover_msgid;
+	int res_newlkid_expect;
+
+	struct rw_semaphore res_lock;
+
+	char *res_lvbptr;	/* Lock value block */
+
+	uint8_t res_resdir_seq;	/* Last directory sequence number */
+
+	uint8_t res_length;
+	char res_name[1];	/* <res_length> bytes */
+};
+
+/*
+ * Lock block. To avoid confusion, where flags mirror the
+ * public flags, they should have the same value.
+ */
+
+#define GDLM_LKSTS_NEW          (0)
+#define GDLM_LKSTS_WAITING      (1)
+#define GDLM_LKSTS_GRANTED      (2)
+#define GDLM_LKSTS_CONVERT      (3)
+
+#define GDLM_LKFLG_VALBLK       (0x00000008)
+#define GDLM_LKFLG_PERSISTENT   (0x00000080)	/* Don't unlock when process exits */
+#define GDLM_LKFLG_NODLCKWT     (0x00000100)    /* Don't do deadlock detection */
+#define GDLM_LKFLG_EXPEDITE     (0x00000400)    /* Move to head of convert queue */
+
+/* Internal flags */
+#define GDLM_LKFLG_RANGE        (0x00001000)	/* Range field is present
+						   (remote protocol only) */
+#define GDLM_LKFLG_MSTCPY       (0x00002000)
+#define GDLM_LKFLG_DELETED      (0x00004000)	/* LKB is being deleted */
+#define GDLM_LKFLG_LQCONVERT    (0x00008000)
+#define GDLM_LKFLG_LQRESEND     (0x00010000)	/* LKB on lockqueue must be resent */
+#define GDLM_LKFLG_DEMOTED      (0x00020000)
+#define GDLM_LKFLG_RESENT       (0x00040000)
+#define GDLM_LKFLG_NOREBUILD    (0x00080000)
+
+#define AST_COMP		(1)
+#define AST_BAST		(2)
+#define AST_DEL			(4)
+
+struct gd_lkb {
+	uint32_t 		lkb_flags;
+	uint16_t 		lkb_status;     /* grant, wait, convert */
+	int8_t			lkb_rqmode;     /* requested lock mode */
+	int8_t			lkb_grmode;     /* granted lock mode */
+	uint32_t		lkb_retstatus;  /* status to return in lksb */
+	uint32_t		lkb_id;         /* our lock ID */
+	struct dlm_lksb *	lkb_lksb;       /* status block of caller */
+	struct list_head	lkb_idtbl_list;	/* lockidtbl */
+	struct list_head	lkb_statequeue;	/* rsb's g/c/w queue */
+	gd_res_t *		lkb_resource;
+	struct list_head	lkb_ownerqueue;	/* list of locks owned by a
+						   process */
+	gd_lkb_t *		lkb_parent;     /* parent lock if any */
+	atomic_t		lkb_childcnt;   /* number of children */
+
+	struct list_head	lkb_lockqueue;	/* queue of locks waiting
+						   for remote reply */
+	int			lkb_lockqueue_state; /* reason on lockqueue */
+	int			lkb_lockqueue_flags; /* as passed into
+							lock/unlock */
+	unsigned long		lkb_lockqueue_time;  /* time lkb went on the
+							lockqueue */
+	unsigned long		lkb_duetime;    /* for deadlock detection */
+
+	uint32_t		lkb_remid;	/* id on remote partner */
+	uint32_t		lkb_nodeid;	/* id of remote partner */
+
+	void *			lkb_astaddr;
+	void *			lkb_bastaddr;
+	long			lkb_astparam;
+	struct list_head	lkb_astqueue;	/* locks with asts to deliver */
+	uint16_t		lkb_astflags;	/* COMP, BAST, DEL */
+	uint8_t			lkb_bastmode;	/* requested mode */
+	uint8_t			lkb_highbast;	/* highest mode bast sent for */
+
+	struct gd_remlockrequest *lkb_request;
+
+	struct list_head	lkb_deadlockq;	/* ls_deadlockq list */
+
+	char *			lkb_lvbptr;	/* points to lksb lvb on local
+						   lock, allocated lvb on
+						   on remote lock */
+	uint64_t *		lkb_range;	/* Points to an array of 64 bit
+						   numbers that represent the
+						   requested and granted ranges
+				 		   of the lock. NULL implies
+						   0-ffffffffffffffff */
+};
+
+/*
+ * Used to save and manage recovery state for a lockspace.
+ */
+
+struct gd_recover {
+	struct list_head gr_list;
+	uint32_t *gr_nodeids;
+	int gr_node_count;
+	int gr_event_id;
+};
+
+/*
+ * Header part of the mid-level comms system. All packets start with
+ * this header so we can identify them. The comms packet can
+ * contain many of these structs but the are split into individual
+ * work units before being passed to the lockqueue routines.
+ * below this are the structs that this is a header for
+ */
+
+struct gd_req_header {
+	uint8_t rh_cmd;		/* What we are */
+	uint8_t rh_flags;	/* maybe just a pad */
+	uint16_t rh_length;	/* Length of struct (so we can send several in
+				 * one message) */
+	uint32_t rh_lkid;	/* Lock ID tag: ie the local (requesting) lock
+				 * ID */
+	uint32_t rh_lockspace;	/* Lockspace ID */
+};
+
+/*
+ * This is the struct used in a remote lock/unlock/convert request
+ * The mid-level comms API should turn this into native byte order.
+ * Most "normal" lock operations will use these two structs for
+ * communications. Recovery operations use their own structs
+ * but still with the gd_req_header on the front.
+ */
+
+struct gd_remlockrequest {
+	struct gd_req_header rr_header;
+
+	uint32_t rr_remlkid;	/* Remote lock ID */
+	uint32_t rr_remparid;	/* Parent's remote lock ID or 0 */
+	uint32_t rr_flags;	/* Flags from lock/convert request */
+        uint64_t rr_range_start;/* Yes, these are in the right place... */
+	uint64_t rr_range_end;
+	uint32_t rr_status;	/* Status to return if this is an AST request */
+	uint8_t rr_rqmode;	/* Requested lock mode */
+	uint8_t rr_asts;	/* Whether the LKB has ASTs or not */
+	uint8_t rr_resdir_seq;	/* Directory sequence number */
+	char rr_lvb[DLM_LVB_LEN];	/* Value block */
+	char rr_name[1];	/* As long as needs be. Only used for directory
+				 * lookups. The length of this can be worked
+				 * out from the packet length */
+};
+
+/*
+ * This is the struct returned by a remote lock/unlock/convert request
+ * The mid-level comms API should turn this into native byte order.
+ */
+
+struct gd_remlockreply {
+	struct gd_req_header rl_header;
+
+	uint32_t rl_lockstate;	/* Whether request was queued/granted/waiting */
+	uint32_t rl_nodeid;	/* nodeid of lock master */
+	uint32_t rl_status;	/* Status to return to caller */
+	uint32_t rl_lkid;	/* Remote lkid */
+	uint8_t rl_resdir_seq;	/* Returned directory sequence number */
+	char rl_lvb[DLM_LVB_LEN];	/* LVB itself */
+};
+
+/*
+ * Recovery comms message
+ */
+
+struct gd_rcom {
+	struct gd_req_header rc_header;	/* 32 byte aligned */
+	uint32_t rc_msgid;
+	uint16_t rc_datalen;
+	uint8_t rc_expanded;
+	uint8_t rc_subcmd;	/* secondary command */
+	char rc_buf[1];		/* first byte of data goes here and extends
+				 * beyond here for another datalen - 1 bytes.
+				 * rh_length is set to sizeof(gd_rcom_t) +
+				 * datalen - 1 */
+};
+
+
+/* A remote query: GDLM_REMCMD_QUERY */
+struct gd_remquery {
+	struct gd_req_header rq_header;
+
+	uint32_t rq_mstlkid;   /* LockID on master node */
+        uint32_t rq_query;     /* query from the user */
+        uint32_t rq_maxlocks;  /* max number of locks we can cope with */
+};
+
+/* First block of a reply query.  cmd = GDLM_REMCMD_QUERY */
+/* There may be subsequent blocks of
+   lock info in GDLM_REMCMD_QUERYCONT messages which just have
+   a normal header. The last of these will have rh_flags set to
+   GDLM_REMFLAG_ENDQUERY
+ */
+struct gd_remqueryreply {
+	struct gd_req_header rq_header;
+
+        uint32_t rq_numlocks;  /* Number of locks in reply */
+        uint32_t rq_startlock; /* Which lock this block starts at (for multiple block replies) */
+        uint32_t rq_status;
+
+        /* Resource information */
+	uint32_t rq_grantcount;	/* No. of nodes on grant queue */
+	uint32_t rq_convcount;	/* No. of nodes on convert queue */
+	uint32_t rq_waitcount;	/* No. of nodes on wait queue */
+        char rq_valblk[DLM_LVB_LEN];	/* Master's LVB contents, if applicable */
+};
+
+/*
+ * Lockqueue wait lock states
+ */
+
+#define GDLM_LQSTATE_WAIT_RSB       1
+#define GDLM_LQSTATE_WAIT_CONVERT   2
+#define GDLM_LQSTATE_WAIT_CONDGRANT 3
+#define GDLM_LQSTATE_WAIT_UNLOCK    4
+
+/* Commands sent across the comms link */
+#define GDLM_REMCMD_LOOKUP          1
+#define GDLM_REMCMD_LOCKREQUEST     2
+#define GDLM_REMCMD_UNLOCKREQUEST   3
+#define GDLM_REMCMD_CONVREQUEST     4
+#define GDLM_REMCMD_LOCKREPLY       5
+#define GDLM_REMCMD_LOCKGRANT       6
+#define GDLM_REMCMD_SENDBAST        7
+#define GDLM_REMCMD_SENDCAST        8
+#define GDLM_REMCMD_REM_RESDATA     9
+#define GDLM_REMCMD_RECOVERMESSAGE  20
+#define GDLM_REMCMD_RECOVERREPLY    21
+#define GDLM_REMCMD_QUERY           30
+#define GDLM_REMCMD_QUERYREPLY      31
+
+/* Set in rh_flags when this is the last block of
+   query information. Note this could also be the first
+   block */
+#define GDLM_REMFLAG_ENDQUERY       1
+
+#ifndef BUG_ON
+#define BUG_ON(x)
+#endif
+
+void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...);
+void dlm_debug_dump(void);
+
+#endif				/* __DLM_INTERNAL_DOT_H__ */
diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
--- linux-orig/cluster/dlm/lkb.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lkb.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,225 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* 
+ * lkb.c
+ *
+ * Allocate and free locks on the lock ID table.
+ *
+ * This is slightly naff but I don't really like the
+ * VMS lockidtbl stuff as it uses a realloced array
+ * to hold the locks in. I think this is slightly better
+ * in some ways.
+ *
+ * Any better suggestions gratefully received. Patrick
+ *
+ */
+
+#include "dlm_internal.h"
+#include "lockqueue.h"
+#include "lkb.h"
+#include "config.h"
+#include "rsb.h"
+#include "memory.h"
+#include "lockspace.h"
+#include "util.h"
+
+/* 
+ * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
+ */
+
+static gd_lkb_t *__find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
+{
+	uint16_t entry = lkid & 0xFFFF;
+	gd_lkb_t *lkb;
+
+	if (entry >= ls->ls_lockidtbl_size)
+		goto out;
+
+	list_for_each_entry(lkb, &ls->ls_lockidtbl[entry].list, lkb_idtbl_list){
+		if (lkb->lkb_id == lkid)
+			return lkb;
+	}
+
+      out:
+	return NULL;
+}
+
+/* 
+ * Should be called at lockspace initialisation time.
+ */
+
+int init_lockidtbl(gd_ls_t *ls, int entries)
+{
+	int i;
+
+	/* Make sure it's a power of two */
+	GDLM_ASSERT(!(entries & (entries - 1)),);
+
+	ls->ls_lockidtbl_size = entries;
+	rwlock_init(&ls->ls_lockidtbl_lock);
+
+	ls->ls_lockidtbl = kmalloc(entries * sizeof(struct gd_lockidtbl_entry),
+		               	   GFP_KERNEL);
+	if (!ls->ls_lockidtbl)
+		return -ENOMEM;
+
+	for (i = 0; i < entries; i++) {
+		INIT_LIST_HEAD(&ls->ls_lockidtbl[i].list);
+		ls->ls_lockidtbl[i].counter = 1;
+	}
+
+	return 0;
+}
+
+/* 
+ * Free up the space - returns an error if there are still locks hanging around
+ */
+
+int free_lockidtbl(gd_ls_t *ls)
+{
+	int i;
+
+	write_lock(&ls->ls_lockidtbl_lock);
+
+	for (i = 0; i < ls->ls_lockidtbl_size; i++) {
+		if (!list_empty(&ls->ls_lockidtbl[i].list)) {
+			write_unlock(&ls->ls_lockidtbl_lock);
+			return -1;
+		}
+	}
+	kfree(ls->ls_lockidtbl);
+
+	write_unlock(&ls->ls_lockidtbl_lock);
+
+	return 0;
+}
+
+/* 
+ * LKB lkid's are 32 bits and have two 16 bit parts.  The bottom 16 bits are a
+ * random number between 0 and lockidtbl_size-1.  This random number specifies
+ * the "bucket" for the lkb in lockidtbl.  The upper 16 bits are a sequentially
+ * assigned per-bucket id.
+ *
+ * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
+ * against the lkid of all lkb's in the bucket to avoid duplication.
+ *
+ */
+
+gd_lkb_t *create_lkb(gd_ls_t *ls)
+{
+	gd_lkb_t *lkb;
+	uint32_t lkid;
+	uint16_t bucket;
+
+	lkb = allocate_lkb(ls);
+	if (!lkb)
+		goto out;
+
+	write_lock(&ls->ls_lockidtbl_lock);
+	do {
+		get_random_bytes(&bucket, sizeof(bucket));
+		bucket &= (ls->ls_lockidtbl_size - 1);
+		lkid = bucket | (ls->ls_lockidtbl[bucket].counter++ << 16);
+	}
+	while (__find_lock_by_id(ls, lkid));
+
+	lkb->lkb_id = (uint32_t) lkid;
+	list_add(&lkb->lkb_idtbl_list, &ls->ls_lockidtbl[bucket].list);
+	write_unlock(&ls->ls_lockidtbl_lock);
+
+      out:
+	return lkb;
+}
+
+/* 
+ * Free LKB and remove it from the lockidtbl.
+ * NB - this always frees the lkb whereas release_rsb doesn't free an
+ * rsb unless its reference count is zero.
+ */
+
+void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb)
+{
+	if (lkb->lkb_status) {
+		log_error(ls, "release lkb with status %u", lkb->lkb_status);
+		print_lkb(lkb);
+		return;
+	}
+
+	if (lkb->lkb_parent)
+		atomic_dec(&lkb->lkb_parent->lkb_childcnt);
+
+	write_lock(&ls->ls_lockidtbl_lock);
+	list_del(&lkb->lkb_idtbl_list);
+	write_unlock(&ls->ls_lockidtbl_lock);
+
+	/* if this is not a master copy then lvbptr points into the user's
+	 * lksb, so don't free it */
+	if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
+		free_lvb(lkb->lkb_lvbptr);
+
+	if (lkb->lkb_range)
+		free_range(lkb->lkb_range);
+
+	free_lkb(lkb);
+}
+
+gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
+{
+	gd_lkb_t *lkb;
+
+	read_lock(&ls->ls_lockidtbl_lock);
+	lkb = __find_lock_by_id(ls, lkid);
+	read_unlock(&ls->ls_lockidtbl_lock);
+
+	return lkb;
+}
+
+gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid)
+{
+        gd_ls_t *lspace = find_lockspace_by_local_id(ls);
+	return find_lock_by_id(lspace, lkid);
+}
+
+/*
+ * Initialise the range parts of an LKB.
+ */
+
+int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end)
+{
+	int ret = -ENOMEM;
+
+	/*
+	 * if this wasn't already a range lock, make it one
+	 */
+	if (!lkb->lkb_range) {
+		lkb->lkb_range = allocate_range(lspace);
+		if (!lkb->lkb_range)
+			goto out;
+
+		/*
+		 * This is needed for conversions that contain ranges where the
+		 * original lock didn't but it's harmless for new locks too.
+		 */
+		lkb->lkb_range[GR_RANGE_START] = 0LL;
+		lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
+	}
+
+	lkb->lkb_range[RQ_RANGE_START] = start;
+	lkb->lkb_range[RQ_RANGE_END] = end;
+
+	ret = 0;
+
+      out:
+	return ret;
+}
diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
--- linux-orig/cluster/dlm/lkb.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lkb.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,27 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LKB_DOT_H__
+#define __LKB_DOT_H__
+
+int free_lockidtbl(gd_ls_t * lspace);
+int init_lockidtbl(gd_ls_t * lspace, int entries);
+
+gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid);
+gd_lkb_t *create_lkb(gd_ls_t *ls);
+void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb);
+gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid);
+int verify_lkb_nodeids(gd_ls_t *ls);
+int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end);
+
+#endif				/* __LKB_DOT_H__ */
diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
--- linux-orig/cluster/dlm/locking.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/locking.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,1223 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* 
+ * locking.c
+ *
+ * This is where the main work of the DLM goes on
+ *
+ */
+
+#include "dlm_internal.h"
+#include "lockqueue.h"
+#include "locking.h"
+#include "lockspace.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "dir.h"
+#include "ast.h"
+#include "memory.h"
+#include "rsb.h"
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+/* 
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
+ */
+
+#define modes_compat(gr, rq) \
+	__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+	{1, 1, 1, 1, 1, 1, 1, 0},	/* UN */
+	{1, 1, 1, 1, 1, 1, 1, 0},	/* NL */
+	{1, 1, 1, 1, 1, 1, 0, 0},	/* CR */
+	{1, 1, 1, 1, 0, 0, 0, 0},	/* CW */
+	{1, 1, 1, 0, 1, 0, 0, 0},	/* PR */
+	{1, 1, 1, 0, 0, 0, 0, 0},	/* PW */
+	{1, 1, 0, 0, 0, 0, 0, 0},	/* EX */
+	{0, 0, 0, 0, 0, 0, 0, 0}	/* PD */
+};
+
+/* 
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+const int __quecvt_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+	{0, 0, 0, 0, 0, 0, 0, 0},	/* UN */
+	{0, 0, 1, 1, 1, 1, 1, 0},	/* NL */
+	{0, 0, 0, 1, 1, 1, 1, 0},	/* CR */
+	{0, 0, 0, 0, 1, 1, 1, 0},	/* CW */
+	{0, 0, 0, 1, 0, 1, 1, 0},	/* PR */
+	{0, 0, 0, 0, 0, 0, 1, 0},	/* PW */
+	{0, 0, 0, 0, 0, 0, 0, 0},	/* EX */
+	{0, 0, 0, 0, 0, 0, 0, 0}	/* PD */
+};
+
+/* 
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int __lvb_operations[8][8] = {
+	/* UN   NL  CR  CW  PR  PW  EX  PD*/
+	{  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
+	{  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
+	{  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
+	{  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
+	{  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
+	{  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
+	{  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
+	{  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
+};
+
+static void grant_lock(gd_lkb_t * lkb, int send_remote);
+static void send_blocking_asts(gd_res_t * rsb, gd_lkb_t * lkb);
+static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb);
+static int convert_lock(gd_ls_t * ls, int mode, struct dlm_lksb *lksb,
+			int flags, void *ast, void *astarg, void *bast,
+			struct dlm_range *range);
+static int dlm_lock_stage1(gd_ls_t * lspace, gd_lkb_t * lkb, int flags,
+			   char *name, int namelen);
+
+
+static inline int first_in_list(gd_lkb_t *lkb, struct list_head *head)
+{
+	gd_lkb_t *first = list_entry(head->next, gd_lkb_t, lkb_statequeue);
+
+	if (lkb->lkb_id == first->lkb_id)
+		return 1;
+
+	return 0;
+}
+
+/* 
+ * Return 1 if the locks' ranges overlap
+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
+ */
+
+static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
+{
+	if (!lkb1->lkb_range || !lkb2->lkb_range)
+		return 1;
+
+	if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
+	    lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Resolve conversion deadlock by changing to NL the granted mode of deadlocked
+ * locks on the convert queue.  One of the deadlocked locks is allowed to
+ * retain its original granted state (we choose the lkb provided although it
+ * shouldn't matter which.)  We do not change the granted mode on locks without
+ * the CONVDEADLK flag.  If any of these exist (there shouldn't if the app uses
+ * the flag consistently) the false return value is used.
+ */
+
+static int conversion_deadlock_resolve(gd_res_t *rsb, gd_lkb_t *lkb)
+{
+	gd_lkb_t *this;
+	int rv = TRUE;
+
+	list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+		if (this == lkb)
+			continue;
+
+		if (!ranges_overlap(lkb, this))
+			continue;
+
+		if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) {
+
+			if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){
+				rv = FALSE;
+				continue;
+			}
+			this->lkb_grmode = DLM_LOCK_NL;
+			this->lkb_flags |= GDLM_LKFLG_DEMOTED;
+		}
+	}
+	return rv;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource.  The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ */
+
+static int conversion_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
+{
+	gd_lkb_t *this;
+
+	list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+		if (this == lkb)
+			continue;
+
+		if (!ranges_overlap(lkb, this))
+			continue;
+
+		if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+			return TRUE;
+	}
+	return FALSE;
+}
+
+/*
+ * Check if the given lkb conflicts with another lkb on the queue.
+ */
+
+static int queue_conflict(struct list_head *head, gd_lkb_t *lkb)
+{
+	gd_lkb_t *this;
+
+	list_for_each_entry(this, head, lkb_statequeue) {
+		if (this == lkb)
+			continue;
+		if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
+			return TRUE;
+	}
+	return FALSE;
+}
+
+/*
+ * Deadlock can arise when using the QUECVT flag if the requested mode of the
+ * first converting lock is incompatible with the granted mode of another
+ * converting lock further down the queue.  To prevent this deadlock, a
+ * requested QUEUECVT lock is granted immediately if adding it to the end of
+ * the queue would prevent a lock ahead of it from being granted.
+ */
+
+static int queuecvt_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
+{
+	gd_lkb_t *this;
+
+	list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+		if (this == lkb)
+			break;
+
+		if (ranges_overlap(lkb, this) && !modes_compat(lkb, this))
+			return TRUE;
+	}
+	return FALSE;
+}
+
+/* 
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ */
+
+static int can_be_granted(gd_res_t *rsb, gd_lkb_t *lkb)
+{
+	if (lkb->lkb_rqmode == DLM_LOCK_NL)
+		return TRUE;
+
+	if (lkb->lkb_rqmode == lkb->lkb_grmode)
+		return TRUE;
+
+	if (queue_conflict(&rsb->res_grantqueue, lkb))
+		return FALSE;
+
+	if (!queue_conflict(&rsb->res_convertqueue, lkb)) {
+		if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
+			return TRUE;
+
+		if (list_empty(&rsb->res_convertqueue) ||
+		    first_in_list(lkb, &rsb->res_convertqueue) ||
+		    queuecvt_deadlock_detect(rsb, lkb))
+			return TRUE;
+		else
+			return FALSE;
+	}
+
+	/* there *is* a conflict between this lkb and a converting lock so
+	   we return false unless conversion deadlock resolution is permitted
+	   (only conversion requests will have the CONVDEADLK flag set) */
+
+	if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK))
+		return FALSE;
+
+	if (!conversion_deadlock_detect(rsb, lkb))
+		return FALSE;
+
+	if (conversion_deadlock_resolve(rsb, lkb))
+		return TRUE;
+
+	return FALSE;
+}
+
+int dlm_lock(void *lockspace,
+	     uint32_t mode,
+	     struct dlm_lksb *lksb,
+	     uint32_t flags,
+	     void *name,
+	     unsigned int namelen,
+	     uint32_t parent,
+	     void (*ast) (void *astarg),
+	     void *astarg,
+	     void (*bast) (void *astarg, int mode),
+	     struct dlm_range *range)
+{
+	gd_ls_t *lspace;
+	gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
+	int ret = -EINVAL;
+
+	lspace = find_lockspace_by_local_id(lockspace);
+	if (!lspace)
+		goto out;
+
+	if (mode < 0 || mode > DLM_LOCK_EX)
+		goto out;
+
+	if (namelen > DLM_RESNAME_MAXLEN)
+		goto out;
+
+	if (flags & DLM_LKF_CANCEL)
+		goto out;
+
+	if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (!ast || !lksb)
+		goto out;
+
+	if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK))
+		goto out;
+
+	if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
+		goto out;
+
+	/* 
+	 * Take conversion path.
+	 */
+
+	if (flags & DLM_LKF_CONVERT) {
+		ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
+				   bast, range);
+		goto out;
+	}
+
+	/* 
+	 * Take new lock path.
+	 */
+
+	if (parent) {
+		down_read(&lspace->ls_unlock_sem);
+
+		parent_lkb = find_lock_by_id(lspace, parent);
+
+		if (!parent_lkb ||
+		    parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
+		    parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
+		    parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
+			up_read(&lspace->ls_unlock_sem);
+			goto out;
+		}
+
+		atomic_inc(&parent_lkb->lkb_childcnt);
+		up_read(&lspace->ls_unlock_sem);
+	}
+
+	down_read(&lspace->ls_in_recovery);
+
+	ret = -ENOMEM;
+
+	lkb = create_lkb(lspace);
+	if (!lkb)
+		goto fail_dec;
+	lkb->lkb_astaddr = ast;
+	lkb->lkb_astparam = (long) astarg;
+	lkb->lkb_bastaddr = bast;
+	lkb->lkb_rqmode = mode;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	lkb->lkb_lksb = lksb;
+	lkb->lkb_parent = parent_lkb;
+	lkb->lkb_lockqueue_flags = flags;
+	lkb->lkb_lvbptr = lksb->sb_lvbptr;
+
+	/* Copy the range if appropriate */
+	if (range) {
+		if (range->ra_start > range->ra_end) {
+			ret = -EINVAL;
+			goto fail_free;
+		}
+
+		if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
+			goto fail_free;
+	}
+
+	/* Convert relevant flags to internal numbers */
+	if (flags & DLM_LKF_VALBLK)
+		lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
+	if (flags & DLM_LKF_PERSISTENT)
+		lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
+	if (flags & DLM_LKF_NODLCKWT)
+		lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
+
+	lksb->sb_lkid = lkb->lkb_id;
+
+	ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
+	if (ret)
+		goto fail_free;
+
+	up_read(&lspace->ls_in_recovery);
+
+	wake_astd();
+
+	return 0;
+
+      fail_free:
+	release_lkb(lspace, lkb);
+	goto fail_unlock;
+
+      fail_dec:
+	if (parent_lkb)
+		atomic_dec(&parent_lkb->lkb_childcnt);
+
+      fail_unlock:
+	up_read(&lspace->ls_in_recovery);
+
+      out:
+	return ret;
+}
+
+int dlm_lock_stage1(gd_ls_t *ls, gd_lkb_t *lkb, int flags, char *name,
+		    int namelen)
+{
+	gd_res_t *rsb, *parent_rsb = NULL;
+	gd_lkb_t *parent_lkb = lkb->lkb_parent;
+	gd_resdata_t *rd;
+	uint32_t nodeid;
+	int error;
+
+	if (parent_lkb)
+		parent_rsb = parent_lkb->lkb_resource;
+
+	error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb);
+	if (error)
+		goto out;
+
+	lkb->lkb_resource = rsb;
+	lkb->lkb_nodeid = rsb->res_nodeid;
+
+	/* 
+	 * Next stage, do we need to find the master or can
+	 * we get on with the real locking work ?
+	 */
+
+	if (rsb->res_nodeid == -1) {
+		if (get_directory_nodeid(rsb) != our_nodeid()) {
+			error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
+			goto out;
+		}
+
+		error = get_resdata(ls, our_nodeid(), rsb->res_name,
+				    rsb->res_length, &rd, 0);
+		if (error)
+			goto out;
+
+		nodeid = rd->rd_master_nodeid;
+		if (nodeid == our_nodeid())
+			nodeid = 0;
+		rsb->res_nodeid = nodeid;
+		lkb->lkb_nodeid = nodeid;
+		rsb->res_resdir_seq = rd->rd_sequence;
+	}
+
+	error = dlm_lock_stage2(ls, lkb, rsb, flags);
+
+      out:
+	if (error)
+		release_rsb(rsb);
+
+	return error;
+}
+
+/* 
+ * Locking routine called after we have an RSB, either a copy of a remote one
+ * or a local one, or perhaps a shiny new one all of our very own
+ */
+
+int dlm_lock_stage2(gd_ls_t *ls, gd_lkb_t *lkb, gd_res_t *rsb, int flags)
+{
+	int error = 0;
+
+	if (rsb->res_nodeid) {
+		res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
+		error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
+	} else {
+		dlm_lock_stage3(lkb);
+	}
+
+	return error;
+}
+
+/* 
+ * Called on an RSB's master node to do stage2 locking for a remote lock
+ * request.  Returns a proper lkb with rsb ready for lock processing.
+ * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
+ */
+
+gd_lkb_t *remote_stage2(int remote_nodeid, gd_ls_t *ls,
+			struct gd_remlockrequest *freq)
+{
+	gd_res_t *rsb = NULL, *parent_rsb = NULL;
+	gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
+	int error, namelen;
+
+	if (freq->rr_remparid) {
+		parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
+		if (!parent_lkb)
+			goto fail;
+
+		atomic_inc(&parent_lkb->lkb_childcnt);
+		parent_rsb = parent_lkb->lkb_resource;
+	}
+
+	/* 
+	 * A new MSTCPY lkb.  Initialize lkb fields including the real lkid and
+	 * node actually holding the (non-MSTCPY) lkb.  AST address are just
+	 * flags in the master copy.
+	 */
+
+	lkb = create_lkb(ls);
+	if (!lkb)
+		goto fail_dec;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	lkb->lkb_rqmode = freq->rr_rqmode;
+	lkb->lkb_parent = parent_lkb;
+	lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP);
+	lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST);
+	lkb->lkb_nodeid = remote_nodeid;
+	lkb->lkb_remid = freq->rr_header.rh_lkid;
+	lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
+	lkb->lkb_lockqueue_flags = freq->rr_flags;
+
+	if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
+		lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
+		allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
+		if (!lkb->lkb_lvbptr)
+			goto fail_free;
+	}
+
+	if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
+		error = lkb_set_range(ls, lkb, freq->rr_range_start,
+				      freq->rr_range_end);
+		if (error)
+			goto fail_free;
+	}
+
+	/* 
+	 * Get the RSB which this lock is for.  Create a new RSB if this is a
+	 * new lock on a new resource.  We must be the master of any new rsb.
+	 */
+
+	namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
+
+	error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 1,
+				   &rsb);
+	if (error)
+		goto fail_free;
+
+	lkb->lkb_resource = rsb;
+	if (rsb->res_nodeid == -1)
+		rsb->res_nodeid = 0;
+	if (freq->rr_resdir_seq)
+		rsb->res_resdir_seq = freq->rr_resdir_seq;
+
+	return lkb;
+
+
+      fail_free:
+	/* release_lkb handles parent */
+	release_lkb(ls, lkb);
+	parent_lkb = NULL;
+
+      fail_dec:
+	if (parent_lkb)
+		atomic_dec(&parent_lkb->lkb_childcnt);
+      fail:
+	return NULL;
+}
+
+/* 
+ * The final bit of lock request processing on the master node.  Here the lock
+ * is granted and the completion ast is queued, or the lock is put on the
+ * waitqueue and blocking asts are sent.
+ */
+
+void dlm_lock_stage3(gd_lkb_t *lkb)
+{
+	gd_res_t *rsb = lkb->lkb_resource;
+
+	/* 
+	 * This is a locally mastered lock on a resource that already exists,
+	 * see if it can be  granted or if it must wait.  When this function is
+	 * called for a remote lock request (process_cluster_request,
+	 * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
+	 * requesting node at the end of process_cluster_request, not at the
+	 * end of grant_lock.
+	 */
+
+	down_write(&rsb->res_lock);
+
+	if (can_be_granted(rsb, lkb)) {
+		grant_lock(lkb, 0);
+		goto out;
+	}
+
+	/* 
+	 * This request is not a conversion, so the lkb didn't exist other than
+	 * for this request and should be freed after EAGAIN is returned in the
+	 * ast.
+	 */
+
+	if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
+		lkb->lkb_retstatus = -EAGAIN;
+		if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
+			send_blocking_asts_all(rsb, lkb);
+		queue_ast(lkb, AST_COMP | AST_DEL, 0);
+		goto out;
+	}
+
+	/* 
+	 * The requested lkb must wait.  Because the rsb of the requested lkb
+	 * is mastered here, send blocking asts for the lkb's blocking the
+	 * request.
+	 */
+
+	lkb->lkb_retstatus = 0;
+	lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
+
+	send_blocking_asts(rsb, lkb);
+
+      out:
+	up_write(&rsb->res_lock);
+}
+
+int dlm_unlock(void *lockspace,
+	       uint32_t lkid,
+	       uint32_t flags,
+	       struct dlm_lksb *lksb,
+	       void *astarg)
+{
+	gd_ls_t *ls = find_lockspace_by_local_id(lockspace);
+	gd_lkb_t *lkb;
+	gd_res_t *rsb;
+	int ret = -EINVAL;
+
+	if (!ls)
+		goto out;
+
+	lkb = find_lock_by_id(ls, lkid);
+	if (!lkb)
+		goto out;
+
+	/* Can't dequeue a master copy (a remote node's mastered lock) */
+	if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
+		goto out;
+
+	/* Already waiting for a remote lock operation */
+	if (lkb->lkb_lockqueue_state) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/* Can only cancel WAITING or CONVERTing locks.
+	 * This is just a quick check - it is also checked in unlock_stage2()
+	 * (which may be on the master) under the semaphore.
+	 */
+	if ((flags & DLM_LKF_CANCEL) &&
+	    (lkb->lkb_status == GDLM_LKSTS_GRANTED))
+		goto out;
+
+	/* "Normal" unlocks must operate on a granted lock */
+	if (!(flags & DLM_LKF_CANCEL) &&
+	    (lkb->lkb_status != GDLM_LKSTS_GRANTED))
+		goto out;
+
+	down_write(&ls->ls_unlock_sem);
+
+	/* Can't dequeue a lock with sublocks */
+	if (atomic_read(&lkb->lkb_childcnt)) {
+		up_write(&ls->ls_unlock_sem);
+		ret = -ENOTEMPTY;
+		goto out;
+	}
+
+	/* Mark it as deleted so we can't use it as a parent in dlm_lock() */
+	if (!(flags & DLM_LKF_CANCEL))
+		lkb->lkb_flags |= GDLM_LKFLG_DELETED;
+	up_write(&ls->ls_unlock_sem);
+
+	/* Save any new params */
+	if (lksb)
+		lkb->lkb_lksb = lksb;
+	if (astarg)
+		lkb->lkb_astparam = (long) astarg;
+
+	lkb->lkb_lockqueue_flags = flags;
+
+	rsb = lkb->lkb_resource;
+
+	down_read(&ls->ls_in_recovery);
+
+	if (rsb->res_nodeid)
+		ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
+	else
+		ret = dlm_unlock_stage2(lkb, flags);
+
+	up_read(&ls->ls_in_recovery);
+
+	wake_astd();
+
+      out:
+	return ret;
+}
+
+int dlm_unlock_stage2(gd_lkb_t *lkb, uint32_t flags)
+{
+	gd_res_t *rsb = lkb->lkb_resource;
+	int old_status;
+	int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
+
+	down_write(&rsb->res_lock);
+
+	/* Can only cancel WAITING or CONVERTing locks */
+	if ((flags & DLM_LKF_CANCEL) &&
+	    (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
+	        lkb->lkb_retstatus = -EINVAL;
+		queue_ast(lkb, AST_COMP, 0);
+	        goto out;
+	}
+
+	old_status = lkb_dequeue(lkb);
+
+	/* 
+	 * If was granted grant any converting or waiting locks.
+	 */
+
+	if (old_status == GDLM_LKSTS_GRANTED)
+		grant_pending_locks(rsb);
+
+	/* 
+	 * Cancelling a conversion
+	 */
+
+	if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
+		/* VMS semantics say we should send blocking ASTs again here */
+		send_blocking_asts(rsb, lkb);
+
+		/* Remove from deadlock detection */
+		if (lkb->lkb_duetime)
+			remove_from_deadlockqueue(lkb);
+
+		/* Stick it back on the granted queue */
+		lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+		lkb->lkb_rqmode = lkb->lkb_grmode;
+
+		/* Was it blocking any other locks? */
+		if (first_in_list(lkb, &rsb->res_convertqueue))
+			grant_pending_locks(rsb);
+
+		lkb->lkb_retstatus = -DLM_ECANCEL;
+		queue_ast(lkb, AST_COMP, 0);
+		goto out;
+	}
+
+	/* 
+	 * The lvb can be saved or cleared on unlock.
+	 */
+
+	if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
+		if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
+			memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+		if (flags & DLM_LKF_IVVALBLK)
+			memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
+	}
+
+	lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK;
+	queue_ast(lkb, AST_COMP | AST_DEL, 0);
+
+	/* 
+	 * Only free the LKB if we are the master copy.  Otherwise the AST
+	 * delivery routine will free it after delivery.  queue_ast for MSTCPY
+	 * lkb just sends a message.
+	 */
+
+	if (remote) {
+		up_write(&rsb->res_lock);
+		release_lkb(rsb->res_ls, lkb);
+		release_rsb(rsb);
+		goto out2;
+	}
+
+      out:
+	up_write(&rsb->res_lock);
+      out2:
+	wake_astd();
+	return 0;
+}
+
+/* 
+ * Lock conversion
+ */
+
+static int convert_lock(gd_ls_t *ls, int mode, struct dlm_lksb *lksb,
+			int flags, void *ast, void *astarg, void *bast,
+			struct dlm_range *range)
+{
+	gd_lkb_t *lkb;
+	gd_res_t *rsb;
+	int ret = -EINVAL;
+
+	lkb = find_lock_by_id(ls, lksb->sb_lkid);
+	if (!lkb) {
+		goto out;
+	}
+
+	if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
+		goto out;
+	}
+
+	if ((flags & DLM_LKF_QUECVT) &&
+	    !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
+		goto out;
+	}
+
+	if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
+		goto out;
+	}
+
+	if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) {
+		goto out;
+	}
+
+	/* Set up the ranges as appropriate */
+	if (range) {
+		if (range->ra_start > range->ra_end)
+			goto out;
+
+		if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	rsb = lkb->lkb_resource;
+	down_read(&rsb->res_ls->ls_in_recovery);
+
+	lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
+	lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
+
+	if (flags & DLM_LKF_NODLCKWT)
+		lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
+	if (ast)
+		lkb->lkb_astaddr = ast;
+	if (astarg)
+		lkb->lkb_astparam = (long) astarg;
+	if (bast)
+		lkb->lkb_bastaddr = bast;
+	lkb->lkb_rqmode = mode;
+	lkb->lkb_lockqueue_flags = flags;
+	lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
+	lkb->lkb_lvbptr = lksb->sb_lvbptr;
+
+	if (rsb->res_nodeid) {
+		res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
+		ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
+	} else {
+		ret = dlm_convert_stage2(lkb, FALSE);
+	}
+
+	up_read(&rsb->res_ls->ls_in_recovery);
+
+	wake_astd();
+
+      out:
+	return ret;
+}
+
+/* 
+ * For local conversion requests on locally mastered locks this is called
+ * directly from dlm_lock/convert_lock.  This function is also called for
+ * remote conversion requests of MSTCPY locks (from process_cluster_request).
+ */
+
+int dlm_convert_stage2(gd_lkb_t *lkb, int do_ast)
+{
+	gd_res_t *rsb = lkb->lkb_resource;
+	int ret = 0;
+
+	down_write(&rsb->res_lock);
+
+	if (can_be_granted(rsb, lkb)) {
+		grant_lock(lkb, 0);
+		grant_pending_locks(rsb);
+		goto out;
+	}
+
+	/* 
+	 * Remove lkb from granted queue.
+	 */
+
+	lkb_dequeue(lkb);
+
+	/* 
+	 * The user won't wait so stick it back on the grant queue
+	 */
+
+	if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
+		lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+		ret = lkb->lkb_retstatus = -EAGAIN;
+		if (do_ast)
+			queue_ast(lkb, AST_COMP, 0);
+		if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
+			send_blocking_asts_all(rsb, lkb);
+		goto out;
+	}
+
+	/* 
+	 * The lkb's status tells which queue it's on.  Put back on convert
+	 * queue.  (QUECVT requests added at end of the queue, all others in
+	 * order.)
+	 */
+
+	lkb->lkb_retstatus = 0;
+	lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
+
+	/* 
+	 * If the request can't be granted
+	 */
+
+	send_blocking_asts(rsb, lkb);
+
+	if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
+	        add_to_deadlockqueue(lkb);
+
+      out:
+	up_write(&rsb->res_lock);
+	return ret;
+}
+
+/* 
+ * Remove lkb from any queue it's on, add it to the granted queue, and queue a
+ * completion ast.  rsb res_lock must be held in write when this is called.
+ */
+
+static void grant_lock(gd_lkb_t *lkb, int send_remote)
+{
+	gd_res_t *rsb = lkb->lkb_resource;
+
+	if (lkb->lkb_duetime)
+		remove_from_deadlockqueue(lkb);
+
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
+		int b;
+		GDLM_ASSERT(lkb->lkb_lvbptr,);
+
+		if (!rsb->res_lvbptr)
+			rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
+
+		b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+		if (b)
+			memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
+		else
+			memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+	}
+
+	if (lkb->lkb_range) {
+		lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
+		lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
+	}
+
+	lkb->lkb_grmode = lkb->lkb_rqmode;
+	lkb->lkb_rqmode = DLM_LOCK_IV;
+	lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+
+	lkb->lkb_highbast = 0;
+	lkb->lkb_retstatus = 0;
+	queue_ast(lkb, AST_COMP, 0);
+
+	/* 
+	 * A remote conversion request has been granted, either immediately
+	 * upon being requested or after waiting a bit.  In the former case,
+	 * reply_and_grant() is called.  In the later case send_remote is 1 and
+	 * remote_grant() is called.
+	 *
+	 * The "send_remote" flag is set only for locks which are granted "out
+	 * of band" - ie by another lock being converted or unlocked.
+	 *
+	 * The second case occurs when this lkb is granted right away as part
+	 * of processing the initial request.  In that case, we send a single
+	 * message in reply_and_grant which combines the request reply with the
+	 * grant message.
+	 */
+
+	if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
+		if (send_remote)
+			remote_grant(lkb);
+		else if (lkb->lkb_request)
+			reply_and_grant(lkb);
+	}
+
+}
+
+static void send_bast_queue(struct list_head *head, gd_lkb_t *lkb)
+{
+	gd_lkb_t *gr;
+
+	list_for_each_entry(gr, head, lkb_statequeue) {
+		if (gr->lkb_bastaddr &&
+		    gr->lkb_highbast < lkb->lkb_rqmode &&
+		    ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
+			queue_ast(gr, AST_BAST, lkb->lkb_rqmode);
+			gr->lkb_highbast = lkb->lkb_rqmode;
+		}
+	}
+}
+
+/* 
+ * Notify granted locks if they are blocking a newly forced-to-wait lock.
+ */
+
+static void send_blocking_asts(gd_res_t *rsb, gd_lkb_t *lkb)
+{
+	send_bast_queue(&rsb->res_grantqueue, lkb);
+	/* check if the following improves performance */
+	/* send_bast_queue(&rsb->res_convertqueue, lkb); */
+}
+
+static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb)
+{
+	send_bast_queue(&rsb->res_grantqueue, lkb);
+	send_bast_queue(&rsb->res_convertqueue, lkb);
+}
+
+/* 
+ * Called when a lock has been dequeued. Look for any locks to grant that are
+ * waiting for conversion or waiting to be granted.
+ * The rsb res_lock must be held in write when this function is called.
+ */
+
+int grant_pending_locks(gd_res_t *rsb)
+{
+	gd_lkb_t *lkb;
+	struct list_head *list;
+	struct list_head *temp;
+	int8_t high = DLM_LOCK_IV;
+
+	list_for_each_safe(list, temp, &rsb->res_convertqueue) {
+		lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
+
+		if (can_be_granted(rsb, lkb))
+			grant_lock(lkb, 1);
+		else
+			high = MAX(lkb->lkb_rqmode, high);
+	}
+
+	list_for_each_safe(list, temp, &rsb->res_waitqueue) {
+		lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
+
+		if (can_be_granted(rsb, lkb))
+			grant_lock(lkb, 1);
+		else
+			high = MAX(lkb->lkb_rqmode, high);
+	}
+
+	/* 
+	 * If there are locks left on the wait/convert queue then send blocking
+	 * ASTs to granted locks that are blocking
+	 *
+	 * FIXME: This might generate some spurious blocking ASTs for range
+	 * locks.
+	 */
+
+	if (high > DLM_LOCK_IV) {
+		list_for_each_safe(list, temp, &rsb->res_grantqueue) {
+			lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
+
+			if (lkb->lkb_bastaddr &&
+			    (lkb->lkb_highbast < high) &&
+			    !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+
+				queue_ast(lkb, AST_BAST, high);
+				lkb->lkb_highbast = high;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* 
+ * Called to cancel a locking operation that failed due to some internal
+ * reason.
+ *
+ * Waiting locks will be removed, converting locks will be reverted to their
+ * granted status, unlocks will be left where they are.
+ *
+ * A completion AST will be delivered to the caller.
+ */
+
+int cancel_lockop(gd_lkb_t *lkb, int status)
+{
+	int state = lkb->lkb_lockqueue_state;
+	uint16_t astflags = AST_COMP;
+
+	lkb->lkb_lockqueue_state = 0;
+
+	switch (state) {
+	case GDLM_LQSTATE_WAIT_RSB:
+		astflags |= AST_DEL;
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONDGRANT:
+		res_lkb_dequeue(lkb);
+		astflags |= AST_DEL;
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONVERT:
+		res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
+
+		/* Remove from deadlock detection */
+		if (lkb->lkb_duetime) {
+			remove_from_deadlockqueue(lkb);
+		}
+		break;
+
+	case GDLM_LQSTATE_WAIT_UNLOCK:
+		/* We can leave this. I think.... */
+		break;
+	}
+
+	lkb->lkb_retstatus = status;
+	queue_ast(lkb, astflags, 0);
+
+	return 0;
+}
+
+/* 
+ * Check for conversion deadlock. If a deadlock was found
+ * return lkb to kill, else return NULL
+ */
+
+gd_lkb_t *conversion_deadlock_check(gd_lkb_t *lkb)
+{
+	gd_res_t *rsb = lkb->lkb_resource;
+	struct list_head *entry;
+
+	GDLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
+
+	/* Work our way up to the head of the queue looking for locks that
+	 * conflict with us */
+
+	down_read(&rsb->res_lock);
+
+	entry = lkb->lkb_statequeue.prev;
+	while (entry != &rsb->res_convertqueue) {
+		gd_lkb_t *lkb2 = list_entry(entry, gd_lkb_t, lkb_statequeue);
+
+		if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
+			up_read(&rsb->res_lock);
+			return lkb;
+		}
+		entry = entry->prev;
+	}
+	up_read(&rsb->res_lock);
+
+	return 0;
+}
+
+/* 
+ * Conversion operation was cancelled by us (not the user).
+ * ret contains the return code to pass onto the user
+ */
+
+void cancel_conversion(gd_lkb_t *lkb, int ret)
+{
+	gd_res_t *rsb = lkb->lkb_resource;
+
+	/* Stick it back on the granted queue */
+	res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+	lkb->lkb_rqmode = lkb->lkb_grmode;
+
+	remove_from_deadlockqueue(lkb);
+
+	lkb->lkb_retstatus = ret;
+	queue_ast(lkb, AST_COMP, 0);
+	wake_astd();
+}
+
+/* 
+ * As new master of the rsb for this lkb, we need to handle these requests
+ * removed from the lockqueue and originating from local processes:
+ * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
+ * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
+ */
+
+void process_remastered_lkb(gd_lkb_t *lkb, int state)
+{
+	switch (state) {
+	case GDLM_LQSTATE_WAIT_RSB:
+		dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
+				lkb->lkb_lockqueue_flags,
+				lkb->lkb_resource->res_name,
+				lkb->lkb_resource->res_length);
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONDGRANT:
+		res_lkb_dequeue(lkb);
+		dlm_lock_stage3(lkb);
+		break;
+
+	case GDLM_LQSTATE_WAIT_UNLOCK:
+		dlm_unlock_stage2(lkb, lkb->lkb_lockqueue_flags);
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONVERT:
+		dlm_convert_stage2(lkb, TRUE);
+		break;
+
+	default:
+		GDLM_ASSERT(0,);
+	}
+}
diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
--- linux-orig/cluster/dlm/locking.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/locking.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,33 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKING_DOT_H__
+#define __LOCKING_DOT_H__
+
+void process_remastered_lkb(gd_lkb_t * lkb, int state);
+void dlm_lock_stage3(gd_lkb_t * lkb);
+int dlm_convert_stage2(gd_lkb_t * lkb, int do_ast);
+int dlm_unlock_stage2(gd_lkb_t * lkb, uint32_t flags);
+int dlm_lock_stage2(gd_ls_t * lspace, gd_lkb_t * lkb, gd_res_t * rsb,
+		    int flags);
+gd_res_t *create_rsb(gd_ls_t * lspace, gd_lkb_t * lkb, char *name, int namelen);
+int free_rsb_if_unused(gd_res_t * rsb);
+gd_lkb_t *remote_stage2(int remote_csid, gd_ls_t * lspace,
+			struct gd_remlockrequest *freq);
+int cancel_lockop(gd_lkb_t * lkb, int status);
+int dlm_remove_lock(gd_lkb_t * lkb, uint32_t flags);
+int grant_pending_locks(gd_res_t * rsb);
+void cancel_conversion(gd_lkb_t * lkb, int ret);
+gd_lkb_t *conversion_deadlock_check(gd_lkb_t * lkb);
+
+#endif				/* __LOCKING_DOT_H__ */
diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
--- linux-orig/cluster/dlm/lockqueue.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lockqueue.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,957 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lockqueue.c
+ *
+ * This controls the lock queue, which is where locks
+ * come when they need to wait for a remote operation
+ * to complete.
+ *
+ * This could also be thought of as the "high-level" comms
+ * layer.
+ *
+ */
+
+#include "dlm_internal.h"
+#include "lockqueue.h"
+#include "dir.h"
+#include "locking.h"
+#include "lkb.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "reccomms.h"
+#include "nodes.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "memory.h"
+#include "rsb.h"
+#include "queries.h"
+
+static void add_reply_lvb(gd_lkb_t * lkb, struct gd_remlockreply *reply);
+static void add_request_lvb(gd_lkb_t * lkb, struct gd_remlockrequest *req);
+
+/*
+ * format of an entry on the request queue
+ */
+struct rq_entry {
+	struct list_head rqe_list;
+	uint32_t rqe_nodeid;
+	char rqe_request[1];
+};
+
+/*
+ * Add a new request (if appropriate) to the request queue and send the remote
+ * request out.  - runs in the context of the locking caller
+ *
+ * Recovery of a remote_stage request if the remote end fails while the lkb
+ * is still on the lockqueue:
+ *
+ * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
+ *   lockqueue_lkb_mark() at the start of recovery.
+ *
+ * o Some lkb's will be rebuilt on new master rsb's during recovery.
+ *   (depends on the type of request, see below).
+ *
+ * o At the end of recovery, resend_cluster_requests() looks at these
+ *   LQRESEND lkb's and either:
+ *
+ *   i) resends the request to the new master for the rsb where the
+ *      request is processed as usual.  The lkb remains on the lockqueue until
+ *      the new master replies and we run process_lockqueue_reply().
+ *
+ *   ii) if we've become the rsb master, remove the lkb from the lockqueue
+ *       and processes the request locally via process_remastered_lkb().
+ *
+ * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
+ * and the request should be resent if dest node is failed.
+ *
+ * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
+ * wait queue.  Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
+ * makes send_lkb_queue() skip it).  Resend this request to the new master.
+ *
+ * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue.  It will
+ * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
+ * Resend this request to the new master.
+ *
+ * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
+ * It will be rebuilt on the new master rsb's granted queue.  Resend this
+ * request to the new master.
+ */
+
+int remote_stage(gd_lkb_t *lkb, int state)
+{
+	int error;
+
+	lkb->lkb_lockqueue_state = state;
+	add_to_lockqueue(lkb);
+
+	error = send_cluster_request(lkb, state);
+	if (error < 0) {
+		log_print("remote_stage error sending request %d", error);
+
+		/* Leave on lockqueue, it will be resent to correct node during
+		 * recovery. */
+
+		 /*
+		 lkb->lkb_lockqueue_state = 0;
+		 remove_from_lockqueue(lkb);
+		 return -ENOTCONN;
+		 */
+	}
+	return 0;
+}
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete.
+ */
+
+void add_to_requestqueue(gd_ls_t *ls, int nodeid, char *request, int length)
+{
+	struct rq_entry *entry;
+
+	if (in_nodes_gone(ls, nodeid))
+		return;
+
+	entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+	if (!entry) {
+		// TODO something better
+		printk("dlm: add_to_requestqueue: out of memory\n");
+		return;
+	}
+
+	log_debug(ls, "add_to_requestqueue %d", nodeid);
+	entry->rqe_nodeid = nodeid;
+	memcpy(entry->rqe_request, request, length);
+	list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
+}
+
+int process_requestqueue(gd_ls_t *ls)
+{
+	int error = 0, count = 0;
+	struct rq_entry *entry, *safe;
+	struct gd_req_header *req;
+
+	log_all(ls, "process held requests");
+
+	list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
+		req = (struct gd_req_header *) entry->rqe_request;
+		log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid);
+
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+			log_debug(ls, "process_requestqueue aborted");
+			error = -EINTR;
+			break;
+		}
+
+		error = process_cluster_request(entry->rqe_nodeid, req, TRUE);
+		if (error == -EINTR) {
+			log_debug(ls, "process_requestqueue interrupted");
+			break;
+		}
+
+		list_del(&entry->rqe_list);
+		kfree(entry);
+		count++;
+		error = 0;
+	}
+
+	log_all(ls, "processed %d requests", count);
+	return error;
+}
+
+void wait_requestqueue(gd_ls_t *ls)
+{
+	while (!list_empty(&ls->ls_requestqueue) &&
+		test_bit(LSFL_LS_RUN, &ls->ls_flags))
+		schedule();
+}
+
+/*
+ * Resdir requests (lookup or remove) and replies from before recovery are
+ * invalid since the resdir was rebuilt.  Clear them.  Requests from nodes now
+ * gone are also invalid.
+ */
+
+void purge_requestqueue(gd_ls_t *ls)
+{
+	int count = 0;
+	struct rq_entry *entry, *safe;
+	struct gd_req_header *req;
+	struct gd_remlockrequest *freq;
+	gd_lkb_t *lkb;
+
+	log_all(ls, "purge requests");
+
+	list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
+		req = (struct gd_req_header *) entry->rqe_request;
+		freq = (struct gd_remlockrequest *) req;
+
+		if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
+		    req->rh_cmd == GDLM_REMCMD_LOOKUP ||
+		    in_nodes_gone(ls, entry->rqe_nodeid)) {
+
+			list_del(&entry->rqe_list);
+			kfree(entry);
+			count++;
+
+		} else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
+
+			/*
+			 * Replies to resdir lookups are invalid and must be
+			 * purged.  The lookup requests are marked in
+			 * lockqueue_lkb_mark and will be resent in
+			 * resend_cluster_requests.  The only way to check if
+			 * this is a lookup reply is to look at the
+			 * lockqueue_state of the lkb.
+			 */
+
+			lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid);
+			GDLM_ASSERT(lkb,);
+			if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
+				list_del(&entry->rqe_list);
+				kfree(entry);
+				count++;
+			}
+		}
+	}
+
+	log_all(ls, "purged %d requests", count);
+}
+
+/*
+ * Check if there's a reply for the given lkid in the requestqueue.
+ */
+
+int reply_in_requestqueue(gd_ls_t *ls, int lkid)
+{
+	int rv = FALSE;
+	struct rq_entry *entry, *safe;
+	struct gd_req_header *req;
+	struct gd_remlockrequest *freq;
+
+	list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
+		req = (struct gd_req_header *) entry->rqe_request;
+		freq = (struct gd_remlockrequest *) req;
+
+		if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY &&
+		    freq->rr_header.rh_lkid == lkid) {
+			rv = TRUE;
+			break;
+		}
+	}
+
+	return rv;
+}
+
+void allocate_and_copy_lvb(gd_ls_t *ls, char **lvbptr, char *src)
+{
+	if (!*lvbptr)
+		*lvbptr = allocate_lvb(ls);
+	if (*lvbptr)
+		memcpy(*lvbptr, src, DLM_LVB_LEN);
+}
+
+/*
+ * Process a lockqueue LKB after it has had it's remote processing complete and
+ * been pulled from the lockqueue.  Runs in the context of the DLM recvd thread on
+ * the machine that requested the lock.
+ */
+
+static void process_lockqueue_reply(gd_lkb_t *lkb,
+				    struct gd_remlockreply *reply)
+{
+	int state = lkb->lkb_lockqueue_state;
+	int oldstate;
+	gd_res_t *rsb = lkb->lkb_resource;
+	gd_ls_t *ls = rsb->res_ls;
+
+	lkb->lkb_lockqueue_state = 0;
+	if (state)
+		remove_from_lockqueue(lkb);
+
+	switch (state) {
+	case GDLM_LQSTATE_WAIT_RSB:
+
+		GDLM_ASSERT(reply->rl_status == 0,);
+
+		if (reply->rl_nodeid == our_nodeid())
+			rsb->res_nodeid = 0;
+		else
+			rsb->res_nodeid = reply->rl_nodeid;
+
+		rsb->res_resdir_seq = reply->rl_resdir_seq;
+		lkb->lkb_nodeid = rsb->res_nodeid;
+
+		dlm_lock_stage2(rsb->res_ls, lkb, rsb,
+				lkb->lkb_lockqueue_flags);
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONVERT:
+	case GDLM_LQSTATE_WAIT_CONDGRANT:
+
+		/*
+		 * After a remote lock/conversion/grant request we put the lock
+		 * on the right queue and send an AST if appropriate.  Any lock
+		 * shuffling (eg newly granted locks because this one was
+		 * converted downwards) will be dealt with in seperate messages
+		 * (which may be in the same network message)
+		 */
+
+		if (!lkb->lkb_remid)
+			lkb->lkb_remid = reply->rl_lkid;
+
+		/*
+		 * The remote request failed (we assume because of NOQUEUE).
+		 * If this is a new request (non-conv) the lkb was created just
+		 * for it so the lkb should be freed.  If this was a
+		 * conversion, the lkb already existed so we should put it back
+		 * on the grant queue.
+		 */
+
+		if (reply->rl_status != 0) {
+			GDLM_ASSERT(reply->rl_status == -EAGAIN,);
+
+			if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
+				res_lkb_dequeue(lkb);
+				lkb->lkb_retstatus = reply->rl_status;
+				queue_ast(lkb, AST_COMP | AST_DEL, 0);
+			} else {
+				res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+				lkb->lkb_retstatus = reply->rl_status;
+				queue_ast(lkb, AST_COMP, 0);
+			}
+			break;
+		}
+
+		/*
+		 * The remote request was successful in granting the request or
+		 * queuing it to be granted later.  Add the lkb to the
+		 * appropriate rsb queue.
+		 */
+
+		switch (reply->rl_lockstate) {
+		case GDLM_LKSTS_GRANTED:
+
+			/* Compact version of grant_lock(). */
+
+			down_write(&rsb->res_lock);
+			if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+				memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
+				       DLM_LVB_LEN);
+
+			lkb->lkb_grmode = lkb->lkb_rqmode;
+			lkb->lkb_rqmode = DLM_LOCK_IV;
+			lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+
+			if (lkb->lkb_range) {
+				lkb->lkb_range[GR_RANGE_START] =
+				    lkb->lkb_range[RQ_RANGE_START];
+				lkb->lkb_range[GR_RANGE_END] =
+				    lkb->lkb_range[RQ_RANGE_END];
+			}
+			up_write(&rsb->res_lock);
+
+			lkb->lkb_retstatus = 0;
+			queue_ast(lkb, AST_COMP, 0);
+			break;
+
+		case GDLM_LKSTS_WAITING:
+
+			if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
+				res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
+			else
+				log_error(ls, "wait reply for granted %x %u",
+					  lkb->lkb_id, lkb->lkb_nodeid);
+			break;
+
+		case GDLM_LKSTS_CONVERT:
+
+			if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
+				res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
+			else
+				log_error(ls, "convert reply for granted %x %u",
+					  lkb->lkb_id, lkb->lkb_nodeid);
+			break;
+
+		default:
+			log_error(ls, "process_lockqueue_reply state %d",
+				  reply->rl_lockstate);
+		}
+
+		break;
+
+	case GDLM_LQSTATE_WAIT_UNLOCK:
+
+		/*
+		 * Unlocks should never fail.  Update local lock info.  This
+		 * always sends completion AST with status in lksb
+		 */
+
+		GDLM_ASSERT(reply->rl_status == 0,);
+		oldstate = res_lkb_dequeue(lkb);
+
+		/* Differentiate between unlocks and conversion cancellations */
+		if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL &&
+		    oldstate == GDLM_LKSTS_CONVERT) {
+			res_lkb_enqueue(lkb->lkb_resource, lkb,
+					GDLM_LKSTS_GRANTED);
+			lkb->lkb_retstatus = -DLM_ECANCEL;
+			queue_ast(lkb, AST_COMP, 0);
+		} else {
+			lkb->lkb_retstatus = -DLM_EUNLOCK;
+			queue_ast(lkb, AST_COMP | AST_DEL, 0);
+		}
+		break;
+
+	default:
+		log_error(ls, "process_lockqueue_reply id %x state %d",
+		          lkb->lkb_id, state);
+	}
+}
+
+/*
+ * Tell a remote node to grant a lock.  This happens when we are the master
+ * copy for a lock that is actually held on a remote node.  The remote end is
+ * also responsible for sending the completion AST.
+ */
+
+void remote_grant(gd_lkb_t *lkb)
+{
+	struct writequeue_entry *e;
+	struct gd_remlockrequest *req;
+
+	// TODO Error handling
+	e = lowcomms_get_buffer(lkb->lkb_nodeid,
+				sizeof(struct gd_remlockrequest),
+				lkb->lkb_resource->res_ls->ls_allocation,
+				(char **) &req);
+	if (!e)
+		return;
+
+	req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
+	req->rr_header.rh_length = sizeof(struct gd_remlockrequest);
+	req->rr_header.rh_flags = 0;
+	req->rr_header.rh_lkid = lkb->lkb_id;
+	req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
+	req->rr_remlkid = lkb->lkb_remid;
+	req->rr_flags = 0;
+
+	if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
+		/* This is a confusing non-standard use of rr_flags which is
+		 * usually used to pass lockqueue_flags. */
+		req->rr_flags |= GDLM_LKFLG_DEMOTED;
+	}
+
+	add_request_lvb(lkb, req);
+	midcomms_send_buffer(&req->rr_header, e);
+}
+
+void reply_and_grant(gd_lkb_t *lkb)
+{
+	struct gd_remlockrequest *req = lkb->lkb_request;
+	struct gd_remlockreply *reply;
+	struct writequeue_entry *e;
+
+	// TODO Error handling
+	e = lowcomms_get_buffer(lkb->lkb_nodeid,
+				sizeof(struct gd_remlockreply),
+				lkb->lkb_resource->res_ls->ls_allocation,
+				(char **) &reply);
+	if (!e)
+		return;
+
+	reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
+	reply->rl_header.rh_flags = 0;
+	reply->rl_header.rh_length = sizeof(struct gd_remlockreply);
+	reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
+	reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
+
+	reply->rl_status = lkb->lkb_retstatus;
+	reply->rl_lockstate = lkb->lkb_status;
+	reply->rl_lkid = lkb->lkb_id;
+
+	GDLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
+
+	lkb->lkb_request = NULL;
+
+	add_reply_lvb(lkb, reply);
+	midcomms_send_buffer(&reply->rl_header, e);
+}
+
+/*
+ * Request removal of a dead entry in the resource directory
+ */
+
+void remote_remove_resdata(gd_ls_t *ls, int nodeid, char *name, int namelen,
+			   uint8_t sequence)
+{
+	struct writequeue_entry *e;
+	struct gd_remlockrequest *req;
+
+	if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+		gd_rcom_t *rc = allocate_rcom_buffer(ls);
+
+		memcpy(rc->rc_buf, name, namelen);
+		rc->rc_datalen = namelen;
+
+		rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
+
+		free_rcom_buffer(rc);
+		return;
+	}
+	// TODO Error handling
+	e = lowcomms_get_buffer(nodeid,
+				sizeof(struct gd_remlockrequest) + namelen - 1,
+				ls->ls_allocation, (char **) &req);
+	if (!e)
+		return;
+
+	memset(req, 0, sizeof(struct gd_remlockrequest) + namelen - 1);
+	req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
+	req->rr_header.rh_length =
+	    sizeof(struct gd_remlockrequest) + namelen - 1;
+	req->rr_header.rh_flags = 0;
+	req->rr_header.rh_lkid = 0;
+	req->rr_header.rh_lockspace = ls->ls_global_id;
+	req->rr_remlkid = 0;
+	req->rr_resdir_seq = sequence;
+	memcpy(req->rr_name, name, namelen);
+
+	midcomms_send_buffer(&req->rr_header, e);
+}
+
+/*
+ * Send remote cluster request to directory or master node before the request
+ * is put on the lock queue.  Runs in the context of the locking caller.
+ */
+
+int send_cluster_request(gd_lkb_t *lkb, int state)
+{
+	uint32_t target_nodeid;
+	gd_res_t *rsb = lkb->lkb_resource;
+	gd_ls_t *ls = rsb->res_ls;
+	struct gd_remlockrequest *req;
+	struct writequeue_entry *e;
+
+	/* Need to know the target nodeid before we allocate a send buffer */
+	target_nodeid = lkb->lkb_nodeid;
+	GDLM_ASSERT(target_nodeid != 0,);
+
+	if (state == GDLM_LQSTATE_WAIT_RSB)
+		target_nodeid = get_directory_nodeid(rsb);
+
+	GDLM_ASSERT(target_nodeid,);
+
+	if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+		/* this may happen when called by resend_cluster_request */
+		log_error(ls, "send_cluster_request to %u state %d recovery",
+			  target_nodeid, state);
+	}
+
+	e = lowcomms_get_buffer(target_nodeid,
+				sizeof(struct gd_remlockrequest) +
+				rsb->res_length - 1, ls->ls_allocation,
+				(char **) &req);
+	if (!e)
+		return -ENOBUFS;
+	memset(req, 0, sizeof(struct gd_remlockrequest) + rsb->res_length - 1);
+
+	/* Common stuff, some are just defaults */
+
+	if (lkb->lkb_bastaddr)
+		req->rr_asts = AST_BAST;
+	if (lkb->lkb_astaddr)
+		req->rr_asts |= AST_COMP;
+	if (lkb->lkb_parent)
+		req->rr_remparid = lkb->lkb_parent->lkb_remid;
+
+	req->rr_flags = lkb->lkb_lockqueue_flags;
+	req->rr_rqmode = lkb->lkb_rqmode;
+	req->rr_remlkid = lkb->lkb_remid;
+	req->rr_header.rh_length =
+	    sizeof(struct gd_remlockrequest) + rsb->res_length - 1;
+	req->rr_header.rh_flags = 0;
+	req->rr_header.rh_lkid = lkb->lkb_id;
+	req->rr_header.rh_lockspace = ls->ls_global_id;
+
+	switch (state) {
+
+	case GDLM_LQSTATE_WAIT_RSB:
+
+		/* The lock must be a root lock */
+		GDLM_ASSERT(!lkb->lkb_parent,);
+
+		req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
+		memcpy(req->rr_name, rsb->res_name, rsb->res_length);
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONVERT:
+
+		req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
+		if (lkb->lkb_range) {
+			req->rr_flags |= GDLM_LKFLG_RANGE;
+			req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
+			req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
+		}
+		break;
+
+	case GDLM_LQSTATE_WAIT_CONDGRANT:
+
+		req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
+		req->rr_resdir_seq = rsb->res_resdir_seq;
+		memcpy(req->rr_name, rsb->res_name, rsb->res_length);
+		if (lkb->lkb_range) {
+			req->rr_flags |= GDLM_LKFLG_RANGE;
+			req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
+			req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
+		}
+		break;
+
+	case GDLM_LQSTATE_WAIT_UNLOCK:
+
+		req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
+		break;
+
+	default:
+		GDLM_ASSERT(!"Unknown cluster request",);
+	}
+
+	add_request_lvb(lkb, req);
+	midcomms_send_buffer(&req->rr_header, e);
+
+	return 0;
+}
+
+/*
+ * We got a request from another cluster node, process it and return an info
+ * structure with the lock state/LVB etc as required.  Executes in the DLM's
+ * recvd thread.
+ */
+
+int process_cluster_request(int nodeid, struct gd_req_header *req, int recovery)
+{
+	gd_ls_t *lspace;
+	gd_lkb_t *lkb = NULL;
+	gd_res_t *rsb;
+	int send_reply = 0, status = 0, namelen;
+	struct gd_remlockrequest *freq = (struct gd_remlockrequest *) req;
+	struct gd_remlockreply reply;
+
+	lspace = find_lockspace_by_global_id(req->rh_lockspace);
+
+	if (!lspace) {
+		log_print("process_cluster_request invalid lockspace %x "
+			  "from %d req %u", req->rh_lockspace, nodeid,
+			  req->rh_cmd);
+		status = -EINVAL;
+		goto out;
+	}
+
+	/* wait for recoverd to drain requestqueue */
+	if (!recovery)
+		wait_requestqueue(lspace);
+
+	/*
+	 * If we're in recovery then queue the request for later.  Otherwise,
+	 * we still need to get the "in_recovery" lock to make sure the
+	 * recovery itself doesn't start until we are done.
+	 */
+ retry:
+	if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
+		if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags))
+			log_error(lspace, "process_cluster_request warning %u",
+				  nodeid);
+		add_to_requestqueue(lspace, nodeid, (char *) req,
+				    req->rh_length);
+		log_debug(lspace, "process_cluster_request abort");
+		status = -EINTR;
+		goto out;
+	}
+	if (!down_read_trylock(&lspace->ls_in_recovery)) {
+		schedule();
+		goto retry;
+	}
+
+
+	/*
+	 * Process the request.
+	 */
+
+	switch (req->rh_cmd) {
+
+	case GDLM_REMCMD_LOOKUP:
+		{
+			gd_resdata_t *rd;
+			int status;
+			uint32_t dir_nodeid;
+
+			namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
+
+			dir_nodeid = name_to_directory_nodeid(lspace,
+							      freq->rr_name,
+							      namelen);
+			if (dir_nodeid != our_nodeid())
+				log_debug(lspace, "ignoring directory lookup");
+
+			status = get_resdata(lspace, nodeid, freq->rr_name,
+					     namelen, &rd, 0);
+			if (status)
+				status = -ENOMEM;
+
+			reply.rl_status = status;
+			reply.rl_lockstate = 0;
+			reply.rl_nodeid = rd->rd_master_nodeid;
+			reply.rl_resdir_seq = rd->rd_sequence;
+		}
+		send_reply = 1;
+		break;
+
+	case GDLM_REMCMD_REM_RESDATA:
+
+		namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
+		remove_resdata(lspace, nodeid, freq->rr_name, namelen,
+			       freq->rr_resdir_seq);
+		break;
+
+	case GDLM_REMCMD_LOCKREQUEST:
+
+		lkb = remote_stage2(nodeid, lspace, freq);
+		if (lkb) {
+			lkb->lkb_request = freq;
+			dlm_lock_stage3(lkb);
+
+			/*
+			 * If the request was granted in lock_stage3, then a
+			 * reply message was already sent in combination with
+			 * the grant message and lkb_request is NULL.
+			 */
+
+			if (lkb->lkb_request) {
+				lkb->lkb_request = NULL;
+				send_reply = 1;
+				reply.rl_status = lkb->lkb_retstatus;
+				reply.rl_lockstate = lkb->lkb_status;
+				reply.rl_lkid = lkb->lkb_id;
+
+				/*
+				 * If the request could not be granted and the
+				 * user won't wait, then free up the LKB
+				 */
+
+				if (lkb->lkb_retstatus == -EAGAIN) {
+					GDLM_ASSERT(lkb->lkb_lockqueue_flags &
+						    DLM_LKF_NOQUEUE,);
+					rsb = lkb->lkb_resource;
+					release_lkb(lspace, lkb);
+					release_rsb(rsb);
+					lkb = NULL;
+				}
+			}
+		} else {
+			reply.rl_status = -ENOMEM;
+			send_reply = 1;
+		}
+		break;
+
+	case GDLM_REMCMD_CONVREQUEST:
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
+				        freq->rr_remlkid,
+				        freq->rr_header.rh_lkid, nodeid););
+
+		if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
+			log_error(lspace, "convrequest: invalid status %d",
+				  lkb->lkb_status);
+
+		lkb->lkb_rqmode = freq->rr_rqmode;
+		lkb->lkb_lockqueue_flags = freq->rr_flags;
+		lkb->lkb_request = freq;
+		lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
+
+		if (lkb->lkb_flags & GDLM_LKFLG_VALBLK
+		    || freq->rr_flags & DLM_LKF_VALBLK) {
+			lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
+			allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
+					      freq->rr_lvb);
+		}
+
+		if (freq->rr_flags & GDLM_LKFLG_RANGE) {
+			if (lkb_set_range(lspace, lkb, freq->rr_range_start,
+			                  freq->rr_range_end)) {
+				reply.rl_status = -ENOMEM;
+				send_reply = 1;
+				goto out;
+			}
+		}
+
+		dlm_convert_stage2(lkb, FALSE);
+
+		/*
+		 * If the conv request was granted in stage2, then a reply
+		 * message was already sent in combination with the grant
+		 * message.
+		 */
+
+		if (lkb->lkb_request) {
+			lkb->lkb_request = NULL;
+			send_reply = 1;
+			reply.rl_status = lkb->lkb_retstatus;
+			reply.rl_lockstate = lkb->lkb_status;
+			reply.rl_lkid = lkb->lkb_id;
+		}
+		break;
+
+	case GDLM_REMCMD_LOCKREPLY:
+
+		lkb = find_lock_by_id(lspace, freq->rr_header.rh_lkid);
+
+		GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
+				        freq->rr_remlkid,
+				        freq->rr_header.rh_lkid, nodeid););
+
+		process_lockqueue_reply(lkb, (struct gd_remlockreply *) req);
+		break;
+
+	case GDLM_REMCMD_LOCKGRANT:
+
+		/*
+		 * Remote lock has been granted asynchronously.  Do a compact
+		 * version of what grant_lock() does.
+		 */
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
+				        freq->rr_remlkid,
+				        freq->rr_header.rh_lkid, nodeid););
+
+		rsb = lkb->lkb_resource;
+
+		if (lkb->lkb_lockqueue_state)
+			log_error(rsb->res_ls, "granting lock on lockqueue "
+			          "id=%x from=%u lqstate=%d flags=%x",
+			          lkb->lkb_id, nodeid, lkb->lkb_lockqueue_state,
+			          lkb->lkb_flags);
+
+		down_write(&rsb->res_lock);
+
+		if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+			memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN);
+
+		lkb->lkb_grmode = lkb->lkb_rqmode;
+		lkb->lkb_rqmode = DLM_LOCK_IV;
+
+		if (lkb->lkb_range) {
+			lkb->lkb_range[GR_RANGE_START] =
+			    lkb->lkb_range[RQ_RANGE_START];
+			lkb->lkb_range[GR_RANGE_END] =
+			    lkb->lkb_range[RQ_RANGE_END];
+		}
+
+		lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
+		up_write(&rsb->res_lock);
+
+		if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
+			lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
+
+		lkb->lkb_retstatus = 0;
+		queue_ast(lkb, AST_COMP, 0);
+		break;
+
+	case GDLM_REMCMD_SENDBAST:
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
+				        freq->rr_remlkid,
+				        freq->rr_header.rh_lkid, nodeid););
+
+		if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
+			queue_ast(lkb, AST_BAST, freq->rr_rqmode);
+		break;
+
+	case GDLM_REMCMD_SENDCAST:
+
+		/* This is only used for some error completion ASTs */
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
+				        freq->rr_remlkid,
+				        freq->rr_header.rh_lkid, nodeid););
+
+		/* Return the lock to granted status */
+		res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
+
+		lkb->lkb_retstatus = freq->rr_status;
+		queue_ast(lkb, AST_COMP, 0);
+		break;
+
+	case GDLM_REMCMD_UNLOCKREQUEST:
+
+		lkb = find_lock_by_id(lspace, freq->rr_remlkid);
+
+		GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
+				        freq->rr_remlkid,
+				        freq->rr_header.rh_lkid, nodeid););
+
+		reply.rl_status = dlm_unlock_stage2(lkb, freq->rr_flags);
+		send_reply = 1;
+		break;
+
+	case GDLM_REMCMD_QUERY:
+	        remote_query(nodeid, lspace, req);
+		break;
+
+	case GDLM_REMCMD_QUERYREPLY:
+	        remote_query_reply(nodeid, lspace, req);
+		break;
+
+	default:
+		log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
+	}
+
+	up_read(&lspace->ls_in_recovery);
+
+      out:
+	if (send_reply) {
+		reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
+		reply.rl_header.rh_flags = 0;
+		reply.rl_header.rh_length = sizeof(reply);
+		reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
+		reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
+
+		status = midcomms_send_message(nodeid, &reply.rl_header,
+			                       GFP_KERNEL);
+	}
+
+	wake_astd();
+
+	return status;
+}
+
+static void add_reply_lvb(gd_lkb_t *lkb, struct gd_remlockreply *reply)
+{
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+		memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
+}
+
+static void add_request_lvb(gd_lkb_t *lkb, struct gd_remlockrequest *req)
+{
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+		memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
+}
diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
--- linux-orig/cluster/dlm/lockqueue.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lockqueue.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKQUEUE_DOT_H__
+#define __LOCKQUEUE_DOT_H__
+
+void remote_grant(gd_lkb_t * lkb);
+void reply_and_grant(gd_lkb_t * lkb);
+int remote_stage(gd_lkb_t * lkb, int state);
+int process_cluster_request(int csid, struct gd_req_header *req, int recovery);
+int send_cluster_request(gd_lkb_t * lkb, int state);
+void purge_requestqueue(gd_ls_t * ls);
+int process_requestqueue(gd_ls_t * ls);
+int reply_in_requestqueue(gd_ls_t * ls, int lkid);
+void remote_remove_resdata(gd_ls_t * ls, int nodeid, char *name, int namelen,
+			   uint8_t sequence);
+void allocate_and_copy_lvb(gd_ls_t * ls, char **lvbptr, char *src);
+
+#endif				/* __LOCKQUEUE_DOT_H__ */
diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
--- linux-orig/cluster/dlm/lockspace.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lockspace.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,706 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/module.h>
+
+#include "dlm_internal.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lockspace.h"
+#include "device.h"
+
+#define GDST_NONE       (0)
+#define GDST_RUNNING    (1)
+
+static int gdlmstate;
+static int gdlmcount;
+static struct semaphore gdlmstate_lock;
+struct list_head lslist;
+spinlock_t lslist_lock;
+struct kcl_service_ops ls_ops;
+
+static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
+
+
+void dlm_lockspace_init(void)
+{
+	gdlmstate = GDST_NONE;
+	gdlmcount = 0;
+	init_MUTEX(&gdlmstate_lock);
+	INIT_LIST_HEAD(&lslist);
+	spin_lock_init(&lslist_lock);
+}
+
+gd_ls_t *find_lockspace_by_global_id(uint32_t id)
+{
+	gd_ls_t *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_global_id == id)
+			goto out;
+	}
+	ls = NULL;
+      out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+/* TODO: make this more efficient */
+gd_ls_t *find_lockspace_by_local_id(void *id)
+{
+	gd_ls_t *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_local_id == (uint32_t)(long)id)
+			goto out;
+	}
+	ls = NULL;
+      out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+gd_ls_t *find_lockspace_by_name(char *name, int namelen)
+{
+	gd_ls_t *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_namelen == namelen &&
+		    memcmp(ls->ls_name, name, namelen) == 0)
+			goto out;
+	}
+	ls = NULL;
+      out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+/*
+ * Called from dlm_init.  These are the general threads which are not
+ * lockspace-specific and work for all gdlm lockspaces.
+ */
+
+static int threads_start(void)
+{
+	int error;
+
+	/* Thread which interacts with cman for all ls's */
+	error = recoverd_start();
+	if (error) {
+		log_print("cannot start recovery thread %d", error);
+		goto fail;
+	}
+
+	/* Thread which process lock requests for all ls's */
+	error = astd_start();
+	if (error) {
+		log_print("cannot start ast thread %d", error);
+		goto recoverd_fail;
+	}
+
+	/* Thread for sending/receiving messages for all ls's */
+	error = lowcomms_start();
+	if (error) {
+		log_print("cannot start lowcomms %d", error);
+		goto astd_fail;
+	}
+
+	return 0;
+
+      astd_fail:
+	astd_stop();
+
+      recoverd_fail:
+	recoverd_stop();
+
+      fail:
+	return error;
+}
+
+static void threads_stop(void)
+{
+	lowcomms_stop();
+	astd_stop();
+	recoverd_stop();
+}
+
+static int init_internal(void)
+{
+	int error = 0;
+
+	if (gdlmstate == GDST_RUNNING)
+		gdlmcount++;
+	else {
+		error = threads_start();
+		if (error)
+			goto out;
+
+		gdlmstate = GDST_RUNNING;
+		gdlmcount = 1;
+	}
+
+      out:
+	return error;
+}
+
+
+/*
+ * Called after gdlm module is loaded and before any lockspaces are created.
+ * Starts and initializes global threads and structures.  These global entities
+ * are shared by and independent of all lockspaces.
+ *
+ * There should be a gdlm-specific user command which a person can run which
+ * calls this function.  If a user hasn't run that command and something
+ * creates a new lockspace, this is called first.
+ *
+ * This also starts the default lockspace.
+ */
+
+int dlm_init(void)
+{
+	int error;
+
+	down(&gdlmstate_lock);
+	error = init_internal();
+	up(&gdlmstate_lock);
+
+	return error;
+}
+
+int dlm_release(void)
+{
+	int error = 0;
+
+	down(&gdlmstate_lock);
+
+	if (gdlmstate == GDST_NONE)
+		goto out;
+
+	if (gdlmcount)
+		gdlmcount--;
+
+	if (gdlmcount)
+		goto out;
+
+	spin_lock(&lslist_lock);
+	if (!list_empty(&lslist)) {
+		spin_unlock(&lslist_lock);
+		log_print("cannot stop threads, lockspaces still exist");
+		goto out;
+	}
+	spin_unlock(&lslist_lock);
+
+	threads_stop();
+	gdlmstate = GDST_NONE;
+
+      out:
+	up(&gdlmstate_lock);
+
+	return error;
+}
+
+gd_ls_t *allocate_ls(int namelen)
+{
+	gd_ls_t *ls;
+
+	/* FIXME: use appropriate malloc type */
+
+	ls = kmalloc(sizeof(gd_ls_t) + namelen, GFP_KERNEL);
+	if (ls)
+		memset(ls, 0, sizeof(gd_ls_t) + namelen);
+
+	return ls;
+}
+
+void free_ls(gd_ls_t *ls)
+{
+	kfree(ls);
+}
+
+static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
+{
+	gd_ls_t *ls;
+	int i, error = -ENOMEM;
+	uint32_t local_id = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	if (namelen > MAX_SERVICE_NAME_LEN)
+		return -EINVAL;
+
+	if ((ls = find_lockspace_by_name(name, namelen))) {
+		*lockspace = (void *)ls->ls_local_id;
+		return -EEXIST;
+	}
+
+	/*
+	 * Initialize ls fields
+	 */
+
+	ls = allocate_ls(namelen);
+	if (!ls)
+		goto out;
+
+	memcpy(ls->ls_name, name, namelen);
+	ls->ls_namelen = namelen;
+
+	ls->ls_allocation = GFP_KERNEL;
+	memset(&ls->ls_flags, 0, sizeof(unsigned long));
+	INIT_LIST_HEAD(&ls->ls_rootres);
+	ls->ls_hashsize = dlm_config.reshashtbl;
+	ls->ls_hashmask = ls->ls_hashsize - 1;
+
+	ls->ls_reshashtbl =
+	    kmalloc(sizeof(struct list_head) * ls->ls_hashsize, GFP_KERNEL);
+	if (!ls->ls_reshashtbl)
+		goto out_lsfree;
+
+	for (i = 0; i < ls->ls_hashsize; i++)
+		INIT_LIST_HEAD(&ls->ls_reshashtbl[i]);
+
+	rwlock_init(&ls->ls_reshash_lock);
+
+	if (init_lockidtbl(ls, dlm_config.lockidtbl) == -1)
+		goto out_htfree;
+
+	INIT_LIST_HEAD(&ls->ls_nodes);
+	ls->ls_num_nodes = 0;
+	INIT_LIST_HEAD(&ls->ls_nodes_gone);
+	INIT_LIST_HEAD(&ls->ls_recover);
+	spin_lock_init(&ls->ls_recover_lock);
+	INIT_LIST_HEAD(&ls->ls_recover_list);
+	ls->ls_recover_list_count = 0;
+	spin_lock_init(&ls->ls_recover_list_lock);
+	init_waitqueue_head(&ls->ls_wait_general);
+	INIT_LIST_HEAD(&ls->ls_requestqueue);
+	INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
+	ls->ls_last_stop = 0;
+	ls->ls_last_start = 0;
+	ls->ls_last_finish = 0;
+	ls->ls_rcom_msgid = 0;
+	init_MUTEX(&ls->ls_rcom_lock);
+	init_rwsem(&ls->ls_in_recovery);
+	init_rwsem(&ls->ls_unlock_sem);
+	init_rwsem(&ls->ls_rec_rsblist);
+	init_rwsem(&ls->ls_gap_rsblist);
+	down_write(&ls->ls_in_recovery);
+
+	for (i = 0; i < RESDIRHASH_SIZE; i++) {
+		INIT_LIST_HEAD(&ls->ls_resdir_hash[i].rb_reslist);
+		rwlock_init(&ls->ls_resdir_hash[i].rb_lock);
+	}
+
+	if (flags & DLM_LSF_NOTIMERS)
+		set_bit(LSFL_NOTIMERS, &ls->ls_flags);
+
+	/*
+	 * Connect this lockspace with the cluster manager
+	 */
+
+	error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
+				     &ls_ops, TRUE, (void *) ls, &local_id);
+	if (error)
+		goto out_idtblfree;
+
+	ls->ls_state = LSST_INIT;
+	ls->ls_local_id = local_id;
+
+	spin_lock(&lslist_lock);
+	list_add(&ls->ls_list, &lslist);
+	spin_unlock(&lslist_lock);
+
+	error = kcl_join_service(local_id);
+	if (error) {
+		log_error(ls, "service manager join error %d", error);
+		goto out_reg;
+	}
+
+	/* The ls isn't actually running until it receives a start() from CMAN.
+	 * Neither does it have a global ls id until started. */
+
+
+	/* Return the local ID as the lockspace handle. I've left this
+	   cast to a void* as it allows us to replace it with pretty much
+	   anything at a future date without breaking clients. But returning
+	   the address of the lockspace is a bad idea as it could get
+	   forcibly removed, leaving client with a dangling pointer */
+	*lockspace = (void *)local_id;
+
+	return 0;
+
+      out_reg:
+	kcl_unregister_service(ls->ls_local_id);
+
+      out_idtblfree:
+	free_lockidtbl(ls);
+
+      out_htfree:
+	kfree(ls->ls_reshashtbl);
+
+      out_lsfree:
+	free_ls(ls);
+
+      out:
+	return error;
+}
+
+/*
+ * Called by a system like GFS which wants independent lock spaces.
+ */
+
+int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
+{
+	int error = -ENOSYS;
+
+	down(&gdlmstate_lock);
+
+	error = init_internal();
+	if (error)
+		goto out;
+
+	error = new_lockspace(name, namelen, lockspace, flags);
+
+      out:
+	up(&gdlmstate_lock);
+
+	return error;
+}
+
+/* Return 1 if the lockspace still has active remote locks,
+ *        2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(gd_ls_t *ls)
+{
+    int i;
+    int lkb_found = 0;
+    gd_lkb_t *lkb;
+
+    /* NOTE: We check the lockidtbl here rather than the resource table.
+     * This is because there may be LKBs queued as ASTs that have been unlinked
+     * from their RSBs and are pending deletion once the AST has been delivered
+     */
+    read_lock(&ls->ls_lockidtbl_lock);
+    for (i = 0; i < ls->ls_lockidtbl_size; i++) {
+	if (!list_empty(&ls->ls_lockidtbl[i].list)) {
+	    lkb_found = 1;
+	    list_for_each_entry(lkb, &ls->ls_lockidtbl[i].list, lkb_idtbl_list) {
+		if (!lkb->lkb_nodeid) {
+		    read_unlock(&ls->ls_lockidtbl_lock);
+		    return 2;
+		}
+	    }
+	}
+    }
+    read_unlock(&ls->ls_lockidtbl_lock);
+    return lkb_found;
+}
+
+/* Actually release the lockspace */
+static int release_lockspace(gd_ls_t *ls, int force)
+{
+	gd_lkb_t *lkb;
+	gd_res_t *rsb;
+	gd_recover_t *gr;
+	gd_csb_t *csb;
+	struct list_head *head;
+	int i;
+	int busy = lockspace_busy(ls);
+
+	/* Don't destroy a busy lockspace */
+	if (busy > force)
+		return -EBUSY;
+
+	if (force < 3) {
+		kcl_leave_service(ls->ls_local_id);
+		kcl_unregister_service(ls->ls_local_id);
+	}
+
+	spin_lock(&lslist_lock);
+	list_del(&ls->ls_list);
+	spin_unlock(&lslist_lock);
+
+	/*
+	 * Free resdata structs.
+	 */
+
+	resdir_clear(ls);
+
+	/*
+	 * Free all lkb's on lockidtbl[] lists.
+	 */
+
+	for (i = 0; i < ls->ls_lockidtbl_size; i++) {
+		head = &ls->ls_lockidtbl[i].list;
+		while (!list_empty(head)) {
+			lkb = list_entry(head->next, gd_lkb_t, lkb_idtbl_list);
+			list_del(&lkb->lkb_idtbl_list);
+
+			if (lkb->lkb_lockqueue_state)
+				remove_from_lockqueue(lkb);
+
+			if (lkb->lkb_astflags & (AST_COMP | AST_BAST))
+				list_del(&lkb->lkb_astqueue);
+
+			if (lkb->lkb_lvbptr
+			    && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
+				free_lvb(lkb->lkb_lvbptr);
+
+			free_lkb(lkb);
+		}
+	}
+
+	/*
+	 * Free lkidtbl[] itself
+	 */
+
+	kfree(ls->ls_lockidtbl);
+
+	/*
+	 * Free all rsb's on reshashtbl[] lists
+	 */
+
+	for (i = 0; i < ls->ls_hashsize; i++) {
+		head = &ls->ls_reshashtbl[i];
+		while (!list_empty(head)) {
+			rsb = list_entry(head->next, gd_res_t, res_hashchain);
+			list_del(&rsb->res_hashchain);
+
+			if (rsb->res_lvbptr)
+				free_lvb(rsb->res_lvbptr);
+
+			free_rsb(rsb);
+		}
+	}
+
+	/*
+	 * Free reshashtbl[] itself
+	 */
+
+	kfree(ls->ls_reshashtbl);
+
+	/*
+	 * Free structures on any other lists
+	 */
+
+	head = &ls->ls_recover;
+	while (!list_empty(head)) {
+		gr = list_entry(head->next, gd_recover_t, gr_list);
+		list_del(&gr->gr_list);
+		free_dlm_recover(gr);
+	}
+
+	head = &ls->ls_nodes;
+	while (!list_empty(head)) {
+		csb = list_entry(head->next, gd_csb_t, csb_list);
+		list_del(&csb->csb_list);
+		release_csb(csb);
+	}
+
+	head = &ls->ls_nodes_gone;
+	while (!list_empty(head)) {
+		csb = list_entry(head->next, gd_csb_t, csb_list);
+		list_del(&csb->csb_list);
+		release_csb(csb);
+	}
+
+	free_ls(ls);
+
+	dlm_release();
+
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer.  We blindly free everything we're managing for this
+ * lockspace.  Remaining nodes will go through the recovery process as if we'd
+ * died.  The lockspace must continue to function as usual, participating in
+ * recoveries, until kcl_leave_service returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+	gd_ls_t *ls;
+
+	ls = find_lockspace_by_local_id(lockspace);
+	if (!ls)
+	    return -EINVAL;
+
+	return release_lockspace(ls, force);
+}
+
+
+/* Called when the cluster is being shut down dirtily */
+void dlm_emergency_shutdown()
+{
+	gd_ls_t *ls;
+	gd_ls_t *tmp;
+
+	/* Shut lowcomms down to prevent any socket activity */
+	lowcomms_stop_accept();
+
+	/* Delete the devices that belong the the userland
+	   lockspaces to be deleted. */
+	dlm_device_free_devices();
+
+	/* Now try to clean the lockspaces */
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
+		spin_unlock(&lslist_lock);
+		release_lockspace(ls, 3);
+		spin_lock(&lslist_lock);
+	}
+
+	spin_unlock(&lslist_lock);
+}
+
+gd_recover_t *allocate_dlm_recover(void)
+{
+	gd_recover_t *gr;
+
+	gr = (gd_recover_t *) kmalloc(sizeof(gd_recover_t), GFP_KERNEL);
+	if (gr)
+		memset(gr, 0, sizeof(gd_recover_t));
+
+	return gr;
+}
+
+void free_dlm_recover(gd_recover_t * gr)
+{
+	kfree(gr);
+}
+
+/*
+ * Called by CMAN on a specific ls.  "stop" means set flag which while set
+ * causes all new requests to ls to be queued and not submitted until flag is
+ * cleared.  stop on a ls also needs to cancel any prior starts on the ls.
+ * The recoverd thread carries out any work called for by this event.
+ */
+
+static int dlm_ls_stop(void *servicedata)
+{
+	gd_ls_t *ls = (gd_ls_t *) servicedata;
+	int new;
+
+	spin_lock(&ls->ls_recover_lock);
+	ls->ls_last_stop = ls->ls_last_start;
+	set_bit(LSFL_LS_STOP, &ls->ls_flags);
+	new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	/*
+	 * This in_recovery lock does two things:
+	 *
+	 * 1) Keeps this function from returning until all threads are out
+	 *    of locking routines and locking is truely stopped.
+	 * 2) Keeps any new requests from being processed until it's unlocked
+	 *    when recovery is complete.
+	 */
+
+	if (new)
+		down_write(&ls->ls_in_recovery);
+
+	clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
+	clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
+	clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
+	clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
+
+	recoverd_kick(ls);
+
+	return 0;
+}
+
+/*
+ * Called by CMAN on a specific ls.  "start" means enable the lockspace to do
+ * request processing which first requires that the recovery procedure be
+ * stepped through with all nodes sharing the lockspace (nodeids).  The first
+ * start on the ls after it's created is a special case and requires some extra
+ * work like figuring out our own local nodeid.  We can't do all this in the
+ * calling CMAN context, so we must pass this work off to the recoverd thread
+ * which was created in gdlm_init().  The recoverd thread carries out any work
+ * called for by this event.
+ */
+
+static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
+			int event_id, int type)
+{
+	gd_ls_t *ls = (gd_ls_t *) servicedata;
+	gd_recover_t *gr;
+	int error = -ENOMEM;
+
+	gr = allocate_dlm_recover();
+	if (!gr)
+		goto out;
+
+	gr->gr_nodeids = nodeids;
+	gr->gr_node_count = count;
+	gr->gr_event_id = event_id;
+
+	spin_lock(&ls->ls_recover_lock);
+	ls->ls_last_start = event_id;
+	list_add_tail(&gr->gr_list, &ls->ls_recover);
+	set_bit(LSFL_LS_START, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	recoverd_kick(ls);
+	error = 0;
+
+      out:
+	return error;
+}
+
+/*
+ * Called by CMAN on a specific ls.  "finish" means that all nodes which
+ * received a "start" have completed the start and called kcl_start_done.
+ * The recoverd thread carries out any work called for by this event.
+ */
+
+static void dlm_ls_finish(void *servicedata, int event_id)
+{
+	gd_ls_t *ls = (gd_ls_t *) servicedata;
+
+	spin_lock(&ls->ls_recover_lock);
+	ls->ls_last_finish = event_id;
+	set_bit(LSFL_LS_FINISH, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	recoverd_kick(ls);
+}
+
+struct kcl_service_ops ls_ops = {
+	.stop = dlm_ls_stop,
+	.start = dlm_ls_start,
+	.finish = dlm_ls_finish
+};
diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
--- linux-orig/cluster/dlm/lockspace.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lockspace.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+void dlm_lockspace_init(void);
+int dlm_init(void);
+int dlm_release(void);
+int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
+int dlm_release_lockspace(void *ls, int force);
+gd_ls_t *find_lockspace_by_global_id(uint32_t id);
+gd_ls_t *find_lockspace_by_local_id(void *id);
+gd_ls_t *find_lockspace_by_name(char *name, int namelen);
+void free_dlm_recover(gd_recover_t *gr);
+int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out);
+void dlm_emergency_shutdown(void);
+
+#endif				/* __LOCKSPACE_DOT_H__ */
diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
--- linux-orig/cluster/dlm/lowcomms.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lowcomms.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,1354 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never waits.
+ *
+ */
+
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/pagemap.h>
+#include <cluster/cnxman.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "config.h"
+
+struct cbuf {
+	unsigned base;
+	unsigned len;
+	unsigned mask;
+};
+
+#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_EAT(cb, n) do { (cb)->len  -= (n); \
+                             (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+
+struct connection {
+	struct socket *sock;	/* NULL if not connected */
+	uint32_t nodeid;	/* So we know who we are in the list */
+	struct rw_semaphore sock_sem;	/* Stop connect races */
+	struct list_head read_list;	/* On this list when ready for reading */
+	struct list_head write_list;	/* On this list when ready for writing */
+	struct list_head state_list;	/* On this list when ready to connect */
+	unsigned long flags;	/* bit 1,2 = We are on the read/write lists */
+#define CF_READ_PENDING 1
+#define CF_WRITE_PENDING 2
+#define CF_CONNECT_PENDING 3
+#define CF_IS_OTHERSOCK 4
+	struct list_head writequeue;	/* List of outgoing writequeue_entries */
+	struct list_head listenlist;    /* List of allocated listening sockets */
+	spinlock_t writequeue_lock;
+	int (*rx_action) (struct connection *);	/* What to do when active */
+	struct page *rx_page;
+	struct cbuf cb;
+	int retries;
+#define MAX_CONNECT_RETRIES 3
+	struct connection *othersock;
+};
+#define sock2con(x) ((struct connection *)(x)->sk_user_data)
+#define nodeid2con(x) (&connections[(x)])
+
+/* An entry waiting to be sent */
+struct writequeue_entry {
+	struct list_head list;
+	struct page *page;
+	int offset;
+	int len;
+	int end;
+	int users;
+	struct connection *con;
+};
+
+/* "Template" structure for IPv4 and IPv6 used to fill
+ * in the missing bits when converting between cman (which knows
+ * nothing about sockaddr structs) and real life where we actually
+ * have to connect to these addresses. Also one of these structs
+ * will hold the cached "us" address.
+ *
+ * It's an in6 sockaddr just so there's enough space for anything
+ * we're likely to see here.
+ */
+static struct sockaddr_in6 local_addr;
+
+/* Manage daemons */
+static struct semaphore thread_lock;
+static struct completion thread_completion;
+static atomic_t send_run;
+static atomic_t recv_run;
+
+/* An array of connections, indexed by NODEID */
+static struct connection *connections;
+static int conn_array_size;
+static atomic_t writequeue_length;
+static atomic_t accepting;
+
+static wait_queue_t lowcomms_send_waitq_head;
+static wait_queue_head_t lowcomms_send_waitq;
+
+static wait_queue_t lowcomms_recv_waitq_head;
+static wait_queue_head_t lowcomms_recv_waitq;
+
+/* List of sockets that have reads pending */
+static struct list_head read_sockets;
+static spinlock_t read_sockets_lock;
+
+/* List of sockets which have writes pending */
+static struct list_head write_sockets;
+static spinlock_t write_sockets_lock;
+
+/* List of sockets which have connects pending */
+static struct list_head state_sockets;
+static spinlock_t state_sockets_lock;
+
+/* List of allocated listen sockets */
+static struct list_head listen_sockets;
+
+static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
+static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
+
+
+/* Data available on socket or listen socket received a connect */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+	struct connection *con = sock2con(sk);
+
+	if (test_and_set_bit(CF_READ_PENDING, &con->flags))
+		return;
+
+	spin_lock_bh(&read_sockets_lock);
+	list_add_tail(&con->read_list, &read_sockets);
+	spin_unlock_bh(&read_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_recv_waitq);
+}
+
+static void lowcomms_write_space(struct sock *sk)
+{
+	struct connection *con = sock2con(sk);
+
+	if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+		return;
+
+	spin_lock_bh(&write_sockets_lock);
+	list_add_tail(&con->write_list, &write_sockets);
+	spin_unlock_bh(&write_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_send_waitq);
+}
+
+static inline void lowcomms_connect_sock(struct connection *con)
+{
+	if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
+		return;
+	if (!atomic_read(&accepting))
+		return;
+
+	spin_lock_bh(&state_sockets_lock);
+	list_add_tail(&con->state_list, &state_sockets);
+	spin_unlock_bh(&state_sockets_lock);
+
+	wake_up_interruptible(&lowcomms_send_waitq);
+}
+
+static void lowcomms_state_change(struct sock *sk)
+{
+/*	struct connection *con = sock2con(sk); */
+
+	switch (sk->sk_state) {
+	case TCP_ESTABLISHED:
+		lowcomms_write_space(sk);
+		break;
+
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+	case TCP_TIME_WAIT:
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+	case TCP_LAST_ACK:
+	case TCP_CLOSING:
+		/* FIXME: I think this causes more trouble than it solves.
+		   lowcomms wil reconnect anyway when there is something to
+		   send. This just attempts reconnection if a node goes down!
+		*/
+		/* lowcomms_connect_sock(con); */
+		break;
+
+	default:
+		printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
+		break;
+	}
+}
+
+/* Make a socket active */
+static int add_sock(struct socket *sock, struct connection *con)
+{
+	con->sock = sock;
+
+	/* Install a data_ready callback */
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->sock->sk->sk_write_space = lowcomms_write_space;
+	con->sock->sk->sk_state_change = lowcomms_state_change;
+
+	return 0;
+}
+
+/* Add the port number to an IP6 or 4 sockaddr and return the address
+   length */
+static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
+			  int *addr_len)
+{
+        saddr->sin6_family = local_addr.sin6_family;
+        if (local_addr.sin6_family == AF_INET) {
+	    struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+	    in4_addr->sin_port = cpu_to_be16(port);
+	    *addr_len = sizeof(struct sockaddr_in);
+	}
+	else {
+	    saddr->sin6_port = cpu_to_be16(port);
+	    *addr_len = sizeof(struct sockaddr_in6);
+	}
+}
+
+/* Close a remote connection and tidy up */
+static void close_connection(struct connection *con)
+{
+	if (test_bit(CF_IS_OTHERSOCK, &con->flags))
+		return;
+
+	down_write(&con->sock_sem);
+
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+		if (con->othersock) {
+			down_write(&con->othersock->sock_sem);
+			sock_release(con->othersock->sock);
+			con->othersock->sock = NULL;
+			up_write(&con->othersock->sock_sem);
+			kfree(con->othersock);
+			con->othersock = NULL;
+		}
+	}
+	if (con->rx_page) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+	up_write(&con->sock_sem);
+}
+
+/* Data received from remote end */
+static int receive_from_sock(struct connection *con)
+{
+	int ret = 0;
+	struct msghdr msg;
+	struct iovec iov[2];
+	mm_segment_t fs;
+	unsigned len;
+	int r;
+	int call_again_soon = 0;
+
+	down_read(&con->sock_sem);
+
+	if (con->sock == NULL)
+		goto out;
+	if (con->rx_page == NULL) {
+		/*
+		 * This doesn't need to be atomic, but I think it should
+		 * improve performance if it is.
+		 */
+		con->rx_page = alloc_page(GFP_ATOMIC);
+		if (con->rx_page == NULL)
+			goto out_resched;
+		CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
+	}
+	/*
+	 * To avoid doing too many short reads, we will reschedule for another
+	 * another time if there are less than 32 bytes left in the buffer.
+	 */
+	if (!CBUF_MAY_ADD(&con->cb, 32))
+		goto out_resched;
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = iov;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_flags = 0;
+
+	/*
+	 * iov[0] is the bit of the circular buffer between the current end
+	 * point (cb.base + cb.len) and the end of the buffer.
+	 */
+	iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
+	iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
+	iov[1].iov_len = 0;
+
+	/*
+	 * iov[1] is the bit of the circular buffer between the start of the
+	 * buffer and the start of the currently used section (cb.base)
+	 */
+	if (CBUF_DATA(&con->cb) >= con->cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
+		iov[1].iov_len = con->cb.base;
+		iov[1].iov_base = page_address(con->rx_page);
+		msg.msg_iovlen = 2;
+	}
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	fs = get_fs();
+	set_fs(get_ds());
+	r = ret = sock_recvmsg(con->sock, &msg, len,
+			       MSG_DONTWAIT | MSG_NOSIGNAL);
+	set_fs(fs);
+
+	if (ret <= 0)
+		goto out_close;
+	if (ret == len)
+		call_again_soon = 1;
+	CBUF_ADD(&con->cb, ret);
+	ret = midcomms_process_incoming_buffer(con->nodeid,
+					       page_address(con->rx_page),
+					       con->cb.base, con->cb.len,
+					       PAGE_CACHE_SIZE);
+	if (ret == -EBADMSG) {
+		printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
+		       "iov_len=%u, iov_base[0]=%p, read=%d\n",
+		       page_address(con->rx_page), con->cb.base, con->cb.len,
+		       len, iov[0].iov_base, r);
+	}
+	if (ret < 0)
+		goto out_close;
+	CBUF_EAT(&con->cb, ret);
+
+	if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+      out:
+	if (call_again_soon)
+		goto out_resched;
+	up_read(&con->sock_sem);
+	ret = 0;
+	goto out_ret;
+
+      out_resched:
+	lowcomms_data_ready(con->sock->sk, 0);
+	up_read(&con->sock_sem);
+	ret = 0;
+	goto out_ret;
+
+      out_close:
+	up_read(&con->sock_sem);
+	if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) {
+		close_connection(con);
+		lowcomms_connect_sock(con);
+	}
+
+      out_ret:
+	return ret;
+}
+
+/* Listening socket is busy, accept a connection */
+static int accept_from_sock(struct connection *con)
+{
+	int result;
+	struct sockaddr_in6 peeraddr;
+	struct socket *newsock;
+	int len;
+	int nodeid;
+	struct connection *newcon;
+
+	memset(&peeraddr, 0, sizeof(peeraddr));
+	newsock = sock_alloc();
+	if (!newsock)
+		return -ENOMEM;
+
+	down_read(&con->sock_sem);
+
+	result = -ENOTCONN;
+	if (con->sock == NULL)
+		goto accept_err;
+
+	newsock->type = con->sock->type;
+	newsock->ops = con->sock->ops;
+
+	result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
+	if (result < 0)
+		goto accept_err;
+
+	/* Get the connected socket's peer */
+	if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
+				  &len, 2)) {
+		result = -ECONNABORTED;
+		goto accept_err;
+	}
+
+	/* Get the new node's NODEID */
+	nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
+	if (nodeid == 0) {
+	    	printk("dlm: connect from non cluster node\n");
+		sock_release(newsock);
+		up_read(&con->sock_sem);
+		return -1;
+	}
+
+	log_print("got connection from %d", nodeid);
+
+	/*  Check to see if we already have a connection to this node. This
+	 *  could happen if the two nodes initiate a connection at roughly
+	 *  the same time and the connections cross on the wire.
+	 * TEMPORARY FIX:
+	 *  In this case we store the incoming one in "othersock"
+	 */
+	newcon = nodeid2con(nodeid);
+	down_write(&newcon->sock_sem);
+	if (newcon->sock) {
+	        struct connection *othercon;
+
+		othercon = kmalloc(sizeof(struct connection), GFP_KERNEL);
+		if (!othercon) {
+		        printk("dlm: failed to allocate incoming socket\n");
+		        sock_release(newsock);
+			up_write(&newcon->sock_sem);
+			up_read(&con->sock_sem);
+			goto accept_out;
+		}
+		memset(othercon, 0, sizeof(*othercon));
+		newcon->othersock = othercon;
+		othercon->nodeid = nodeid;
+		othercon->sock = newsock;
+		othercon->rx_action = receive_from_sock;
+		add_sock(newsock, othercon);
+		init_rwsem(&othercon->sock_sem);
+		set_bit(CF_IS_OTHERSOCK, &othercon->flags);
+		newsock->sk->sk_user_data = othercon;
+
+		up_write(&newcon->sock_sem);
+		lowcomms_data_ready(newsock->sk, 0);
+		up_read(&con->sock_sem);
+		goto accept_out;
+	}
+
+	newsock->sk->sk_user_data = newcon;
+	newcon->rx_action = receive_from_sock;
+	add_sock(newsock, newcon);
+	up_write(&newcon->sock_sem);
+
+	/*
+	 * Add it to the active queue in case we got data
+	 * beween processing the accept adding the socket
+	 * to the read_sockets list
+	 */
+	lowcomms_data_ready(newsock->sk, 0);
+
+	up_read(&con->sock_sem);
+
+      accept_out:
+	return 0;
+
+      accept_err:
+	up_read(&con->sock_sem);
+	sock_release(newsock);
+
+	printk("dlm: error accepting connection from node: %d\n", result);
+	return result;
+}
+
+/* Connect a new socket to its peer */
+static int connect_to_sock(struct connection *con)
+{
+	int result = -EHOSTUNREACH;
+	struct sockaddr_in6 saddr;
+	int addr_len;
+	struct socket *sock;
+
+	if (con->nodeid == 0) {
+		log_print("attempt to connect sock 0 foiled");
+		return 0;
+	}
+
+	down_write(&con->sock_sem);
+	if (con->retries++ > MAX_CONNECT_RETRIES)
+		goto out;
+
+	// FIXME not sure this should happen, let alone like this.
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (result < 0)
+		goto out_err;
+
+	if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
+	        goto out_err;
+
+	sock->sk->sk_user_data = con;
+	con->rx_action = receive_from_sock;
+
+	make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
+
+	add_sock(sock, con);
+	result =
+	    sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
+			       O_NONBLOCK);
+	if (result == -EINPROGRESS)
+		result = 0;
+	if (result != 0)
+		goto out_err;
+
+      out:
+	up_write(&con->sock_sem);
+	/*
+	 * Returning an error here means we've given up trying to connect to
+	 * a remote node, otherwise we return 0 and reschedule the connetion
+	 * attempt
+	 */
+	return result;
+
+      out_err:
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
+	/*
+	 * Some errors are fatal and this list might need adjusting. For other
+	 * errors we try again until the max number of retries is reached.
+	 */
+	if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
+	    result != -ENETDOWN && result != EINVAL
+	    && result != -EPROTONOSUPPORT) {
+		lowcomms_connect_sock(con);
+		result = 0;
+	}
+	goto out;
+}
+
+static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
+{
+        struct socket *sock = NULL;
+	mm_segment_t fs;
+	int result = 0;
+	int one = 1;
+	struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (result < 0) {
+		printk("dlm: Can't create listening comms socket\n");
+		goto create_out;
+	}
+
+	fs = get_fs();
+	set_fs(get_ds());
+	result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
+	set_fs(fs);
+	if (result < 0) {
+		printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
+	}
+	sock->sk->sk_user_data = con;
+	con->rx_action = accept_from_sock;
+	con->sock = sock;
+
+	/* Bind to our port */
+	make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
+	result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
+	if (result < 0) {
+		printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		goto create_out;
+	}
+
+	fs = get_fs();
+	set_fs(get_ds());
+
+	result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
+	set_fs(fs);
+	if (result < 0) {
+		printk("dlm: Set keepalive failed: %d\n", result);
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		goto create_out;
+	}
+
+      create_out:
+	return sock;
+}
+
+
+/* Listen on all interfaces */
+static int listen_for_all(void)
+{
+	int result = 0;
+	int nodeid;
+	struct socket *sock = NULL;
+	struct list_head *addr_list;
+	struct connection *con = nodeid2con(0);
+	struct cluster_node_addr *node_addr;
+	char local_addr[sizeof(struct sockaddr_in6)];
+
+	/* This will also fill in local_addr */
+	nodeid = lowcomms_our_nodeid();
+
+	addr_list = kcl_get_node_addresses(nodeid);
+	if (!addr_list) {
+	        printk("dlm: cannot initialise comms layer\n");
+		result = -ENOTCONN;
+		goto create_out;
+	}
+
+	list_for_each_entry(node_addr, addr_list, list) {
+
+		if (!con) {
+			con = kmalloc(sizeof(struct connection), GFP_KERNEL);
+			if (!con) {
+				printk("dlm: failed to allocate listen socket\n");
+				goto create_out;
+			}
+			memset(con, 0, sizeof(*con));
+			init_rwsem(&con->sock_sem);
+			spin_lock_init(&con->writequeue_lock);
+			INIT_LIST_HEAD(&con->writequeue);
+			set_bit(CF_IS_OTHERSOCK, &con->flags);
+		}
+
+		memcpy(local_addr, node_addr->addr, node_addr->addr_len);
+	        sock = create_listen_sock(con, local_addr,
+					  node_addr->addr_len);
+		if (sock) {
+			add_sock(sock, con);
+		}
+		else {
+			kfree(con);
+		}
+
+		/* Keep a list of dynamically allocated listening sockets
+		   so we can free them at shutdown */
+		if (test_bit(CF_IS_OTHERSOCK, &con->flags)) {
+			list_add_tail(&con->listenlist, &listen_sockets);
+		}
+		con = NULL;
+	}
+
+      create_out:
+	return result;
+}
+
+
+
+static struct writequeue_entry *new_writequeue_entry(struct connection *con,
+						     int allocation)
+{
+	struct writequeue_entry *entry;
+
+	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+	if (!entry)
+		return NULL;
+
+	entry->page = alloc_page(allocation);
+	if (!entry->page) {
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->offset = 0;
+	entry->len = 0;
+	entry->end = 0;
+	entry->users = 0;
+	entry->con = con;
+
+	return entry;
+}
+
+struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
+					     int allocation, char **ppc)
+{
+	struct connection *con = nodeid2con(nodeid);
+	struct writequeue_entry *e;
+	int offset = 0;
+	int users = 0;
+
+	if (!atomic_read(&accepting))
+		return NULL;
+
+	spin_lock(&con->writequeue_lock);
+	e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
+	if (((struct list_head *) e == &con->writequeue) ||
+	    (PAGE_CACHE_SIZE - e->end < len)) {
+		e = NULL;
+	} else {
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+	}
+	spin_unlock(&con->writequeue_lock);
+
+	if (e) {
+	      got_one:
+		if (users == 0)
+			kmap(e->page);
+		*ppc = page_address(e->page) + offset;
+		return e;
+	}
+
+	e = new_writequeue_entry(con, allocation);
+	if (e) {
+		spin_lock(&con->writequeue_lock);
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+		list_add_tail(&e->list, &con->writequeue);
+		spin_unlock(&con->writequeue_lock);
+		atomic_inc(&writequeue_length);
+		goto got_one;
+	}
+	return NULL;
+}
+
+void lowcomms_commit_buffer(struct writequeue_entry *e)
+{
+	struct connection *con = e->con;
+	int users;
+
+	if (!atomic_read(&accepting))
+		return;
+
+	spin_lock(&con->writequeue_lock);
+	users = --e->users;
+	if (users)
+		goto out;
+	e->len = e->end - e->offset;
+	kunmap(e->page);
+	spin_unlock(&con->writequeue_lock);
+
+	if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
+		spin_lock_bh(&write_sockets_lock);
+		list_add_tail(&con->write_list, &write_sockets);
+		spin_unlock_bh(&write_sockets_lock);
+
+		wake_up_interruptible(&lowcomms_send_waitq);
+	}
+	return;
+
+      out:
+	spin_unlock(&con->writequeue_lock);
+	return;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+	atomic_dec(&writequeue_length);
+}
+
+/* Send a message */
+static int send_to_sock(struct connection *con)
+{
+	int ret = 0;
+	ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
+	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	struct writequeue_entry *e;
+	int len, offset;
+
+	down_read(&con->sock_sem);
+	if (con->sock == NULL)
+		goto out_connect;
+
+	sendpage = con->sock->ops->sendpage;
+
+	spin_lock(&con->writequeue_lock);
+	for (;;) {
+		e = list_entry(con->writequeue.next, struct writequeue_entry,
+			       list);
+		if ((struct list_head *) e == &con->writequeue)
+			break;
+
+		len = e->len;
+		offset = e->offset;
+		BUG_ON(len == 0 && e->users == 0);
+		spin_unlock(&con->writequeue_lock);
+
+		ret = 0;
+		if (len) {
+			ret = sendpage(con->sock, e->page, offset, len,
+				       msg_flags);
+			if (ret == -EAGAIN || ret == 0)
+				goto out;
+			if (ret <= 0)
+				goto send_error;
+		}
+
+		spin_lock(&con->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			free_entry(e);
+			continue;
+		}
+	}
+	spin_unlock(&con->writequeue_lock);
+      out:
+	up_read(&con->sock_sem);
+	return ret;
+
+      send_error:
+	up_read(&con->sock_sem);
+	close_connection(con);
+	lowcomms_connect_sock(con);
+	return ret;
+
+      out_connect:
+	up_read(&con->sock_sem);
+	lowcomms_connect_sock(con);
+	return 0;
+}
+
+/* Called from recoverd when it knows that a node has
+   left the cluster */
+int lowcomms_close(int nodeid)
+{
+	struct connection *con;
+
+	if (!connections)
+		goto out;
+
+	con = nodeid2con(nodeid);
+	if (con->sock) {
+		close_connection(con);
+		return 0;
+	}
+
+      out:
+	return -1;
+}
+
+/* API send message call, may queue the request */
+/* N.B. This is the old interface - use the new one for new calls */
+int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
+{
+	struct writequeue_entry *e;
+	char *b;
+
+	GDLM_ASSERT(nodeid < dlm_config.max_connections,
+		    printk("nodeid=%u\n", nodeid););
+
+	e = lowcomms_get_buffer(nodeid, len, allocation, &b);
+	if (e) {
+		memcpy(b, buf, len);
+		lowcomms_commit_buffer(e);
+		return 0;
+	}
+	return -ENOBUFS;
+}
+
+/* Look for activity on active sockets */
+static void process_sockets(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+
+	spin_lock_bh(&read_sockets_lock);
+	list_for_each_safe(list, temp, &read_sockets) {
+		struct connection *con =
+		    list_entry(list, struct connection, read_list);
+		list_del(&con->read_list);
+		clear_bit(CF_READ_PENDING, &con->flags);
+
+		spin_unlock_bh(&read_sockets_lock);
+
+		con->rx_action(con);
+
+		/* Don't starve out everyone else */
+		schedule();
+		spin_lock_bh(&read_sockets_lock);
+	}
+	spin_unlock_bh(&read_sockets_lock);
+}
+
+/* Try to send any messages that are pending
+ */
+static void process_output_queue(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+	int ret;
+
+	spin_lock_bh(&write_sockets_lock);
+	list_for_each_safe(list, temp, &write_sockets) {
+		struct connection *con =
+		    list_entry(list, struct connection, write_list);
+		list_del(&con->write_list);
+		clear_bit(CF_WRITE_PENDING, &con->flags);
+
+		spin_unlock_bh(&write_sockets_lock);
+
+		ret = send_to_sock(con);
+		if (ret < 0) {
+		}
+		spin_lock_bh(&write_sockets_lock);
+	}
+	spin_unlock_bh(&write_sockets_lock);
+}
+
+static void process_state_queue(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+	int ret;
+
+	spin_lock_bh(&state_sockets_lock);
+	list_for_each_safe(list, temp, &state_sockets) {
+		struct connection *con =
+		    list_entry(list, struct connection, state_list);
+		list_del(&con->state_list);
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		spin_unlock_bh(&state_sockets_lock);
+
+		ret = connect_to_sock(con);
+		if (ret < 0) {
+		}
+		spin_lock_bh(&state_sockets_lock);
+	}
+	spin_unlock_bh(&state_sockets_lock);
+}
+
+/* Discard all entries on the write queues */
+static void clean_writequeues(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+	int nodeid;
+
+	for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) {
+		struct connection *con = nodeid2con(nodeid);
+
+		spin_lock(&con->writequeue_lock);
+		list_for_each_safe(list, temp, &con->writequeue) {
+			struct writequeue_entry *e =
+			    list_entry(list, struct writequeue_entry, list);
+			list_del(&e->list);
+			free_entry(e);
+		}
+		spin_unlock(&con->writequeue_lock);
+	}
+}
+
+static int read_list_empty(void)
+{
+	int status;
+
+	spin_lock_bh(&read_sockets_lock);
+	status = list_empty(&read_sockets);
+	spin_unlock_bh(&read_sockets_lock);
+
+	return status;
+}
+
+/* DLM Transport comms receive daemon */
+static int dlm_recvd(void *data)
+{
+	daemonize("dlm_recvd");
+	atomic_set(&recv_run, 1);
+
+	init_waitqueue_head(&lowcomms_recv_waitq);
+	init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
+	add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
+
+	complete(&thread_completion);
+
+	while (atomic_read(&recv_run)) {
+
+		set_task_state(current, TASK_INTERRUPTIBLE);
+
+		if (read_list_empty())
+			schedule();
+
+		set_task_state(current, TASK_RUNNING);
+
+		process_sockets();
+	}
+
+	down(&thread_lock);
+	up(&thread_lock);
+
+	complete(&thread_completion);
+
+	return 0;
+}
+
+static int write_and_state_lists_empty(void)
+{
+	int status;
+
+	spin_lock_bh(&write_sockets_lock);
+	status = list_empty(&write_sockets);
+	spin_unlock_bh(&write_sockets_lock);
+
+	spin_lock_bh(&state_sockets_lock);
+	if (list_empty(&state_sockets) == 0)
+		status = 0;
+	spin_unlock_bh(&state_sockets_lock);
+
+	return status;
+}
+
+/* DLM Transport send daemon */
+static int dlm_sendd(void *data)
+{
+	daemonize("dlm_sendd");
+	atomic_set(&send_run, 1);
+
+	init_waitqueue_head(&lowcomms_send_waitq);
+	init_waitqueue_entry(&lowcomms_send_waitq_head, current);
+	add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
+
+	complete(&thread_completion);
+
+	while (atomic_read(&send_run)) {
+
+		set_task_state(current, TASK_INTERRUPTIBLE);
+
+		if (write_and_state_lists_empty())
+			schedule();
+
+		set_task_state(current, TASK_RUNNING);
+
+		process_state_queue();
+		process_output_queue();
+	}
+
+	down(&thread_lock);
+	up(&thread_lock);
+
+	complete(&thread_completion);
+
+	return 0;
+}
+
+static void daemons_stop(void)
+{
+	if (atomic_read(&recv_run)) {
+		down(&thread_lock);
+		atomic_set(&recv_run, 0);
+		wake_up_interruptible(&lowcomms_recv_waitq);
+		up(&thread_lock);
+		wait_for_completion(&thread_completion);
+	}
+
+	if (atomic_read(&send_run)) {
+		down(&thread_lock);
+		atomic_set(&send_run, 0);
+		wake_up_interruptible(&lowcomms_send_waitq);
+		up(&thread_lock);
+		wait_for_completion(&thread_completion);
+	}
+}
+
+static int daemons_start(void)
+{
+	int error;
+
+	error = kernel_thread(dlm_recvd, NULL, 0);
+	if (error < 0) {
+		log_print("can't start recvd thread: %d", error);
+		goto out;
+	}
+	wait_for_completion(&thread_completion);
+
+	error = kernel_thread(dlm_sendd, NULL, 0);
+	if (error < 0) {
+		log_print("can't start sendd thread: %d", error);
+		daemons_stop();
+		goto out;
+	}
+	wait_for_completion(&thread_completion);
+
+	error = 0;
+ out:
+	return error;
+}
+
+/*
+ * Return the largest buffer size we can cope with.
+ */
+int lowcomms_max_buffer_size(void)
+{
+	return PAGE_CACHE_SIZE;
+}
+
+void lowcomms_stop(void)
+{
+	int i;
+	struct connection *temp;
+	struct connection *lcon;
+
+	atomic_set(&accepting, 0);
+
+	/* Set all the activity flags to prevent any
+	   socket activity.
+	*/
+	for (i = 0; i < conn_array_size; i++) {
+	        connections[i].flags = 0x7;
+	}
+	daemons_stop();
+	clean_writequeues();
+
+	for (i = 0; i < conn_array_size; i++) {
+		close_connection(nodeid2con(i));
+	}
+
+	kfree(connections);
+	connections = NULL;
+
+	/* Free up any dynamically allocated listening sockets */
+	list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
+		sock_release(lcon->sock);
+		kfree(lcon);
+	}
+
+	kcl_releaseref_cluster();
+}
+
+/* This is quite likely to sleep... */
+int lowcomms_start(void)
+{
+	int error = 0;
+	int i;
+
+	INIT_LIST_HEAD(&read_sockets);
+	INIT_LIST_HEAD(&write_sockets);
+	INIT_LIST_HEAD(&state_sockets);
+	INIT_LIST_HEAD(&listen_sockets);
+
+	spin_lock_init(&read_sockets_lock);
+	spin_lock_init(&write_sockets_lock);
+	spin_lock_init(&state_sockets_lock);
+
+	init_completion(&thread_completion);
+	init_MUTEX(&thread_lock);
+	atomic_set(&send_run, 0);
+	atomic_set(&recv_run, 0);
+
+	error = -ENOTCONN;
+	if (kcl_addref_cluster())
+		goto out;
+
+	/*
+	 * Temporarily initialise the waitq head so that lowcomms_send_message
+	 * doesn't crash if it gets called before the thread is fully
+	 * initialised
+	 */
+	init_waitqueue_head(&lowcomms_send_waitq);
+
+	error = -ENOMEM;
+
+	connections = kmalloc(sizeof(struct connection) *
+			      dlm_config.max_connections, GFP_KERNEL);
+	if (!connections)
+		goto out;
+
+	memset(connections, 0,
+	       sizeof(struct connection) * dlm_config.max_connections);
+	for (i = 0; i < dlm_config.max_connections; i++) {
+		connections[i].nodeid = i;
+		init_rwsem(&connections[i].sock_sem);
+		INIT_LIST_HEAD(&connections[i].writequeue);
+		spin_lock_init(&connections[i].writequeue_lock);
+	}
+	conn_array_size = dlm_config.max_connections;
+
+	/* Start listening */
+	error = listen_for_all();
+	if (error)
+		goto fail_free_conn;
+
+	error = daemons_start();
+	if (error)
+		goto fail_free_conn;
+
+	atomic_set(&accepting, 1);
+
+	return 0;
+
+      fail_free_conn:
+	kfree(connections);
+
+      out:
+	return error;
+}
+
+/* Don't accept any more outgoing work */
+void lowcomms_stop_accept()
+{
+        atomic_set(&accepting, 0);
+}
+
+/* Cluster Manager interface functions for looking up
+   nodeids and IP addresses by each other
+*/
+
+/* Return the IP address of a node given its NODEID */
+static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
+{
+	struct list_head *addrs;
+	struct cluster_node_addr *node_addr;
+	struct cluster_node_addr *current_addr = NULL;
+	struct sockaddr_in6 *saddr;
+	int interface;
+	int i;
+
+	addrs = kcl_get_node_addresses(nodeid);
+	if (!addrs)
+		return -1;
+
+	interface = kcl_get_current_interface();
+
+	/* Look for address number <interface> */
+	i=0; /* i/f numbers start at 1 */
+	list_for_each_entry(node_addr, addrs, list) {
+	        if (interface == ++i) {
+		        current_addr = node_addr;
+			break;
+		}
+	}
+
+	/* If that failed then just use the first one */
+	if (!current_addr)
+ 	        current_addr = (struct cluster_node_addr *)addrs->next;
+
+	saddr = (struct sockaddr_in6 *)current_addr->addr;
+
+	/* Extract the IP address */
+	if (saddr->sin6_family == AF_INET) {
+	        struct sockaddr_in *in4  = (struct sockaddr_in *)saddr;
+		struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
+		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+	}
+	else {
+	        struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
+		memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
+	}
+
+	return 0;
+}
+
+/* Return the NODEID for a node given its sockaddr */
+static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
+{
+	struct kcl_cluster_node node;
+	struct sockaddr_in6 ipv6_addr;
+	struct sockaddr_in  ipv4_addr;
+
+	if (addr->sa_family == AF_INET) {
+	        struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
+		memcpy(&ipv4_addr, &local_addr, addr_len);
+		memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
+
+		addr = (struct sockaddr *)&ipv4_addr;
+	}
+	else {
+	        struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
+		memcpy(&ipv6_addr, &local_addr, addr_len);
+		memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
+
+		addr = (struct sockaddr *)&ipv6_addr;
+	}
+
+	if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
+		return node.node_id;
+	else
+		return 0;
+}
+
+int lowcomms_our_nodeid(void)
+{
+	struct kcl_cluster_node node;
+	struct list_head *addrs;
+	struct cluster_node_addr *first_addr;
+	static int our_nodeid = 0;
+
+	if (our_nodeid)
+		return our_nodeid;
+
+	if (kcl_get_node_by_nodeid(0, &node) == -1)
+		return 0;
+
+	our_nodeid = node.node_id;
+
+	/* Fill in the "template" structure */
+	addrs = kcl_get_node_addresses(our_nodeid);
+	if (!addrs)
+		return 0;
+
+	first_addr = (struct cluster_node_addr *) addrs->next;
+	memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
+
+	return node.node_id;
+}
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
--- linux-orig/cluster/dlm/lowcomms.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/lowcomms.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+/* The old interface */
+int lowcomms_send_message(int csid, char *buf, int len, int allocation);
+
+/* The new interface */
+struct writequeue_entry;
+extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
+						    int allocation, char **ppc);
+extern void lowcomms_commit_buffer(struct writequeue_entry *e);
+
+int lowcomms_start(void);
+void lowcomms_stop(void);
+void lowcomms_stop_accept(void);
+int lowcomms_close(int nodeid);
+int lowcomms_max_buffer_size(void);
+
+int lowcomms_our_nodeid(void);
+
+#endif				/* __LOWCOMMS_DOT_H__ */
diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
--- linux-orig/cluster/dlm/main.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/main.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,98 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#define EXPORT_SYMTAB
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <net/sock.h>
+
+#include <cluster/cnxman.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "locking.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "lowcomms.h"
+
+int  dlm_device_init(void);
+void dlm_device_exit(void);
+void dlm_proc_init(void);
+void dlm_proc_exit(void);
+
+
+/* Cluster manager callbacks, we want to know if a node dies
+   N.B. this is independent of lockspace-specific event callbacks from SM */
+
+static void cman_callback(kcl_callback_reason reason, long arg)
+{
+	if (reason == DIED) {
+		lowcomms_close((int) arg);
+	}
+
+	/* This is unconditional. so do what we can to tidy up */
+	if (reason == LEAVING) {
+	        dlm_emergency_shutdown();
+	}
+}
+
+int __init init_dlm(void)
+{
+	dlm_proc_init();
+	dlm_lockspace_init();
+	dlm_recoverd_init();
+	dlm_nodes_init();
+	dlm_device_init();
+	dlm_memory_init();
+	dlm_config_init();
+
+	kcl_add_callback(cman_callback);
+
+	printk("DLM %s (built %s %s) installed\n",
+	       DLM_RELEASE_NAME, __DATE__, __TIME__);
+
+	return 0;
+}
+
+void __exit exit_dlm(void)
+{
+	kcl_remove_callback(cman_callback);
+
+	dlm_device_exit();
+	dlm_memory_exit();
+	dlm_config_exit();
+	dlm_proc_exit();
+}
+
+MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+EXPORT_SYMBOL(dlm_init);
+EXPORT_SYMBOL(dlm_release);
+EXPORT_SYMBOL(dlm_new_lockspace);
+EXPORT_SYMBOL(dlm_release_lockspace);
+EXPORT_SYMBOL(dlm_lock);
+EXPORT_SYMBOL(dlm_unlock);
diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
--- linux-orig/cluster/dlm/memory.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/memory.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,238 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* memory.c
+ * 
+ * memory allocation routines
+ * 
+ */
+
+#include "dlm_internal.h"
+#include "memory.h"
+#include "config.h"
+
+/* as the man says...Shouldn't this be in a header file somewhere? */
+#define	BYTES_PER_WORD		sizeof(void *)
+
+static kmem_cache_t *rsb_cache_small;
+static kmem_cache_t *rsb_cache_large;
+static kmem_cache_t *lkb_cache;
+static kmem_cache_t *lvb_cache;
+static kmem_cache_t *resdir_cache_large;
+static kmem_cache_t *resdir_cache_small;
+
+/* The thresholds above which we allocate large RSBs/resdatas rather than small 
+ * ones. This must make the resultant structure end on a word boundary */
+#define LARGE_RSB_NAME 28
+#define LARGE_RES_NAME 28
+
+int dlm_memory_init()
+{
+	int ret = -ENOMEM;
+
+
+	rsb_cache_small =
+	    kmem_cache_create("dlm_rsb(small)",
+			      (sizeof(gd_res_t) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
+			      __alignof__(gd_res_t), 0, NULL, NULL);
+	if (!rsb_cache_small)
+		goto out;
+
+	rsb_cache_large =
+	    kmem_cache_create("dlm_rsb(large)",
+			      sizeof(gd_res_t) + DLM_RESNAME_MAXLEN,
+			      __alignof__(gd_res_t), 0, NULL, NULL);
+	if (!rsb_cache_large)
+		goto out_free_rsbs;
+
+	lkb_cache = kmem_cache_create("dlm_lkb", sizeof(gd_lkb_t),
+				      __alignof__(gd_lkb_t), 0, NULL, NULL);
+	if (!lkb_cache)
+		goto out_free_rsbl;
+
+	resdir_cache_large =
+	    kmem_cache_create("dlm_resdir(l)",
+			      sizeof(gd_resdata_t) + DLM_RESNAME_MAXLEN,
+			      __alignof__(gd_resdata_t), 0, NULL, NULL);
+	if (!resdir_cache_large)
+		goto out_free_lkb;
+
+	resdir_cache_small =
+	    kmem_cache_create("dlm_resdir(s)",
+			      (sizeof(gd_resdata_t) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
+			      __alignof__(gd_resdata_t), 0, NULL, NULL);
+	if (!resdir_cache_small)
+		goto out_free_resl;
+
+	/* LVB cache also holds ranges, so should be 64bit aligned */
+	lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
+				      __alignof__(uint64_t), 0, NULL, NULL);
+	if (!lkb_cache)
+		goto out_free_ress;
+
+	ret = 0;
+	goto out;
+
+      out_free_ress:
+	kmem_cache_destroy(resdir_cache_small);
+
+      out_free_resl:
+	kmem_cache_destroy(resdir_cache_large);
+
+      out_free_lkb:
+	kmem_cache_destroy(lkb_cache);
+
+      out_free_rsbl:
+	kmem_cache_destroy(rsb_cache_large);
+
+      out_free_rsbs:
+	kmem_cache_destroy(rsb_cache_small);
+
+      out:
+	return ret;
+}
+
+void dlm_memory_exit()
+{
+	kmem_cache_destroy(rsb_cache_large);
+	kmem_cache_destroy(rsb_cache_small);
+	kmem_cache_destroy(lkb_cache);
+	kmem_cache_destroy(resdir_cache_small);
+	kmem_cache_destroy(resdir_cache_large);
+	kmem_cache_destroy(lvb_cache);
+}
+
+gd_res_t *allocate_rsb(gd_ls_t *ls, int namelen)
+{
+	gd_res_t *r;
+
+	GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+	if (namelen >= LARGE_RSB_NAME)
+		r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
+	else
+		r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
+
+	if (r)
+		memset(r, 0, sizeof(gd_res_t) + namelen);
+
+	return r;
+}
+
+void free_rsb(gd_res_t *r)
+{
+	int length = r->res_length;
+
+#ifdef POISON
+	memset(r, 0x55, sizeof(gd_res_t) + r->res_length);
+#endif
+
+	if (length >= LARGE_RSB_NAME)
+		kmem_cache_free(rsb_cache_large, r);
+	else
+		kmem_cache_free(rsb_cache_small, r);
+}
+
+gd_lkb_t *allocate_lkb(gd_ls_t *ls)
+{
+	gd_lkb_t *l;
+
+	l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
+	if (l)
+		memset(l, 0, sizeof(gd_lkb_t));
+
+	return l;
+}
+
+void free_lkb(gd_lkb_t *l)
+{
+#ifdef POISON
+	memset(l, 0xAA, sizeof(gd_lkb_t));
+#endif
+	kmem_cache_free(lkb_cache, l);
+}
+
+gd_resdata_t *allocate_resdata(gd_ls_t *ls, int namelen)
+{
+	gd_resdata_t *rd;
+
+	GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+	if (namelen >= LARGE_RES_NAME)
+		rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
+	else
+		rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
+
+	if (rd)
+		memset(rd, 0, sizeof(gd_resdata_t));
+
+	return rd;
+}
+
+void free_resdata(gd_resdata_t *rd)
+{
+	if (rd->rd_length >= LARGE_RES_NAME)
+		kmem_cache_free(resdir_cache_large, rd);
+	else
+		kmem_cache_free(resdir_cache_small, rd);
+}
+
+char *allocate_lvb(gd_ls_t *ls)
+{
+	char *l;
+
+	l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
+	if (l)
+		memset(l, 0, DLM_LVB_LEN);
+
+	return l;
+}
+
+void free_lvb(char *l)
+{
+	kmem_cache_free(lvb_cache, l);
+}
+
+/* Ranges are allocated from the LVB cache as they are the same size (4x64
+ * bits) */
+uint64_t *allocate_range(gd_ls_t * ls)
+{
+	uint64_t *l;
+
+	l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
+	if (l)
+		memset(l, 0, DLM_LVB_LEN);
+
+	return l;
+}
+
+void free_range(uint64_t *l)
+{
+	kmem_cache_free(lvb_cache, l);
+}
+
+gd_rcom_t *allocate_rcom_buffer(gd_ls_t *ls)
+{
+	gd_rcom_t *rc;
+
+	rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
+	if (rc)
+		memset(rc, 0, dlm_config.buffer_size);
+
+	return rc;
+}
+
+void free_rcom_buffer(gd_rcom_t *rc)
+{
+	kfree(rc);
+}
diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
--- linux-orig/cluster/dlm/memory.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/memory.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,32 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+gd_res_t *allocate_rsb(gd_ls_t * ls, int namelen);
+void free_rsb(gd_res_t * r);
+gd_lkb_t *allocate_lkb(gd_ls_t * ls);
+void free_lkb(gd_lkb_t * l);
+gd_resdata_t *allocate_resdata(gd_ls_t * ls, int namelen);
+void free_resdata(gd_resdata_t * rd);
+char *allocate_lvb(gd_ls_t * ls);
+void free_lvb(char *l);
+gd_rcom_t *allocate_rcom_buffer(gd_ls_t * ls);
+void free_rcom_buffer(gd_rcom_t * rc);
+uint64_t *allocate_range(gd_ls_t * ls);
+void free_range(uint64_t * l);
+
+#endif		/* __MEMORY_DOT_H__ */
diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
--- linux-orig/cluster/dlm/midcomms.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/midcomms.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,351 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ *
+ * It knows the format of the mid-level messages used and nodeidss
+ * but it does not know how to resolve a nodeid into an IP address
+ * or any of the comms channel details
+ *
+ */
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "lockqueue.h"
+#include "nodes.h"
+#include "reccomms.h"
+#include "config.h"
+
+/* Byteorder routines */
+
+static void host_to_network(void *msg)
+{
+	struct gd_req_header *head = msg;
+	struct gd_remlockrequest *req = msg;
+	struct gd_remlockreply *reply = msg;
+	struct gd_remquery *query = msg;
+	struct gd_remqueryreply *queryrep = msg;
+	gd_rcom_t *rc = msg;
+
+	/* Force into network byte order */
+
+	/*
+	 * Do the common header first
+	 */
+
+	head->rh_length = cpu_to_le16(head->rh_length);
+	head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
+	/* Leave the lkid alone as it is transparent at the remote end */
+
+	/*
+	 * Do the fields in the remlockrequest or remlockreply structs
+	 */
+
+	switch (req->rr_header.rh_cmd) {
+
+	case GDLM_REMCMD_LOCKREQUEST:
+	case GDLM_REMCMD_CONVREQUEST:
+		req->rr_range_start = cpu_to_le64(req->rr_range_start);
+		req->rr_range_end = cpu_to_le64(req->rr_range_end);
+		/* Deliberate fall through */
+	case GDLM_REMCMD_UNLOCKREQUEST:
+	case GDLM_REMCMD_LOOKUP:
+	case GDLM_REMCMD_LOCKGRANT:
+	case GDLM_REMCMD_SENDBAST:
+	case GDLM_REMCMD_SENDCAST:
+	case GDLM_REMCMD_REM_RESDATA:
+		req->rr_flags = cpu_to_le32(req->rr_flags);
+		req->rr_status = cpu_to_le32(req->rr_status);
+		break;
+
+	case GDLM_REMCMD_LOCKREPLY:
+		reply->rl_lockstate = cpu_to_le32(reply->rl_lockstate);
+		reply->rl_nodeid = cpu_to_le32(reply->rl_nodeid);
+		reply->rl_status = cpu_to_le32(reply->rl_status);
+		break;
+
+	case GDLM_REMCMD_RECOVERMESSAGE:
+	case GDLM_REMCMD_RECOVERREPLY:
+		rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
+		rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
+		break;
+
+	case GDLM_REMCMD_QUERY:
+	        query->rq_mstlkid = cpu_to_le32(query->rq_mstlkid);
+		query->rq_query = cpu_to_le32(query->rq_query);
+		query->rq_maxlocks = cpu_to_le32(query->rq_maxlocks);
+		break;
+
+	case GDLM_REMCMD_QUERYREPLY:
+	        queryrep->rq_numlocks = cpu_to_le32(queryrep->rq_numlocks);
+		queryrep->rq_status = cpu_to_le32(queryrep->rq_status);
+		queryrep->rq_grantcount = cpu_to_le32(queryrep->rq_grantcount);
+		queryrep->rq_waitcount = cpu_to_le32(queryrep->rq_waitcount);
+		queryrep->rq_convcount = cpu_to_le32(queryrep->rq_convcount);
+		break;
+
+	default:
+		printk("dlm: warning, unknown REMCMD type %u\n",
+		       req->rr_header.rh_cmd);
+	}
+}
+
+static void network_to_host(void *msg)
+{
+	struct gd_req_header *head = msg;
+	struct gd_remlockrequest *req = msg;
+	struct gd_remlockreply *reply = msg;
+	struct gd_remquery *query = msg;
+	struct gd_remqueryreply *queryrep = msg;
+	gd_rcom_t *rc = msg;
+
+	/* Force into host byte order */
+
+	/*
+	 * Do the common header first
+	 */
+
+	head->rh_length = le16_to_cpu(head->rh_length);
+	head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
+	/* Leave the lkid alone as it is transparent at the remote end */
+
+	/*
+	 * Do the fields in the remlockrequest or remlockreply structs
+	 */
+
+	switch (req->rr_header.rh_cmd) {
+
+	case GDLM_REMCMD_LOCKREQUEST:
+	case GDLM_REMCMD_CONVREQUEST:
+		req->rr_range_start = le64_to_cpu(req->rr_range_start);
+		req->rr_range_end = le64_to_cpu(req->rr_range_end);
+	case GDLM_REMCMD_LOOKUP:
+	case GDLM_REMCMD_UNLOCKREQUEST:
+	case GDLM_REMCMD_LOCKGRANT:
+	case GDLM_REMCMD_SENDBAST:
+	case GDLM_REMCMD_SENDCAST:
+	case GDLM_REMCMD_REM_RESDATA:
+		/* Actually, not much to do here as the remote lock IDs are
+		 * transparent too */
+		req->rr_flags = le32_to_cpu(req->rr_flags);
+		req->rr_status = le32_to_cpu(req->rr_status);
+		break;
+
+	case GDLM_REMCMD_LOCKREPLY:
+		reply->rl_lockstate = le32_to_cpu(reply->rl_lockstate);
+		reply->rl_nodeid = le32_to_cpu(reply->rl_nodeid);
+		reply->rl_status = le32_to_cpu(reply->rl_status);
+		break;
+
+	case GDLM_REMCMD_RECOVERMESSAGE:
+	case GDLM_REMCMD_RECOVERREPLY:
+		rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
+		rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
+		break;
+
+
+	case GDLM_REMCMD_QUERY:
+	        query->rq_mstlkid = le32_to_cpu(query->rq_mstlkid);
+		query->rq_query = le32_to_cpu(query->rq_query);
+		query->rq_maxlocks = le32_to_cpu(query->rq_maxlocks);
+		break;
+
+	case GDLM_REMCMD_QUERYREPLY:
+	        queryrep->rq_numlocks = le32_to_cpu(queryrep->rq_numlocks);
+		queryrep->rq_status = le32_to_cpu(queryrep->rq_status);
+		queryrep->rq_grantcount = le32_to_cpu(queryrep->rq_grantcount);
+		queryrep->rq_waitcount = le32_to_cpu(queryrep->rq_waitcount);
+		queryrep->rq_convcount = le32_to_cpu(queryrep->rq_convcount);
+		break;
+
+	default:
+		printk("dlm: warning, unknown REMCMD type %u\n",
+		       req->rr_header.rh_cmd);
+	}
+}
+
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+			 unsigned len, unsigned limit)
+{
+	unsigned copy = len;
+
+	if ((copy + offset) > limit)
+		copy = limit - offset;
+	memcpy(dst, base + offset, copy);
+	len -= copy;
+	if (len)
+		memcpy(dst + copy, base, len);
+}
+
+static void khexdump(const unsigned char *c, int len)
+{
+	while (len > 16) {
+		printk(KERN_INFO
+		       "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
+		       c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
+		       c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
+		len -= 16;
+	}
+	while (len > 4) {
+		printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
+		       c[3]);
+		len -= 4;
+	}
+	while (len > 0) {
+		printk(KERN_INFO "%02x\n", c[0]);
+		len--;
+	}
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ *
+ */
+int midcomms_process_incoming_buffer(int nodeid, const void *base,
+				     unsigned offset, unsigned len,
+				     unsigned limit)
+{
+	unsigned char __tmp[sizeof(struct gd_req_header) + 64];
+	struct gd_req_header *msg = (struct gd_req_header *) __tmp;
+	int ret = 0;
+	int err = 0;
+	unsigned msglen;
+	__u32 id, space;
+
+	while (len > sizeof(struct gd_req_header)) {
+		/* Get message header and check it over */
+		copy_from_cb(msg, base, offset, sizeof(struct gd_req_header),
+			     limit);
+		msglen = le16_to_cpu(msg->rh_length);
+		id = msg->rh_lkid;
+		space = msg->rh_lockspace;
+
+		/* Check message size */
+		err = -EINVAL;
+		if (msglen < sizeof(struct gd_req_header))
+			break;
+		err = -E2BIG;
+		if (msglen > dlm_config.buffer_size) {
+			printk("dlm: message size too big %d\n", msglen);
+			break;
+		}
+		err = 0;
+
+		/* Not enough in buffer yet? wait for some more */
+		if (msglen > len)
+			break;
+
+		/* Make sure our temp buffer is large enough */
+		if (msglen > sizeof(__tmp) &&
+		    msg == (struct gd_req_header *) __tmp) {
+			msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+			if (msg == NULL)
+				return ret;
+		}
+
+		copy_from_cb(msg, base, offset, msglen, limit);
+		BUG_ON(id != msg->rh_lkid);
+		BUG_ON(space != msg->rh_lockspace);
+		ret += msglen;
+		offset += msglen;
+		offset &= (limit - 1);
+		len -= msglen;
+		network_to_host(msg);
+
+		if ((msg->rh_cmd > 32) ||
+		    (msg->rh_cmd == 0) ||
+		    (msg->rh_length < sizeof(struct gd_req_header)) ||
+		    (msg->rh_length > dlm_config.buffer_size)) {
+
+			printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
+			       "lkid=%u, lockspace=%u\n",
+			       msg->rh_cmd, msg->rh_flags, msg->rh_length,
+			       msg->rh_lkid, msg->rh_lockspace);
+
+			printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
+			       "ret=%u, limit=%08x newbuf=%d\n",
+			       base, offset, len, ret, limit,
+			       ((struct gd_req_header *) __tmp == msg));
+
+			khexdump((const unsigned char *) msg, msg->rh_length);
+
+			return -EBADMSG;
+		}
+
+		switch (msg->rh_cmd) {
+		case GDLM_REMCMD_RECOVERMESSAGE:
+		case GDLM_REMCMD_RECOVERREPLY:
+			process_recovery_comm(nodeid, msg);
+			break;
+		default:
+			process_cluster_request(nodeid, msg, FALSE);
+		}
+	}
+
+	if (msg != (struct gd_req_header *) __tmp)
+		kfree(msg);
+
+	return err ? err : ret;
+}
+
+/*
+ * Send a lowcomms buffer
+ */
+
+void midcomms_send_buffer(struct gd_req_header *msg, struct writequeue_entry *e)
+{
+	host_to_network(msg);
+	lowcomms_commit_buffer(e);
+}
+
+/*
+ * Make the message into network byte order and send it
+ */
+
+int midcomms_send_message(uint32_t nodeid, struct gd_req_header *msg,
+			  int allocation)
+{
+	int len = msg->rh_length;
+
+	host_to_network(msg);
+
+	/*
+	 * Loopback.  In fact, the locking code pretty much prevents this from
+	 * being needed but it can happen when the directory node is also the
+	 * local node.
+	 */
+
+	if (nodeid == our_nodeid())
+		return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
+							len, len);
+
+	return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
+}
diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
--- linux-orig/cluster/dlm/midcomms.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/midcomms.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+int midcomms_send_message(uint32_t csid, struct gd_req_header *msg,
+			  int allocation);
+int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
+				     unsigned len, unsigned limit);
+void midcomms_send_buffer(struct gd_req_header *msg,
+			  struct writequeue_entry *e);
+
+#endif				/* __MIDCOMMS_DOT_H__ */
diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
--- linux-orig/cluster/dlm/nodes.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/nodes.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,325 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <net/sock.h>
+#include <cluster/cnxman.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "nodes.h"
+#include "recover.h"
+#include "reccomms.h"
+#include "util.h"
+
+static struct list_head cluster_nodes;
+static spinlock_t node_lock;
+static uint32_t local_nodeid;
+static struct semaphore local_init_lock;
+
+
+void dlm_nodes_init(void)
+{
+	INIT_LIST_HEAD(&cluster_nodes);
+	spin_lock_init(&node_lock);
+	local_nodeid = 0;
+	init_MUTEX(&local_init_lock);
+}
+
+static gd_node_t *search_node(uint32_t nodeid)
+{
+	gd_node_t *node;
+
+	list_for_each_entry(node, &cluster_nodes, gn_list) {
+		if (node->gn_nodeid == nodeid)
+			goto out;
+	}
+	node = NULL;
+      out:
+	return node;
+}
+
+static void put_node(gd_node_t *node)
+{
+	spin_lock(&node_lock);
+	node->gn_refcount--;
+	if (node->gn_refcount == 0) {
+		list_del(&node->gn_list);
+		spin_unlock(&node_lock);
+		kfree(node);
+		return;
+	}
+	spin_unlock(&node_lock);
+}
+
+static int get_node(uint32_t nodeid, gd_node_t **ndp)
+{
+	gd_node_t *node, *node2;
+	int error = -ENOMEM;
+
+	spin_lock(&node_lock);
+	node = search_node(nodeid);
+	if (node)
+		node->gn_refcount++;
+	spin_unlock(&node_lock);
+
+	if (node)
+		goto out;
+
+	node = (gd_node_t *) kmalloc(sizeof(gd_node_t), GFP_KERNEL);
+	if (!node)
+		goto fail;
+
+	memset(node, 0, sizeof(gd_node_t));
+	node->gn_nodeid = nodeid;
+
+	spin_lock(&node_lock);
+	node2 = search_node(nodeid);
+	if (node2) {
+		node2->gn_refcount++;
+		spin_unlock(&node_lock);
+		kfree(node);
+		node = node2;
+		goto out;
+	}
+
+	node->gn_refcount = 1;
+	list_add_tail(&node->gn_list, &cluster_nodes);
+	spin_unlock(&node_lock);
+
+      out:
+	*ndp = node;
+	return 0;
+
+      fail:
+	return error;
+}
+
+int init_new_csb(uint32_t nodeid, gd_csb_t **ret_csb)
+{
+	gd_csb_t *csb;
+	gd_node_t *node;
+	int error = -ENOMEM;
+
+	csb = (gd_csb_t *) kmalloc(sizeof(gd_csb_t), GFP_KERNEL);
+	if (!csb)
+		goto fail;
+
+	memset(csb, 0, sizeof(gd_csb_t));
+
+	error = get_node(nodeid, &node);
+	if (error)
+		goto fail_free;
+
+	csb->csb_node = node;
+
+	down(&local_init_lock);
+
+	if (!local_nodeid) {
+		if (nodeid == our_nodeid()) {
+			local_nodeid = node->gn_nodeid;
+		}
+	}
+	up(&local_init_lock);
+
+	*ret_csb = csb;
+	return 0;
+
+      fail_free:
+	kfree(csb);
+      fail:
+	return error;
+}
+
+void release_csb(gd_csb_t *csb)
+{
+	put_node(csb->csb_node);
+	kfree(csb);
+}
+
+uint32_t our_nodeid(void)
+{
+	return lowcomms_our_nodeid();
+}
+
+int nodes_reconfig_wait(gd_ls_t *ls)
+{
+	int error;
+
+	if (ls->ls_low_nodeid == our_nodeid()) {
+		error = gdlm_wait_status_all(ls, NODES_VALID);
+		if (!error)
+			set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
+
+		/* Experimental: this delay should allow any final messages
+		 * from the previous node to be received before beginning
+		 * recovery. */
+
+		if (ls->ls_num_nodes == 1) {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout((2) * HZ);
+		}
+
+	} else
+		error = gdlm_wait_status_low(ls, NODES_ALL_VALID);
+
+	return error;
+}
+
+static void add_ordered_node(gd_ls_t *ls, gd_csb_t *new)
+{
+	gd_csb_t *csb = NULL;
+	struct list_head *tmp;
+	struct list_head *newlist = &new->csb_list;
+	struct list_head *head = &ls->ls_nodes;
+
+	list_for_each(tmp, head) {
+		csb = list_entry(tmp, gd_csb_t, csb_list);
+
+		if (new->csb_node->gn_nodeid < csb->csb_node->gn_nodeid)
+			break;
+	}
+
+	if (!csb)
+		list_add_tail(newlist, head);
+	else {
+		/* FIXME: can use list macro here */
+		newlist->prev = tmp->prev;
+		newlist->next = tmp;
+		tmp->prev->next = newlist;
+		tmp->prev = newlist;
+	}
+}
+
+int ls_nodes_reconfig(gd_ls_t *ls, gd_recover_t *gr, int *neg_out)
+{
+	gd_csb_t *csb, *safe;
+	int error, i, found, pos = 0, neg = 0;
+	uint32_t low = (uint32_t) (-1);
+
+	/* 
+	 * Remove (and save) departed nodes from lockspace's nodes list
+	 */
+
+	list_for_each_entry_safe(csb, safe, &ls->ls_nodes, csb_list) {
+		found = FALSE;
+		for (i = 0; i < gr->gr_node_count; i++) {
+			if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
+				found = TRUE;
+				break;
+			}
+		}
+
+		if (!found) {
+			neg++;
+			csb->csb_gone_event = gr->gr_event_id;
+			list_del(&csb->csb_list);
+			list_add_tail(&csb->csb_list, &ls->ls_nodes_gone);
+			ls->ls_num_nodes--;
+			log_all(ls, "remove node %u", csb->csb_node->gn_nodeid);
+		}
+	}
+
+	/* 
+	 * Add new nodes to lockspace's nodes list
+	 */
+
+	for (i = 0; i < gr->gr_node_count; i++) {
+		found = FALSE;
+		list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
+			if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
+				found = TRUE;
+				break;
+			}
+		}
+
+		if (!found) {
+			pos++;
+
+			error = init_new_csb(gr->gr_nodeids[i], &csb);
+			GDLM_ASSERT(!error,);
+
+			add_ordered_node(ls, csb);
+			ls->ls_num_nodes++;
+			log_all(ls, "add node %u", csb->csb_node->gn_nodeid);
+		}
+	}
+
+	list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
+		if (csb->csb_node->gn_nodeid < low)
+			low = csb->csb_node->gn_nodeid;
+	}
+
+	rcom_log_clear(ls);
+	ls->ls_low_nodeid = low;
+	ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
+	set_bit(LSFL_NODES_VALID, &ls->ls_flags);
+	*neg_out = neg;
+
+	error = nodes_reconfig_wait(ls);
+
+	log_all(ls, "total nodes %d", ls->ls_num_nodes);
+
+	return error;
+}
+
+int ls_nodes_init(gd_ls_t *ls, gd_recover_t *gr)
+{
+	gd_csb_t *csb;
+	int i, error;
+	uint32_t low = (uint32_t) (-1);
+
+	log_all(ls, "add nodes");
+
+	for (i = 0; i < gr->gr_node_count; i++) {
+		error = init_new_csb(gr->gr_nodeids[i], &csb);
+		if (error)
+			goto fail;
+
+		add_ordered_node(ls, csb);
+		ls->ls_num_nodes++;
+
+		if (csb->csb_node->gn_nodeid < low)
+			low = csb->csb_node->gn_nodeid;
+	}
+
+	ls->ls_low_nodeid = low;
+	ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
+	set_bit(LSFL_NODES_VALID, &ls->ls_flags);
+
+	error = nodes_reconfig_wait(ls);
+
+	log_all(ls, "total nodes %d", ls->ls_num_nodes);
+
+	return error;
+
+      fail:
+	while (!list_empty(&ls->ls_nodes)) {
+		csb = list_entry(ls->ls_nodes.next, gd_csb_t, csb_list);
+		list_del(&csb->csb_list);
+		release_csb(csb);
+	}
+	ls->ls_num_nodes = 0;
+
+	return error;
+}
+
+int in_nodes_gone(gd_ls_t *ls, uint32_t nodeid)
+{
+	gd_csb_t *csb;
+
+	list_for_each_entry(csb, &ls->ls_nodes_gone, csb_list) {
+		if (csb->csb_node->gn_nodeid == nodeid)
+			return TRUE;
+	}
+	return FALSE;
+}
diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
--- linux-orig/cluster/dlm/nodes.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/nodes.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __NODES_DOT_H__
+#define __NODES_DOT_H__
+
+void dlm_nodes_init(void);
+int init_new_csb(uint32_t nodeid, gd_csb_t ** ret_csb);
+void release_csb(gd_csb_t * csb);
+uint32_t our_nodeid(void);
+int ls_nodes_reconfig(gd_ls_t * ls, gd_recover_t * gr, int *neg);
+int ls_nodes_init(gd_ls_t * ls, gd_recover_t * gr);
+int in_nodes_gone(gd_ls_t * ls, uint32_t nodeid);
+
+#endif				/* __NODES_DOT_H__ */
diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
--- linux-orig/cluster/dlm/proc.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/proc.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,469 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+
+#if defined(DLM_DEBUG)
+#define DLM_DEBUG_SIZE		(1024)
+#define MAX_DEBUG_MSG_LEN	(64)
+#else
+#define DLM_DEBUG_SIZE		(0)
+#define MAX_DEBUG_MSG_LEN	(0)
+#endif
+
+static char *			debug_buf;
+static unsigned int		debug_size;
+static unsigned int		debug_point;
+static int			debug_wrap;
+static spinlock_t		debug_lock;
+static struct proc_dir_entry *	debug_proc_entry = NULL;
+static struct proc_dir_entry *	rcom_proc_entry = NULL;
+static char			proc_ls_name[255] = "";
+
+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
+static struct proc_dir_entry *	locks_proc_entry = NULL;
+static struct seq_operations	locks_info_op;
+
+
+static int locks_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &locks_info_op);
+}
+
+/* Write simply sets the lockspace to use */
+static ssize_t locks_write(struct file *file, const char *buf,
+			   size_t count, loff_t * ppos)
+{
+	if (count < sizeof(proc_ls_name)) {
+		copy_from_user(proc_ls_name, buf, count);
+		proc_ls_name[count] = '\0';
+
+		/* Remove any trailing LF so that lazy users
+		   can just echo "lsname" > /proc/cluster/dlm_locks */
+		if (proc_ls_name[count - 1] == '\n')
+			proc_ls_name[count - 1] = '\0';
+
+		return count;
+	}
+	return 0;
+}
+
+static struct file_operations locks_fops = {
+      open:locks_open,
+      write:locks_write,
+      read:seq_read,
+      llseek:seq_lseek,
+      release:seq_release,
+};
+
+struct ls_dumpinfo {
+	int entry;
+	struct list_head *next;
+	gd_ls_t *ls;
+	gd_res_t *rsb;
+};
+
+static int print_resource(gd_res_t * res, struct seq_file *s);
+
+static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
+{
+	read_lock(&di->ls->ls_reshash_lock);
+	if (!di->next) {
+		/* Find the next non-empty hash bucket */
+		while (list_empty(&di->ls->ls_reshashtbl[di->entry]) &&
+		       di->entry < di->ls->ls_hashsize) {
+			di->entry++;
+		}
+		if (di->entry >= di->ls->ls_hashsize) {
+			read_unlock(&di->ls->ls_reshash_lock);
+			return NULL;	/* End of hash list */
+		}
+
+		di->next = di->ls->ls_reshashtbl[di->entry].next;
+	} else {		/* Find the next entry in the list */
+
+		di->next = di->next->next;
+		if (di->next->next == di->ls->ls_reshashtbl[di->entry].next) {
+			/* End of list - move to next bucket */
+			di->next = NULL;
+			di->entry++;
+			read_unlock(&di->ls->ls_reshash_lock);
+
+			return next_rsb(di);	/* do the top half of this conditional */
+		}
+	}
+	di->rsb = list_entry(di->next, gd_res_t, res_hashchain);
+	read_unlock(&di->ls->ls_reshash_lock);
+
+	return di;
+}
+
+static void *s_start(struct seq_file *m, loff_t * pos)
+{
+	struct ls_dumpinfo *di;
+	gd_ls_t *ls;
+	int i;
+
+	ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
+	if (!ls)
+		return NULL;
+
+	di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
+	if (!di)
+		return NULL;
+
+	if (*pos == 0)
+		seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
+
+	di->entry = 0;
+	di->next = NULL;
+	di->ls = ls;
+
+	for (i = 0; i < *pos; i++)
+		if (next_rsb(di) == NULL)
+			return NULL;
+
+	return next_rsb(di);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	struct ls_dumpinfo *di = p;
+
+	*pos += 1;
+
+	return next_rsb(di);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+	struct ls_dumpinfo *di = p;
+	return print_resource(di->rsb, m);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static struct seq_operations locks_info_op = {
+      start:s_start,
+      next:s_next,
+      stop:s_stop,
+      show:s_show
+};
+
+static char *print_lockmode(int mode)
+{
+	switch (mode) {
+	case DLM_LOCK_IV:
+		return "--";
+	case DLM_LOCK_NL:
+		return "NL";
+	case DLM_LOCK_CR:
+		return "CR";
+	case DLM_LOCK_CW:
+		return "CW";
+	case DLM_LOCK_PR:
+		return "PR";
+	case DLM_LOCK_PW:
+		return "PW";
+	case DLM_LOCK_EX:
+		return "EX";
+	default:
+		return "??";
+	}
+}
+
+static void print_lock(struct seq_file *s, gd_lkb_t * lkb, gd_res_t * res)
+{
+
+	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+	if (lkb->lkb_status == GDLM_LKSTS_CONVERT
+	    || lkb->lkb_status == GDLM_LKSTS_WAITING)
+		seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+	if (lkb->lkb_range) {
+		/* This warns on Alpha. Tough. Only I see it */
+		if (lkb->lkb_status == GDLM_LKSTS_CONVERT
+		    || lkb->lkb_status == GDLM_LKSTS_GRANTED)
+			seq_printf(s, " %" PRIx64 "-%" PRIx64,
+				   lkb->lkb_range[GR_RANGE_START],
+				   lkb->lkb_range[GR_RANGE_END]);
+		if (lkb->lkb_status == GDLM_LKSTS_CONVERT
+		    || lkb->lkb_status == GDLM_LKSTS_WAITING)
+			seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
+				   lkb->lkb_range[RQ_RANGE_START],
+				   lkb->lkb_range[RQ_RANGE_END]);
+	}
+
+	if (lkb->lkb_nodeid) {
+		if (lkb->lkb_nodeid != res->res_nodeid)
+			seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+				   lkb->lkb_remid);
+		else
+			seq_printf(s, " Master:     %08x", lkb->lkb_remid);
+	}
+
+	if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
+		seq_printf(s, "  LQ: %d", lkb->lkb_lockqueue_state);
+
+	seq_printf(s, "\n");
+}
+
+static int print_resource(gd_res_t *res, struct seq_file *s)
+{
+	int i;
+	struct list_head *locklist;
+
+	seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
+		   res->res_parent, res->res_length);
+	for (i = 0; i < res->res_length; i++) {
+		if (isprint(res->res_name[i]))
+			seq_printf(s, "%c", res->res_name[i]);
+		else
+			seq_printf(s, "%c", '.');
+	}
+	if (res->res_nodeid)
+		seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+			   res->res_nodeid);
+	else
+		seq_printf(s, "\"  \nMaster Copy\n");
+
+	/* Print the LVB: */
+	if (res->res_lvbptr) {
+		seq_printf(s, "LVB: ");
+		for (i = 0; i < DLM_LVB_LEN; i++) {
+			if (i == DLM_LVB_LEN / 2)
+				seq_printf(s, "\n     ");
+			seq_printf(s, "%02x ",
+				   (unsigned char) res->res_lvbptr[i]);
+		}
+		seq_printf(s, "\n");
+	}
+
+	/* Print the locks attached to this resource */
+	seq_printf(s, "Granted Queue\n");
+	list_for_each(locklist, &res->res_grantqueue) {
+		gd_lkb_t *this_lkb =
+		    list_entry(locklist, gd_lkb_t, lkb_statequeue);
+		print_lock(s, this_lkb, res);
+	}
+
+	seq_printf(s, "Conversion Queue\n");
+	list_for_each(locklist, &res->res_convertqueue) {
+		gd_lkb_t *this_lkb =
+		    list_entry(locklist, gd_lkb_t, lkb_statequeue);
+		print_lock(s, this_lkb, res);
+	}
+
+	seq_printf(s, "Waiting Queue\n");
+	list_for_each(locklist, &res->res_waitqueue) {
+		gd_lkb_t *this_lkb =
+		    list_entry(locklist, gd_lkb_t, lkb_statequeue);
+		print_lock(s, this_lkb, res);
+	}
+	return 0;
+}
+#endif				/* CONFIG_CLUSTER_DLM_PROCLOCKS */
+
+void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...)
+{
+	va_list va;
+	int i, n, size, len;
+	char buf[MAX_DEBUG_MSG_LEN+1];
+
+	spin_lock(&debug_lock);
+
+	if (!debug_buf)
+		goto out;
+
+	size = MAX_DEBUG_MSG_LEN;
+	memset(buf, 0, size+1);
+
+	n = snprintf(buf, size, "%s ", ls->ls_name);
+	size -= n;
+
+	va_start(va, fmt);
+	vsnprintf(buf+n, size, fmt, va);
+	va_end(va);
+
+	len = strlen(buf);
+	if (len > MAX_DEBUG_MSG_LEN-1)
+		len = MAX_DEBUG_MSG_LEN-1;
+	buf[len] = '\n';
+	buf[len+1] = '\0';
+
+	for (i = 0; i < strlen(buf); i++) {
+		debug_buf[debug_point++] = buf[i];
+
+		if (debug_point == debug_size) {
+			debug_point = 0;
+			debug_wrap = 1;
+		}
+	}
+ out:
+	spin_unlock(&debug_lock);
+}
+
+void dlm_debug_dump(void)
+{
+	int i;
+
+	spin_lock(&debug_lock);
+	if (debug_wrap) {
+		for (i = debug_point; i < debug_size; i++)
+			printk("%c", debug_buf[i]);
+	}
+	for (i = 0; i < debug_point; i++)
+		printk("%c", debug_buf[i]);
+	spin_unlock(&debug_lock);
+}
+
+void dlm_debug_setup(int size)
+{
+	char *b = NULL;
+
+	if (size > PAGE_SIZE)
+		size = PAGE_SIZE;
+	if (size)
+		b = kmalloc(size, GFP_KERNEL);
+
+	spin_lock(&debug_lock);
+	if (debug_buf)
+		kfree(debug_buf);
+	if (!size || !b)
+		goto out;
+	debug_size = size;
+	debug_point = 0;
+	debug_wrap = 0;
+	debug_buf = b;
+	memset(debug_buf, 0, debug_size);
+ out:
+        spin_unlock(&debug_lock);
+}
+
+static void dlm_debug_init(void)
+{
+	debug_buf = NULL;
+        debug_size = 0;
+	debug_point = 0;
+	debug_wrap = 0;
+	spin_lock_init(&debug_lock);
+
+	dlm_debug_setup(DLM_DEBUG_SIZE);
+}
+
+#ifdef CONFIG_PROC_FS
+int dlm_debug_info(char *b, char **start, off_t offset, int length)
+{
+	int i, n = 0;
+
+	spin_lock(&debug_lock);
+
+	if (debug_wrap) {
+		for (i = debug_point; i < debug_size; i++)
+			n += sprintf(b + n, "%c", debug_buf[i]);
+	}
+	for (i = 0; i < debug_point; i++)
+		n += sprintf(b + n, "%c", debug_buf[i]);
+
+	spin_unlock(&debug_lock);
+
+	return n;
+}
+
+int dlm_rcom_info(char *b, char **start, off_t offset, int length)
+{
+	gd_ls_t *ls;
+	gd_csb_t *csb;
+	int n = 0;
+
+	ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
+	if (!ls)
+		return 0;
+
+	n += sprintf(b + n, "nodeid names_send_count names_send_msgid "
+				   "names_recv_count names_recv_msgid "
+				   "locks_send_count locks_send_msgid "
+				   "locks_recv_count locks_recv_msgid\n");
+
+	list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
+		n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n",
+			     csb->csb_node->gn_nodeid,
+			     csb->csb_names_send_count,
+                	     csb->csb_names_send_msgid,
+                	     csb->csb_names_recv_count,
+                	     csb->csb_names_recv_msgid,
+                	     csb->csb_locks_send_count,
+                	     csb->csb_locks_send_msgid,
+                	     csb->csb_locks_recv_count,
+                	     csb->csb_locks_recv_msgid);
+        }
+	return n;
+}
+#endif
+
+void dlm_proc_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
+					     NULL);
+	if (!debug_proc_entry)
+		return;
+
+	debug_proc_entry->get_info = &dlm_debug_info;
+
+	rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL);
+	if (!rcom_proc_entry)
+		return;
+
+	rcom_proc_entry->get_info = &dlm_rcom_info;
+#endif
+	dlm_debug_init();
+
+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
+	locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
+						  S_IFREG | 0400,
+						  NULL, NULL, NULL);
+	if (!locks_proc_entry)
+		return;
+	locks_proc_entry->proc_fops = &locks_fops;
+#endif
+}
+
+void dlm_proc_exit(void)
+{
+#ifdef CONFIG_PROC_FS
+	if (debug_proc_entry) {
+		remove_proc_entry("cluster/dlm_debug", NULL);
+		dlm_debug_setup(0);
+	}
+
+	if (rcom_proc_entry)
+		remove_proc_entry("cluster/dlm_rcom", NULL);
+#endif
+
+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
+	if (locks_proc_entry)
+		remove_proc_entry("cluster/dlm_locks", NULL);
+#endif
+}
diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
--- linux-orig/cluster/dlm/queries.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/queries.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,696 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * queries.c
+ *
+ * This file provides the kernel query interface to the DLM.
+ *
+ */
+
+#define EXPORT_SYMTAB
+#include <linux/module.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lockqueue.h"
+#include "locking.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "dir.h"
+#include "ast.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rsb.h"
+
+static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo);
+static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo);
+
+/*
+ * API entry point.
+ */
+int dlm_query(void *lockspace,
+	      struct dlm_lksb *lksb,
+	      int query,
+	      struct dlm_queryinfo *qinfo,
+	      void (ast_routine(void *)),
+	      void *astarg)
+{
+	int status = -EINVAL;
+	gd_lkb_t *target_lkb;
+	gd_lkb_t *query_lkb = NULL;	/* Our temporary LKB */
+	gd_ls_t  *ls = (gd_ls_t *) find_lockspace_by_local_id(lockspace);
+
+
+	if (!qinfo)
+		goto out;
+	if (!ls)
+	        goto out;
+	if (!ast_routine)
+	        goto out;
+	if (!lksb)
+	        goto out;
+
+	if (!qinfo->gqi_lockinfo)
+		qinfo->gqi_locksize = 0;
+
+        /* Find the lkid */
+	target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
+	if (!target_lkb)
+		goto out;
+
+	/* If the user wants a list of locks that are blocking or
+	   not blocking this lock, then it must be waiting
+	   for something
+	*/
+	if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
+	     (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
+	    target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
+		return -EINVAL;
+
+	/* We now allocate an LKB for our own use (so we can hang
+	 * things like the AST routine and the lksb from it) */
+	lksb->sb_status = -EBUSY;
+	query_lkb = create_lkb(ls);
+	if (!query_lkb) {
+	        status = -ENOMEM;
+		goto out;
+	}
+	query_lkb->lkb_astaddr  = ast_routine;
+	query_lkb->lkb_astparam = (long)astarg;
+	query_lkb->lkb_resource = target_lkb->lkb_resource;
+	query_lkb->lkb_lksb     = lksb;
+
+	/* Don't free the resource while we are querying it. This ref
+	 * will be dropped when the LKB is freed */
+	hold_rsb(query_lkb->lkb_resource);
+
+	/* Fill in the stuff that's always local */
+	if (qinfo->gqi_resinfo) {
+		if (target_lkb->lkb_resource->res_nodeid)
+			qinfo->gqi_resinfo->rsi_masternode =
+				target_lkb->lkb_resource->res_nodeid;
+		else
+			qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
+		qinfo->gqi_resinfo->rsi_length =
+			target_lkb->lkb_resource->res_length;
+		memcpy(qinfo->gqi_resinfo->rsi_name,
+		       target_lkb->lkb_resource->res_name,
+		       qinfo->gqi_resinfo->rsi_length);
+	}
+
+	/* If the master is local (or the user doesn't want the overhead of a
+	 * remote call) - fill in the details here */
+	if (target_lkb->lkb_resource->res_nodeid == 0 ||
+	    (query & DLM_QUERY_LOCAL)) {
+
+		status = 0;
+		/* Resource info */
+		if (qinfo->gqi_resinfo) {
+			query_resource(target_lkb->lkb_resource,
+				       qinfo->gqi_resinfo);
+		}
+
+		/* Lock lists */
+		if (qinfo->gqi_lockinfo) {
+			status = query_locks(query, target_lkb, qinfo);
+		}
+
+		query_lkb->lkb_retstatus = status;
+		queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
+		wake_astd();
+
+		/* An AST will be delivered so we must return success here */
+		status = 0;
+		goto out;
+	}
+
+	/* Remote master */
+	if (target_lkb->lkb_resource->res_nodeid != 0)
+	{
+		struct gd_remquery *remquery;
+		struct writequeue_entry *e;
+
+		/* Clear this cos the receiving end adds to it with
+		   each incoming packet */
+		qinfo->gqi_lockcount = 0;
+
+		/* Squirrel a pointer to the query info struct
+		   somewhere illegal */
+		query_lkb->lkb_request = (struct gd_remlockrequest *) qinfo;
+
+		e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
+					sizeof(struct gd_remquery),
+					ls->ls_allocation,
+					(char **) &remquery);
+		if (!e) {
+			status = -ENOBUFS;
+			goto out;
+		}
+
+		/* Build remote packet */
+		memset(remquery, 0, sizeof(struct gd_remquery));
+
+		remquery->rq_maxlocks  = qinfo->gqi_locksize;
+		remquery->rq_query     = query;
+		remquery->rq_mstlkid   = target_lkb->lkb_remid;
+		if (qinfo->gqi_lockinfo)
+			remquery->rq_maxlocks = qinfo->gqi_locksize;
+
+		remquery->rq_header.rh_cmd       = GDLM_REMCMD_QUERY;
+		remquery->rq_header.rh_flags     = 0;
+		remquery->rq_header.rh_length    = sizeof(struct gd_remquery);
+		remquery->rq_header.rh_lkid      = query_lkb->lkb_id;
+		remquery->rq_header.rh_lockspace = ls->ls_global_id;
+
+		midcomms_send_buffer(&remquery->rq_header, e);
+		status = 0;
+	}
+
+      out:
+
+	return status;
+}
+
+static inline int valid_range(struct dlm_range *r)
+{
+    if (r->ra_start != 0ULL ||
+	r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
+	return 1;
+    else
+	return 0;
+}
+
+static void put_int(int x, char *buf, int *offp)
+{
+        x = cpu_to_le32(x);
+        memcpy(buf + *offp, &x, sizeof(int));
+        *offp += sizeof(int);
+}
+
+static void put_int64(uint64_t x, char *buf, int *offp)
+{
+        x = cpu_to_le64(x);
+        memcpy(buf + *offp, &x, sizeof(uint64_t));
+        *offp += sizeof(uint64_t);
+}
+
+static int get_int(char *buf, int *offp)
+{
+        int value;
+        memcpy(&value, buf + *offp, sizeof(int));
+        *offp += sizeof(int);
+        return le32_to_cpu(value);
+}
+
+static uint64_t get_int64(char *buf, int *offp)
+{
+        uint64_t value;
+
+        memcpy(&value, buf + *offp, sizeof(uint64_t));
+        *offp += sizeof(uint64_t);
+        return le64_to_cpu(value);
+}
+
+#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
+
+/* Called from recvd to get lock info for a remote node */
+int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
+{
+        struct gd_remquery *query = (struct gd_remquery *) msg;
+	struct gd_remqueryreply *reply;
+	struct dlm_resinfo resinfo;
+	struct dlm_queryinfo qinfo;
+	struct writequeue_entry *e;
+	char *buf;
+	gd_lkb_t *lkb;
+	int status = 0;
+	int bufidx;
+	int finished = 0;
+	int cur_lock = 0;
+	int start_lock = 0;
+
+	lkb = find_lock_by_id(ls, query->rq_mstlkid);
+	if (!lkb) {
+		status = -EINVAL;
+		goto send_error;
+	}
+
+	qinfo.gqi_resinfo = &resinfo;
+	qinfo.gqi_locksize = query->rq_maxlocks;
+
+	/* Get the resource bits */
+	query_resource(lkb->lkb_resource, &resinfo);
+
+	/* Now get the locks if wanted */
+	if (query->rq_maxlocks) {
+	        qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
+					     GFP_KERNEL);
+		if (!qinfo.gqi_lockinfo) {
+		        status = -ENOMEM;
+			goto send_error;
+		}
+
+		status = query_locks(query->rq_query, lkb, &qinfo);
+		if (status && status != -E2BIG) {
+			kfree(qinfo.gqi_lockinfo);
+			goto send_error;
+		}
+	}
+	else {
+	        qinfo.gqi_lockinfo = NULL;
+		qinfo.gqi_lockcount = 0;
+	}
+
+	/* Send as many blocks as needed for all the locks */
+	do {
+		int i;
+		int msg_len = sizeof(struct gd_remqueryreply);
+		int last_msg_len = msg_len; /* keeps compiler quiet */
+		int last_lock;
+
+		/* First work out how many locks we can fit into a block */
+		for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
+
+			last_msg_len = msg_len;
+
+			msg_len += LOCK_LEN;
+			if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
+			    valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
+
+				msg_len += sizeof(uint64_t) * 4;
+			}
+		}
+
+		/* There must be a neater way of doing this... */
+		if (msg_len > PAGE_SIZE) {
+			last_lock = i-1;
+			msg_len = last_msg_len;
+		}
+		else {
+			last_lock = i;
+		}
+
+		e = lowcomms_get_buffer(nodeid,
+					msg_len,
+					ls->ls_allocation,
+					(char **) &reply);
+		if (!e) {
+			kfree(qinfo.gqi_lockinfo);
+			status = -ENOBUFS;
+			goto out;
+		}
+
+		reply->rq_header.rh_cmd       = GDLM_REMCMD_QUERYREPLY;
+		reply->rq_header.rh_length    = msg_len;
+		reply->rq_header.rh_lkid      = msg->rh_lkid;
+		reply->rq_header.rh_lockspace = msg->rh_lockspace;
+
+		reply->rq_status     = status;
+		reply->rq_startlock  = cur_lock;
+		reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
+		reply->rq_convcount  = qinfo.gqi_resinfo->rsi_convcount;
+		reply->rq_waitcount  = qinfo.gqi_resinfo->rsi_waitcount;
+		memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
+
+		buf = (char *)reply;
+		bufidx = sizeof(struct gd_remqueryreply);
+
+		for (; cur_lock < last_lock; cur_lock++) {
+
+			buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
+			buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
+			buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
+			put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
+
+			if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
+			    valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
+
+				buf[bufidx++] = 1;
+				put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
+				put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
+				put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
+				put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
+			}
+			else {
+				buf[bufidx++] = 0;
+			}
+		}
+
+		if (cur_lock == qinfo.gqi_lockcount) {
+			reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
+			finished = 1;
+		}
+		else {
+			reply->rq_header.rh_flags = 0;
+		}
+
+		reply->rq_numlocks = cur_lock - start_lock;
+		start_lock = cur_lock;
+
+		midcomms_send_buffer(&reply->rq_header, e);
+	} while (!finished);
+
+	kfree(qinfo.gqi_lockinfo);
+ out:
+	return status;
+
+ send_error:
+	e = lowcomms_get_buffer(nodeid,
+				sizeof(struct gd_remqueryreply),
+				ls->ls_allocation,
+				(char **) &reply);
+	if (!e) {
+		status =  -ENOBUFS;
+		goto out;
+	}
+	reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
+	reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */
+	reply->rq_header.rh_length = sizeof(struct gd_remqueryreply);
+	reply->rq_header.rh_lkid = msg->rh_lkid;
+	reply->rq_header.rh_lockspace = msg->rh_lockspace;
+	reply->rq_status     = status;
+	reply->rq_numlocks   = 0;
+	reply->rq_startlock  = 0;
+	reply->rq_grantcount = 0;
+	reply->rq_convcount  = 0;
+	reply->rq_waitcount  = 0;
+
+	midcomms_send_buffer(&reply->rq_header, e);
+
+	return status;
+}
+
+/* Reply to a remote query */
+int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
+{
+	gd_lkb_t *query_lkb;
+	struct dlm_queryinfo *qinfo;
+	struct gd_remqueryreply *reply;
+	char *buf;
+	int i;
+	int bufidx;
+
+	query_lkb = find_lock_by_id(ls, msg->rh_lkid);
+	if (!query_lkb)
+		return -EINVAL;
+
+	qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
+	reply = (struct gd_remqueryreply *) msg;
+
+	/* Copy the easy bits first */
+	qinfo->gqi_lockcount += reply->rq_numlocks;
+	if (qinfo->gqi_resinfo) {
+		qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
+		qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
+		qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
+		memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
+			DLM_LVB_LEN);
+	}
+
+	/* Now unpack the locks */
+	bufidx = sizeof(struct gd_remqueryreply);
+	buf = (char *) msg;
+
+	GDLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
+		    printk("start = %d, num + %d. Max=  %d\n",
+			   reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
+
+	for (i = reply->rq_startlock;
+	     i < reply->rq_startlock + reply->rq_numlocks; i++) {
+		qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
+		qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
+		qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
+		qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
+		qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
+		qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
+		qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
+		if (buf[bufidx++]) {
+			qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
+			qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = get_int64(buf, &bufidx);
+			qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
+			qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = get_int64(buf, &bufidx);
+		}
+		else {
+			qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
+			qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
+			qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
+			qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
+		}
+	}
+
+	/* If this was the last block then now tell the user */
+	if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
+	        query_lkb->lkb_retstatus = reply->rq_status;
+		queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
+		wake_astd();
+	}
+
+	return 0;
+}
+
+/* Aggregate resource information */
+static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo)
+{
+	struct list_head *tmp;
+
+
+	if (rsb->res_lvbptr)
+		memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
+
+	resinfo->rsi_grantcount = 0;
+	list_for_each(tmp, &rsb->res_grantqueue) {
+		resinfo->rsi_grantcount++;
+	}
+
+	resinfo->rsi_waitcount = 0;
+	list_for_each(tmp, &rsb->res_waitqueue) {
+		resinfo->rsi_waitcount++;
+	}
+
+	resinfo->rsi_convcount = 0;
+	list_for_each(tmp, &rsb->res_convertqueue) {
+		resinfo->rsi_convcount++;
+	}
+
+	return 0;
+}
+
+static int add_lock(gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
+{
+	int entry;
+
+	/* Don't fill it in if the buffer is full */
+	if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
+		return -E2BIG;
+
+	/* gqi_lockcount contains the number of locks we have returned */
+	entry = qinfo->gqi_lockcount++;
+
+	/* Fun with master copies */
+	if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
+	        qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
+		qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
+	}
+	else {
+	        qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
+		qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
+	}
+
+	/* Also make sure we always have a valid nodeid in there, the
+	   calling end may not know which node "0" is */
+	if (lkb->lkb_nodeid)
+	    qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
+	else
+	    qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
+
+	if (lkb->lkb_parent)
+		qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
+	else
+		qinfo->gqi_lockinfo[entry].lki_parent = 0;
+
+	qinfo->gqi_lockinfo[entry].lki_state  = lkb->lkb_status;
+	qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
+	qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
+
+	if (lkb->lkb_range) {
+		qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
+			lkb->lkb_range[GR_RANGE_START];
+		qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
+			lkb->lkb_range[GR_RANGE_END];
+		qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
+			lkb->lkb_range[RQ_RANGE_START];
+		qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
+			lkb->lkb_range[RQ_RANGE_END];
+	} else {
+		qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
+		qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
+		qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
+		qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
+	}
+	return 0;
+}
+
+static int query_lkb_queue(struct list_head *queue, int query,
+			   struct dlm_queryinfo *qinfo)
+{
+	struct list_head *tmp;
+	int status = 0;
+	int mode = query & DLM_QUERY_MODE_MASK;
+
+	list_for_each(tmp, queue) {
+		gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
+		int lkmode;
+
+		if (query & DLM_QUERY_RQMODE)
+			lkmode = lkb->lkb_rqmode;
+		else
+			lkmode = lkb->lkb_grmode;
+
+		/* Add the LKB info to the list if it matches the criteria in
+		 * the query bitmap */
+		switch (query & DLM_QUERY_MASK) {
+		case DLM_QUERY_LOCKS_ALL:
+			status = add_lock(lkb, qinfo);
+			break;
+
+		case DLM_QUERY_LOCKS_HIGHER:
+			if (lkmode > mode)
+				status = add_lock(lkb, qinfo);
+			break;
+
+		case DLM_QUERY_LOCKS_EQUAL:
+			if (lkmode == mode)
+				status = add_lock(lkb, qinfo);
+			break;
+
+		case DLM_QUERY_LOCKS_LOWER:
+			if (lkmode < mode)
+				status = add_lock(lkb, qinfo);
+			break;
+		}
+	}
+	return status;
+}
+
+/*
+ * Return 1 if the locks' ranges overlap
+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
+ */
+static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
+{
+	if (!lkb1->lkb_range || !lkb2->lkb_range)
+		return 1;
+
+	if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
+	    lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
+		return 0;
+
+	return 1;
+}
+extern const int __dlm_compat_matrix[8][8];
+
+
+static int get_blocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
+{
+	struct list_head *tmp;
+	int status = 0;
+
+	list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
+		gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
+
+		if (ranges_overlap(lkb, qlkb) &&
+		    !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
+			status = add_lock(lkb, qinfo);
+	}
+
+	return status;
+}
+
+static int get_nonblocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
+{
+	struct list_head *tmp;
+	int status = 0;
+
+	list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
+		gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
+
+		if (!(ranges_overlap(lkb, qlkb) &&
+		      !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
+			status = add_lock(lkb, qinfo);
+	}
+
+	return status;
+}
+
+/* Gather a list of appropriate locks */
+static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
+{
+	int status = 0;
+
+
+	/* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
+	 * was requested as the mode
+	 */
+	if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
+		query &= ~DLM_QUERY_MODE_MASK;
+		if (query & DLM_QUERY_RQMODE)
+			query |= lkb->lkb_rqmode;
+		else
+			query |= lkb->lkb_grmode;
+	}
+
+	qinfo->gqi_lockcount = 0;
+
+	/* BLOCKING/NOTBLOCK only look at the granted queue */
+	if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
+		return get_blocking_locks(lkb, qinfo);
+
+	if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
+		return get_nonblocking_locks(lkb, qinfo);
+
+        /* Do the lock queues that were requested */
+	if (query & DLM_QUERY_QUEUE_GRANT) {
+		status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue,
+					 query,	qinfo);
+	}
+
+	if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
+		status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue,
+					 query, qinfo);
+	}
+
+	if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
+		status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue,
+					 query, qinfo);
+	}
+
+
+	return status;
+}
+
+EXPORT_SYMBOL(dlm_query);
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
--- linux-orig/cluster/dlm/queries.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/queries.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,20 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __QUERIES_DOT_H__
+#define __QUERIES_DOT_H__
+
+extern int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
+extern int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
+
+#endif                          /* __QUERIES_DOT_H__ */
diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
--- linux-orig/cluster/dlm/rebuild.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/rebuild.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,1245 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* 
+ * Rebuild RSB's on new masters.  Functions for transferring locks and
+ * subresources to new RSB masters during recovery.
+ */
+
+#include "dlm_internal.h"
+#include "reccomms.h"
+#include "lkb.h"
+#include "rsb.h"
+#include "nodes.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+
+
+/* Types of entity serialised in remastering messages */
+#define REMASTER_ROOTRSB 1
+#define REMASTER_RSB     2
+#define REMASTER_LKB     3
+
+struct rcom_fill {
+	char *			outbuf;		/* Beginning of data */
+	int 			offset;		/* Current offset into outbuf */
+	int 			maxlen;		/* Max value of offset */
+	int 			remasterid;
+	int 			count;
+	gd_res_t *		rsb;
+	gd_res_t *		subrsb;
+	gd_lkb_t *		lkb;
+	struct list_head *	lkbqueue;
+	char 			more;
+};
+typedef struct rcom_fill rcom_fill_t;
+
+
+struct rebuild_node {
+	struct list_head	list;
+	int 			nodeid;
+	gd_res_t *		rootrsb;
+};
+typedef struct rebuild_node rebuild_node_t;
+
+
+/* 
+ * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
+ * master.  The rsb will be "done" with recovery when the new master has
+ * replied with all the new remote lockid's for this rsb's lkb's.
+ */
+
+void expect_new_lkids(gd_res_t *rsb)
+{
+	rsb->res_newlkid_expect = 0;
+	recover_list_add(rsb);
+}
+
+/* 
+ * This function is called on root rsb or subrsb when another lkb is being sent
+ * to the new master for which we expect to receive a corresponding remote lkid
+ */
+
+void need_new_lkid(gd_res_t *rsb)
+{
+	gd_res_t *root = rsb;
+
+	if (rsb->res_parent)
+		root = rsb->res_root;
+
+	if (!root->res_newlkid_expect)
+		recover_list_add(root);
+	else
+		GDLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
+
+	root->res_newlkid_expect++;
+}
+
+/* 
+ * This function is called for each lkb for which a new remote lkid is
+ * received.  Decrement the expected number of remote lkids expected for the
+ * root rsb.
+ */
+
+void have_new_lkid(gd_lkb_t *lkb)
+{
+	gd_res_t *root = lkb->lkb_resource;
+
+	if (root->res_parent)
+		root = root->res_root;
+
+	down_write(&root->res_lock);
+
+	GDLM_ASSERT(root->res_newlkid_expect,
+		    printk("newlkid_expect=%d\n", root->res_newlkid_expect););
+
+	root->res_newlkid_expect--;
+
+	if (!root->res_newlkid_expect) {
+		clear_bit(RESFL_NEW_MASTER, &root->res_flags);
+		recover_list_del(root);
+	}
+	up_write(&root->res_lock);
+}
+
+/* 
+ * Return the rebuild struct for a node - will create an entry on the rootrsb
+ * list if necessary.
+ *
+ * Currently no locking is needed here as it all happens in the gdlm_recvd
+ * thread
+ */
+
+static rebuild_node_t *find_rebuild_root(gd_ls_t *ls, int nodeid)
+{
+	rebuild_node_t *node = NULL;
+
+	list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
+		if (node->nodeid == nodeid)
+			return node;
+	}
+
+	/* Not found, add one */
+	node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
+	if (!node)
+		return NULL;
+
+	node->nodeid = nodeid;
+	node->rootrsb = NULL;
+	list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
+
+	return node;
+}
+
+/* 
+ * Tidy up after a rebuild run.  Called when all recovery has finished
+ */
+
+void rebuild_freemem(gd_ls_t *ls)
+{
+	rebuild_node_t *node = NULL, *s;
+
+	list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
+		list_del(&node->list);
+		kfree(node);
+	}
+}
+
+static void put_int(int x, char *buf, int *offp)
+{
+	x = cpu_to_le32(x);
+	memcpy(buf + *offp, &x, sizeof(int));
+	*offp += sizeof(int);
+}
+
+static void put_int64(uint64_t x, char *buf, int *offp)
+{
+	x = cpu_to_le64(x);
+	memcpy(buf + *offp, &x, sizeof(uint64_t));
+	*offp += sizeof(uint64_t);
+}
+
+static void put_bytes(char *x, int len, char *buf, int *offp)
+{
+	put_int(len, buf, offp);
+	memcpy(buf + *offp, x, len);
+	*offp += len;
+}
+
+static void put_char(char x, char *buf, int *offp)
+{
+	buf[*offp] = x;
+	*offp += 1;
+}
+
+static int get_int(char *buf, int *offp)
+{
+	int value;
+	memcpy(&value, buf + *offp, sizeof(int));
+	*offp += sizeof(int);
+	return le32_to_cpu(value);
+}
+
+static uint64_t get_int64(char *buf, int *offp)
+{
+	uint64_t value;
+
+	memcpy(&value, buf + *offp, sizeof(uint64_t));
+	*offp += sizeof(uint64_t);
+	return le64_to_cpu(value);
+}
+
+static char get_char(char *buf, int *offp)
+{
+	char x = buf[*offp];
+
+	*offp += 1;
+	return x;
+}
+
+static void get_bytes(char *bytes, int *len, char *buf, int *offp)
+{
+	*len = get_int(buf, offp);
+	memcpy(bytes, buf + *offp, *len);
+	*offp += *len;
+}
+
+static int lkb_length(gd_lkb_t *lkb)
+{
+	int len = 0;
+
+	len += sizeof(int);	/* lkb_id */
+	len += sizeof(int);	/* lkb_resource->res_reamasterid */
+	len += sizeof(int);	/* lkb_flags */
+	len += sizeof(int);	/* lkb_status */
+	len += sizeof(char);	/* lkb_rqmode */
+	len += sizeof(char);	/* lkb_grmode */
+	len += sizeof(int);	/* lkb_childcnt */
+	len += sizeof(int);	/* lkb_parent->lkb_id */
+	len += sizeof(int);	/* lkb_bastaddr */
+
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
+		len += sizeof(int);	/* number of lvb bytes */
+		len += DLM_LVB_LEN;
+	}
+
+	if (lkb->lkb_range) {
+		len += sizeof(uint64_t);
+		len += sizeof(uint64_t);
+		if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
+			len += sizeof(uint64_t);
+			len += sizeof(uint64_t);
+		}
+	}
+
+	return len;
+}
+
+/* 
+ * It's up to the caller to be sure there's enough space in the buffer.
+ */
+
+static void serialise_lkb(gd_lkb_t *lkb, char *buf, int *offp)
+{
+	int flags;
+
+	/* Need to tell the remote end if we have a range */
+	flags = lkb->lkb_flags;
+	if (lkb->lkb_range)
+		flags |= GDLM_LKFLG_RANGE;
+
+	/* 
+	 * See lkb_length()
+	 * Total: 30 (no lvb) or 66 (with lvb) bytes
+	 */
+
+	put_int(lkb->lkb_id, buf, offp);
+	put_int(lkb->lkb_resource->res_remasterid, buf, offp);
+	put_int(flags, buf, offp);
+	put_int(lkb->lkb_status, buf, offp);
+	put_char(lkb->lkb_rqmode, buf, offp);
+	put_char(lkb->lkb_grmode, buf, offp);
+	put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
+
+	if (lkb->lkb_parent)
+		put_int(lkb->lkb_parent->lkb_id, buf, offp);
+	else
+		put_int(0, buf, offp);
+
+	if (lkb->lkb_bastaddr)
+		put_int(1, buf, offp);
+	else
+		put_int(0, buf, offp);
+
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
+		GDLM_ASSERT(lkb->lkb_lvbptr,);
+		put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
+	}
+
+	/* Only send the range we actually need */
+	if (lkb->lkb_range) {
+		switch (lkb->lkb_status) {
+		case GDLM_LKSTS_CONVERT:
+			put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
+			put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
+			put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
+			put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
+			break;
+		case GDLM_LKSTS_WAITING:
+			put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
+			put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
+			break;
+		case GDLM_LKSTS_GRANTED:
+			put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
+			put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
+			break;
+		default:
+			GDLM_ASSERT(0,);
+		}
+	}
+}
+
+static int rsb_length(gd_res_t *rsb)
+{
+	int len = 0;
+
+	len += sizeof(int);	/* number of res_name bytes */
+	len += rsb->res_length;	/* res_name */
+	len += sizeof(int);	/* res_remasterid */
+	len += sizeof(int);	/* res_parent->res_remasterid */
+
+	return len;
+}
+
+static inline gd_res_t *next_subrsb(gd_res_t *subrsb)
+{
+	struct list_head *tmp;
+	gd_res_t *r;
+
+	tmp = subrsb->res_subreslist.next;
+	r = list_entry(tmp, gd_res_t, res_subreslist);
+
+	return r;
+}
+
+static inline int last_in_list(gd_res_t *r, struct list_head *head)
+{
+	gd_res_t *last = list_entry(head->prev, gd_res_t, res_subreslist);
+
+	if (last == r)
+		return 1;
+	return 0;
+}
+
+/* 
+ * Used to decide if an rsb should be rebuilt on a new master.  An rsb only
+ * needs to be rebuild if we have lkb's queued on it.  NOREBUILD lkb's on the
+ * wait queue are not rebuilt.
+ */
+
+static int lkbs_to_remaster(gd_res_t *r)
+{
+	gd_lkb_t *lkb;
+	gd_res_t *sub;
+
+	if (!list_empty(&r->res_grantqueue) ||
+	    !list_empty(&r->res_convertqueue))
+		return TRUE;
+
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
+			continue;
+		return TRUE;
+	}
+
+	list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
+		if (!list_empty(&sub->res_grantqueue) ||
+		    !list_empty(&sub->res_convertqueue))
+			return TRUE;
+
+		list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) {
+			if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
+				continue;
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+static void serialise_rsb(gd_res_t *rsb, char *buf, int *offp)
+{
+	/* 
+	 * See rsb_length()
+	 * Total: 36 bytes (4 + 24 + 4 + 4)
+	 */
+
+	put_bytes(rsb->res_name, rsb->res_length, buf, offp);
+	put_int(rsb->res_remasterid, buf, offp);
+
+	if (rsb->res_parent)
+		put_int(rsb->res_parent->res_remasterid, buf, offp);
+	else
+		put_int(0, buf, offp);
+
+	GDLM_ASSERT(!rsb->res_lvbptr,);
+}
+
+/* 
+ * Flatten an LKB into a buffer for sending to the new RSB master.  As a
+ * side-effect the nodeid of the lock is set to the nodeid of the new RSB
+ * master.
+ */
+
+static int pack_one_lkb(gd_res_t *r, gd_lkb_t *lkb, rcom_fill_t *fill)
+{
+	if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
+		goto nospace;
+
+	lkb->lkb_nodeid = r->res_nodeid;
+
+	put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
+	serialise_lkb(lkb, fill->outbuf, &fill->offset);
+
+	fill->count++;
+	need_new_lkid(r);
+	return 0;
+
+      nospace:
+	return -ENOSPC;
+}
+
+/* 
+ * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
+ */
+
+static int pack_lkb_queue(gd_res_t *r, struct list_head *queue,
+			  rcom_fill_t *fill)
+{
+	gd_lkb_t *lkb;
+	int error;
+
+	list_for_each_entry(lkb, queue, lkb_statequeue) {
+		if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
+			continue;
+
+		error = pack_one_lkb(r, lkb, fill);
+		if (error)
+			goto nospace;
+	}
+
+	return 0;
+
+      nospace:
+	fill->lkb = lkb;
+	fill->lkbqueue = queue;
+
+	return error;
+}
+
+static int pack_lkb_queues(gd_res_t *r, rcom_fill_t *fill)
+{
+	int error;
+
+	error = pack_lkb_queue(r, &r->res_grantqueue, fill);
+	if (error)
+		goto nospace;
+
+	error = pack_lkb_queue(r, &r->res_convertqueue, fill);
+	if (error)
+		goto nospace;
+
+	error = pack_lkb_queue(r, &r->res_waitqueue, fill);
+
+      nospace:
+	return error;
+}
+
+/* 
+ * Pack remaining lkb's for rsb or subrsb.  This may include a partial lkb
+ * queue and full lkb queues.
+ */
+
+static int pack_lkb_remaining(gd_res_t *r, rcom_fill_t *fill)
+{
+	struct list_head *tmp, *start, *end;
+	gd_lkb_t *lkb;
+	int error;
+
+	/* 
+	 * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
+	 */
+
+	error = pack_one_lkb(r, fill->lkb, fill);
+	if (error)
+		goto out;
+
+	start = fill->lkb->lkb_statequeue.next;
+	end = fill->lkbqueue;
+
+	for (tmp = start; tmp != end; tmp = tmp->next) {
+		lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
+
+		error = pack_one_lkb(r, lkb, fill);
+		if (error) {
+			fill->lkb = lkb;
+			goto out;
+		}
+	}
+
+	/* 
+	 * Pack all lkb's on r's queues following fill->lkbqueue.
+	 */
+
+	if (fill->lkbqueue == &r->res_waitqueue)
+		goto out;
+	if (fill->lkbqueue == &r->res_convertqueue)
+		goto skip;
+
+	GDLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
+
+	error = pack_lkb_queue(r, &r->res_convertqueue, fill);
+	if (error)
+		goto out;
+      skip:
+	error = pack_lkb_queue(r, &r->res_waitqueue, fill);
+
+      out:
+	return error;
+}
+
+static int pack_one_subrsb(gd_res_t *rsb, gd_res_t *subrsb, rcom_fill_t *fill)
+{
+	int error;
+
+	down_write(&subrsb->res_lock);
+
+	if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
+		goto nospace;
+
+	subrsb->res_nodeid = rsb->res_nodeid;
+	subrsb->res_remasterid = ++fill->remasterid;
+
+	put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
+	serialise_rsb(subrsb, fill->outbuf, &fill->offset);
+
+	error = pack_lkb_queues(subrsb, fill);
+	if (error)
+		goto nospace;
+
+	up_write(&subrsb->res_lock);
+
+	return 0;
+
+      nospace:
+	up_write(&subrsb->res_lock);
+	fill->subrsb = subrsb;
+
+	return -ENOSPC;
+}
+
+static int pack_subrsbs(gd_res_t *rsb, gd_res_t *in_subrsb, rcom_fill_t *fill)
+{
+	gd_res_t *subrsb;
+	int error = 0;
+
+	/* 
+	 * When an initial subrsb is given, we know it needs to be packed.
+	 * When no initial subrsb is given, begin with the first (if any exist).
+	 */
+
+	if (!in_subrsb) {
+		if (list_empty(&rsb->res_subreslist))
+			goto out;
+
+		subrsb = list_entry(rsb->res_subreslist.next, gd_res_t,
+			       	    res_subreslist);
+	} else
+		subrsb = in_subrsb;
+
+	for (;;) {
+		error = pack_one_subrsb(rsb, subrsb, fill);
+		if (error)
+			goto out;
+
+		if (last_in_list(subrsb, &rsb->res_subreslist))
+			break;
+
+		subrsb = next_subrsb(subrsb);
+	}
+
+      out:
+	return error;
+}
+
+/* 
+ * Finish packing whatever is left in an rsb tree.  If space runs out while
+ * finishing, save subrsb/lkb and this will be called again for the same rsb.
+ *
+ * !subrsb &&  lkb, we left off part way through root rsb's lkbs.
+ *  subrsb && !lkb, we left off just before starting a new subrsb.
+ *  subrsb &&  lkb, we left off part way through a subrsb's lkbs.
+ * !subrsb && !lkb, we shouldn't be in this function, but starting
+ *                  a new rsb in pack_rsb_tree().
+ */
+
+static int pack_rsb_tree_remaining(gd_ls_t *ls, gd_res_t *rsb,
+				   rcom_fill_t *fill)
+{
+	gd_res_t *subrsb = NULL;
+	int error = 0;
+
+	if (!fill->subrsb && fill->lkb) {
+		error = pack_lkb_remaining(rsb, fill);
+		if (error)
+			goto out;
+
+		error = pack_subrsbs(rsb, NULL, fill);
+		if (error)
+			goto out;
+	}
+
+	else if (fill->subrsb && !fill->lkb) {
+		error = pack_subrsbs(rsb, fill->subrsb, fill);
+		if (error)
+			goto out;
+	}
+
+	else if (fill->subrsb && fill->lkb) {
+		error = pack_lkb_remaining(fill->subrsb, fill);
+		if (error)
+			goto out;
+
+		if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
+			goto out;
+
+		subrsb = next_subrsb(fill->subrsb);
+
+		error = pack_subrsbs(rsb, subrsb, fill);
+		if (error)
+			goto out;
+	}
+
+	fill->subrsb = NULL;
+	fill->lkb = NULL;
+
+      out:
+	return error;
+}
+
+/* 
+ * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
+ * buffer.  When the buffer runs out of space, save the place to restart (the
+ * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
+ */
+
+static int pack_rsb_tree(gd_ls_t *ls, gd_res_t *rsb, rcom_fill_t *fill)
+{
+	int error = -ENOSPC;
+
+	fill->remasterid = 0;
+
+	/* 
+	 * Pack the root rsb itself.  A 1 byte type precedes the serialised
+	 * rsb.  Then pack the lkb's for the root rsb.
+	 */
+
+	down_write(&rsb->res_lock);
+
+	if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
+		goto out;
+
+	rsb->res_remasterid = ++fill->remasterid;
+	put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
+	serialise_rsb(rsb, fill->outbuf, &fill->offset);
+
+	error = pack_lkb_queues(rsb, fill);
+	if (error)
+		goto out;
+
+	up_write(&rsb->res_lock);
+
+	/* 
+	 * Pack subrsb/lkb's under the root rsb.
+	 */
+
+	error = pack_subrsbs(rsb, NULL, fill);
+
+	return error;
+
+      out:
+	up_write(&rsb->res_lock);
+	return error;
+}
+
+/* 
+ * Given an RSB, return the next RSB that should be sent to a new master.
+ */
+
+static gd_res_t *next_remastered_rsb(gd_ls_t *ls, gd_res_t *rsb)
+{
+	struct list_head *tmp, *start, *end;
+	gd_res_t *r;
+
+	if (!rsb)
+		start = ls->ls_rootres.next;
+	else
+		start = rsb->res_rootlist.next;
+
+	end = &ls->ls_rootres;
+
+	for (tmp = start; tmp != end; tmp = tmp->next) {
+		r = list_entry(tmp, gd_res_t, res_rootlist);
+
+		if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
+			if (r->res_nodeid && lkbs_to_remaster(r)) {
+				expect_new_lkids(r);
+				return r;
+			} else
+				clear_bit(RESFL_NEW_MASTER, &r->res_flags);
+		}
+	}
+
+	return NULL;
+}
+
+/* 
+ * Given an rcom buffer, fill it with RSB's that need to be sent to a single
+ * new master node.  In the case where all the data to send to one node
+ * requires multiple messages, this function needs to resume filling each
+ * successive buffer from the point where it left off when the previous buffer
+ * filled up.
+ */
+
+static void fill_rcom_buffer(gd_ls_t *ls, rcom_fill_t *fill, uint32_t *nodeid)
+{
+	gd_res_t *rsb, *prev_rsb = fill->rsb;
+	int error;
+
+	fill->offset = 0;
+
+	if (!prev_rsb) {
+
+		/* 
+		 * The first time this function is called.
+		 */
+
+		rsb = next_remastered_rsb(ls, NULL);
+		if (!rsb)
+			goto no_more;
+
+	} else if (fill->subrsb || fill->lkb) {
+
+		/* 
+		 * Continue packing an rsb tree that was partially packed last
+		 * time (fill->subrsb/lkb indicates where packing of last block
+		 * left off)
+		 */
+
+		rsb = prev_rsb;
+		*nodeid = rsb->res_nodeid;
+
+		error = pack_rsb_tree_remaining(ls, rsb, fill);
+		if (error == -ENOSPC)
+			goto more;
+
+		rsb = next_remastered_rsb(ls, prev_rsb);
+		if (!rsb)
+			goto no_more;
+
+		if (rsb->res_nodeid != prev_rsb->res_nodeid)
+			goto more;
+	} else {
+		rsb = prev_rsb;
+	}
+
+	/* 
+	 * Pack rsb trees into the buffer until we run out of space, run out of
+	 * new rsb's or hit a new nodeid.
+	 */
+
+	*nodeid = rsb->res_nodeid;
+
+	for (;;) {
+		error = pack_rsb_tree(ls, rsb, fill);
+		if (error == -ENOSPC)
+			goto more;
+
+		prev_rsb = rsb;
+
+		rsb = next_remastered_rsb(ls, prev_rsb);
+		if (!rsb)
+			goto no_more;
+
+		if (rsb->res_nodeid != prev_rsb->res_nodeid)
+			goto more;
+	}
+
+      more:
+	fill->more = 1;
+	fill->rsb = rsb;
+	return;
+
+      no_more:
+	fill->more = 0;
+}
+
+/* 
+ * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
+ */
+
+int rebuild_rsbs_send(gd_ls_t *ls)
+{
+	gd_rcom_t *rc;
+	rcom_fill_t fill;
+	uint32_t nodeid;
+	int error;
+
+	GDLM_ASSERT(recover_list_empty(ls),);
+
+	log_all(ls, "rebuild locks");
+
+	error = -ENOMEM;
+	rc = allocate_rcom_buffer(ls);
+	if (!rc)
+		goto ret;
+
+	error = 0;
+	memset(&fill, 0, sizeof(rcom_fill_t));
+	fill.outbuf = rc->rc_buf;
+	fill.maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
+
+	do {
+		fill_rcom_buffer(ls, &fill, &nodeid);
+		if (!fill.offset)
+			break;
+
+		rc->rc_datalen = fill.offset;
+		error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
+		if (error)
+			goto out;
+
+		schedule();
+		error = gdlm_recovery_stopped(ls);
+		if (error)
+			goto out;
+	}
+	while (fill.more);
+
+	error = gdlm_wait_function(ls, &recover_list_empty);
+
+	log_all(ls, "rebuilt %d locks", fill.count);
+
+      out:
+	rebuild_freemem(ls);
+	free_rcom_buffer(rc);
+
+      ret:
+	return error;
+}
+
+static gd_res_t *find_by_remasterid(gd_ls_t *ls, int remasterid,
+				    gd_res_t *rootrsb)
+{
+	gd_res_t *rsb;
+
+	GDLM_ASSERT(rootrsb,);
+
+	if (rootrsb->res_remasterid == remasterid) {
+		rsb = rootrsb;
+		goto out;
+	}
+
+	list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
+		if (rsb->res_remasterid == remasterid)
+			goto out;
+	}
+	rsb = NULL;
+
+      out:
+	return rsb;
+}
+
+/* 
+ * Search a queue for the given remote lock id (remlkid).
+ */
+
+static gd_lkb_t *search_remlkid(struct list_head *statequeue, int nodeid,
+				int remid)
+{
+	gd_lkb_t *lkb;
+
+	list_for_each_entry(lkb, statequeue, lkb_statequeue) {
+		if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
+			return lkb;
+		}
+	}
+
+	return NULL;
+}
+
+/* 
+ * Given a remote lock ID (and a parent resource), return the local LKB for it
+ * Hopefully we dont need to do this too often on deep lock trees.  This is
+ * VERY suboptimal for anything but the smallest lock trees. It searches the
+ * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
+ * returns the LKB address.  OPTIMISATION: we should keep a list of these while
+ * we are building up the remastered LKBs
+ */
+
+static gd_lkb_t *find_by_remlkid(gd_res_t *rootrsb, int nodeid, int remid)
+{
+	gd_lkb_t *lkb;
+	gd_res_t *rsb;
+
+	lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
+	if (lkb)
+		goto out;
+
+	lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
+	if (lkb)
+		goto out;
+
+	lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
+	if (lkb)
+		goto out;
+
+	list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
+		lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
+		if (lkb)
+			goto out;
+
+		lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
+		if (lkb)
+			goto out;
+
+		lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
+		if (lkb)
+			goto out;
+	}
+	lkb = NULL;
+
+      out:
+	return lkb;
+}
+
+/* 
+ * Unpack an LKB from a remaster operation
+ */
+
+static int deserialise_lkb(gd_ls_t *ls, int rem_nodeid, gd_res_t *rootrsb,
+			   char *buf, int *ptr, char *outbuf, int *outoffp)
+{
+	gd_lkb_t *lkb;
+	gd_res_t *rsb;
+	int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
+
+	remote_lkid = get_int(buf, ptr);
+
+	rsb_rmid = get_int(buf, ptr);
+	rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
+	GDLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
+
+	/* 
+	 * We could have received this lkb already from a previous recovery
+	 * that was interrupted.  If so, just return the lkid to the remote
+	 * node.
+	 */
+	lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
+	if (lkb)
+		goto put_lkid;
+
+	lkb = create_lkb(rsb->res_ls);
+	if (!lkb)
+		goto out;
+
+	lkb->lkb_remid = remote_lkid;
+	lkb->lkb_flags = get_int(buf, ptr);
+	status = get_int(buf, ptr);
+	lkb->lkb_rqmode = get_char(buf, ptr);
+	lkb->lkb_grmode = get_char(buf, ptr);
+	atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
+
+	parentid = get_int(buf, ptr);
+	lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
+
+	if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
+		lkb->lkb_lvbptr = allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			goto out;
+		get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
+	}
+
+	if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
+		uint64_t start, end;
+
+		/* Don't need to keep the range flag, for comms use only */
+		lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
+		start = get_int64(buf, ptr);
+		end = get_int64(buf, ptr);
+
+		lkb->lkb_range = allocate_range(rsb->res_ls);
+		if (!lkb->lkb_range)
+			goto out;
+
+		switch (status) {
+		case GDLM_LKSTS_CONVERT:
+			lkb->lkb_range[RQ_RANGE_START] = start;
+			lkb->lkb_range[RQ_RANGE_END] = end;
+			start = get_int64(buf, ptr);
+			end = get_int64(buf, ptr);
+			lkb->lkb_range[GR_RANGE_START] = start;
+			lkb->lkb_range[GR_RANGE_END] = end;
+
+		case GDLM_LKSTS_WAITING:
+			lkb->lkb_range[RQ_RANGE_START] = start;
+			lkb->lkb_range[RQ_RANGE_END] = end;
+			break;
+
+		case GDLM_LKSTS_GRANTED:
+			lkb->lkb_range[GR_RANGE_START] = start;
+			lkb->lkb_range[GR_RANGE_END] = end;
+			break;
+		default:
+			GDLM_ASSERT(0,);
+		}
+	}
+
+	/* Resolve local lock LKB address from parent ID */
+	if (parentid)
+		lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
+				                  parentid);
+
+	atomic_inc(&rsb->res_ref);
+	lkb->lkb_resource = rsb;
+
+	lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
+	lkb->lkb_nodeid = rem_nodeid;
+
+	/* 
+	 * Put the lkb on an RSB queue.  An lkb that's in the midst of a
+	 * conversion request (on the requesting node's lockqueue and has
+	 * LQCONVERT set) should be put on the granted queue.  The convert
+	 * request will be resent by the requesting node.
+	 */
+
+	if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
+		lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
+		GDLM_ASSERT(status == GDLM_LKSTS_CONVERT,
+			    printk("status=%d\n", status););
+		lkb->lkb_rqmode = DLM_LOCK_IV;
+		status = GDLM_LKSTS_GRANTED;
+	}
+
+	lkb_enqueue(rsb, lkb, status);
+
+	/* 
+	 * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
+	 */
+
+	if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
+	    && lkb->lkb_grmode > DLM_LOCK_NL) {
+		if (!rsb->res_lvbptr)
+			rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
+		if (!rsb->res_lvbptr)
+			goto out;
+		memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+	}
+
+	/* 
+	 * Clear flags that may have been sent over that are only relevant in
+	 * the context of the sender.
+	 */
+
+	lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND |
+			    GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED);
+
+      put_lkid:
+	/* Return the new LKID to the caller's buffer */
+	put_int(lkb->lkb_id, outbuf, outoffp);
+	put_int(lkb->lkb_remid, outbuf, outoffp);
+	error = 0;
+
+      out:
+	return error;
+}
+
+static gd_res_t *deserialise_rsb(gd_ls_t *ls, int nodeid, gd_res_t *rootrsb,
+				 char *buf, int *ptr)
+{
+	int length;
+	int remasterid;
+	int parent_remasterid;
+	char name[DLM_RESNAME_MAXLEN];
+	int error;
+	gd_res_t *parent = NULL;
+	gd_res_t *rsb;
+
+	get_bytes(name, &length, buf, ptr);
+	remasterid = get_int(buf, ptr);
+	parent_remasterid = get_int(buf, ptr);
+
+	if (parent_remasterid)
+		parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
+
+	/* 
+	 * The rsb reference from this find_or_create_rsb() will keep the rsb
+	 * around while we add new lkb's to it from deserialise_lkb.  Each of
+	 * the lkb's will add an rsb reference.  The reference added here is
+	 * removed by release_rsb() after all lkb's are added.
+	 */
+
+	error = find_or_create_rsb(ls, parent, name, length, 1, &rsb);
+	GDLM_ASSERT(!error,);
+
+	/* There is a case where the above needs to create the RSB. */
+	if (rsb->res_nodeid == -1)
+		rsb->res_nodeid = our_nodeid();
+
+	rsb->res_remasterid = remasterid;
+
+	return rsb;
+}
+
+/* 
+ * Processing at the receiving end of a NEWLOCKS message from a node in
+ * rebuild_rsbs_send().  Rebuild a remastered lock tree.  Nodeid is the remote
+ * node whose locks we are now mastering.  For a reply we need to send back the
+ * new lockids of the remastered locks so that remote ops can find them.
+ */
+
+int rebuild_rsbs_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
+{
+	gd_rcom_t *rc;
+	gd_res_t *rsb = NULL;
+	rebuild_node_t *rnode;
+	char *outbuf;
+	int outptr, ptr = 0, error = -ENOMEM;
+
+	rnode = find_rebuild_root(ls, nodeid);
+	if (!rnode)
+		goto out;
+
+	/* 
+	 * Allocate a buffer for the reply message which is a list of remote
+	 * lock IDs and their (new) local lock ids.  It will always be big
+	 * enough to fit <n> ID pairs if it already fit <n> LKBs.
+	 */
+
+	rc = allocate_rcom_buffer(ls);
+	if (!rc)
+		goto out;
+	outbuf = rc->rc_buf;
+	outptr = 0;
+
+	/* 
+	 * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
+	 * created.  Each deserialise_rsb adds an rsb reference that must be
+	 * removed with release_rsb once all new lkb's for an rsb have been
+	 * added.
+	 */
+
+	while (ptr < len) {
+		int type;
+
+		type = get_char(buf, &ptr);
+
+		switch (type) {
+		case REMASTER_ROOTRSB:
+			if (rsb)
+				release_rsb(rsb);
+			rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
+					      &ptr);
+			rnode->rootrsb = rsb;
+			break;
+
+		case REMASTER_RSB:
+			if (rsb)
+				release_rsb(rsb);
+			rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
+					      &ptr);
+			break;
+
+		case REMASTER_LKB:
+			deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
+					outbuf, &outptr);
+			break;
+
+		default:
+			GDLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
+					      "len=%d\n", type, nodeid, ptr,
+					      len););
+		}
+	}
+
+	if (rsb)
+		release_rsb(rsb);
+
+	/* 
+	 * Reply with the new lock IDs.
+	 */
+
+	rc->rc_datalen = outptr;
+	error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
+
+	free_rcom_buffer(rc);
+
+      out:
+	return error;
+}
+
+/* 
+ * Processing for a NEWLOCKIDS message.  Called when we get the reply from the
+ * new master telling us what the new remote lock IDs are for the remastered
+ * locks
+ */
+
+int rebuild_rsbs_lkids_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
+{
+	int offset = 0;
+
+	if (len == 1)
+		len = 0;
+
+	while (offset < len) {
+		int remote_id;
+		int local_id;
+		gd_lkb_t *lkb;
+
+		if (offset + 8 > len) {
+			log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
+				  "length nodeid=%d offset=%d len=%d",
+				  nodeid, offset, len);
+			break;
+		}
+
+		remote_id = get_int(buf, &offset);
+		local_id = get_int(buf, &offset);
+
+		lkb = find_lock_by_id(ls, local_id);
+		if (lkb) {
+			lkb->lkb_remid = remote_id;
+			have_new_lkid(lkb);
+		} else {
+			log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
+				  "nodeid=%d id=%x remid=%x offset=%d len=%d",
+				  nodeid, local_id, remote_id, offset, len);
+		}
+	}
+
+	if (recover_list_empty(ls))
+		wake_up(&ls->ls_wait_general);
+
+	return 0;
+}
diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
--- linux-orig/cluster/dlm/rebuild.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/rebuild.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REBUILD_DOT_H__
+#define __REBUILD_DOT_H__
+
+int rebuild_rsbs_send(gd_ls_t * ls);
+int rebuild_rsbs_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
+int rebuild_rsbs_lkids_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
+int rebuild_freemem(gd_ls_t * ls);
+
+#endif				/* __REBUILD_DOT_H__ */
diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
--- linux-orig/cluster/dlm/reccomms.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/reccomms.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,502 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "reccomms.h"
+#include "nodes.h"
+#include "lockspace.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "rebuild.h"
+#include "memory.h"
+
+/* Running on the basis that only a single recovery communication will be done
+ * at a time per lockspace */
+
+static void rcom_process_message(gd_ls_t * ls, uint32_t nodeid, gd_rcom_t * rc);
+
+/*
+ * Track per-node progress/stats during recovery to help debugging.
+ */
+
+void rcom_log(gd_ls_t *ls, int nodeid, gd_rcom_t *rc, int send)
+{
+	gd_csb_t *csb;
+	int found = 0;
+ 
+	list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
+		if (csb->csb_node->gn_nodeid == nodeid) {
+			found = TRUE;
+			break;
+		}
+	}
+
+	if (!found)
+		return;
+
+	if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) {
+		if (send) {
+			csb->csb_names_send_count++;
+			csb->csb_names_send_msgid = rc->rc_msgid;
+		} else {
+			csb->csb_names_recv_count++;
+			csb->csb_names_recv_msgid = rc->rc_msgid;
+		}
+	} else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) {
+		if (send) {
+			csb->csb_locks_send_count++;
+			csb->csb_locks_send_msgid = rc->rc_msgid;
+		} else {
+			csb->csb_locks_recv_count++;
+			csb->csb_locks_recv_msgid = rc->rc_msgid;
+		}
+	}
+}
+
+void rcom_log_clear(gd_ls_t *ls)
+{
+	gd_csb_t *csb;
+ 
+	list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
+		csb->csb_names_send_count = 0;
+		csb->csb_names_send_msgid = 0;
+		csb->csb_names_recv_count = 0;
+		csb->csb_names_recv_msgid = 0;
+		csb->csb_locks_send_count = 0;
+		csb->csb_locks_send_msgid = 0;
+		csb->csb_locks_recv_count = 0;
+		csb->csb_locks_recv_msgid = 0;
+	}
+}
+
+static int rcom_response(gd_ls_t *ls)
+{
+	return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
+}
+
+/**
+ * rcom_send_message - send or request recovery data
+ * @ls: the lockspace
+ * @nodeid: node to which the message is sent
+ * @type: type of recovery message
+ * @rc: the rc buffer to send
+ * @need_reply: wait for reply if this is set
+ *
+ * Using this interface
+ * i)   Allocate an rc buffer:  
+ *          rc = allocate_rcom_buffer(ls);
+ * ii)  Copy data to send beginning at rc->rc_buf:
+ *          memcpy(rc->rc_buf, mybuf, mylen);
+ * iii) Set rc->rc_datalen to the number of bytes copied in (ii): 
+ *          rc->rc_datalen = mylen
+ * iv)  Submit the rc to this function:
+ *          rcom_send_message(rc);
+ *
+ * The max value of "mylen" is dlm_config.buffer_size - sizeof(gd_rcom_t).  If
+ * more data must be passed in one send, use rcom_expand_buffer() which
+ * incrementally increases the size of the rc buffer by dlm_config.buffer_size
+ * bytes.
+ *
+ * Any data returned for the message (when need_reply is set) will saved in
+ * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
+ * number of bytes copied into rc->rc_buf.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+int rcom_send_message(gd_ls_t *ls, uint32_t nodeid, int type, gd_rcom_t *rc,
+		      int need_reply)
+{
+	int error = 0;
+
+	if (!rc->rc_datalen)
+		rc->rc_datalen = 1;
+
+	/* 
+	 * Fill in the header.
+	 */
+
+	rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
+	rc->rc_header.rh_lockspace = ls->ls_global_id;
+	rc->rc_header.rh_length = sizeof(gd_rcom_t) + rc->rc_datalen - 1;
+	rc->rc_subcmd = type;
+	rc->rc_msgid = ++ls->ls_rcom_msgid;
+
+	rcom_log(ls, nodeid, rc, 1);
+
+	/* 
+	 * When a reply is received, the reply data goes back into this buffer.
+	 * Synchronous rcom requests (need_reply=1) are serialised because of
+	 * the single ls_rcom.
+	 */
+
+	if (need_reply) {
+		down(&ls->ls_rcom_lock);
+		ls->ls_rcom = rc;
+	}
+
+	/* 
+	 * After sending the message we'll wait at the end of this function to
+	 * get a reply.  The READY flag will be set when the reply has been
+	 * received and requested data has been copied into
+	 * ls->ls_rcom->rc_buf;
+	 */
+
+	GDLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
+
+	/* 
+	 * The WAIT bit indicates that we're waiting for and willing to accept a
+	 * reply.  Any replies are ignored unless this bit is set.
+	 */
+
+	set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
+
+	/* 
+	 * Process the message locally.
+	 */
+
+	if (nodeid == our_nodeid()) {
+		rcom_process_message(ls, nodeid, rc);
+		goto out;
+	}
+
+	/* 
+	 * Send the message.
+	 */
+
+	log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
+
+	error = midcomms_send_message(nodeid, (struct gd_req_header *) rc,
+				      GFP_KERNEL);
+	GDLM_ASSERT(error >= 0, printk("error = %d\n", error););
+	error = 0;
+
+	/* 
+	 * Wait for a reply.  Once a reply is processed from midcomms, the
+	 * READY bit will be set and we'll be awoken (gdlm_wait_function will
+	 * return 0).
+	 */
+
+	if (need_reply) {
+		error = gdlm_wait_function(ls, &rcom_response);
+		if (error)
+			log_debug(ls, "rcom wait error %d", error);
+	}
+
+      out:
+	clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
+	clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
+
+	if (need_reply)
+		up(&ls->ls_rcom_lock);
+
+	return error;
+}
+
+/* 
+ * Runs in same context as midcomms.
+ */
+
+static void rcom_process_message(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *rc)
+{
+	gd_rcom_t rc_stack;
+	gd_rcom_t *reply = NULL;
+	gd_resdata_t *rd;
+	int status, datalen, maxlen;
+	uint32_t be_nodeid;
+
+	if (!ls)
+		return;
+
+	rcom_log(ls, nodeid, rc, 0);
+
+	if (gdlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
+		log_error(ls, "ignoring recovery message %x from %u",
+			  rc->rc_subcmd, nodeid);
+		return;
+	}
+
+	switch (rc->rc_subcmd) {
+
+	case RECCOMM_STATUS:
+
+		memset(&rc_stack, 0, sizeof(gd_rcom_t));
+		reply = &rc_stack;
+
+		reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+		reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+		reply->rc_subcmd = rc->rc_subcmd;
+		reply->rc_msgid = rc->rc_msgid;
+		reply->rc_buf[0] = 0;
+
+		if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
+			reply->rc_buf[0] |= RESDIR_VALID;
+
+		if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
+			reply->rc_buf[0] |= RESDIR_ALL_VALID;
+
+		if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
+			reply->rc_buf[0] |= NODES_VALID;
+
+		if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
+			reply->rc_buf[0] |= NODES_ALL_VALID;
+
+		reply->rc_datalen = 1;
+		reply->rc_header.rh_length =
+			sizeof(gd_rcom_t) + reply->rc_datalen - 1;
+
+		log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
+		break;
+
+	case RECCOMM_RECOVERNAMES:
+
+		reply = allocate_rcom_buffer(ls);
+		GDLM_ASSERT(reply,);
+		maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
+
+		reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+		reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+		reply->rc_subcmd = rc->rc_subcmd;
+		reply->rc_msgid = rc->rc_msgid;
+
+		/* 
+		 * The other node wants a bunch of resource names.  The name of
+		 * the resource to begin with is in rc->rc_buf.
+		 */
+
+		datalen = resdir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
+					      reply->rc_buf, maxlen, nodeid);
+
+		reply->rc_datalen = datalen;
+		reply->rc_header.rh_length =
+		    sizeof(gd_rcom_t) + reply->rc_datalen - 1;
+
+		log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
+			  reply->rc_msgid);
+		break;
+
+	case RECCOMM_GETMASTER:
+
+		reply = allocate_rcom_buffer(ls);
+		GDLM_ASSERT(reply,);
+
+		reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+		reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+		reply->rc_subcmd = rc->rc_subcmd;
+		reply->rc_msgid = rc->rc_msgid;
+
+		/* 
+		 * The other node wants to know the master of a named resource.
+		 */
+
+		status = get_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen,
+				     &rd, 1);
+		if (status != 0) {
+			free_rcom_buffer(reply);
+			reply = NULL;
+			return;
+		}
+		be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
+		memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
+		reply->rc_datalen = sizeof(uint32_t);
+		reply->rc_header.rh_length =
+		    sizeof(gd_rcom_t) + reply->rc_datalen - 1;
+		break;
+
+	case RECCOMM_BULKLOOKUP:
+
+		reply = allocate_rcom_buffer(ls);
+		GDLM_ASSERT(reply,);
+
+		reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+		reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+		reply->rc_subcmd = rc->rc_subcmd;
+		reply->rc_msgid = rc->rc_msgid;
+
+		/* 
+		 * This is a bulk version of the above and just returns a
+		 * buffer full of node ids to match the resources
+		 */
+
+		datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
+				             rc->rc_datalen, reply->rc_buf);
+		if (datalen < 0) {
+			free_rcom_buffer(reply);
+			reply = NULL;
+			return;
+		}
+
+		reply->rc_datalen = datalen;
+		reply->rc_header.rh_length =
+		    sizeof(gd_rcom_t) + reply->rc_datalen - 1;
+		break;
+
+		/* 
+		 * These RECCOMM messages don't need replies.
+		 */
+
+	case RECCOMM_NEWLOCKS:
+		rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
+		break;
+
+	case RECCOMM_NEWLOCKIDS:
+		rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
+		break;
+
+	case RECCOMM_REMRESDATA:
+		remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen, 1);
+		break;
+
+	default:
+		GDLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
+	}
+
+	if (reply) {
+		if (nodeid == our_nodeid()) {
+			GDLM_ASSERT(rc == ls->ls_rcom,);
+			memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
+			rc->rc_datalen = reply->rc_datalen;
+		} else {
+			midcomms_send_message(nodeid,
+					      (struct gd_req_header *) reply,
+					      GFP_KERNEL);
+		}
+
+		if (reply != &rc_stack)
+			free_rcom_buffer(reply);
+	}
+}
+
+static void process_reply_sync(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
+{
+	gd_rcom_t *rc = ls->ls_rcom;
+
+	if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
+		log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
+		return;
+	}
+
+	if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
+		log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
+		          reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
+		return;
+	}
+
+	memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
+	rc->rc_datalen = reply->rc_datalen;
+
+	/* 
+	 * Tell the thread waiting in rcom_send_message() that it can go ahead.
+	 */
+
+	set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
+	wake_up(&ls->ls_wait_general);
+}
+
+static void process_reply_async(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
+{
+	restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
+			       reply->rc_msgid);
+}
+
+/* 
+ * Runs in same context as midcomms.
+ */
+
+static void rcom_process_reply(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
+{
+	if (gdlm_recovery_stopped(ls)) {
+		log_error(ls, "ignoring recovery reply %x from %u",
+			  reply->rc_subcmd, nodeid);
+		return;
+	}
+
+	switch (reply->rc_subcmd) {
+	case RECCOMM_GETMASTER:
+		process_reply_async(ls, nodeid, reply);
+		break;
+	case RECCOMM_STATUS:
+	case RECCOMM_NEWLOCKS:
+	case RECCOMM_NEWLOCKIDS:
+	case RECCOMM_RECOVERNAMES:
+		process_reply_sync(ls, nodeid, reply);
+		break;
+	default:
+		log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
+		          reply->rc_subcmd, nodeid);
+	}
+}
+
+
+static int send_ls_not_ready(uint32_t nodeid, struct gd_req_header *header)
+{
+	struct writequeue_entry *wq;
+	gd_rcom_t *rc = (gd_rcom_t *) header;
+	gd_rcom_t *reply;
+
+	wq = lowcomms_get_buffer(nodeid, sizeof(gd_rcom_t), GFP_KERNEL,
+			         (char **)&reply);
+	if (!wq)
+		return -ENOMEM;
+
+	reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
+	reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
+	reply->rc_subcmd = rc->rc_subcmd;
+	reply->rc_msgid = rc->rc_msgid;
+	reply->rc_buf[0] = 0;
+
+	reply->rc_datalen = 1;
+	reply->rc_header.rh_length = sizeof(gd_rcom_t) + reply->rc_datalen - 1;
+
+	midcomms_send_buffer((struct gd_req_header *)reply, wq);
+	return 0;
+}
+
+
+/* 
+ * Runs in same context as midcomms.  Both recovery requests and recovery
+ * replies come through this function.
+ */
+
+void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header)
+{
+	gd_ls_t *ls = find_lockspace_by_global_id(header->rh_lockspace);
+	gd_rcom_t *rc = (gd_rcom_t *) header;
+
+	/* If the lockspace doesn't exist then still send a status message
+	   back, it's possible that it just doesn't have it's global_id
+  	   yet. */
+	if (!ls) {
+	      send_ls_not_ready(nodeid, header);
+	      return;
+	}
+
+	switch (header->rh_cmd) {
+	case GDLM_REMCMD_RECOVERMESSAGE:
+		down_read(&ls->ls_rec_rsblist);
+		rcom_process_message(ls, nodeid, rc);
+		up_read(&ls->ls_rec_rsblist);
+		break;
+
+	case GDLM_REMCMD_RECOVERREPLY:
+		rcom_process_reply(ls, nodeid, rc);
+		break;
+
+	default:
+		GDLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
+	}
+}
+
diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
--- linux-orig/cluster/dlm/reccomms.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/reccomms.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,37 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECCOMMS_DOT_H__
+#define __RECCOMMS_DOT_H__
+
+/* Bit flags */
+
+#define RESDIR_VALID            (1)
+#define RESDIR_ALL_VALID        (2)
+#define NODES_VALID             (4)
+#define NODES_ALL_VALID         (8)
+
+#define RECCOMM_STATUS          (1)
+#define RECCOMM_RECOVERNAMES    (2)
+#define RECCOMM_GETMASTER       (3)
+#define RECCOMM_BULKLOOKUP      (4)
+#define RECCOMM_NEWLOCKS        (5)
+#define RECCOMM_NEWLOCKIDS      (6)
+#define RECCOMM_REMRESDATA      (7)
+
+int rcom_send_message(gd_ls_t * ls, uint32_t nodeid, int type, gd_rcom_t * rc,
+		      int need_reply);
+void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header);
+void rcom_log_clear(gd_ls_t *ls);
+
+#endif
diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
--- linux-orig/cluster/dlm/recover.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/recover.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,632 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "reccomms.h"
+#include "dir.h"
+#include "locking.h"
+#include "rsb.h"
+#include "lockspace.h"
+#include "lkb.h"
+#include "nodes.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+
+/*
+ * Called in recovery routines to check whether the recovery process has been
+ * interrupted/stopped by another transition.  A recovery in-process will abort
+ * if the lockspace is "stopped" so that a new recovery process can start from
+ * the beginning when the lockspace is "started" again.
+ */
+
+int gdlm_recovery_stopped(gd_ls_t *ls)
+{
+	return test_bit(LSFL_LS_STOP, &ls->ls_flags);
+}
+
+static void gdlm_wait_timer_fn(unsigned long data)
+{
+	gd_ls_t *ls = (gd_ls_t *) data;
+
+	wake_up(&ls->ls_wait_general);
+}
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
+ * set due to failure of a node in ls_nodes).  When another function thinks it
+ * could have completed the waited-on task, they should wake up ls_wait_general
+ * to get an immediate response rather than waiting for the timer to detect the
+ * result.  A timer wakes us up periodically while waiting to see if we should
+ * abort due to a node failure.
+ */
+
+int gdlm_wait_function(gd_ls_t *ls, int (*testfn) (gd_ls_t * ls))
+{
+	struct timer_list timer;
+	int error = 0;
+
+	init_timer(&timer);
+	timer.function = gdlm_wait_timer_fn;
+	timer.data = (long) ls;
+
+	for (;;) {
+		mod_timer(&timer, jiffies + (5 * HZ));
+
+		wchan_cond_sleep_intr(ls->ls_wait_general,
+				      !testfn(ls) &&
+				      !test_bit(LSFL_LS_STOP, &ls->ls_flags));
+
+		if (timer_pending(&timer))
+			del_timer(&timer);
+
+		if (testfn(ls))
+			break;
+
+		if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
+			error = -1;
+			break;
+		}
+	}
+
+	return error;
+}
+
+int gdlm_wait_status_all(gd_ls_t *ls, unsigned int wait_status)
+{
+	gd_rcom_t rc_stack, *rc;
+	gd_csb_t *csb;
+	int status;
+	int error = 0;
+
+	memset(&rc_stack, 0, sizeof(gd_rcom_t));
+	rc = &rc_stack;
+	rc->rc_datalen = 0;
+
+	list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
+		for (;;) {
+			error = gdlm_recovery_stopped(ls);
+			if (error)
+				goto out;
+
+			error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
+						  RECCOMM_STATUS, rc, 1);
+			if (error)
+				goto out;
+
+			status = rc->rc_buf[0];
+			if (status & wait_status)
+				break;
+			else {
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ >> 1);
+			}
+		}
+	}
+
+      out:
+	return error;
+}
+
+int gdlm_wait_status_low(gd_ls_t *ls, unsigned int wait_status)
+{
+	gd_rcom_t rc_stack, *rc;
+	uint32_t nodeid = ls->ls_low_nodeid;
+	int status;
+	int error = 0;
+
+	memset(&rc_stack, 0, sizeof(gd_rcom_t));
+	rc = &rc_stack;
+	rc->rc_datalen = 0;
+
+	for (;;) {
+		error = gdlm_recovery_stopped(ls);
+		if (error)
+			goto out;
+
+		error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
+		if (error)
+			break;
+
+		status = rc->rc_buf[0];
+		if (status & wait_status)
+			break;
+		else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ >> 1);
+		}
+	}
+
+      out:
+	return error;
+}
+
+static int purge_queue(gd_ls_t *ls, struct list_head *queue)
+{
+	gd_lkb_t *lkb, *safe;
+	gd_res_t *rsb;
+	int count = 0;
+
+	list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+		if (!lkb->lkb_nodeid)
+			continue;
+
+		GDLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
+
+		if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
+			list_del(&lkb->lkb_statequeue);
+
+			rsb = lkb->lkb_resource;
+			lkb->lkb_status = 0;
+
+			if (lkb->lkb_status == GDLM_LKSTS_CONVERT
+			    && &lkb->lkb_duetime)
+				remove_from_deadlockqueue(lkb);
+
+			release_lkb(ls, lkb);
+			release_rsb(rsb);
+			count++;
+		}
+	}
+
+	return count;
+}
+
+/*
+ * Go through local restbl and for each rsb we're master of, clear out any
+ * lkb's held by departed nodes.
+ */
+
+int restbl_lkb_purge(gd_ls_t *ls)
+{
+	struct list_head *tmp2, *safe2;
+	int count = 0;
+	gd_res_t *rootrsb, *safe, *rsb;
+
+	log_all(ls, "purge locks of departed nodes");
+
+	list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
+
+		rootrsb->res_resdir_seq = 1;
+
+		if (rootrsb->res_nodeid)
+			continue;
+
+		hold_rsb(rootrsb);
+		down_write(&rootrsb->res_lock);
+
+		/* This traverses the subreslist in reverse order so we purge
+		 * the children before their parents. */
+
+		for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
+		     tmp2 != &rootrsb->res_subreslist;
+		     tmp2 = safe2, safe2 = safe2->prev) {
+			rsb = list_entry(tmp2, gd_res_t, res_subreslist);
+
+			hold_rsb(rsb);
+			purge_queue(ls, &rsb->res_grantqueue);
+			purge_queue(ls, &rsb->res_convertqueue);
+			purge_queue(ls, &rsb->res_waitqueue);
+			release_rsb(rsb);
+		}
+		count += purge_queue(ls, &rootrsb->res_grantqueue);
+		count += purge_queue(ls, &rootrsb->res_convertqueue);
+		count += purge_queue(ls, &rootrsb->res_waitqueue);
+
+		up_write(&rootrsb->res_lock);
+		release_rsb(rootrsb);
+	}
+
+	log_all(ls, "purged %d locks", count);
+
+	return 0;
+}
+
+/*
+ * Grant any locks that have become grantable after a purge
+ */
+
+int restbl_grant_after_purge(gd_ls_t *ls)
+{
+	gd_res_t *root, *rsb, *safe;
+	int error = 0;
+
+	down_write(&ls->ls_gap_rsblist);
+
+	list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
+		/* only the rsb master grants locks */
+		if (root->res_nodeid)
+			continue;
+
+		if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+			log_debug(ls, "restbl_grant_after_purge aborted");
+			error = -EINTR;
+			up_write(&ls->ls_gap_rsblist);
+			goto out;
+		}
+
+		down_write(&root->res_lock);
+		grant_pending_locks(root);
+		up_write(&root->res_lock);
+
+		list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
+			down_write(&rsb->res_lock);
+			grant_pending_locks(rsb);
+			up_write(&rsb->res_lock);
+		}
+	}
+	up_write(&ls->ls_gap_rsblist);
+	wake_astd();
+ out:
+	return error;
+}
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+	gd_lkb_t *lkb;
+
+	list_for_each_entry(lkb, queue, lkb_statequeue) {
+		/* Don't muck around with pre-exising sublocks */
+		if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
+			lkb->lkb_nodeid = nodeid;
+	}
+}
+
+static void set_master_lkbs(gd_res_t *rsb)
+{
+	set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
+	set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
+	set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
+}
+
+/*
+ * This rsb struct is now the master so it is responsible for keeping the
+ * latest rsb.  Find if any current lkb's have an up to date copy of the lvb to
+ * be used as the rsb copy.  An equivalent step occurs as new lkb's arrive for
+ * this rsb in deserialise_lkb.
+ */
+
+static void set_rsb_lvb(gd_res_t *rsb)
+{
+	gd_lkb_t *lkb;
+
+	list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
+
+		if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
+		    (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
+		    (lkb->lkb_grmode > DLM_LOCK_NL))
+		{
+			if (!rsb->res_lvbptr)
+				rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
+
+			memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+			return;
+		}
+	}
+
+	list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
+
+		if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
+		    (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
+		    (lkb->lkb_grmode > DLM_LOCK_NL))
+		{
+			if (!rsb->res_lvbptr)
+				rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
+
+			memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+			return;
+		}
+	}
+}
+
+/*
+ * Propogate the new master nodeid to locks, subrsbs, sublocks.
+ * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
+ */
+
+static void set_new_master(gd_res_t *rsb)
+{
+	gd_res_t *subrsb;
+
+	down_write(&rsb->res_lock);
+
+	if (rsb->res_nodeid == our_nodeid()) {
+		rsb->res_nodeid = 0;
+		set_rsb_lvb(rsb);
+	}
+
+	set_master_lkbs(rsb);
+
+	list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
+		subrsb->res_nodeid = rsb->res_nodeid;
+		set_master_lkbs(subrsb);
+	}
+
+	up_write(&rsb->res_lock);
+
+	set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid.  As replies are returned from the resource directories the
+ * rsb's are removed from the list.  When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ */
+
+int recover_list_empty(gd_ls_t *ls)
+{
+	int empty;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	empty = list_empty(&ls->ls_recover_list);
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	return empty;
+}
+
+int recover_list_count(gd_ls_t *ls)
+{
+	int count;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	count = ls->ls_recover_list_count;
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	return count;
+}
+
+void recover_list_add(gd_res_t *rsb)
+{
+	gd_ls_t *ls = rsb->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
+		list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
+		ls->ls_recover_list_count++;
+		hold_rsb(rsb);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+void recover_list_del(gd_res_t *rsb)
+{
+	gd_ls_t *ls = rsb->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
+	list_del(&rsb->res_recover_list);
+	ls->ls_recover_list_count--;
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	release_rsb(rsb);
+}
+
+static gd_res_t *recover_list_find(gd_ls_t *ls, int msgid)
+{
+	gd_res_t *rsb = NULL;
+
+	spin_lock(&ls->ls_recover_list_lock);
+
+	list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
+		if (rsb->res_recover_msgid == msgid)
+		        goto rec_found;
+	}
+	rsb = NULL;
+
+ rec_found:
+	spin_unlock(&ls->ls_recover_list_lock);
+	return rsb;
+}
+
+#if 0
+static void recover_list_clear(gd_ls_t *ls)
+{
+	gd_res_t *rsb;
+
+
+	spin_lock(&ls->ls_recover_list_lock);
+
+	while (!list_empty(&ls->ls_recover_list)) {
+		rsb = list_entry(ls->ls_recover_list.next, gd_res_t,
+			         res_recover_list);
+		list_del(&rsb->res_recover_list);
+		ls->ls_recover_list_count--;
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+
+}
+#endif
+
+#if 0
+void recover_list_dump(gd_ls_t *ls)
+{
+	struct list_head *tmp;
+	gd_res_t *rsb;
+
+	spin_lock(&ls->ls_recover_list_lock);
+
+	printk("recover_list_count=%d\n", ls->ls_recover_list_count);
+
+	list_for_each(tmp, &ls->ls_recover_list) {
+		rsb = list_entry(tmp, gd_res_t, res_recover_list);
+		gdlm_res_dbprint(rsb);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+#endif
+
+static int rsb_master_lookup(gd_res_t *rsb, gd_rcom_t *rc)
+{
+	gd_ls_t *ls = rsb->res_ls;
+	gd_resdata_t *rd;
+	uint32_t dir_nodeid;
+	int error;
+
+	dir_nodeid = get_directory_nodeid(rsb);
+
+	if (dir_nodeid == our_nodeid()) {
+		error = get_resdata(ls, dir_nodeid, rsb->res_name,
+				    rsb->res_length, &rd, 1);
+		if (error)
+			goto fail;
+
+		rsb->res_nodeid = rd->rd_master_nodeid;
+		set_new_master(rsb);
+	} else {
+		/* As we are the only thread doing recovery this 
+		   should be safe. if not then we need to use a different
+		   ID somehow. We must set it in the RSB before rcom_send_msg
+		   completes cos we may get a reply quite quickly.
+		*/
+		rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
+
+		recover_list_add(rsb);
+
+		memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
+		rc->rc_datalen = rsb->res_length;
+
+		error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
+				          rc, 0);
+		if (error)
+			goto fail;
+	}
+
+      fail:
+	return error;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the resdir.  The resdir will
+ * assign mastery to the first node to look up the new master.  That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the resdir requests individually and asynchronously to the
+ * correct resdir node.  The replies are processed in rsb_master_recv().
+ */
+
+int restbl_rsb_update(gd_ls_t *ls)
+{
+	gd_res_t *rsb, *safe;
+	gd_rcom_t *rc;
+	int error = -ENOMEM;
+	int count = 0;
+
+	log_all(ls, "update remastered resources");
+
+	rc = allocate_rcom_buffer(ls);
+	if (!rc)
+		goto out;
+
+	list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
+		if (!rsb->res_nodeid)
+			continue;
+
+		error = gdlm_recovery_stopped(ls);
+		if (error)
+			goto out_free;
+
+		if (in_nodes_gone(ls, rsb->res_nodeid)) {
+			error = rsb_master_lookup(rsb, rc);
+			if (error)
+				goto out_free;
+			count++;
+		}
+	}
+
+	error = gdlm_wait_function(ls, &recover_list_empty);
+
+	log_all(ls, "updated %d resources", count);
+
+      out_free:
+	free_rcom_buffer(rc);
+
+      out:
+	return error;
+}
+
+int restbl_rsb_update_recv(gd_ls_t *ls, uint32_t nodeid, char *buf, int length,
+			   int msgid)
+{
+	gd_res_t *rsb;
+	uint32_t be_nodeid;
+
+	rsb = recover_list_find(ls, msgid);
+	if (!rsb) {
+		log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
+		goto out;
+	}
+
+	memcpy(&be_nodeid, buf, sizeof(uint32_t));
+	rsb->res_nodeid = be32_to_cpu(be_nodeid);
+	set_new_master(rsb);
+	recover_list_del(rsb);
+
+	if (recover_list_empty(ls))
+		wake_up(&ls->ls_wait_general);
+
+      out:
+	return 0;
+}
+
+/*
+ * This function not used any longer.
+ */
+
+int bulk_master_lookup(gd_ls_t *ls, int nodeid, char *inbuf, int inlen,
+		       char *outbuf)
+{
+	char *inbufptr, *outbufptr;
+
+	/*
+	 * The other node wants nodeids matching the resource names in inbuf.
+	 * The resource names are packed into inbuf as
+	 * [len1][name1][len2][name2]...  where lenX is 1 byte and nameX is
+	 * lenX bytes.  Matching nodeids are packed into outbuf in order
+	 * [nodeid1][nodeid2]...
+	 */
+
+	inbufptr = inbuf;
+	outbufptr = outbuf;
+
+	while (inbufptr < inbuf + inlen) {
+		gd_resdata_t *rd;
+		uint32_t be_nodeid;
+		int status;
+
+		status = get_resdata(ls, nodeid, inbufptr + 1, *inbufptr,
+				     &rd, 1);
+		if (status != 0)
+			goto fail;
+
+		inbufptr += *inbufptr + 1;
+
+		be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
+		memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
+		outbufptr += sizeof(uint32_t);
+
+		/* add assertion that outbufptr - outbuf is not > than ... */
+	}
+
+	return (outbufptr - outbuf);
+
+      fail:
+	return -1;
+}
diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
--- linux-orig/cluster/dlm/recover.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/recover.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int gdlm_wait_function(gd_ls_t * ls, int (*testfn) (gd_ls_t * ls));
+int gdlm_wait_status_all(gd_ls_t * ls, unsigned int wait_status);
+int gdlm_wait_status_low(gd_ls_t * ls, unsigned int wait_status);
+int gdlm_recovery_stopped(gd_ls_t * ls);
+int recover_list_empty(gd_ls_t * ls);
+int recover_list_count(gd_ls_t * ls);
+void recover_list_add(gd_res_t * rsb);
+void recover_list_del(gd_res_t * rsb);
+void recover_list_dump(gd_ls_t * ls);
+int restbl_lkb_purge(gd_ls_t * ls);
+void restbl_grant_after_purge(gd_ls_t * ls);
+int restbl_rsb_update(gd_ls_t * ls);
+int restbl_rsb_update_recv(gd_ls_t * ls, int nodeid, char *buf, int len,
+			   int msgid);
+int bulk_master_lookup(gd_ls_t * ls, int nodeid, char *inbuf, int inlen,
+		       char *outbuf);
+
+#endif				/* __RECOVER_DOT_H__ */
diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
--- linux-orig/cluster/dlm/recoverd.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/recoverd.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,692 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "nodes.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lockspace.h"
+#include "lowcomms.h"
+#include "lockqueue.h"
+#include "lkb.h"
+#include "rebuild.h"
+
+/* 
+ * next_move actions
+ */
+
+#define DO_STOP             (1)
+#define DO_START            (2)
+#define DO_FINISH           (3)
+#define DO_FINISH_STOP      (4)
+#define DO_FINISH_START     (5)
+
+/* 
+ * recoverd_flags for thread
+ */
+
+#define THREAD_STOP         (0)
+
+/* 
+ * local thread variables
+ */
+
+static unsigned long recoverd_flags;
+static struct completion recoverd_run;
+static wait_queue_head_t recoverd_wait;
+static struct task_struct *recoverd_task;
+
+/* 
+ * Queue of lockspaces (gr_recover_t structs) which need to be
+ * started/recovered
+ */
+
+static struct list_head recoverd_start_queue;
+static atomic_t recoverd_start_count;
+
+extern struct list_head lslist;
+extern spinlock_t lslist_lock;
+
+void dlm_recoverd_init(void)
+{
+	INIT_LIST_HEAD(&recoverd_start_queue);
+	atomic_set(&recoverd_start_count, 0);
+
+	init_completion(&recoverd_run);
+	init_waitqueue_head(&recoverd_wait);
+	memset(&recoverd_flags, 0, sizeof(unsigned long));
+}
+
+static int enable_locking(gd_ls_t *ls, int event_id)
+{
+	int error = 0;
+
+	spin_lock(&ls->ls_recover_lock);
+	if (ls->ls_last_stop < event_id) {
+		set_bit(LSFL_LS_RUN, &ls->ls_flags);
+		up_write(&ls->ls_in_recovery);
+	} else {
+		error = -EINTR;
+		log_debug(ls, "enable_locking: abort %d", event_id);
+	}
+	spin_unlock(&ls->ls_recover_lock);
+	return error;
+}
+
+static int ls_first_start(gd_ls_t *ls, gd_recover_t *gr)
+{
+	int error;
+
+	log_all(ls, "recover event %u (first)", gr->gr_event_id);
+
+	kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
+
+	error = ls_nodes_init(ls, gr);
+	if (error) {
+		log_error(ls, "nodes_init failed %d", error);
+		goto out;
+	}
+
+	error = resdir_rebuild_local(ls);
+	if (error) {
+		log_error(ls, "resdir_rebuild_local failed %d", error);
+		goto out;
+	}
+
+	error = resdir_rebuild_wait(ls);
+	if (error) {
+		log_error(ls, "resdir_rebuild_wait failed %d", error);
+		goto out;
+	}
+
+	log_all(ls, "recover event %u done", gr->gr_event_id);
+	kcl_start_done(ls->ls_local_id, gr->gr_event_id);
+
+      out:
+	return error;
+}
+
+/* 
+ * We are given here a new group of nodes which are in the lockspace.  We first
+ * figure out the differences in ls membership from when we were last running.
+ * If nodes from before are gone, then there will be some lock recovery to do.
+ * If there are only nodes which have joined, then there's no lock recovery.
+ *
+ * note: cman requires an rc to finish starting on an revent (where nodes die)
+ * before it allows an sevent (where nodes join) to be processed.  This means
+ * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
+ * joined.
+ */
+
+static int ls_reconfig(gd_ls_t *ls, gd_recover_t *gr)
+{
+	int error, neg = 0;
+
+	log_all(ls, "recover event %u", gr->gr_event_id);
+
+	/* 
+	 * Add or remove nodes from the lockspace's ls_nodes list.
+	 */
+
+	error = ls_nodes_reconfig(ls, gr, &neg);
+	if (error) {
+		log_error(ls, "nodes_reconfig failed %d", error);
+		goto fail;
+	}
+
+	/* 
+	 * Rebuild our own share of the resdir by collecting from all other
+	 * nodes rsb name/master pairs for which the name hashes to us.
+	 */
+
+	error = resdir_rebuild_local(ls);
+	if (error) {
+		log_error(ls, "resdir_rebuild_local failed %d", error);
+		goto fail;
+	}
+
+	/* 
+	 * Purge resdir-related requests that are being held in requestqueue.
+	 * All resdir requests from before recovery started are invalid now due
+	 * to the resdir rebuild and will be resent by the requesting nodes.
+	 */
+
+	purge_requestqueue(ls);
+	set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
+
+	/* 
+	 * Wait for all nodes to complete resdir rebuild.
+	 */
+
+	error = resdir_rebuild_wait(ls);
+	if (error) {
+		log_error(ls, "resdir_rebuild_wait failed %d", error);
+		goto fail;
+	}
+
+	/* 
+	 * Mark our own lkb's waiting in the lockqueue for remote replies from
+	 * nodes that are now departed.  These will be resent to the new
+	 * masters in resend_cluster_requests.  Also mark resdir lookup
+	 * requests for resending.
+	 */
+
+	lockqueue_lkb_mark(ls);
+
+	error = gdlm_recovery_stopped(ls);
+	if (error)
+		goto fail;
+
+	if (neg) {
+		/* 
+		 * Clear lkb's for departed nodes.  This can't fail since it
+		 * doesn't involve communicating with other nodes.
+		 */
+
+		down_write(&ls->ls_rec_rsblist);
+		restbl_lkb_purge(ls);
+		up_write(&ls->ls_rec_rsblist);
+
+		down_read(&ls->ls_rec_rsblist);
+
+		/* 
+		 * Get new master id's for rsb's of departed nodes.  This fails
+		 * if we can't communicate with other nodes.
+		 */
+
+		error = restbl_rsb_update(ls);
+		if (error) {
+			log_error(ls, "restbl_rsb_update failed %d", error);
+			goto fail_up;
+		}
+
+		/* 
+		 * Send our lkb info to new masters.  This fails if we can't
+		 * communicate with a node.
+		 */
+
+		error = rebuild_rsbs_send(ls);
+		if (error) {
+			log_error(ls, "rebuild_rsbs_send failed %d", error);
+			goto fail_up;
+		}
+		up_read(&ls->ls_rec_rsblist);
+	}
+
+	clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
+
+	log_all(ls, "recover event %u done", gr->gr_event_id);
+	kcl_start_done(ls->ls_local_id, gr->gr_event_id);
+	return 0;
+
+ fail_up:
+	up_read(&ls->ls_rec_rsblist);
+ fail:
+	log_all(ls, "recover event %d error %d", gr->gr_event_id, error);
+	return error;
+}
+
+static void clear_finished_nodes(gd_ls_t *ls, int finish_event)
+{
+	gd_csb_t *csb, *safe;
+
+	list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, csb_list) {
+		if (csb->csb_gone_event <= finish_event) {
+			list_del(&csb->csb_list);
+			release_csb(csb);
+		}
+	}
+}
+
+/* 
+ * Between calls to this routine for a ls, there can be multiple stop/start
+ * events from cman where every start but the latest is cancelled by stops.
+ * There can only be a single finish from cman because every finish requires us
+ * to call start_done.  A single finish event could be followed by multiple
+ * stop/start events.  This routine takes any combination of events from cman
+ * and boils them down to one course of action.
+ */
+
+int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out)
+{
+	LIST_HEAD(events);
+	unsigned int cmd = 0, stop, start, finish;
+	unsigned int last_stop, last_start, last_finish;
+	gd_recover_t *gr = NULL, *start_gr = NULL;
+
+	/* 
+	 * Grab the current state of cman/sm events.
+	 */
+
+	spin_lock(&ls->ls_recover_lock);
+
+	stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
+	start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
+	finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
+
+	last_stop = ls->ls_last_stop;
+	last_start = ls->ls_last_start;
+	last_finish = ls->ls_last_finish;
+
+	while (!list_empty(&ls->ls_recover)) {
+		gr = list_entry(ls->ls_recover.next, gd_recover_t, gr_list);
+		list_del(&gr->gr_list);
+		list_add_tail(&gr->gr_list, &events);
+	}
+	spin_unlock(&ls->ls_recover_lock);
+
+	log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
+		  last_stop, last_start, last_finish);
+
+	/* 
+	 * Toss start events which have since been cancelled.
+	 */
+
+	while (!list_empty(&events)) {
+		GDLM_ASSERT(start,);
+		gr = list_entry(events.next, gd_recover_t, gr_list);
+		list_del(&gr->gr_list);
+
+		if (gr->gr_event_id <= last_stop) {
+			log_debug(ls, "move skip event %u", gr->gr_event_id);
+			kfree(gr->gr_nodeids);
+			free_dlm_recover(gr);
+			gr = NULL;
+		} else {
+			log_debug(ls, "move use event %u", gr->gr_event_id);
+			GDLM_ASSERT(!start_gr,);
+			start_gr = gr;
+		}
+	}
+
+	/* 
+	 * Eight possible combinations of events.
+	 */
+
+	/* 0 */
+	if (!stop && !start && !finish) {
+		GDLM_ASSERT(!start_gr,);
+		cmd = 0;
+		goto out;
+	}
+
+	/* 1 */
+	if (!stop && !start && finish) {
+		GDLM_ASSERT(!start_gr,);
+		GDLM_ASSERT(last_start > last_stop,);
+		GDLM_ASSERT(last_finish == last_start,);
+		cmd = DO_FINISH;
+		*finish_out = last_finish;
+		goto out;
+	}
+
+	/* 2 */
+	if (!stop && start && !finish) {
+		GDLM_ASSERT(start_gr,);
+		GDLM_ASSERT(last_start > last_stop,);
+		cmd = DO_START;
+		*gr_out = start_gr;
+		goto out;
+	}
+
+	/* 3 */
+	if (!stop && start && finish) {
+		GDLM_ASSERT(0, printk("finish and start with no stop\n"););
+	}
+
+	/* 4 */
+	if (stop && !start && !finish) {
+		GDLM_ASSERT(!start_gr,);
+		GDLM_ASSERT(last_start == last_stop,);
+		cmd = DO_STOP;
+		goto out;
+	}
+
+	/* 5 */
+	if (stop && !start && finish) {
+		GDLM_ASSERT(!start_gr,);
+		GDLM_ASSERT(last_finish == last_start,);
+		GDLM_ASSERT(last_stop == last_start,);
+		cmd = DO_FINISH_STOP;
+		*finish_out = last_finish;
+		goto out;
+	}
+
+	/* 6 */
+	if (stop && start && !finish) {
+		if (start_gr) {
+			GDLM_ASSERT(last_start > last_stop,);
+			cmd = DO_START;
+			*gr_out = start_gr;
+		} else {
+			GDLM_ASSERT(last_stop == last_start,);
+			cmd = DO_STOP;
+		}
+		goto out;
+	}
+
+	/* 7 */
+	if (stop && start && finish) {
+		if (start_gr) {
+			GDLM_ASSERT(last_start > last_stop,);
+			GDLM_ASSERT(last_start > last_finish,);
+			cmd = DO_FINISH_START;
+			*finish_out = last_finish;
+			*gr_out = start_gr;
+		} else {
+			GDLM_ASSERT(last_start == last_stop,);
+			GDLM_ASSERT(last_start > last_finish,);
+			cmd = DO_FINISH_STOP;
+			*finish_out = last_finish;
+		}
+		goto out;
+	}
+
+      out:
+	return cmd;
+}
+
+/* 
+ * This function decides what to do given every combination of current
+ * lockspace state and next lockspace state.
+ */
+
+static void do_ls_recovery(gd_ls_t *ls)
+{
+	gd_recover_t *gr = NULL;
+	int error, cur_state, next_state = 0, do_now, finish_event = 0;
+
+	do_now = next_move(ls, &gr, &finish_event);
+	if (!do_now)
+		goto out;
+
+	cur_state = ls->ls_state;
+	next_state = 0;
+
+	GDLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
+		    log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
+
+	/* 
+	 * LSST_CLEAR - we're not in any recovery state.  We can get a stop or
+	 * a stop and start which equates with a START.
+	 */
+
+	if (cur_state == LSST_CLEAR) {
+		switch (do_now) {
+		case DO_STOP:
+			next_state = LSST_WAIT_START;
+			break;
+
+		case DO_START:
+			error = ls_reconfig(ls, gr);
+			if (error)
+				next_state = LSST_WAIT_START;
+			else
+				next_state = LSST_RECONFIG_DONE;
+			break;
+
+		case DO_FINISH:	/* invalid */
+		case DO_FINISH_STOP:	/* invalid */
+		case DO_FINISH_START:	/* invalid */
+		default:
+			GDLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+	/* 
+	 * LSST_WAIT_START - we're not running because of getting a stop or
+	 * failing a start.  We wait in this state for another stop/start or
+	 * just the next start to begin another reconfig attempt.
+	 */
+
+	if (cur_state == LSST_WAIT_START) {
+		switch (do_now) {
+		case DO_STOP:
+			break;
+
+		case DO_START:
+			error = ls_reconfig(ls, gr);
+			if (error)
+				next_state = LSST_WAIT_START;
+			else
+				next_state = LSST_RECONFIG_DONE;
+			break;
+
+		case DO_FINISH:	/* invalid */
+		case DO_FINISH_STOP:	/* invalid */
+		case DO_FINISH_START:	/* invalid */
+		default:
+			GDLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+	/* 
+	 * LSST_RECONFIG_DONE - we entered this state after successfully
+	 * completing ls_reconfig and calling kcl_start_done.  We expect to get
+	 * a finish if everything goes ok.  A finish could be followed by stop
+	 * or stop/start before we get here to check it.  Or a finish may never
+	 * happen, only stop or stop/start.
+	 */
+
+	if (cur_state == LSST_RECONFIG_DONE) {
+		switch (do_now) {
+		case DO_FINISH:
+			clear_finished_nodes(ls, finish_event);
+			next_state = LSST_CLEAR;
+
+			error = enable_locking(ls, finish_event);
+			if (error)
+				break;
+
+			error = process_requestqueue(ls);
+			if (error)
+				break;
+
+			error = resend_cluster_requests(ls);
+			if (error)
+				break;
+
+			restbl_grant_after_purge(ls);
+
+			log_all(ls, "recover event %u finished", finish_event);
+			break;
+
+		case DO_STOP:
+			next_state = LSST_WAIT_START;
+			break;
+
+		case DO_FINISH_STOP:
+			clear_finished_nodes(ls, finish_event);
+			next_state = LSST_WAIT_START;
+			break;
+
+		case DO_FINISH_START:
+			clear_finished_nodes(ls, finish_event);
+			/* fall into DO_START */
+
+		case DO_START:
+			error = ls_reconfig(ls, gr);
+			if (error)
+				next_state = LSST_WAIT_START;
+			else
+				next_state = LSST_RECONFIG_DONE;
+			break;
+
+		default:
+			GDLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+	/* 
+	 * LSST_INIT - state after ls is created and before it has been
+	 * started.  A start operation will cause the ls to be started for the
+	 * first time.  A failed start will cause to just wait in INIT for
+	 * another stop/start.
+	 */
+
+	if (cur_state == LSST_INIT) {
+		switch (do_now) {
+		case DO_START:
+			error = ls_first_start(ls, gr);
+			if (!error)
+				next_state = LSST_INIT_DONE;
+			break;
+
+		case DO_STOP:
+			break;
+
+		case DO_FINISH:	/* invalid */
+		case DO_FINISH_STOP:	/* invalid */
+		case DO_FINISH_START:	/* invalid */
+		default:
+			GDLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+	/* 
+	 * LSST_INIT_DONE - after the first start operation is completed
+	 * successfully and kcl_start_done() called.  If there are no errors, a
+	 * finish will arrive next and we'll move to LSST_CLEAR.
+	 */
+
+	if (cur_state == LSST_INIT_DONE) {
+		switch (do_now) {
+		case DO_STOP:
+		case DO_FINISH_STOP:
+			next_state = LSST_WAIT_START;
+			break;
+
+		case DO_START:
+		case DO_FINISH_START:
+			error = ls_reconfig(ls, gr);
+			if (error)
+				next_state = LSST_WAIT_START;
+			else
+				next_state = LSST_RECONFIG_DONE;
+			break;
+
+		case DO_FINISH:
+			next_state = LSST_CLEAR;
+			enable_locking(ls, finish_event);
+			log_all(ls, "recover event %u finished", finish_event);
+			break;
+
+		default:
+			GDLM_ASSERT(0,);
+		}
+		goto out;
+	}
+
+      out:
+	if (next_state)
+		ls->ls_state = next_state;
+
+	if (gr) {
+		kfree(gr->gr_nodeids);
+		free_dlm_recover(gr);
+	}
+}
+
+static __inline__ gd_ls_t *get_work(int clear)
+{
+	gd_ls_t *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (clear) {
+			if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+			        goto got_work;
+
+		} else {
+			if (test_bit(LSFL_WORK, &ls->ls_flags))
+			        goto got_work;
+		}
+	}
+	ls = NULL;
+
+ got_work:
+	spin_unlock(&lslist_lock);
+
+	return ls;
+}
+
+/* 
+ * Thread which does recovery for all lockspaces.
+ */
+
+static int dlm_recoverd(void *arg)
+{
+	gd_ls_t *ls;
+
+	daemonize("dlm_recoverd");
+	recoverd_task = current;
+	complete(&recoverd_run);
+
+	while (!test_bit(THREAD_STOP, &recoverd_flags)) {
+		wchan_cond_sleep_intr(recoverd_wait, !get_work(0));
+		if ((ls = get_work(1)))
+			do_ls_recovery(ls);
+	}
+
+	complete(&recoverd_run);
+	return 0;
+}
+
+/* 
+ * Mark a specific lockspace as needing work and wake up the thread to do it.
+ */
+
+void recoverd_kick(gd_ls_t *ls)
+{
+	set_bit(LSFL_WORK, &ls->ls_flags);
+	wake_up(&recoverd_wait);
+}
+
+/* 
+ * Start the recoverd thread when gdlm is started (before any lockspaces).
+ */
+
+int recoverd_start(void)
+{
+	int error;
+
+	clear_bit(THREAD_STOP, &recoverd_flags);
+	error = kernel_thread(dlm_recoverd, NULL, 0);
+	if (error < 0)
+		goto out;
+
+	error = 0;
+	wait_for_completion(&recoverd_run);
+
+      out:
+	return error;
+}
+
+/* 
+ * Stop the recoverd thread when gdlm is shut down (all lockspaces are gone).
+ */
+
+int recoverd_stop(void)
+{
+	set_bit(THREAD_STOP, &recoverd_flags);
+	wake_up(&recoverd_wait);
+	wait_for_completion(&recoverd_run);
+
+	return 0;
+}
diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
--- linux-orig/cluster/dlm/recoverd.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/recoverd.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+void dlm_recoverd_init(void);
+void recoverd_kick(gd_ls_t * ls);
+int recoverd_start(void);
+int recoverd_stop(void);
+
+#endif				/* __RECOVERD_DOT_H__ */
diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
--- linux-orig/cluster/dlm/rsb.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/rsb.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,307 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "locking.h"
+#include "memory.h"
+#include "lockqueue.h"
+#include "nodes.h"
+#include "dir.h"
+#include "util.h"
+
+static gd_res_t *search_hashchain(struct list_head *head, gd_res_t *parent,
+				  char *name, int namelen)
+{
+	gd_res_t *r;
+
+	list_for_each_entry(r, head, res_hashchain) {
+		if ((parent == r->res_parent) && (namelen == r->res_length) &&
+		    (memcmp(name, r->res_name, namelen) == 0)) {
+			atomic_inc(&r->res_ref);
+			return r;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * A way to arbitrarily hold onto an rsb which we already have a reference to
+ * to make sure it doesn't go away.  Opposite of release_rsb().
+ */
+
+void hold_rsb(gd_res_t *r)
+{
+	atomic_inc(&r->res_ref);
+}
+
+/*
+ * release_rsb() - Decrement reference count on rsb struct.  Free the rsb
+ * struct when there are zero references.  Every lkb for the rsb adds a
+ * reference.  When ref is zero there can be no more lkb's for the rsb, on the
+ * queue's or anywhere else.
+ */
+
+void release_rsb(gd_res_t *r)
+{
+	gd_ls_t *ls = r->res_ls;
+	int removed = FALSE;
+
+	write_lock(&ls->ls_reshash_lock);
+	atomic_dec(&r->res_ref);
+
+	if (!atomic_read(&r->res_ref)) {
+		GDLM_ASSERT(list_empty(&r->res_grantqueue),);
+		GDLM_ASSERT(list_empty(&r->res_waitqueue),);
+		GDLM_ASSERT(list_empty(&r->res_convertqueue),);
+		removed = TRUE;
+		list_del(&r->res_hashchain);
+	}
+	write_unlock(&ls->ls_reshash_lock);
+
+	if (removed) {
+		down_read(&ls->ls_gap_rsblist);
+		if (r->res_parent)
+			list_del(&r->res_subreslist);
+		else
+			list_del(&r->res_rootlist);
+		up_read(&ls->ls_gap_rsblist);
+
+		/*
+		 * Remove resdir entry if this was a locally mastered root rsb.
+		 */
+		if (!r->res_parent && !r->res_nodeid) {
+			if (get_directory_nodeid(r) != our_nodeid())
+				remote_remove_resdata(r->res_ls,
+						      get_directory_nodeid(r),
+						      r->res_name,
+						      r->res_length,
+						      r->res_resdir_seq);
+			else
+				remove_resdata(r->res_ls, our_nodeid(),
+					       r->res_name, r->res_length,
+					       r->res_resdir_seq);
+		}
+
+		if (r->res_lvbptr)
+			free_lvb(r->res_lvbptr);
+
+		free_rsb(r);
+	}
+}
+
+/*
+ * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
+ * If the rsb exists, its ref count is incremented by this function.  If it
+ * doesn't exist, it's created with a ref count of one.
+ */
+
+int find_or_create_rsb(gd_ls_t *ls, gd_res_t *parent, char *name, int namelen,
+		       int create, gd_res_t **rp)
+{
+	uint32_t hash;
+	gd_res_t *r, *tmp;
+	int error = -ENOMEM;
+
+	GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+	hash = gdlm_hash(name, namelen);
+	hash &= ls->ls_hashmask;
+
+	read_lock(&ls->ls_reshash_lock);
+	r = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
+	read_unlock(&ls->ls_reshash_lock);
+
+	if (r)
+		goto out_set;
+	if (!create) {
+		*rp = NULL;
+		goto out;
+	}
+
+	r = allocate_rsb(ls, namelen);
+	if (!r)
+		goto fail;
+
+	INIT_LIST_HEAD(&r->res_subreslist);
+	INIT_LIST_HEAD(&r->res_grantqueue);
+	INIT_LIST_HEAD(&r->res_convertqueue);
+	INIT_LIST_HEAD(&r->res_waitqueue);
+
+	memcpy(r->res_name, name, namelen);
+	r->res_length = namelen;
+	r->res_ls = ls;
+	init_rwsem(&r->res_lock);
+	atomic_set(&r->res_ref, 1);
+
+	if (parent) {
+		r->res_parent = parent;
+		r->res_depth = parent->res_depth + 1;
+		r->res_root = parent->res_root;
+		r->res_nodeid = parent->res_nodeid;
+	} else {
+		r->res_parent = NULL;
+		r->res_depth = 1;
+		r->res_root = r;
+		r->res_nodeid = -1;
+	}
+
+	write_lock(&ls->ls_reshash_lock);
+	tmp = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
+	if (tmp) {
+		write_unlock(&ls->ls_reshash_lock);
+		free_rsb(r);
+		r = tmp;
+	} else {
+		list_add(&r->res_hashchain, &ls->ls_reshashtbl[hash]);
+		write_unlock(&ls->ls_reshash_lock);
+
+		down_read(&ls->ls_gap_rsblist);
+		if (parent)
+			list_add_tail(&r->res_subreslist,
+				      &r->res_root->res_subreslist);
+		else
+			list_add(&r->res_rootlist, &ls->ls_rootres);
+		up_read(&ls->ls_gap_rsblist);
+	}
+
+      out_set:
+	*rp = r;
+
+      out:
+	error = 0;
+
+      fail:
+	return error;
+}
+
+/*
+ * Add a LKB to a resource's grant/convert/wait queue. in order
+ */
+
+void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
+{
+	gd_lkb_t *lkb = NULL;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+		if (lkb->lkb_rqmode < mode)
+			break;
+	}
+
+	if (!lkb) {
+		/* No entries in the queue, we are alone */
+	        list_add_tail(new, head);
+	} else {
+	        __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+	}
+}
+
+/*
+ * The rsb res_lock must be held in write when this function is called.
+ */
+
+void lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
+{
+
+	GDLM_ASSERT(!lkb->lkb_status, printk("status=%u\n", lkb->lkb_status););
+
+	lkb->lkb_status = type;
+
+	switch (type) {
+	case GDLM_LKSTS_WAITING:
+		list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+		break;
+
+	case GDLM_LKSTS_GRANTED:
+		lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+				lkb->lkb_grmode);
+		break;
+
+	case GDLM_LKSTS_CONVERT:
+	        if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
+		        list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+
+		else
+		        if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)
+			        list_add_tail(&lkb->lkb_statequeue,
+					      &r->res_convertqueue);
+			else
+			        lkb_add_ordered(&lkb->lkb_statequeue,
+						&r->res_convertqueue, lkb->lkb_rqmode);
+		break;
+
+	default:
+		GDLM_ASSERT(0,);
+	}
+}
+
+void res_lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
+{
+	down_write(&r->res_lock);
+	lkb_enqueue(r, lkb, type);
+	up_write(&r->res_lock);
+}
+
+/*
+ * The rsb res_lock must be held in write when this function is called.
+ */
+
+int lkb_dequeue(gd_lkb_t *lkb)
+{
+	int status = lkb->lkb_status;
+
+	if (!status)
+		goto out;
+
+	lkb->lkb_status = 0;
+	list_del(&lkb->lkb_statequeue);
+
+      out:
+	return status;
+}
+
+int res_lkb_dequeue(gd_lkb_t *lkb)
+{
+	int status;
+
+	down_write(&lkb->lkb_resource->res_lock);
+	status = lkb_dequeue(lkb);
+	up_write(&lkb->lkb_resource->res_lock);
+
+	return status;
+}
+
+/*
+ * The rsb res_lock must be held in write when this function is called.
+ */
+
+int lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
+{
+	int status;
+
+	status = lkb_dequeue(lkb);
+	lkb_enqueue(r, lkb, type);
+
+	return status;
+}
+
+int res_lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
+{
+	int status;
+
+	down_write(&r->res_lock);
+	status = lkb_swqueue(r, lkb, type);
+	up_write(&r->res_lock);
+
+	return status;
+}
diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
--- linux-orig/cluster/dlm/rsb.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/rsb.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RSB_DOT_H__
+#define __RSB_DOT_H__
+
+void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
+void _release_rsb(gd_res_t * r);
+void release_rsb(gd_res_t * r);
+void hold_rsb(gd_res_t * r);
+int find_or_create_rsb(gd_ls_t * ls, gd_res_t * parent, char *name, int namelen,
+		       int create, gd_res_t ** rp);
+void lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
+void res_lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
+int lkb_dequeue(gd_lkb_t * lkb);
+int res_lkb_dequeue(gd_lkb_t * lkb);
+int lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
+int res_lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
+
+#endif				/* __RSB_DOT_H__ */
diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
--- linux-orig/cluster/dlm/util.c	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/util.c	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,130 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+
+static const uint32_t crc_32_tab[] = {
+	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+	0xe963a535, 0x9e6495a3,
+	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
+	0xe7b82d07, 0x90bf1d91,
+	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
+	0xf4d4b551, 0x83d385c7,
+	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+	0xfa0f3d63, 0x8d080df5,
+	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
+	0xd20d85fd, 0xa50ab56b,
+	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
+	0xdcd60dcf, 0xabd13d59,
+	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+	0xcfba9599, 0xb8bda50f,
+	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
+	0xc1611dab, 0xb6662d3d,
+	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+	0x9fbfe4a5, 0xe8b8d433,
+	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+	0x91646c97, 0xe6635c01,
+	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
+	0x8208f4c1, 0xf50fc457,
+	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
+	0x8cd37cf3, 0xfbd44c65,
+	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+	0xa4d1c46d, 0xd3d6f4fb,
+	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+	0xaa0a4c5f, 0xdd0d7cc9,
+	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
+	0xb966d409, 0xce61e49f,
+	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+	0xb7bd5c3b, 0xc0ba6cad,
+	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
+	0x04db2615, 0x73dc1683,
+	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
+	0x0a00ae27, 0x7d079eb1,
+	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+	0x196c3671, 0x6e6b06e7,
+	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
+	0x17b7be43, 0x60b08ed5,
+	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
+	0x3fb506dd, 0x48b2364b,
+	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+	0x316e8eef, 0x4669be79,
+	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
+	0x220216b9, 0x5505262f,
+	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+	0x2cd99e8b, 0x5bdeae1d,
+	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+	0x72076785, 0x05005713,
+	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
+	0x7cdcefb7, 0x0bdbdf21,
+	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
+	0x6fb077e1, 0x18b74777,
+	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+	0x616bffd3, 0x166ccf45,
+	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+	0x4969474d, 0x3e6e77db,
+	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
+	0x47b2cf7f, 0x30b5ffe9,
+	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+	0x54de5729, 0x23d967bf,
+	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
+	0x5a05df1b, 0x2d02ef8d
+};
+
+/**
+ * gdlm_hash - hash an array of data
+ * @data: the data to be hashed
+ * @len: the length of data to be hashed
+ *
+ * Copied from GFS.
+ *
+ * Take some data and convert it to a 32-bit hash.
+ *
+ * The hash function is a 32-bit CRC of the data.  The algorithm uses
+ * the crc_32_tab table above.
+ *
+ * This may not be the fastest hash function, but it does a fair bit better
+ * at providing uniform results than the others I've looked at.  That's
+ * really important for efficient directories.
+ *
+ * Returns: the hash
+ */
+
+uint32_t gdlm_hash(const char *data, int len)
+{
+	uint32_t hash = 0xFFFFFFFF;
+
+	for (; len--; data++)
+		hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
+
+	hash = ~hash;
+
+	return hash;
+}
+
+uint32_t gdlm_next_power2(uint32_t val)
+{
+	uint32_t x;
+
+	for (x = 1; x < val; x <<= 1) ;
+
+	return x;
+}
+
+void print_lkb(gd_lkb_t *lkb)
+{
+	printk("dlm: lkb id=%x remid=%x flags=%x status=%x rq=%d gr=%d "
+		"nodeid=%u lqstate=%x lqflags=%x\n",
+		lkb->lkb_id, lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_status,
+		lkb->lkb_rqmode, lkb->lkb_grmode, lkb->lkb_nodeid,
+		lkb->lkb_lockqueue_state, lkb->lkb_lockqueue_flags);
+}
diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
--- linux-orig/cluster/dlm/util.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/cluster/dlm/util.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+uint32_t gdlm_hash(const char *data, int len);
+uint32_t gdlm_next_power2(uint32_t val);
+
+void print_lkb(gd_lkb_t *lkb);
+
+#endif
diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
--- linux-orig/include/cluster/dlm.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/include/cluster/dlm.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,404 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_DOT_H__
+#define __DLM_DOT_H__
+
+/*
+ * Interface to DLM - routines and structures to use DLM lockspaces.
+ */
+
+/*
+ * Lock Modes
+ */
+
+#define DLM_LOCK_IV            (-1)	/* invalid */
+#define DLM_LOCK_NL            (0)	/* null */
+#define DLM_LOCK_CR            (1)	/* concurrent read */
+#define DLM_LOCK_CW            (2)	/* concurrent write */
+#define DLM_LOCK_PR            (3)	/* protected read */
+#define DLM_LOCK_PW            (4)	/* protected write */
+#define DLM_LOCK_EX            (5)	/* exclusive */
+
+/*
+ * Maximum size in bytes of a dlm_lock name
+ */
+
+#define DLM_RESNAME_MAXLEN     (64)
+
+/*
+ * Size in bytes of Lock Value Block
+ */
+
+#define DLM_LVB_LEN            (32)
+
+/*
+ * Flags to dlm_new_lockspace
+ *
+ * DLM_LSF_NOTIMERS
+ *
+ * Do not subject locks in this lockspace to time-outs.
+ *
+ */
+
+#define DLM_LSF_NOTIMERS       (1)
+
+/*
+ * Flags to dlm_lock
+ *
+ * DLM_LKF_NOQUEUE
+ *
+ * Do not queue the lock request on the wait queue if it cannot be granted
+ * immediately.  If the lock cannot be granted because of this flag, DLM will
+ * either return -EAGAIN from the dlm_lock call or will return 0 from
+ * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
+ *
+ * DLM_LKF_CONVERT
+ *
+ * Indicates a lock conversion request.  For conversions the name and namelen
+ * are ignored and the lock ID in the LKSB is used to identify the lock.
+ *
+ * DLM_LKF_VALBLK
+ *
+ * Requests DLM to return the current contents of the lock value block in the
+ * lock status block.  When this flag is set in a lock conversion from PW or EX
+ * modes, DLM assigns the value specified in the lock status block to the lock
+ * value block of the lock resource.  The LVB is a DLM_LVB_LEN size array
+ * containing application-specific information.
+ *
+ * DLM_LKF_QUECVT
+ *
+ * Force a conversion lock request to the back of the convert queue.  All other
+ * conversion requests ahead of it must be granted before it can be granted.
+ * This enforces a FIFO ordering on the convert queue.  When this flag is set,
+ * indefinite postponement is averted.  This flag is allowed only when
+ * converting a lock to a more restrictive mode.
+ *
+ * DLM_LKF_CANCEL
+ *
+ * Used to cancel a pending conversion (with dlm_unlock).  Lock is returned to
+ * previously granted mode.
+ *
+ * DLM_LKF_IVVALBLK
+ *
+ * Invalidate/clear the lock value block.
+ *
+ * DLM_LKF_CONVDEADLK
+ *
+ * The granted mode of a lock being converted (from a non-NL mode) can be
+ * changed to NL in the process of acquiring the requested mode to avoid
+ * conversion deadlock.
+ *
+ * DLM_LKF_PERSISTENT
+ *
+ * Only relevant to locks originating in userspace. Signals to the ioctl.c code
+ * that this lock should not be unlocked when the process exits.
+ *
+ * DLM_LKF_NODLKWT
+ *
+ * This lock is not to be checked for conversion deadlocks.
+ *
+ * DLM_LKF_NODLCKBLK
+ *
+ * not yet implemented
+ *
+ * DLM_LKF_EXPEDITE
+ *
+ * If this lock conversion cannot be granted immediately it is to go to the
+ * head of the conversion queue regardless of its requested lock mode.
+ *
+ * DLM_LKF_NOQUEUEBAST
+ *
+ * Send blocking AST's before returning -EAGAIN to the caller.  It is only
+ * used along with the NOQUEUE flag.  Blocking AST's are not sent for failed
+ * NOQUEUE requests otherwise.
+ *
+ */
+
+#define DLM_LKF_NOQUEUE        (0x00000001)
+#define DLM_LKF_CANCEL         (0x00000002)
+#define DLM_LKF_CONVERT        (0x00000004)
+#define DLM_LKF_VALBLK         (0x00000008)
+#define DLM_LKF_QUECVT         (0x00000010)
+#define DLM_LKF_IVVALBLK       (0x00000020)
+#define DLM_LKF_CONVDEADLK     (0x00000040)
+#define DLM_LKF_PERSISTENT     (0x00000080)
+#define DLM_LKF_NODLCKWT       (0x00000100)
+#define DLM_LKF_NODLCKBLK      (0x00000200)
+#define DLM_LKF_EXPEDITE       (0x00000400)
+#define DLM_LKF_NOQUEUEBAST    (0x00000800)
+
+/*
+ * Some return codes that are not not in errno.h
+ */
+
+#define DLM_ECANCEL            (0x10001)
+#define DLM_EUNLOCK            (0x10002)
+
+typedef void dlm_lockspace_t;
+
+/*
+ * Lock range structure
+ */
+
+struct dlm_range {
+	uint64_t ra_start;
+	uint64_t ra_end;
+};
+
+/*
+ * Lock status block
+ *
+ * Use this structure to specify the contents of the lock value block.  For a
+ * conversion request, this structure is used to specify the lock ID of the
+ * lock.  DLM writes the status of the lock request and the lock ID assigned
+ * to the request in the lock status block.
+ *
+ * sb_lkid: the returned lock ID.  It is set on new (non-conversion) requests.
+ * It is available when dlm_lock returns.
+ *
+ * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
+ * shown for the DLM_LKF_VALBLK flag.
+ *
+ * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
+ * it was first demoted to NL to avoid conversion deadlock.
+ *
+ * sb_status: the returned status of the lock request set prior to AST
+ * execution.  Possible return values:
+ *
+ * 0 if lock request was successful
+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
+ * -ENOMEM if there is no memory to process request
+ * -EINVAL if there are invalid parameters
+ * -DLM_EUNLOCK if unlock request was successful
+ * -DLM_ECANCEL ?
+ */
+
+#define DLM_SBF_DEMOTED        (0x01)
+
+struct dlm_lksb {
+	int 	 sb_status;
+	uint32_t sb_lkid;
+	char 	 sb_flags;
+	char *	 sb_lvbptr;
+};
+
+/*
+ * These defines are the bits that make up the
+ * query code.
+ */
+
+/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
+ * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
+#define DLM_LOCK_THIS            0x0007
+#define DLM_QUERY_MODE_MASK      0x0007
+
+/* Bits 3, 4, 5  bitmap of queue(s) to query */
+#define DLM_QUERY_QUEUE_WAIT     0x0008
+#define DLM_QUERY_QUEUE_CONVERT  0x0010
+#define DLM_QUERY_QUEUE_GRANT    0x0020
+#define DLM_QUERY_QUEUE_GRANTED  0x0030	/* Shorthand */
+#define DLM_QUERY_QUEUE_ALL      0x0038	/* Shorthand */
+
+/* Bit 6, Return only the information that can be established without a network
+ * round-trip. The caller must be aware of the implications of this. Useful for
+ * just getting the master node id or resource name. */
+#define DLM_QUERY_LOCAL          0x0040
+
+/* Bits 8 up, query type */
+#define DLM_QUERY_LOCKS_HIGHER   0x0100
+#define DLM_QUERY_LOCKS_LOWER    0x0200
+#define DLM_QUERY_LOCKS_EQUAL    0x0300
+#define DLM_QUERY_LOCKS_BLOCKING 0x0400
+#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
+#define DLM_QUERY_LOCKS_ALL      0x0600
+#define DLM_QUERY_MASK           0x0F00
+
+/* GRMODE is the default for mode comparisons,
+   RQMODE might also be handy */
+#define DLM_QUERY_GRMODE         0x0000
+#define DLM_QUERY_RQMODE         0x1000
+
+/* Structures passed into and out of the query */
+
+struct dlm_lockinfo {
+	int lki_lkid;		/* Lock ID on originating node */
+        int lki_mstlkid;        /* Lock ID on master node */
+	int lki_parent;
+	int lki_node;		/* Originating node (not master) */
+	uint8_t lki_state;	/* Queue the lock is on */
+	uint8_t lki_grmode;	/* Granted mode */
+	uint8_t lki_rqmode;	/* Requested mode */
+	struct dlm_range lki_grrange;	/* Granted range, if applicable */
+	struct dlm_range lki_rqrange;	/* Requested range, if applicable */
+};
+
+struct dlm_resinfo {
+	int rsi_length;
+	int rsi_grantcount;	/* No. of nodes on grant queue */
+	int rsi_convcount;	/* No. of nodes on convert queue */
+	int rsi_waitcount;	/* No. of nodes on wait queue */
+	int rsi_masternode;	/* Master for this resource */
+	char rsi_name[DLM_RESNAME_MAXLEN];	/* Resource name */
+	char rsi_valblk[DLM_LVB_LEN];	/* Master's LVB contents, if applicable
+					 */
+};
+
+struct dlm_queryinfo {
+	struct dlm_resinfo *gqi_resinfo;
+	struct dlm_lockinfo *gqi_lockinfo;	/* This points to an array
+						 * of structs */
+	int gqi_locksize;	/* input */
+	int gqi_lockcount;	/* output */
+};
+
+#ifdef __KERNEL__
+/*
+ * dlm_init
+ *
+ * Starts and initializes DLM threads and structures.  Creation of the first
+ * lockspace will call this if it has not been called already.
+ *
+ * Returns: 0 if successful, -EXXX on error
+ */
+
+int dlm_init(void);
+
+/*
+ * dlm_release
+ *
+ * Stops DLM threads.
+ *
+ * Returns: 0 if successful, -EXXX on error
+ */
+
+int dlm_release(void);
+
+/*
+ * dlm_new_lockspace
+ *
+ * Starts a lockspace with the given name.  If the named lockspace exists in
+ * the cluster, the calling node joins it.
+ */
+
+int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
+		      int flags);
+
+/*
+ * dlm_release_lockspace
+ *
+ * Stop a lockspace.
+ */
+
+int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
+
+/*
+ * dlm_lock
+ *
+ * Make an asyncronous request to acquire or convert a lock on a named
+ * resource.
+ *
+ * lockspace: context for the request
+ * mode: the requested mode of the lock (DLM_LOCK_)
+ * lksb: lock status block for input and async return values
+ * flags: input flags (DLM_LKF_)
+ * name: name of the resource to lock, can be binary
+ * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
+ * parent: the lock ID of a parent lock or 0 if none
+ * lockast: function DLM executes when it completes processing the request
+ * astarg: argument passed to lockast and bast functions
+ * bast: function DLM executes when this lock later blocks another request
+ *
+ * Returns:
+ * 0 if request is successfully queued for processing
+ * -EINVAL if any input parameters are invalid
+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
+ * -ENOMEM if there is no memory to process request
+ * -ENOTCONN if there is a communication error
+ *
+ * If the call to dlm_lock returns an error then the operation has failed and
+ * the AST routine will not be called.  If dlm_lock returns 0 it is still
+ * possible that the lock operation will fail. The AST routine will be called
+ * when the locking is complete and the status is returned in the lksb.
+ *
+ * If the AST routines or parameter are passed to a conversion operation then
+ * they will overwrite those values that were passed to a previous dlm_lock
+ * call.
+ *
+ * AST routines should not block (at least not for long), but may make
+ * any locking calls they please.
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+	     uint32_t mode,
+	     struct dlm_lksb *lksb,
+	     uint32_t flags,
+	     void *name,
+	     unsigned int namelen,
+	     uint32_t parent,
+	     void (*lockast) (void *astarg),
+	     void *astarg,
+	     void (*bast) (void *astarg, int mode),
+	     struct dlm_range *range);
+
+/*
+ * dlm_unlock
+ *
+ * Asynchronously release a lock on a resource.  The AST routine is called
+ * when the resource is successfully unlocked.
+ *
+ * lockspace: context for the request
+ * lkid: the lock ID as returned in the lksb
+ * flags: input flags (DLM_LKF_)
+ * lksb: if NULL the lksb parameter passed to last lock request is used
+ * astarg: if NULL, astarg in last lock request is used
+ *
+ * Returns:
+ * 0 if request is successfully queued for processing
+ * -EINVAL if any input parameters are invalid
+ * -ENOTEMPTY if the lock still has sublocks
+ * -EBUSY if the lock is waiting for a remote lock operation
+ * -ENOTCONN if there is a communication error
+ */
+
+extern int dlm_unlock(dlm_lockspace_t *lockspace,
+		       uint32_t lkid,
+		       uint32_t flags,
+		       struct dlm_lksb *lksb,
+		       void *astarg);
+
+/* Query interface
+ *
+ * Query the other holders of a resource, given a known lock ID
+ *
+ * lockspace:   context for the request
+ * lksb:        LKSB, sb_lkid contains the lock ID of a valid lock
+ *              on the resource. sb_status will contain the status
+ *	        of the request on completion.
+ * query:       query bitmap see DLM_QUERY_* above
+ * qinfo:       pointer to dlm_queryinfo structure
+ * ast_routine: AST routine to call on completion
+ * artarg:      argument to AST routine. It is "traditional"
+ *              to put the qinfo pointer into lksb->sb_lvbptr
+ *              and pass the lksb in here.
+ */
+extern int dlm_query(dlm_lockspace_t *lockspace,
+		      struct dlm_lksb *lksb,
+		      int query,
+		      struct dlm_queryinfo *qinfo,
+		      void (ast_routine(void *)),
+		      void *astarg);
+
+#endif				/* __KERNEL__ */
+
+#endif				/* __DLM_DOT_H__ */
diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
--- linux-orig/include/cluster/dlm_device.h	1970-01-01 07:30:00.000000000 +0730
+++ linux-patched/include/cluster/dlm_device.h	2004-06-29 20:01:20.000000000 +0800
@@ -0,0 +1,63 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* This is the device interface for dlm, most users will use a library
+ * interface.
+ */
+
+/* Version of the device interface */
+#define DLM_DEVICE_VERSION_MAJOR 2
+#define DLM_DEVICE_VERSION_MINOR 0
+#define DLM_DEVICE_VERSION_PATCH 0
+
+/* struct passed to the lock write */
+struct dlm_lock_params {
+	uint32_t version[3];
+	uint8_t cmd;
+	uint8_t mode;
+	uint16_t flags;
+	uint32_t lkid;
+	uint32_t parent;
+	struct dlm_range range;
+	uint8_t namelen;
+        void *astparam;
+        void *astaddr;
+        void *bastaddr;
+        struct dlm_lksb *lksb;
+	char name[1];
+};
+
+
+/* struct read from the "device" fd,
+   consists mainly of userspace pointers for the library to use */
+struct dlm_lock_result {
+    	uint8_t cmd;
+        void *astparam;
+        void (*astaddr)(void *astparam);
+        struct dlm_lksb *user_lksb;
+        struct dlm_lksb lksb;  /* But this has real data in it */
+        uint8_t bast_mode; /* Not yet used */
+};
+
+/* commands passed to the device */
+#define DLM_USER_LOCK       1
+#define DLM_USER_UNLOCK     2
+#define DLM_USER_QUERY      3
+
+/* Arbitrary length restriction */
+#define MAX_LS_NAME_LEN 64
+
+/* ioctls on the device */
+#define DLM_CREATE_LOCKSPACE         _IOW('D', 0x01, char *)
+#define DLM_RELEASE_LOCKSPACE        _IOW('D', 0x02, char *)
+#define DLM_FORCE_RELEASE_LOCKSPACE  _IOW('D', 0x03, char *)