# Add DLM to the build system diff -urN -p linux-2.6.8.1/cluster/Kconfig linux/cluster/Kconfig --- linux-2.6.8.1/cluster/Kconfig 2004-08-24 13:23:09.000000000 +0800 +++ linux/cluster/Kconfig 2004-08-24 13:23:32.000000000 +0800 @@ -10,4 +10,22 @@ config CLUSTER needed by all the other components. It provides membership services for those other subsystems. +config CLUSTER_DLM + tristate "Distributed Lock Manager" + depends on CLUSTER + ---help--- + A fully distributed lock manager, providing cluster-wide locking services + and protected lock namespaces for kernel and userland applications. + +config CLUSTER_DLM_PROCLOCKS + boolean "/proc/locks support for DLM" + depends on CLUSTER_DLM + depends on PROC_FS + ---help--- + If this option is enabled a file will appear in /proc/cluster/dlm_locks. + write into this "file" the name of a lockspace known to the DLM and then + read out a list of all the resources and locks in that lockspace that are + known to the local node. Note because the DLM is distributed this may not + be the full lock picture. + endmenu diff -urN -p linux-2.6.8.1/cluster/Makefile linux/cluster/Makefile --- linux-2.6.8.1/cluster/Makefile 2004-08-24 13:23:09.000000000 +0800 +++ linux/cluster/Makefile 2004-08-24 13:23:32.000000000 +0800 @@ -1,3 +1,4 @@ obj-y := nocluster.o obj-$(CONFIG_CLUSTER) += cman/ +obj-$(CONFIG_CLUSTER_DLM) += dlm/ diff -urN -p linux-2.6.8.1/cluster/dlm/Makefile linux/cluster/dlm/Makefile --- linux-2.6.8.1/cluster/dlm/Makefile 1970-01-01 07:30:00.000000000 +0730 +++ linux/cluster/dlm/Makefile 2004-08-24 13:23:32.000000000 +0800 @@ -0,0 +1,23 @@ +dlm-objs := ast.o \ + config.o \ + device.o \ + dir.o \ + lkb.o \ + locking.o \ + lockqueue.o \ + lockspace.o \ + lowcomms.o \ + main.o \ + memory.o \ + midcomms.o \ + nodes.o \ + proc.o \ + queries.o \ + rebuild.o \ + reccomms.o \ + recover.o \ + recoverd.o \ + rsb.o \ + util.o \ + +obj-$(CONFIG_CLUSTER_DLM) += dlm.o diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c --- linux-orig/cluster/dlm/ast.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/ast.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,618 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * This delivers ASTs and checks for dead remote requests and deadlocks. + */ + +#include + +#include "dlm_internal.h" +#include "rsb.h" +#include "lockqueue.h" +#include "dir.h" +#include "locking.h" +#include "lkb.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "ast.h" +#include "nodes.h" +#include "config.h" +#include "util.h" + +/* Wake up flags for astd */ +#define WAKE_ASTS 1 +#define WAKE_TIMER 2 + +static struct list_head ast_queue; +static struct semaphore ast_queue_lock; +static wait_queue_head_t astd_waitchan; +struct task_struct * astd_task; +static unsigned long astd_wakeflags; + +static struct list_head _deadlockqueue; +static struct semaphore _deadlockqueue_lock; +static struct list_head _lockqueue; +static struct semaphore _lockqueue_lock; +static struct timer_list _lockqueue_timer; + +void add_to_lockqueue(struct dlm_lkb *lkb) +{ + /* Time stamp the entry so we know if it's been waiting too long */ + lkb->lkb_lockqueue_time = jiffies; + + down(&_lockqueue_lock); + list_add(&lkb->lkb_lockqueue, &_lockqueue); + up(&_lockqueue_lock); +} + +void remove_from_lockqueue(struct dlm_lkb *lkb) +{ + down(&_lockqueue_lock); + list_del(&lkb->lkb_lockqueue); + up(&_lockqueue_lock); + +#ifdef CONFIG_DLM_STATS + dlm_stats.lockqueue_time[lkb->lkb_lockqueue_state] += (jiffies - lkb->lkb_lockqueue_time); + dlm_stats.lockqueue_locks[lkb->lkb_lockqueue_state]++; +#endif + lkb->lkb_lockqueue_state = 0; +} + +void add_to_deadlockqueue(struct dlm_lkb *lkb) +{ + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags)) + return; + lkb->lkb_duetime = jiffies; + down(&_deadlockqueue_lock); + list_add(&lkb->lkb_deadlockq, &_deadlockqueue); + up(&_deadlockqueue_lock); +} + +void remove_from_deadlockqueue(struct dlm_lkb *lkb) +{ + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags)) + return; + + down(&_deadlockqueue_lock); + list_del(&lkb->lkb_deadlockq); + up(&_deadlockqueue_lock); + + /* Invalidate the due time */ + memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime)); +} + +/* + * Queue an AST for delivery, this will only deal with + * kernel ASTs, usermode API will piggyback on top of this. + * + * This can be called in either the user or DLM context. + * ASTs are queued EVEN IF we are already running in dlm_astd + * context as we don't know what other locks are held (eg we could + * be being called from a lock operation that was called from + * another AST! + * If the AST is to be queued remotely then a message is sent to + * the target system via midcomms. + */ + +void queue_ast(struct dlm_lkb *lkb, uint16_t flags, uint8_t rqmode) +{ + struct dlm_request req; + + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) { + /* + * Send a message to have an ast queued remotely. Note: we do + * not send remote completion asts, they are handled as part of + * remote lock granting. + */ + if (flags & AST_BAST) { + req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST; + req.rr_header.rh_length = sizeof(req); + req.rr_header.rh_flags = 0; + req.rr_header.rh_lkid = lkb->lkb_id; + req.rr_header.rh_lockspace = + lkb->lkb_resource->res_ls->ls_global_id; + req.rr_status = lkb->lkb_retstatus; + req.rr_remlkid = lkb->lkb_remid; + req.rr_rqmode = rqmode; + + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header, + lkb->lkb_resource->res_ls->ls_allocation); + } else if (lkb->lkb_retstatus == -EDEADLOCK) { + /* + * We only queue remote Completion ASTs here for error + * completions that happen out of band. + * DEADLOCK is one such. + */ + req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST; + req.rr_header.rh_length = sizeof(req); + req.rr_header.rh_flags = 0; + req.rr_header.rh_lkid = lkb->lkb_id; + req.rr_header.rh_lockspace = + lkb->lkb_resource->res_ls->ls_global_id; + req.rr_status = lkb->lkb_retstatus; + req.rr_remlkid = lkb->lkb_remid; + req.rr_rqmode = rqmode; + + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header, + lkb->lkb_resource->res_ls->ls_allocation); + } + } else { + /* + * Prepare info that will be returned in ast/bast. + */ + + if (flags & AST_BAST) { + lkb->lkb_bastmode = rqmode; + } else { + lkb->lkb_lksb->sb_status = lkb->lkb_retstatus; + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) + lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED; + else + lkb->lkb_lksb->sb_flags = 0; + } + + down(&ast_queue_lock); + if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST))) + list_add_tail(&lkb->lkb_astqueue, &ast_queue); + lkb->lkb_astflags |= flags; + up(&ast_queue_lock); + + /* It is the responsibility of the caller to call wake_astd() + * after it has finished other locking operations that request + * the ASTs to be delivered after */ + } +} + +/* + * Process any LKBs on the AST queue. + */ + +static void process_asts(void) +{ + struct dlm_ls *ls; + struct dlm_rsb *rsb; + struct dlm_lkb *lkb; + void (*cast) (long param); + void (*bast) (long param, int mode); + long astparam; + uint16_t flags; + + for (;;) { + down(&ast_queue_lock); + if (list_empty(&ast_queue)) { + up(&ast_queue_lock); + break; + } + + lkb = list_entry(ast_queue.next, struct dlm_lkb, lkb_astqueue); + list_del(&lkb->lkb_astqueue); + flags = lkb->lkb_astflags; + lkb->lkb_astflags = 0; + up(&ast_queue_lock); + + cast = lkb->lkb_astaddr; + bast = lkb->lkb_bastaddr; + astparam = lkb->lkb_astparam; + rsb = lkb->lkb_resource; + ls = rsb->res_ls; + + if (flags & AST_COMP) { + if (flags & AST_DEL) { + DLM_ASSERT(lkb->lkb_astflags == 0,); + + /* FIXME: we don't want to block asts for other + lockspaces while one is being recovered */ + + down_read(&ls->ls_in_recovery); + release_lkb(ls, lkb); + release_rsb(rsb); + up_read(&ls->ls_in_recovery); + } + + if (cast) { +#ifdef CONFIG_DLM_STATS + dlm_stats.cast++; +#endif + cast(astparam); + } + } + + if (flags & AST_BAST && !(flags & AST_DEL)) { + int bmode = lkb->lkb_bastmode; + + /* gr or rq mode of the lock may have changed since the + ast was queued making the delivery unnecessary */ + + if (!bast || dlm_modes_compat(lkb->lkb_grmode, bmode)) + continue; + + if (lkb->lkb_rqmode == DLM_LOCK_IV || + !dlm_modes_compat(lkb->lkb_rqmode, bmode)) { + bast(astparam, bmode); +#ifdef CONFIG_DLM_STATS + dlm_stats.bast++; +#endif + } + } + + schedule(); + } +} + +void lockqueue_lkb_mark(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb, *safe; + int count = 0; + + log_all(ls, "mark waiting requests"); + + down(&_lockqueue_lock); + + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) { + + if (lkb->lkb_resource->res_ls != ls) + continue; + + log_debug(ls, "mark %x lq %d nodeid %d", lkb->lkb_id, + lkb->lkb_lockqueue_state, lkb->lkb_nodeid); + + /* + * These lkb's are new and the master is being looked up. Mark + * the lkb request to be resent. Even if the destination node + * for the request is still living and has our request, it will + * purge all resdir requests in purge_requestqueue. If there's + * a reply to the LOOKUP request in our requestqueue (the reply + * arrived after ls_stop), it is invalid and will be discarded + * in purge_requestqueue, too. + */ + + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) { + DLM_ASSERT(lkb->lkb_nodeid == -1, + print_lkb(lkb); + print_rsb(lkb->lkb_resource);); + + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND; + count++; + continue; + } + + /* + * We're waiting for an unlock reply and the master node from + * whom we're expecting the reply has failed. If there's a + * reply in the requestqueue do nothing and process it later in + * process_requestqueue. If there's no reply, don't rebuild + * the lkb on a new master, but just assume we've gotten an + * unlock completion reply from the prev master (this also + * means not resending the unlock request). If the unlock is + * for the last lkb on the rsb, the rsb has nodeid of -1 and + * the rsb won't be rebuilt on the new master either. + * + * If we're waiting for an unlock reply and the master node is + * still alive, we should either have a reply in the + * requestqueue from the master already, or we should get one + * from the master once recovery is complete. There is no + * rebuilding of the rsb/lkb in this case and no resending of + * the request. + */ + + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK) { + if (in_nodes_gone(ls, lkb->lkb_nodeid)) { + if (reply_in_requestqueue(ls, lkb->lkb_id)) { + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD; + log_debug(ls, "mark %x unlock have rep", + lkb->lkb_id); + } else { + /* assume we got reply fr old master */ + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD; + lkb->lkb_flags |= GDLM_LKFLG_UNLOCKDONE; + log_debug(ls, "mark %x unlock no rep", + lkb->lkb_id); + } + } + count++; + continue; + } + + /* + * These lkb's have an outstanding request to a bygone node. + * The request will be redirected to the new master node in + * resend_cluster_requests(). Don't mark the request for + * resending if there's a reply for it saved in the + * requestqueue. + */ + + if (in_nodes_gone(ls, lkb->lkb_nodeid) && + !reply_in_requestqueue(ls, lkb->lkb_id)) { + + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND; + + /* + * Don't rebuild this lkb on a new rsb in + * rebuild_rsbs_send(). + */ + + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONDGRANT) { + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_WAITING, + print_lkb(lkb); + print_rsb(lkb->lkb_resource);); + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD; + } + + /* + * This flag indicates to the new master that his lkb + * is in the midst of a convert request and should be + * placed on the granted queue rather than the convert + * queue. We will resend this convert request to the + * new master. + */ + + else if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONVERT) { + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT, + print_lkb(lkb); + print_rsb(lkb->lkb_resource);); + lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT; + } + + count++; + } + } + up(&_lockqueue_lock); + + log_all(ls, "marked %d requests", count); +} + +int resend_cluster_requests(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb, *safe; + struct dlm_rsb *r; + int error = 0, state, count = 0; + + log_all(ls, "resend marked requests"); + + down(&_lockqueue_lock); + + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) { + + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { + log_debug(ls, "resend_cluster_requests: aborted"); + error = -EINTR; + break; + } + + r = lkb->lkb_resource; + + if (r->res_ls != ls) + continue; + + log_debug(ls, "resend %x lq %d flg %x node %d/%d \"%s\"", + lkb->lkb_id, lkb->lkb_lockqueue_state, lkb->lkb_flags, + lkb->lkb_nodeid, r->res_nodeid, r->res_name); + + if (lkb->lkb_flags & GDLM_LKFLG_UNLOCKDONE) { + log_debug(ls, "unlock done %x", lkb->lkb_id); + list_del(&lkb->lkb_lockqueue); + res_lkb_dequeue(lkb); + lkb->lkb_retstatus = -DLM_EUNLOCK; + queue_ast(lkb, AST_COMP | AST_DEL, 0); + count++; + continue; + } + + /* + * Resend/process the lockqueue lkb's (in-progres requests) + * that were flagged at the start of recovery in + * lockqueue_lkb_mark(). + */ + + if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) { + lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND; + lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD; + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT; + + if (lkb->lkb_nodeid == -1) { + /* + * Send lookup to new resdir node. + */ + lkb->lkb_lockqueue_time = jiffies; + send_cluster_request(lkb, + lkb->lkb_lockqueue_state); + } + + else if (lkb->lkb_nodeid != 0) { + /* + * There's a new RSB master (that's not us.) + */ + lkb->lkb_lockqueue_time = jiffies; + send_cluster_request(lkb, + lkb->lkb_lockqueue_state); + } + + else { + /* + * We are the new RSB master for this lkb + * request. + */ + state = lkb->lkb_lockqueue_state; + lkb->lkb_lockqueue_state = 0; + /* list_del equals remove_from_lockqueue() */ + list_del(&lkb->lkb_lockqueue); + process_remastered_lkb(ls, lkb, state); + } + + count++; + } + } + up(&_lockqueue_lock); + + log_all(ls, "resent %d requests", count); + return error; +} + +/* + * Process any LKBs on the Lock queue, this + * just looks at the entries to see if they have been + * on the queue too long and fails the requests if so. + */ + +static void process_lockqueue(void) +{ + struct dlm_lkb *lkb, *safe; + struct dlm_ls *ls; + int count = 0; + + down(&_lockqueue_lock); + + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) { + ls = lkb->lkb_resource->res_ls; + + if (test_bit(LSFL_NOTIMERS, &ls->ls_flags)) + continue; + + /* Don't time out locks that are in transition */ + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) + continue; + + if (check_timeout(lkb->lkb_lockqueue_time, + dlm_config.lock_timeout)) { + count++; + list_del(&lkb->lkb_lockqueue); + up(&_lockqueue_lock); + cancel_lockop(lkb, -ETIMEDOUT); + down(&_lockqueue_lock); + } + } + up(&_lockqueue_lock); + + if (count) + wake_astd(); + + mod_timer(&_lockqueue_timer, + jiffies + ((dlm_config.lock_timeout >> 1) * HZ)); +} + +/* Look for deadlocks */ +static void process_deadlockqueue(void) +{ + struct dlm_lkb *lkb, *safe; + + down(&_deadlockqueue_lock); + + list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) { + struct dlm_lkb *kill_lkb; + + /* Only look at "due" locks */ + if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime)) + break; + + /* Don't look at locks that are in transition */ + if (!test_bit(LSFL_LS_RUN, + &lkb->lkb_resource->res_ls->ls_flags)) + continue; + + up(&_deadlockqueue_lock); + + /* Lock has hit due time, check for conversion deadlock */ + kill_lkb = conversion_deadlock_check(lkb); + if (kill_lkb) + cancel_conversion(kill_lkb, -EDEADLOCK); + + down(&_deadlockqueue_lock); + } + up(&_deadlockqueue_lock); +} + +static __inline__ int no_asts(void) +{ + int ret; + + down(&ast_queue_lock); + ret = list_empty(&ast_queue); + up(&ast_queue_lock); + return ret; +} + +static void lockqueue_timer_fn(unsigned long arg) +{ + set_bit(WAKE_TIMER, &astd_wakeflags); + wake_up(&astd_waitchan); +} + +/* + * DLM daemon which delivers asts. + */ + +static int dlm_astd(void *data) +{ + /* + * Set a timer to check the lockqueue for dead locks (and deadlocks). + */ + INIT_LIST_HEAD(&_lockqueue); + init_MUTEX(&_lockqueue_lock); + INIT_LIST_HEAD(&_deadlockqueue); + init_MUTEX(&_deadlockqueue_lock); + init_timer(&_lockqueue_timer); + _lockqueue_timer.function = lockqueue_timer_fn; + _lockqueue_timer.data = 0; + mod_timer(&_lockqueue_timer, + jiffies + ((dlm_config.lock_timeout >> 1) * HZ)); + + while (!kthread_should_stop()) { + wchan_cond_sleep_intr(astd_waitchan, !test_bit(WAKE_ASTS, &astd_wakeflags)); + + if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags)) + process_asts(); + + if (test_and_clear_bit(WAKE_TIMER, &astd_wakeflags)) { + process_lockqueue(); + if (dlm_config.deadlocktime) + process_deadlockqueue(); + } + } + + if (timer_pending(&_lockqueue_timer)) + del_timer(&_lockqueue_timer); + + return 0; +} + +void wake_astd(void) +{ + if (!no_asts()) { + set_bit(WAKE_ASTS, &astd_wakeflags); + wake_up(&astd_waitchan); + } +} + +int astd_start(void) +{ + struct task_struct *p; + int error = 0; + + INIT_LIST_HEAD(&ast_queue); + init_MUTEX(&ast_queue_lock); + init_waitqueue_head(&astd_waitchan); + + p = kthread_run(dlm_astd, NULL, 0, "dlm_astd"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + astd_task = p; + return error; +} + +void astd_stop(void) +{ + kthread_stop(astd_task); + wake_up(&astd_waitchan); +} diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h --- linux-orig/cluster/dlm/ast.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/ast.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,28 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __AST_DOT_H__ +#define __AST_DOT_H__ + +void lockqueue_lkb_mark(struct dlm_ls *ls); +int resend_cluster_requests(struct dlm_ls *ls); +void add_to_lockqueue(struct dlm_lkb *lkb); +void remove_from_lockqueue(struct dlm_lkb *lkb); +void add_to_deadlockqueue(struct dlm_lkb *lkb); +void remove_from_deadlockqueue(struct dlm_lkb *lkb); +void queue_ast(struct dlm_lkb *lkb, uint16_t astflags, uint8_t rqmode); +void wake_astd(void); +int astd_start(void); +void astd_stop(void); + +#endif /* __AST_DOT_H__ */ diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c --- linux-orig/cluster/dlm/config.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/config.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,137 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "config.h" + +/* Config file defaults */ +#define DEFAULT_TCP_PORT 21064 +#define DEFAULT_LOCK_TIMEOUT 30 +#define DEFAULT_BUFFER_SIZE 4096 +#define DEFAULT_RSBTBL_SIZE 256 +#define DEFAULT_LKBTBL_SIZE 1024 +#define DEFAULT_DIRTBL_SIZE 512 +#define DEFAULT_CONN_INCREMENT 32 +#define DEFAULT_DEADLOCKTIME 10 +#define DEFAULT_RECOVER_TIMER 5 + +struct config_info dlm_config = { + .tcp_port = DEFAULT_TCP_PORT, + .lock_timeout = DEFAULT_LOCK_TIMEOUT, + .buffer_size = DEFAULT_BUFFER_SIZE, + .rsbtbl_size = DEFAULT_RSBTBL_SIZE, + .lkbtbl_size = DEFAULT_LKBTBL_SIZE, + .dirtbl_size = DEFAULT_DIRTBL_SIZE, + .conn_increment = DEFAULT_CONN_INCREMENT, + .deadlocktime = DEFAULT_DEADLOCKTIME, + .recover_timer = DEFAULT_RECOVER_TIMER +}; + + +static struct config_proc_info { + char *name; + int *value; +} config_proc[] = { + { + .name = "tcp_port", + .value = &dlm_config.tcp_port, + }, + { + .name = "lock_timeout", + .value = &dlm_config.lock_timeout, + }, + { + .name = "buffer_size", + .value = &dlm_config.buffer_size, + }, + { + .name = "rsbtbl_size", + .value = &dlm_config.rsbtbl_size, + }, + { + .name = "lkbtbl_size", + .value = &dlm_config.lkbtbl_size, + }, + { + .name = "dirtbl_size", + .value = &dlm_config.dirtbl_size, + }, + { + .name = "conn_increment", + .value = &dlm_config.conn_increment, + }, + { + .name = "deadlocktime", + .value = &dlm_config.deadlocktime, + }, + { + .name = "recover_timer", + .value = &dlm_config.recover_timer, + } +}; +static struct proc_dir_entry *dlm_dir; + +static int dlm_config_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct config_proc_info *cinfo = data; + return snprintf(page, count, "%d\n", *cinfo->value); +} + +static int dlm_config_write_proc(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct config_proc_info *cinfo = data; + int value; + char *end; + + value = simple_strtoul(buffer, &end, 10); + if (*end) + *cinfo->value = value; + return count; +} + +int dlm_config_init(void) +{ + int i; + struct proc_dir_entry *pde; + + dlm_dir = proc_mkdir("cluster/config/dlm", 0); + if (!dlm_dir) + return -1; + + dlm_dir->owner = THIS_MODULE; + + for (i=0; idata = &config_proc[i]; + pde->write_proc = dlm_config_write_proc; + pde->read_proc = dlm_config_read_proc; + } + } + return 0; +} + +void dlm_config_exit(void) +{ + int i; + + for (i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "device.h" + +extern struct dlm_lkb *dlm_get_lkb(struct dlm_ls *, int); +static struct file_operations _dlm_fops; +static const char *name_prefix="dlm"; +static struct list_head user_ls_list; +static struct semaphore user_ls_lock; + +/* Flags in li_flags */ +#define LI_FLAG_COMPLETE 1 +#define LI_FLAG_FIRSTLOCK 2 + +#define LOCKINFO_MAGIC 0x53595324 + +struct lock_info { + uint32_t li_magic; + uint8_t li_cmd; + struct dlm_lksb li_lksb; + wait_queue_head_t li_waitq; + unsigned long li_flags; + void __user *li_castparam; + void __user *li_castaddr; + void __user *li_bastparam; + void __user *li_bastaddr; + void __user *li_pend_bastparam; + void __user *li_pend_bastaddr; + void __user *li_user_lvbptr; + struct list_head li_ownerqueue; + struct file_info *li_file; + struct dlm_lksb __user *li_user_lksb; + struct semaphore li_firstlock; + struct dlm_queryinfo *li_queryinfo; + struct dlm_queryinfo __user *li_user_queryinfo; +}; + +/* A queued AST no less */ +struct ast_info { + struct dlm_lock_result result; + struct dlm_queryinfo *queryinfo; + struct dlm_queryinfo __user *user_queryinfo; + struct list_head list; + void __user *user_lvbptr; + uint32_t ast_reason; /* AST_COMP or AST_BAST from dlm_internal.h */ +}; + +/* One of these per userland lockspace */ +struct user_ls { + void *ls_lockspace; + atomic_t ls_refcnt; + long ls_flags; /* bit 1 means LS has been deleted */ + + /* Passed into misc_register() */ + struct miscdevice ls_miscinfo; + struct list_head ls_list; +}; + +/* misc_device info for the control device */ +static struct miscdevice ctl_device; + +/* + * Stuff we hang off the file struct. + * The first two are to cope with unlocking all the + * locks help by a process when it dies. + */ +struct file_info { + struct list_head fi_lkb_list; /* List of active lkbs */ + spinlock_t fi_lkb_lock; + struct list_head fi_ast_list; /* Queue of ASTs to be delivered */ + spinlock_t fi_ast_lock; + wait_queue_head_t fi_wait; + struct user_ls *fi_ls; + atomic_t fi_refcnt; /* Number of users */ + unsigned long fi_flags; /* Bit 1 means the device is open */ +}; + + +/* get and put ops for file_info. + Actually I don't really like "get" and "put", but everyone + else seems to use them and I can't think of anything + nicer at the moment */ +static void get_file_info(struct file_info *f) +{ + atomic_inc(&f->fi_refcnt); +} + +static void put_file_info(struct file_info *f) +{ + if (atomic_dec_and_test(&f->fi_refcnt)) + kfree(f); +} + +static void release_lockinfo(struct lock_info *li) +{ + put_file_info(li->li_file); + if (li->li_lksb.sb_lvbptr && li->li_cmd != DLM_USER_QUERY) + kfree(li->li_lksb.sb_lvbptr); + kfree(li); +} + +static struct user_ls *__find_lockspace(int minor) +{ + struct user_ls *lsinfo; + + list_for_each_entry(lsinfo, &user_ls_list, ls_list) { + + if (lsinfo->ls_miscinfo.minor == minor) + return lsinfo; + } + return NULL; +} + +/* Find a lockspace struct given the device minor number */ +static struct user_ls *find_lockspace(int minor) +{ + struct user_ls *lsinfo; + + down(&user_ls_lock); + lsinfo = __find_lockspace(minor); + up(&user_ls_lock); + + return lsinfo; +} + +static void add_lockspace_to_list(struct user_ls *lsinfo) +{ + down(&user_ls_lock); + list_add(&lsinfo->ls_list, &user_ls_list); + up(&user_ls_lock); +} + +/* Register a lockspace with the DLM and create a misc + device for userland to access it */ +static int register_lockspace(char *name, struct user_ls **ls) +{ + struct user_ls *newls; + int status; + int namelen; + + namelen = strlen(name)+strlen(name_prefix)+2; + + newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL); + if (!newls) + return -ENOMEM; + memset(newls, 0, sizeof(struct user_ls)); + + newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL); + if (!newls->ls_miscinfo.name) { + kfree(newls); + return -ENOMEM; + } + status = dlm_new_lockspace(name, strlen(name), + &newls->ls_lockspace, 0); + + if (status != 0) { + kfree(newls->ls_miscinfo.name); + kfree(newls); + return status; + } + + snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name); + + newls->ls_miscinfo.fops = &_dlm_fops; + newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR; + + status = misc_register(&newls->ls_miscinfo); + if (status) { + log_print("failed to register misc device for %s", name); + dlm_release_lockspace(newls->ls_lockspace, 0); + kfree(newls->ls_miscinfo.name); + kfree(newls); + return status; + } + + + add_lockspace_to_list(newls); + *ls = newls; + return 0; +} + +/* Called with the user_ls_lock semaphore held */ +static int unregister_lockspace(struct user_ls *lsinfo, int force) +{ + int status; + + status = dlm_release_lockspace(lsinfo->ls_lockspace, force); + if (status) + return status; + + status = misc_deregister(&lsinfo->ls_miscinfo); + if (status) + return status; + + list_del(&lsinfo->ls_list); + set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */ + lsinfo->ls_lockspace = NULL; + if (atomic_dec_and_test(&lsinfo->ls_refcnt)) { + kfree(lsinfo->ls_miscinfo.name); + kfree(lsinfo); + } + + return 0; +} + +/* Add it to userland's AST queue */ +static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam, uint32_t reason) +{ + struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL); + if (!ast) + return; + + ast->result.astparam = astparam; + ast->result.astaddr = astaddr; + ast->result.user_lksb = li->li_user_lksb; + ast->result.cmd = li->li_cmd; + ast->user_lvbptr = li->li_user_lvbptr; + ast->ast_reason = reason; + memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb)); + + /* These two will both be NULL for anything other than queries */ + ast->queryinfo = li->li_queryinfo; + ast->user_queryinfo = li->li_user_queryinfo; + + spin_lock(&li->li_file->fi_ast_lock); + list_add_tail(&ast->list, &li->li_file->fi_ast_list); + spin_unlock(&li->li_file->fi_ast_lock); + wake_up_interruptible(&li->li_file->fi_wait); +} + +static void bast_routine(void *param, int mode) +{ + struct lock_info *li = param; + + if (li && li->li_bastaddr) { + add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, AST_BAST); + } +} + +/* + * This is the kernel's AST routine. + * All lock, unlock & query operations complete here. + * The only syncronous ops are those done during device close. + */ +static void ast_routine(void *param) +{ + struct lock_info *li = param; + + /* Param may be NULL if a persistent lock is unlocked by someone else */ + if (!li) + return; + + /* If this is a succesful conversion then activate the blocking ast + * args from the conversion request */ + if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) && + li->li_lksb.sb_status == 0) { + + li->li_bastparam = li->li_pend_bastparam; + li->li_bastaddr = li->li_pend_bastaddr; + li->li_pend_bastaddr = NULL; + } + + /* If it's an async request then post data to the user's AST queue. */ + if (li->li_castaddr) { + + /* Only queue AST if the device is still open */ + if (test_bit(1, &li->li_file->fi_flags)) + add_to_astqueue(li, li->li_castaddr, li->li_castparam, AST_COMP); + + /* If it's a new lock operation that failed, then + * remove it from the owner queue and free the + * lock_info. The DLM will not free the LKB until this + * AST has completed. + */ + if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) && + li->li_lksb.sb_status != 0) { + struct dlm_lkb *lkb; + + /* Wait till dlm_lock() has finished */ + down(&li->li_firstlock); + up(&li->li_firstlock); + + /* If the LKB has been freed then we need to tidy up too */ + lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid); + if (!lkb) { + spin_lock(&li->li_file->fi_lkb_lock); + list_del(&li->li_ownerqueue); + spin_unlock(&li->li_file->fi_lkb_lock); + + release_lockinfo(li); + } + return; + } + /* Free unlocks & queries */ + if (li->li_lksb.sb_status == -DLM_EUNLOCK || + li->li_cmd == DLM_USER_QUERY) { + release_lockinfo(li); + } + } + else { + /* Synchronous request, just wake up the caller */ + set_bit(LI_FLAG_COMPLETE, &li->li_flags); + wake_up_interruptible(&li->li_waitq); + } +} + +/* + * Wait for the lock op to complete and return the status. + */ +static int wait_for_ast(struct lock_info *li) +{ + /* Wait for the AST routine to complete */ + set_task_state(current, TASK_INTERRUPTIBLE); + while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags)) + schedule(); + + set_task_state(current, TASK_RUNNING); + + return li->li_lksb.sb_status; +} + + +/* Open on control device */ +static int dlm_ctl_open(struct inode *inode, struct file *file) +{ + return 0; +} + +/* Close on control device */ +static int dlm_ctl_close(struct inode *inode, struct file *file) +{ + return 0; +} + +/* Open on lockspace device */ +static int dlm_open(struct inode *inode, struct file *file) +{ + struct file_info *f; + struct user_ls *lsinfo; + + lsinfo = find_lockspace(iminor(inode)); + if (!lsinfo) + return -ENOENT; + + f = kmalloc(sizeof(struct file_info), GFP_KERNEL); + if (!f) + return -ENOMEM; + + atomic_inc(&lsinfo->ls_refcnt); + INIT_LIST_HEAD(&f->fi_lkb_list); + INIT_LIST_HEAD(&f->fi_ast_list); + spin_lock_init(&f->fi_ast_lock); + spin_lock_init(&f->fi_lkb_lock); + init_waitqueue_head(&f->fi_wait); + f->fi_ls = lsinfo; + atomic_set(&f->fi_refcnt, 1); + set_bit(1, &f->fi_flags); + + file->private_data = f; + + return 0; +} + +/* Check the user's version matches ours */ +static int check_version(struct dlm_lock_params *params) +{ + if (params->version[0] != DLM_DEVICE_VERSION_MAJOR || + (params->version[0] == DLM_DEVICE_VERSION_MAJOR && + params->version[1] > DLM_DEVICE_VERSION_MINOR)) { + + log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)", + params->version[0], + params->version[1], + params->version[2], + DLM_DEVICE_VERSION_MAJOR, + DLM_DEVICE_VERSION_MINOR, + DLM_DEVICE_VERSION_PATCH); + return -EINVAL; + } + return 0; +} + +/* Close on lockspace device */ +static int dlm_close(struct inode *inode, struct file *file) +{ + struct file_info *f = file->private_data; + struct lock_info li; + struct lock_info *old_li, *safe; + sigset_t tmpsig; + sigset_t allsigs; + struct user_ls *lsinfo; + DECLARE_WAITQUEUE(wq, current); + + lsinfo = find_lockspace(iminor(inode)); + if (!lsinfo) + return -ENOENT; + + /* Mark this closed so that ASTs will not be delivered any more */ + clear_bit(1, &f->fi_flags); + + /* Block signals while we are doing this */ + sigfillset(&allsigs); + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); + + /* We use our own lock_info struct here, so that any + * outstanding "real" ASTs will be delivered with the + * corresponding "real" params, thus freeing the lock_info + * that belongs the lock. This catches the corner case where + * a lock is BUSY when we try to unlock it here + */ + memset(&li, 0, sizeof(li)); + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); + init_waitqueue_head(&li.li_waitq); + add_wait_queue(&li.li_waitq, &wq); + + /* + * Free any outstanding locks, they are on the + * list in LIFO order so there should be no problems + * about unlocking parents before children. + * Although we don't remove the lkbs from the list here + * (what would be the point?), foreach_safe is needed + * because the lkbs are freed during dlm_unlock operations + */ + list_for_each_entry_safe(old_li, safe, &f->fi_lkb_list, li_ownerqueue) { + int status; + int lock_status; + int flags = 0; + struct dlm_lkb *lkb; + + lkb = dlm_get_lkb(f->fi_ls->ls_lockspace, old_li->li_lksb.sb_lkid); + + /* Don't unlock persistent locks */ + if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) { + list_del(&old_li->li_ownerqueue); + + /* Update master copy */ + if (lkb->lkb_resource->res_nodeid) { + li.li_lksb.sb_lkid = lkb->lkb_id; + status = dlm_lock(f->fi_ls->ls_lockspace, + lkb->lkb_grmode, &li.li_lksb, + DLM_LKF_CONVERT|DLM_LKF_ORPHAN, + NULL, 0, 0, ast_routine, &li, + NULL, NULL); + if (status == 0) + wait_for_ast(&li); + } + lkb->lkb_flags |= GDLM_LKFLG_ORPHAN; + + /* But tidy our references in it */ + kfree(old_li); + lkb->lkb_astparam = (long)NULL; + put_file_info(f); + + continue; + } + + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); + + /* If it's not granted then cancel the request. + * If the lock was WAITING then it will be dropped, + * if it was converting then it will be reverted to GRANTED, + * then we will unlock it. + */ + lock_status = lkb->lkb_status; + + if (lock_status != GDLM_LKSTS_GRANTED) + flags = DLM_LKF_CANCEL; + + if (lkb->lkb_grmode >= DLM_LOCK_PW) + flags |= DLM_LKF_IVVALBLK; + + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li); + + /* Must wait for it to complete as the next lock could be its + * parent */ + if (status == 0) + wait_for_ast(&li); + + /* If it was waiting for a conversion, it will + now be granted so we can unlock it properly */ + if (lock_status == GDLM_LKSTS_CONVERT) { + flags &= ~DLM_LKF_CANCEL; + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li); + + if (status == 0) + wait_for_ast(&li); + } + /* Unlock suceeded, free the lock_info struct. */ + if (status == 0) { + kfree(old_li); + put_file_info(f); + } + } + + remove_wait_queue(&li.li_waitq, &wq); + + /* If this is the last reference, and the lockspace has been deleted + then free the struct */ + if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) { + kfree(lsinfo->ls_miscinfo.name); + kfree(lsinfo); + } + + /* Restore signals */ + sigprocmask(SIG_SETMASK, &tmpsig, NULL); + recalc_sigpending(); + + return 0; +} + +/* + * ioctls to create/remove lockspaces, and check how many + * outstanding ASTs there are against a particular LS. + */ +static int dlm_ioctl(struct inode *inode, struct file *file, + uint command, ulong u) +{ + struct file_info *fi = file->private_data; + int status = -EINVAL; + int count; + struct list_head *tmp_list; + + switch (command) { + + /* Are there any ASTs for us to read? + * Warning, this returns the number of messages (ASTs) + * in the queue, NOT the number of bytes to read + */ + case FIONREAD: + count = 0; + spin_lock(&fi->fi_ast_lock); + list_for_each(tmp_list, &fi->fi_ast_list) + count++; + spin_unlock(&fi->fi_ast_lock); + status = put_user(count, (int *)u); + break; + + default: + return -ENOTTY; + } + + return status; +} + +/* + * ioctls to create/remove lockspaces. + */ +static int dlm_ctl_ioctl(struct inode *inode, struct file *file, + uint command, ulong u) +{ + int status = -EINVAL; + char ls_name[MAX_LS_NAME_LEN]; + struct user_ls *lsinfo; + int force = 0; + + switch (command) { + case DLM_CREATE_LOCKSPACE: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0) + return -EFAULT; + status = register_lockspace(ls_name, &lsinfo); + + /* If it succeeded then return the minor number */ + if (status == 0) + status = lsinfo->ls_miscinfo.minor; + break; + + case DLM_FORCE_RELEASE_LOCKSPACE: + force = 2; + + case DLM_RELEASE_LOCKSPACE: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + down(&user_ls_lock); + lsinfo = __find_lockspace(u); + if (!lsinfo) { + up(&user_ls_lock); + return -EINVAL; + } + + status = unregister_lockspace(lsinfo, force); + up(&user_ls_lock); + break; + + default: + return -ENOTTY; + } + + return status; +} + +/* Deal with the messy stuff of copying a web of structs + from kernel space to userspace */ +static int copy_query_result(struct ast_info *ast) +{ + int status = -EFAULT; + struct dlm_queryinfo qi; + + /* Get the pointers to userspace structs */ + if (copy_from_user(&qi, ast->user_queryinfo, + sizeof(struct dlm_queryinfo))) + goto copy_out; + + if (put_user(ast->queryinfo->gqi_lockcount, + &ast->user_queryinfo->gqi_lockcount)) + goto copy_out; + + if (qi.gqi_resinfo) { + if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo, + sizeof(struct dlm_resinfo))) + goto copy_out; + } + + if (qi.gqi_lockinfo) { + if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo, + sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount)) + goto copy_out; + } + + status = 0; + + if (ast->queryinfo->gqi_lockinfo) + kfree(ast->queryinfo->gqi_lockinfo); + + if (ast->queryinfo->gqi_resinfo) + kfree(ast->queryinfo->gqi_resinfo); + + kfree(ast->queryinfo); + + copy_out: + return status; +} + +/* Read call, might block if no ASTs are waiting. + * It will only ever return one message at a time, regardless + * of how many are pending. + */ +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) +{ + struct file_info *fi = file->private_data; + struct ast_info *ast; + int ret; + DECLARE_WAITQUEUE(wait, current); + + if (count < sizeof(struct dlm_lock_result)) + return -EINVAL; + + spin_lock(&fi->fi_ast_lock); + if (list_empty(&fi->fi_ast_list)) { + + /* No waiting ASTs. + * Return EOF if the lockspace been deleted. + */ + if (test_bit(1, &fi->fi_ls->ls_flags)) + return 0; + + if (file->f_flags & O_NONBLOCK) { + spin_unlock(&fi->fi_ast_lock); + return -EAGAIN; + } + + add_wait_queue(&fi->fi_wait, &wait); + + repeat: + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&fi->fi_ast_list) && + !signal_pending(current)) { + + spin_unlock(&fi->fi_ast_lock); + schedule(); + spin_lock(&fi->fi_ast_lock); + goto repeat; + } + + current->state = TASK_RUNNING; + remove_wait_queue(&fi->fi_wait, &wait); + + if (signal_pending(current)) { + spin_unlock(&fi->fi_ast_lock); + return -ERESTARTSYS; + } + } + + ast = list_entry(fi->fi_ast_list.next, struct ast_info, list); + list_del(&ast->list); + spin_unlock(&fi->fi_ast_lock); + + ret = sizeof(struct dlm_lock_result); + if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result))) + ret = -EFAULT; + + if (ast->ast_reason == AST_COMP && + ast->result.cmd == DLM_USER_LOCK && ast->user_lvbptr) { + if (copy_to_user(ast->user_lvbptr, ast->result.lksb.sb_lvbptr, DLM_LVB_LEN)) + ret = -EFAULT; + } + + /* If it was a query then copy the result block back here */ + if (ast->queryinfo) { + int status = copy_query_result(ast); + if (status) + ret = status; + } + + kfree(ast); + return ret; +} + +static unsigned int dlm_poll(struct file *file, poll_table *wait) +{ + struct file_info *fi = file->private_data; + + poll_wait(file, &fi->fi_wait, wait); + + spin_lock(&fi->fi_ast_lock); + if (!list_empty(&fi->fi_ast_list)) { + spin_unlock(&fi->fi_ast_lock); + return POLLIN | POLLRDNORM; + } + + spin_unlock(&fi->fi_ast_lock); + return 0; +} + +static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams) +{ + struct lock_info *li; + int status; + + if (!kparams->castaddr) + return -EINVAL; + + if (!kparams->lksb) + return -EINVAL; + + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL); + if (!li) + return -ENOMEM; + + get_file_info(fi); + li->li_user_lksb = kparams->lksb; + li->li_bastparam = kparams->bastparam; + li->li_bastaddr = kparams->bastaddr; + li->li_castparam = kparams->castparam; + li->li_castaddr = kparams->castaddr; + li->li_file = fi; + li->li_flags = 0; + li->li_cmd = kparams->cmd; + clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags); + + if (copy_from_user(&li->li_lksb, kparams->lksb, + sizeof(struct dlm_lksb))) { + kfree(li); + return -EFAULT; + } + li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr; + + /* Allocate query structs */ + status = -ENOMEM; + li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL); + if (!li->li_queryinfo) + goto out1; + + /* Mainly to get gqi_lock buffer size */ + if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr, + sizeof(struct dlm_queryinfo))) { + status = -EFAULT; + goto out1; + } + + /* Overwrite userspace pointers we just copied with kernel space ones */ + if (li->li_queryinfo->gqi_resinfo) { + li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL); + if (!li->li_queryinfo->gqi_resinfo) + goto out1; + } + if (li->li_queryinfo->gqi_lockinfo) { + li->li_queryinfo->gqi_lockinfo = + kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize, + GFP_KERNEL); + if (!li->li_queryinfo->gqi_lockinfo) + goto out2; + } + + li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo; + + return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb, + kparams->flags, /* query */ + li->li_queryinfo, + ast_routine, li); + + out2: + kfree(li->li_queryinfo); + + out1: + kfree(li); + return status; +} + +static struct lock_info *allocate_lockinfo(struct file_info *fi, struct dlm_lock_params *kparams) +{ + struct lock_info *li; + + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL); + if (li) { + li->li_magic = LOCKINFO_MAGIC; + li->li_file = fi; + li->li_cmd = kparams->cmd; + li->li_queryinfo = NULL; + li->li_flags = 0; + li->li_pend_bastparam = NULL; + li->li_pend_bastaddr = NULL; + li->li_lksb.sb_lvbptr = NULL; + li->li_bastaddr = kparams->bastaddr; + li->li_bastparam = kparams->bastparam; + + get_file_info(fi); + } + return li; +} + +static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams, + const char *buffer) +{ + struct lock_info *li; + int status; + char name[DLM_RESNAME_MAXLEN]; + void *lvbptr; + + /* + * Validate things that we need to have correct. + */ + if (!kparams->castaddr) + return -EINVAL; + + if (!kparams->lksb) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, kparams->lksb, sizeof(struct dlm_lksb))) + return -EFAULT; + + /* Persistent child locks are not available yet */ + if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent) + return -EINVAL; + + /* For conversions, the lock will already have a lock_info + block squirelled away in astparam */ + if (kparams->flags & DLM_LKF_CONVERT) { + struct dlm_lkb *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid); + if (!lkb) { + return -EINVAL; + } + + li = (struct lock_info *)lkb->lkb_astparam; + + /* li may be NULL if the lock was PERSISTENT and the process went + away, so we need to allocate a new one */ + if (!li) { + li = allocate_lockinfo(fi, kparams); + if (li) { + spin_lock(&fi->fi_lkb_lock); + list_add(&li->li_ownerqueue, &fi->fi_lkb_list); + spin_unlock(&fi->fi_lkb_lock); + } + else { + return -ENOMEM; + } + } + + if (li->li_magic != LOCKINFO_MAGIC) + return -EINVAL; + + /* For conversions don't overwrite the current blocking AST + info so that: + a) if a blocking AST fires before the conversion is queued + it runs the current handler + b) if the conversion is cancelled, the original blocking AST + declaration is active + The pend_ info is made active when the conversion + completes. + */ + li->li_pend_bastaddr = kparams->bastaddr; + li->li_pend_bastparam = kparams->bastparam; + } + else { + li = allocate_lockinfo(fi, kparams); + if (!li) + return -ENOMEM; + + /* Get the lock name */ + if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name), + kparams->namelen)) { + return -EFAULT; + } + + /* semaphore to allow us to complete our work before + the AST routine runs. In fact we only need (and use) this + when the initial lock fails */ + init_MUTEX_LOCKED(&li->li_firstlock); + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags); + } + + li->li_user_lksb = kparams->lksb; + li->li_castaddr = kparams->castaddr; + li->li_castparam = kparams->castparam; + + /* Copy the user's LKSB into kernel space, + needed for conversions & value block operations. + Save our kernel-space lvbptr first */ + lvbptr = li->li_lksb.sb_lvbptr; + if (copy_from_user(&li->li_lksb, kparams->lksb, sizeof(struct dlm_lksb))) { + status = -EFAULT; + goto out_err; + } + /* Store new userland LVBptr and restore kernel one */ + li->li_user_lvbptr = li->li_lksb.sb_lvbptr; + li->li_lksb.sb_lvbptr = lvbptr; + + /* Copy in the value block */ + if (kparams->flags & DLM_LKF_VALBLK) { + if (!li->li_lksb.sb_lvbptr) { + li->li_lksb.sb_lvbptr = kmalloc(DLM_LVB_LEN, GFP_KERNEL); + if (!li->li_lksb.sb_lvbptr) { + status = -ENOMEM; + goto out_err; + } + } + + if (copy_from_user(li->li_lksb.sb_lvbptr, kparams->lksb->sb_lvbptr, + DLM_LVB_LEN)) { + status = -EFAULT; + goto out_err; + } + } + else { + li->li_user_lvbptr = NULL; + } + + /* Lock it ... */ + status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb, + kparams->flags, name, kparams->namelen, + kparams->parent, + ast_routine, + li, + (li->li_pend_bastaddr || li->li_bastaddr) ? + bast_routine : NULL, + kparams->range.ra_end ? &kparams->range : NULL); + + /* If it succeeded (this far) with a new lock then keep track of + it on the file's lkb list */ + if (!status && !(kparams->flags & DLM_LKF_CONVERT)) { + + spin_lock(&fi->fi_lkb_lock); + list_add(&li->li_ownerqueue, &fi->fi_lkb_list); + spin_unlock(&fi->fi_lkb_lock); + + up(&li->li_firstlock); + + /* Copy the lkid back to userspace in case they want to cancel. + This address has already been tested so /should/ be OK, if not: + tough - we've taken the lock! */ + copy_to_user(&kparams->lksb->sb_lkid, + &li->li_lksb.sb_lkid, + sizeof(li->li_lksb.sb_lkid)); + } + + return status; + + out_err: + if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) { + + release_lockinfo(li); + } + return status; + +} + +static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams) +{ + struct lock_info *li; + struct dlm_lkb *lkb; + int status; + int convert_cancel = 0; + + lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid); + if (!lkb) { + return -EINVAL; + } + + /* Cancelling a conversion doesn't remove the lock...*/ + if (kparams->flags & DLM_LKF_CANCEL && + lkb->lkb_status == GDLM_LKSTS_CONVERT) { + convert_cancel = 1; + } + + li = (struct lock_info *)lkb->lkb_astparam; + if (!li) { + li = allocate_lockinfo(fi, kparams); + spin_lock(&fi->fi_lkb_lock); + list_add(&li->li_ownerqueue, &fi->fi_lkb_list); + spin_unlock(&fi->fi_lkb_lock); + } + if (!li) + return -ENOMEM; + + if (li->li_magic != LOCKINFO_MAGIC) + return -EINVAL; + + li->li_user_lksb = kparams->lksb; + li->li_castparam = kparams->castparam; + li->li_cmd = kparams->cmd; + + /* dlm_unlock() passes a 0 for castaddr which means don't overwrite + the existing li_castaddr as that's the completion routine for + unlocks. dlm_unlock_wait() specifies a new AST routine to be + executed when the unlock completes. */ + if (kparams->castaddr) + li->li_castaddr = kparams->castaddr; + + /* Have to do it here cos the lkb may not exist after + * dlm_unlock() */ + if (!convert_cancel) { + spin_lock(&fi->fi_lkb_lock); + list_del(&li->li_ownerqueue); + spin_unlock(&fi->fi_lkb_lock); + } + + /* Use existing lksb & astparams */ + status = dlm_unlock(fi->fi_ls->ls_lockspace, + kparams->lkid, + kparams->flags, &li->li_lksb, li); + if (status && !convert_cancel) { + /* It failed, put it back on the list */ + spin_lock(&fi->fi_lkb_lock); + list_add(&li->li_ownerqueue, &fi->fi_lkb_list); + spin_unlock(&fi->fi_lkb_lock); + } + + return status; +} + +/* Write call, submit a locking request */ +static ssize_t dlm_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct file_info *fi = file->private_data; + struct dlm_lock_params kparams; + sigset_t tmpsig; + sigset_t allsigs; + int status; + + if (count < sizeof(kparams)-1) /* -1 because lock name is optional */ + return -EINVAL; + + /* Has the lockspace been deleted */ + if (test_bit(1, &fi->fi_ls->ls_flags)) + return -ENOENT; + + /* Get the command info */ + if (copy_from_user(&kparams, buffer, sizeof(kparams))) + return -EFAULT; + + if (check_version(&kparams)) + return -EINVAL; + + /* Block signals while we are doing this */ + sigfillset(&allsigs); + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); + + switch (kparams.cmd) + { + case DLM_USER_LOCK: + status = do_user_lock(fi, &kparams, buffer); + break; + + case DLM_USER_UNLOCK: + status = do_user_unlock(fi, &kparams); + break; + + case DLM_USER_QUERY: + status = do_user_query(fi, &kparams); + break; + + default: + status = -EINVAL; + break; + } + /* Restore signals */ + sigprocmask(SIG_SETMASK, &tmpsig, NULL); + recalc_sigpending(); + + if (status == 0) + return count; + else + return status; +} + +/* Called when the cluster is shutdown uncleanly, all lockspaces + have been summarily removed */ +void dlm_device_free_devices() +{ + struct user_ls *tmp; + struct user_ls *lsinfo; + + down(&user_ls_lock); + list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) { + misc_deregister(&lsinfo->ls_miscinfo); + + /* Tidy up, but don't delete the lsinfo struct until + all the users have closed their devices */ + list_del(&lsinfo->ls_list); + set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */ + lsinfo->ls_lockspace = NULL; + } + up(&user_ls_lock); +} + +static struct file_operations _dlm_fops = { + .open = dlm_open, + .release = dlm_close, + .ioctl = dlm_ioctl, + .read = dlm_read, + .write = dlm_write, + .poll = dlm_poll, + .owner = THIS_MODULE, +}; + +static struct file_operations _dlm_ctl_fops = { + .open = dlm_ctl_open, + .release = dlm_ctl_close, + .ioctl = dlm_ctl_ioctl, + .owner = THIS_MODULE, +}; + +/* + * Create control device + */ +int dlm_device_init(void) +{ + int r; + + INIT_LIST_HEAD(&user_ls_list); + init_MUTEX(&user_ls_lock); + + ctl_device.name = "dlm-control"; + ctl_device.fops = &_dlm_ctl_fops; + ctl_device.minor = MISC_DYNAMIC_MINOR; + + r = misc_register(&ctl_device); + if (r) { + log_print("misc_register failed for DLM control device"); + return r; + } + + return 0; +} + +void dlm_device_exit(void) +{ + misc_deregister(&ctl_device); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h --- linux-orig/cluster/dlm/device.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/device.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,19 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DEVICE_DOT_H__ +#define __DEVICE_DOT_H__ + +extern void dlm_device_free_devices(void); + +#endif /* __DEVICE_DOT_H__ */ diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c --- linux-orig/cluster/dlm/dir.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/dir.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,471 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "nodes.h" +#include "lockspace.h" +#include "lowcomms.h" +#include "reccomms.h" +#include "rsb.h" +#include "config.h" +#include "memory.h" +#include "recover.h" +#include "util.h" + +struct resmov { + uint32_t rm_nodeid; + uint16_t rm_length; + uint16_t rm_pad; +}; + +void print_name(char *b, int len) +{ + int i; + for (i = 0; i < len; i++) + printk("%c", b[i]); + printk("\n"); +} + +static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de) +{ + spin_lock(&ls->ls_recover_list_lock); + list_add(&de->list, &ls->ls_recover_list); + spin_unlock(&ls->ls_recover_list_lock); +} + +static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) +{ + int found = FALSE; + struct dlm_direntry *de; + + spin_lock(&ls->ls_recover_list_lock); + list_for_each_entry(de, &ls->ls_recover_list, list) { + if (de->length == len) { + list_del(&de->list); + de->master_nodeid = 0; + memset(de->name, 0, len); + found = TRUE; + break; + } + } + spin_unlock(&ls->ls_recover_list_lock); + + if (!found) + de = allocate_direntry(ls, len); + return de; +} + +void clear_free_de(struct dlm_ls *ls) +{ + struct dlm_direntry *de; + + spin_lock(&ls->ls_recover_list_lock); + while (!list_empty(&ls->ls_recover_list)) { + de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, + list); + list_del(&de->list); + free_direntry(de); + } + spin_unlock(&ls->ls_recover_list_lock); +} + +/* + * We use the upper 16 bits of the hash value to select the directory node. + * Low bits are used for distribution of rsb's among hash buckets on each node. + * + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of + * num_nodes to the hash value. This value in the desired range is used as an + * offset into the sorted list of nodeid's to give the particular nodeid of the + * directory node. + */ + +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length) +{ + struct list_head *tmp; + struct dlm_csb *csb = NULL; + uint32_t hash, node, n = 0, nodeid; + + if (ls->ls_num_nodes == 1) { + nodeid = our_nodeid(); + goto out; + } + + hash = dlm_hash(name, length); + node = (hash >> 16) % ls->ls_num_nodes; + + if (ls->ls_node_array) { + nodeid = ls->ls_node_array[node]; + goto out; + } + + list_for_each(tmp, &ls->ls_nodes) { + if (n++ != node) + continue; + csb = list_entry(tmp, struct dlm_csb, list); + break; + } + + DLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u\n", + ls->ls_num_nodes, n, node);); + nodeid = csb->node->nodeid; + out: + return nodeid; +} + +uint32_t get_directory_nodeid(struct dlm_rsb *rsb) +{ + return name_to_directory_nodeid(rsb->res_ls, rsb->res_name, + rsb->res_length); +} + +static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len) +{ + uint32_t val; + + val = dlm_hash(name, len); + val &= (ls->ls_dirtbl_size - 1); + + return val; +} + +static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de) +{ + uint32_t bucket; + + bucket = dir_hash(ls, de->name, de->length); + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); +} + +static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name, + int namelen, uint32_t bucket) +{ + struct dlm_direntry *de; + + list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) { + if (de->length == namelen && !memcmp(name, de->name, namelen)) + goto out; + } + de = NULL; + out: + return de; +} + +void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen) +{ + struct dlm_direntry *de; + uint32_t bucket; + + bucket = dir_hash(ls, name, namelen); + + write_lock(&ls->ls_dirtbl[bucket].lock); + + de = search_bucket(ls, name, namelen, bucket); + + if (!de) { + log_all(ls, "remove fr %u none", nodeid); + print_name(name, namelen); + goto out; + } + + if (de->master_nodeid != nodeid) { + log_all(ls, "remove fr %u ID %u", nodeid, de->master_nodeid); + print_name(name, namelen); + goto out; + } + + list_del(&de->list); + free_direntry(de); + out: + write_unlock(&ls->ls_dirtbl[bucket].lock); +} + +void dlm_dir_clear(struct dlm_ls *ls) +{ + struct list_head *head; + struct dlm_direntry *de; + int i; + + for (i = 0; i < ls->ls_dirtbl_size; i++) { + write_lock(&ls->ls_dirtbl[i].lock); + head = &ls->ls_dirtbl[i].list; + while (!list_empty(head)) { + de = list_entry(head->next, struct dlm_direntry, list); + list_del(&de->list); + put_free_de(ls, de); + } + write_unlock(&ls->ls_dirtbl[i].lock); + } +} + +static void resmov_in(struct resmov *rm, char *buf) +{ + struct resmov tmp; + + memcpy(&tmp, buf, sizeof(struct resmov)); + + rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid); + rm->rm_length = be16_to_cpu(tmp.rm_length); +} + +int dlm_dir_rebuild_local(struct dlm_ls *ls) +{ + struct dlm_csb *csb; + struct dlm_direntry *de; + struct dlm_rcom *rc; + struct resmov mov, last_mov; + char *b, *last_name; + int error = -ENOMEM, count = 0; + + log_all(ls, "rebuild resource directory"); + + dlm_dir_clear(ls); + + rc = allocate_rcom_buffer(ls); + if (!rc) + goto out; + + last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL); + if (!last_name) + goto free_rc; + + list_for_each_entry(csb, &ls->ls_nodes, list) { + last_mov.rm_length = 0; + for (;;) { + error = dlm_recovery_stopped(ls); + if (error) + goto free_last; + + memcpy(rc->rc_buf, last_name, last_mov.rm_length); + rc->rc_datalen = last_mov.rm_length; + + error = rcom_send_message(ls, csb->node->nodeid, + RECCOMM_RECOVERNAMES, rc, 1); + if (error) + goto free_last; + + schedule(); + + /* + * pick each res out of buffer + */ + + b = rc->rc_buf; + + for (;;) { + resmov_in(&mov, b); + b += sizeof(struct resmov); + + /* Length of 0 with a non-zero nodeid marks the + * end of the list */ + if (!mov.rm_length && mov.rm_nodeid) + goto done; + + /* This is just the end of the block */ + if (!mov.rm_length) + break; + + DLM_ASSERT(mov.rm_nodeid == csb->node->nodeid,); + + error = -ENOMEM; + de = get_free_de(ls, mov.rm_length); + if (!de) + goto free_last; + + de->master_nodeid = mov.rm_nodeid; + de->length = mov.rm_length; + memcpy(de->name, b, mov.rm_length); + b += mov.rm_length; + + add_entry_to_hash(ls, de); + count++; + + last_mov = mov; + memset(last_name, 0, DLM_RESNAME_MAXLEN); + memcpy(last_name, de->name, de->length); + } + } + done: + ; + } + + set_bit(LSFL_RESDIR_VALID, &ls->ls_flags); + error = 0; + + log_all(ls, "rebuilt %d resources", count); + + free_last: + kfree(last_name); + + free_rc: + free_rcom_buffer(rc); + + out: + clear_free_de(ls); + return error; +} + +/* + * The reply end of dlm_dir_rebuild_local/RECOVERNAMES. Collect and send as + * many resource names as can fit in the buffer. + */ + +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen, + char *outbuf, int outlen, uint32_t nodeid) +{ + struct list_head *list; + struct dlm_rsb *start_rsb = NULL, *rsb; + int offset = 0, start_namelen, error; + char *start_name; + struct resmov tmp; + uint32_t dir_nodeid; + + /* + * Find the rsb where we left off (or start again) + */ + + start_namelen = inlen; + start_name = inbuf; + + if (start_namelen > 1) { + error = find_rsb(ls, NULL, start_name, start_namelen, 0, + &start_rsb); + DLM_ASSERT(!error && start_rsb, printk("error %d\n", error);); + release_rsb(start_rsb); + } + + /* + * Send rsb names for rsb's we're master of and whose directory node + * matches the requesting node. + */ + + down_read(&ls->ls_root_lock); + if (start_rsb) + list = start_rsb->res_rootlist.next; + else + list = ls->ls_rootres.next; + + for (offset = 0; list != &ls->ls_rootres; list = list->next) { + rsb = list_entry(list, struct dlm_rsb, res_rootlist); + if (rsb->res_nodeid) + continue; + + dir_nodeid = get_directory_nodeid(rsb); + if (dir_nodeid != nodeid) + continue; + + if (offset + sizeof(struct resmov)*2 + rsb->res_length > outlen) { + /* Write end-of-block record */ + memset(&tmp, 0, sizeof(struct resmov)); + memcpy(outbuf + offset, &tmp, sizeof(struct resmov)); + offset += sizeof(struct resmov); + goto out; + } + + memset(&tmp, 0, sizeof(struct resmov)); + tmp.rm_nodeid = cpu_to_be32(our_nodeid()); + tmp.rm_length = cpu_to_be16(rsb->res_length); + + memcpy(outbuf + offset, &tmp, sizeof(struct resmov)); + offset += sizeof(struct resmov); + + memcpy(outbuf + offset, rsb->res_name, rsb->res_length); + offset += rsb->res_length; + } + + /* + * If we've reached the end of the list (and there's room) write a + * terminating record. + */ + + if ((list == &ls->ls_rootres) && + (offset + sizeof(struct resmov) <= outlen)) { + + memset(&tmp, 0, sizeof(struct resmov)); + /* This only needs to be non-zero */ + tmp.rm_nodeid = cpu_to_be32(1); + /* and this must be zero */ + tmp.rm_length = 0; + memcpy(outbuf + offset, &tmp, sizeof(struct resmov)); + offset += sizeof(struct resmov); + } + + out: + up_read(&ls->ls_root_lock); + return offset; +} + +static int get_entry(struct dlm_ls *ls, uint32_t nodeid, char *name, + int namelen, uint32_t *r_nodeid) +{ + struct dlm_direntry *de, *tmp; + uint32_t bucket; + + bucket = dir_hash(ls, name, namelen); + + write_lock(&ls->ls_dirtbl[bucket].lock); + de = search_bucket(ls, name, namelen, bucket); + if (de) { + *r_nodeid = de->master_nodeid; + write_unlock(&ls->ls_dirtbl[bucket].lock); + if (*r_nodeid == nodeid) + return -EEXIST; + return 0; + } + + write_unlock(&ls->ls_dirtbl[bucket].lock); + + de = allocate_direntry(ls, namelen); + if (!de) + return -ENOMEM; + + de->master_nodeid = nodeid; + de->length = namelen; + memcpy(de->name, name, namelen); + + write_lock(&ls->ls_dirtbl[bucket].lock); + tmp = search_bucket(ls, name, namelen, bucket); + if (tmp) { + free_direntry(de); + de = tmp; + } else { + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); + } + *r_nodeid = de->master_nodeid; + write_unlock(&ls->ls_dirtbl[bucket].lock); + return 0; +} + +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen, + uint32_t *r_nodeid) +{ + return get_entry(ls, nodeid, name, namelen, r_nodeid); +} + +/* + * The node with lowest id queries all nodes to determine when all are done. + * All other nodes query the low nodeid for this. + */ + +int dlm_dir_rebuild_wait(struct dlm_ls *ls) +{ + int error; + + if (ls->ls_low_nodeid == our_nodeid()) { + error = dlm_wait_status_all(ls, RESDIR_VALID); + if (!error) + set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags); + } else + error = dlm_wait_status_low(ls, RESDIR_ALL_VALID); + + return error; +} diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h --- linux-orig/cluster/dlm/dir.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/dir.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,33 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + +void print_name(char *b, int len); +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length); +uint32_t get_directory_nodeid(struct dlm_rsb *rsb); + +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen, + uint32_t *r_nodeid); +void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name, + int namelen); +int dlm_dir_rebuild_local(struct dlm_ls *ls); +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen, + char *outbuf, int outlen, uint32_t nodeid); +int dlm_dir_rebuild_wait(struct dlm_ls * ls); +void dlm_dir_clear(struct dlm_ls *ls); +void dlm_dir_dump(struct dlm_ls *ls); +void clear_free_de(struct dlm_ls *ls); + +#endif /* __DIR_DOT_H__ */ diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h --- linux-orig/cluster/dlm/dlm_internal.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/dlm_internal.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,612 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DLM_INTERNAL_DOT_H__ +#define __DLM_INTERNAL_DOT_H__ + +/* + * This is the main header file to be included in each DLM source file. + */ + +#define DLM_RELEASE_NAME "" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifndef TRUE +#define TRUE (1) +#endif + +#ifndef FALSE +#define FALSE (0) +#endif + +#if (BITS_PER_LONG == 64) +#define PRIu64 "lu" +#define PRId64 "ld" +#define PRIo64 "lo" +#define PRIx64 "lx" +#define PRIX64 "lX" +#define SCNu64 "lu" +#define SCNd64 "ld" +#define SCNo64 "lo" +#define SCNx64 "lx" +#define SCNX64 "lX" +#else +#define PRIu64 "Lu" +#define PRId64 "Ld" +#define PRIo64 "Lo" +#define PRIx64 "Lx" +#define PRIX64 "LX" +#define SCNu64 "Lu" +#define SCNd64 "Ld" +#define SCNo64 "Lo" +#define SCNx64 "Lx" +#define SCNX64 "LX" +#endif + +#define wchan_cond_sleep_intr(chan, sleep_cond) \ +do \ +{ \ + DECLARE_WAITQUEUE(__wait_chan, current); \ + current->state = TASK_INTERRUPTIBLE; \ + add_wait_queue(&chan, &__wait_chan); \ + if ((sleep_cond)) \ + schedule(); \ + remove_wait_queue(&chan, &__wait_chan); \ + current->state = TASK_RUNNING; \ +} \ +while (0) + +static inline int check_timeout(unsigned long stamp, unsigned int seconds) +{ + return time_after(jiffies, stamp + seconds * HZ); +} + + +#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args) + +#define log_all(ls, fmt, args...) \ + do { \ + printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \ + dlm_debug_log(ls, fmt, ##args); \ + } while (0) + +#define log_error log_all + +#if defined(DLM_DEBUG2) +int nibbler_printf(const char *fmt, ...); +#define log_debug2(fmt, args...) nibbler_printf(fmt"\n", ##args) +#else +#define log_debug2(fmt, args...) +#endif + +#define DLM_DEBUG +#if defined(DLM_DEBUG) +#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args) +#else +#define log_debug(ls, fmt, args...) +#endif + +#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL) +#undef log_debug +#define log_debug log_all +#endif + + +#define DLM_ASSERT(x, do) \ +{ \ + if (!(x)) \ + { \ + dlm_locks_dump(); \ + dlm_debug_dump(); \ + printk("\nDLM: Assertion failed on line %d of file %s\n" \ + "DLM: assertion: \"%s\"\n" \ + "DLM: time = %lu\n", \ + __LINE__, __FILE__, #x, jiffies); \ + {do} \ + printk("\n"); \ + BUG(); \ + panic("DLM: Record message above and reboot.\n"); \ + } \ +} + + +struct dlm_ls; +struct dlm_lkb; +struct dlm_rsb; +struct dlm_csb; +struct dlm_node; +struct dlm_lkbtable; +struct dlm_rsbtable; +struct dlm_dirtable; +struct dlm_direntry; +struct dlm_recover; +struct dlm_header; +struct dlm_request; +struct dlm_reply; +struct dlm_rcom; +struct dlm_query_request; +struct dlm_query_reply; + + +struct dlm_direntry { + struct list_head list; + uint32_t master_nodeid; + uint16_t length; + char name[1]; +}; + +struct dlm_dirtable { + struct list_head list; + rwlock_t lock; +}; + +struct dlm_rsbtable { + struct list_head list; + rwlock_t lock; +}; + +struct dlm_lkbtable { + struct list_head list; + rwlock_t lock; + uint16_t counter; +}; + +/* + * Cluster node (per node in cluster) + */ + +struct dlm_node { + struct list_head list; + uint32_t nodeid; + atomic_t refcount; /* num csb's referencing */ +}; + +/* + * Cluster System Block (per node in a ls) + */ + +struct dlm_csb { + struct list_head list; /* per-lockspace node list */ + struct dlm_node * node; /* global node structure */ + int gone_event; /* event id when node removed */ +}; + +/* + * Used to save and manage recovery state for a lockspace. + */ + +struct dlm_recover { + struct list_head list; + uint32_t * nodeids; + int node_count; + int event_id; +}; + +/* + * Elements in the range array + */ + +#define GR_RANGE_START (0) +#define GR_RANGE_END (1) +#define RQ_RANGE_START (2) +#define RQ_RANGE_END (3) + +/* + * Lockspace structure + */ + +#define LSFL_WORK (0) +#define LSFL_LS_RUN (1) +#define LSFL_LS_STOP (2) +#define LSFL_LS_START (3) +#define LSFL_LS_FINISH (4) +#define LSFL_RECCOMM_WAIT (5) +#define LSFL_RECCOMM_READY (6) +#define LSFL_NOTIMERS (7) +#define LSFL_FINISH_RECOVERY (8) +#define LSFL_RESDIR_VALID (9) +#define LSFL_ALL_RESDIR_VALID (10) +#define LSFL_NODES_VALID (11) +#define LSFL_ALL_NODES_VALID (12) +#define LSFL_REQUEST_WARN (13) +#define LSFL_RECOVERD_EXIT (14) + +#define LSST_NONE (0) +#define LSST_INIT (1) +#define LSST_INIT_DONE (2) +#define LSST_CLEAR (3) +#define LSST_WAIT_START (4) +#define LSST_RECONFIG_DONE (5) + +struct dlm_ls { + struct list_head ls_list; /* list of lockspaces */ + uint32_t ls_local_id; /* local unique lockspace ID */ + uint32_t ls_global_id; /* global unique lockspace ID */ + int ls_allocation; /* Memory allocation policy */ + int ls_count; /* reference count */ + unsigned long ls_flags; /* LSFL_ */ + + struct dlm_rsbtable * ls_rsbtbl; + uint32_t ls_rsbtbl_size; + + struct dlm_lkbtable * ls_lkbtbl; + uint32_t ls_lkbtbl_size; + + struct dlm_dirtable * ls_dirtbl; + uint32_t ls_dirtbl_size; + + struct list_head ls_nodes; /* current nodes in ls */ + struct list_head ls_nodes_gone; /* dead node list, recovery */ + uint32_t ls_num_nodes; /* number of nodes in ls */ + uint32_t ls_low_nodeid; + uint32_t * ls_node_array; + + struct rw_semaphore ls_unlock_sem; /* To prevent unlock on a + parent lock racing with a + new child lock */ + + struct list_head ls_deadlockq; /* List of locks in conversion + ordered by duetime. for + deadlock detection */ + + /* recovery related */ + + struct task_struct * ls_recoverd_task; + struct semaphore ls_recoverd_lock; + struct list_head ls_recover; /* dlm_recover structs */ + spinlock_t ls_recover_lock; + int ls_last_stop; + int ls_last_start; + int ls_last_finish; + int ls_state; /* recovery states */ + + struct rw_semaphore ls_in_recovery; /* block local requests */ + struct list_head ls_requestqueue;/* queue remote requests */ + struct semaphore ls_requestqueue_lock; + + struct dlm_rcom * ls_rcom; /* recovery comms */ + uint32_t ls_rcom_msgid; + struct semaphore ls_rcom_lock; + + struct list_head ls_recover_list; + spinlock_t ls_recover_list_lock; + int ls_recover_list_count; + wait_queue_head_t ls_wait_general; + + struct list_head ls_rootres; /* root resources */ + struct rw_semaphore ls_root_lock; /* protect rootres list */ + + struct list_head ls_rebuild_rootrsb_list; /* Root of lock trees + we're deserialising */ + int ls_namelen; + char ls_name[1]; +}; + +/* + * Resource block + */ + +#define RESFL_NEW_MASTER (0) +#define RESFL_RECOVER_LIST (1) +#define RESFL_MASTER (2) + +struct dlm_rsb { + struct list_head res_hashchain; + uint32_t res_bucket; + + struct dlm_ls * res_ls; /* The owning lockspace */ + + struct list_head res_rootlist; /* List of root rsb's */ + + struct list_head res_subreslist; /* List of all sub-resources + for this root rsb */ + + uint8_t res_depth; /* Depth in resource tree */ + unsigned long res_flags; /* Flags, RESFL_ */ + + struct list_head res_grantqueue; + struct list_head res_convertqueue; + struct list_head res_waitqueue; + + uint32_t res_nodeid; /* nodeid of master node */ + + struct dlm_rsb * res_root; /* root rsb if a subresource */ + struct dlm_rsb * res_parent; /* parent rsb (if any) */ + + atomic_t res_ref; /* Number of lkb's */ + uint16_t res_remasterid; /* ID used during remaster */ + + struct list_head res_recover_list; /* General list for use + during recovery */ + int res_recover_msgid; + int res_newlkid_expect; + + struct rw_semaphore res_lock; + + char * res_lvbptr; /* Lock value block */ + + uint8_t res_length; + char res_name[1]; /* bytes */ +}; + +/* + * Lock block. To avoid confusion, where flags mirror the public flags, they + * should have the same value. + * + * In general, DLM_LKF flags from dlm.h apply only to lkb_lockqueue_flags + * and GDLM_LKFLG flags from dlm_internal.h apply only to lkb_flags. + * The rr_flags field in the request struct is a copy of lkb_lockqueue_flags. + * There is one dangerous exception: GDLM_LKFLG_RANGE is set in rr_flags + * when sending a remote range lock request. This value is then copied into + * the remote lkb_lockqueue_flags field. This means GDLM_LKFLG_RANGE must + * not have the same value as any external DLM_LKF flag. + */ + +#define GDLM_LKSTS_NEW (0) +#define GDLM_LKSTS_WAITING (1) +#define GDLM_LKSTS_GRANTED (2) +#define GDLM_LKSTS_CONVERT (3) + +/* mirror external flags */ +#define GDLM_LKFLG_VALBLK (0x00000008) +#define GDLM_LKFLG_PERSISTENT (0x00000080) +#define GDLM_LKFLG_NODLCKWT (0x00000100) +#define GDLM_LKFLG_EXPEDITE (0x00000400) +#define GDLM_LKFLG_ORPHAN (0x00004000) +/* external flags now go up to: (0x00004000) : DLM_LKF_ORPHAN */ + +/* internal-only flags */ +#define GDLM_LKFLG_RANGE (0x00010000) +#define GDLM_LKFLG_MSTCPY (0x00020000) +#define GDLM_LKFLG_DELETED (0x00040000) +#define GDLM_LKFLG_LQCONVERT (0x00080000) +#define GDLM_LKFLG_LQRESEND (0x00100000) +#define GDLM_LKFLG_DEMOTED (0x00200000) +#define GDLM_LKFLG_RESENT (0x00400000) +#define GDLM_LKFLG_NOREBUILD (0x00800000) +#define GDLM_LKFLG_UNLOCKDONE (0x01000000) + +#define AST_COMP (1) +#define AST_BAST (2) +#define AST_DEL (4) + +struct dlm_lkb { + uint32_t lkb_flags; + uint16_t lkb_status; /* grant, wait, convert */ + int8_t lkb_rqmode; /* requested lock mode */ + int8_t lkb_grmode; /* granted lock mode */ + uint32_t lkb_retstatus; /* status to return in lksb */ + uint32_t lkb_id; /* our lock ID */ + struct dlm_lksb * lkb_lksb; /* status block of caller */ + struct list_head lkb_idtbl_list; /* lockidtbl */ + struct list_head lkb_statequeue; /* rsb's g/c/w queue */ + struct dlm_rsb * lkb_resource; + struct dlm_lkb * lkb_parent; /* parent lock if any */ + atomic_t lkb_childcnt; /* number of children */ + + struct list_head lkb_lockqueue; /* queue of locks waiting + for remote reply */ + int lkb_lockqueue_state; /* reason on lockqueue */ + uint32_t lkb_lockqueue_flags; /* as passed into + lock/unlock */ + int lkb_ownpid; /* pid of lock owner */ + unsigned long lkb_lockqueue_time; /* time lkb went on the + lockqueue */ + unsigned long lkb_duetime; /* for deadlock detection */ + + uint32_t lkb_remid; /* id on remote partner */ + uint32_t lkb_nodeid; /* id of remote partner */ + void * lkb_astaddr; + void * lkb_bastaddr; + long lkb_astparam; + struct list_head lkb_astqueue; /* locks with asts to deliver */ + uint16_t lkb_astflags; /* COMP, BAST, DEL */ + uint8_t lkb_bastmode; /* requested mode */ + uint8_t lkb_highbast; /* highest mode bast sent for */ + + struct dlm_request * lkb_request; + + struct list_head lkb_deadlockq; /* ls_deadlockq list */ + + char * lkb_lvbptr; /* points to lksb lvb on local + lock, allocated lvb on + on remote lock */ + uint64_t * lkb_range; /* Points to an array of 64 bit + numbers that represent the + requested and granted ranges + of the lock. NULL implies + 0-ffffffffffffffff */ +}; + +/* + * Header part of the mid-level comms system. All packets start with + * this header so we can identify them. The comms packet can + * contain many of these structs but the are split into individual + * work units before being passed to the lockqueue routines. + * below this are the structs that this is a header for + */ + +struct dlm_header { + uint8_t rh_cmd; /* What we are */ + uint8_t rh_flags; /* maybe just a pad */ + uint16_t rh_length; /* Length of struct (so we can + send many in 1 message) */ + uint32_t rh_lkid; /* Lock ID tag: ie the local + (requesting) lock ID */ + uint32_t rh_lockspace; /* Lockspace ID */ +} __attribute__((packed)); + +/* + * This is the struct used in a remote lock/unlock/convert request + * The mid-level comms API should turn this into native byte order. + * Most "normal" lock operations will use these two structs for + * communications. Recovery operations use their own structs + * but still with the gd_req_header on the front. + */ + +struct dlm_request { + struct dlm_header rr_header; + uint32_t rr_remlkid; /* Remote lock ID */ + uint32_t rr_remparid; /* Parent's remote lock ID */ + uint32_t rr_flags; /* Flags from lock/convert req*/ + uint64_t rr_range_start; /* Yes, these are in the right + place... */ + uint64_t rr_range_end; + uint32_t rr_status; /* Status to return if this is + an AST request */ + uint32_t rr_pid; /* Owner PID of lock */ + uint8_t rr_rqmode; /* Requested lock mode */ + uint8_t rr_asts; /* Whether the LKB has ASTs */ + char rr_lvb[DLM_LVB_LEN]; + char rr_name[1]; /* As long as needs be. Only + used for directory lookups. + The length of this can be + worked out from the packet + length */ +} __attribute__((packed)); + +/* + * This is the struct returned by a remote lock/unlock/convert request + * The mid-level comms API should turn this into native byte order. + */ + +struct dlm_reply { + struct dlm_header rl_header; + uint32_t rl_lockstate; /* Whether request was + queued/granted/waiting */ + uint32_t rl_nodeid; /* nodeid of lock master */ + uint32_t rl_status; /* Status to return to caller */ + uint32_t rl_lkid; /* Remote lkid */ + char rl_lvb[DLM_LVB_LEN]; +} __attribute__((packed)); + +/* + * Recovery comms message + */ + +struct dlm_rcom { + struct dlm_header rc_header; /* 32 byte aligned */ + uint32_t rc_msgid; + uint16_t rc_datalen; + uint8_t rc_expanded; + uint8_t rc_subcmd; /* secondary command */ + char rc_buf[1]; /* first byte of data goes here + and extends beyond here for + another datalen - 1 bytes. + rh_length is set to sizeof + dlm_rcom + datalen - 1 */ +} __attribute__((packed)); + + +/* A remote query: GDLM_REMCMD_QUERY */ + +struct dlm_query_request { + struct dlm_header rq_header; + uint32_t rq_mstlkid; /* LockID on master node */ + uint32_t rq_query; /* query from the user */ + uint32_t rq_maxlocks; /* max number of locks we can + cope with */ +} __attribute__((packed)); + +/* First block of a reply query. cmd = GDLM_REMCMD_QUERY */ +/* There may be subsequent blocks of + lock info in GDLM_REMCMD_QUERYCONT messages which just have + a normal header. The last of these will have rh_flags set to + GDLM_REMFLAG_ENDQUERY + */ + +struct dlm_query_reply { + struct dlm_header rq_header; + uint32_t rq_numlocks; /* Number of locks in reply */ + uint32_t rq_startlock; /* Which lock this block starts + at (for multi-block replies) */ + uint32_t rq_status; + + /* Resource information */ + uint32_t rq_grantcount; /* No. of nodes on grantqueue */ + uint32_t rq_convcount; /* No. of nodes on convertq */ + uint32_t rq_waitcount; /* No. of nodes on waitqueue */ + char rq_valblk[DLM_LVB_LEN]; /* Master's LVB + contents, if + applicable */ +} __attribute__((packed)); + +/* + * Lockqueue wait lock states + */ + +#define GDLM_LQSTATE_WAIT_RSB 1 +#define GDLM_LQSTATE_WAIT_CONVERT 2 +#define GDLM_LQSTATE_WAIT_CONDGRANT 3 +#define GDLM_LQSTATE_WAIT_UNLOCK 4 + +/* Commands sent across the comms link */ +#define GDLM_REMCMD_LOOKUP 1 +#define GDLM_REMCMD_LOCKREQUEST 2 +#define GDLM_REMCMD_UNLOCKREQUEST 3 +#define GDLM_REMCMD_CONVREQUEST 4 +#define GDLM_REMCMD_LOCKREPLY 5 +#define GDLM_REMCMD_LOCKGRANT 6 +#define GDLM_REMCMD_SENDBAST 7 +#define GDLM_REMCMD_SENDCAST 8 +#define GDLM_REMCMD_REM_RESDATA 9 +#define GDLM_REMCMD_RECOVERMESSAGE 20 +#define GDLM_REMCMD_RECOVERREPLY 21 +#define GDLM_REMCMD_QUERY 30 +#define GDLM_REMCMD_QUERYREPLY 31 + +/* Set in rh_flags when this is the last block of + query information. Note this could also be the first + block */ +#define GDLM_REMFLAG_ENDQUERY 1 + +#ifdef CONFIG_DLM_STATS +struct dlm_statinfo +{ + unsigned int cast; + unsigned int bast; + unsigned int lockops; + unsigned int unlockops; + unsigned int convertops; + unsigned long lockqueue_time[5]; + unsigned long lockqueue_locks[5]; +}; +extern struct dlm_statinfo dlm_stats; +#endif + +#ifndef BUG_ON +#define BUG_ON(x) +#endif + +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...); +void dlm_debug_dump(void); +void dlm_locks_dump(void); + +#endif /* __DLM_INTERNAL_DOT_H__ */ diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c --- linux-orig/cluster/dlm/lkb.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/lkb.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,183 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * lkb.c + * + * Allocate and free locks on the lock ID table. + * + * This is slightly naff but I don't really like the + * VMS lockidtbl stuff as it uses a realloced array + * to hold the locks in. I think this is slightly better + * in some ways. + * + * Any better suggestions gratefully received. Patrick + * + */ + +#include "dlm_internal.h" +#include "lockqueue.h" +#include "lkb.h" +#include "config.h" +#include "rsb.h" +#include "memory.h" +#include "lockspace.h" +#include "util.h" + +/* + * Internal find lock by ID. Must be called with the lockidtbl spinlock held. + */ + +static struct dlm_lkb *__find_lock_by_id(struct dlm_ls *ls, uint32_t lkid) +{ + uint16_t bucket = lkid & 0xFFFF; + struct dlm_lkb *lkb; + + if (bucket >= ls->ls_lkbtbl_size) + goto out; + + list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list){ + if (lkb->lkb_id == lkid) + return lkb; + } + out: + return NULL; +} + +/* + * LKB lkid's are 32 bits and have two 16 bit parts. The bottom 16 bits are a + * random number between 0 and lockidtbl_size-1. This random number specifies + * the "bucket" for the lkb in lockidtbl. The upper 16 bits are a sequentially + * assigned per-bucket id. + * + * Because the 16 bit id's per bucket can roll over, a new lkid must be checked + * against the lkid of all lkb's in the bucket to avoid duplication. + * + */ + +struct dlm_lkb *create_lkb(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + uint32_t lkid; + uint16_t bucket; + + lkb = allocate_lkb(ls); + if (!lkb) + goto out; + + retry: + get_random_bytes(&bucket, sizeof(bucket)); + bucket &= (ls->ls_lkbtbl_size - 1); + + write_lock(&ls->ls_lkbtbl[bucket].lock); + + lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16); + + if (__find_lock_by_id(ls, lkid)) { + write_unlock(&ls->ls_lkbtbl[bucket].lock); + goto retry; + } + + lkb->lkb_id = lkid; + list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list); + write_unlock(&ls->ls_lkbtbl[bucket].lock); + out: + return lkb; +} + +/* + * Free LKB and remove it from the lockidtbl. + * NB - this always frees the lkb whereas release_rsb doesn't free an + * rsb unless its reference count is zero. + */ + +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + uint16_t bucket = lkb->lkb_id & 0xFFFF; + + if (lkb->lkb_status) { + log_error(ls, "release lkb with status %u", lkb->lkb_status); + print_lkb(lkb); + return; + } + + if (lkb->lkb_parent) + atomic_dec(&lkb->lkb_parent->lkb_childcnt); + + write_lock(&ls->ls_lkbtbl[bucket].lock); + list_del(&lkb->lkb_idtbl_list); + write_unlock(&ls->ls_lkbtbl[bucket].lock); + + /* if this is not a master copy then lvbptr points into the user's + * lksb, so don't free it */ + if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY) + free_lvb(lkb->lkb_lvbptr); + + if (lkb->lkb_range) + free_range(lkb->lkb_range); + + free_lkb(lkb); +} + +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid) +{ + struct dlm_lkb *lkb; + uint16_t bucket = lkid & 0xFFFF; + + read_lock(&ls->ls_lkbtbl[bucket].lock); + lkb = __find_lock_by_id(ls, lkid); + read_unlock(&ls->ls_lkbtbl[bucket].lock); + + return lkb; +} + +struct dlm_lkb *dlm_get_lkb(void *lockspace, uint32_t lkid) +{ + struct dlm_ls *ls = find_lockspace_by_local_id(lockspace); + struct dlm_lkb *lkb = find_lock_by_id(ls, lkid); + put_lockspace(ls); + return lkb; +} + +/* + * Initialise the range parts of an LKB. + */ + +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end) +{ + int ret = -ENOMEM; + + /* + * if this wasn't already a range lock, make it one + */ + if (!lkb->lkb_range) { + lkb->lkb_range = allocate_range(lspace); + if (!lkb->lkb_range) + goto out; + + /* + * This is needed for conversions that contain ranges where the + * original lock didn't but it's harmless for new locks too. + */ + lkb->lkb_range[GR_RANGE_START] = 0LL; + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL; + } + + lkb->lkb_range[RQ_RANGE_START] = start; + lkb->lkb_range[RQ_RANGE_END] = end; + + ret = 0; + + out: + return ret; +} diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h --- linux-orig/cluster/dlm/lkb.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/lkb.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,23 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LKB_DOT_H__ +#define __LKB_DOT_H__ + +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid); +struct dlm_lkb *create_lkb(struct dlm_ls *ls); +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb); +struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid); +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end); + +#endif /* __LKB_DOT_H__ */ diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c --- linux-orig/cluster/dlm/locking.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/locking.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,1378 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * locking.c + * + * This is where the main work of the DLM goes on + * + */ + +#include "dlm_internal.h" +#include "lockqueue.h" +#include "locking.h" +#include "lockspace.h" +#include "lkb.h" +#include "nodes.h" +#include "dir.h" +#include "ast.h" +#include "memory.h" +#include "rsb.h" +#include "util.h" +#include "lowcomms.h" + +extern struct list_head lslist; + +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +/* + * Lock compatibilty matrix - thanks Steve + * UN = Unlocked state. Not really a state, used as a flag + * PD = Padding. Used to make the matrix a nice power of two in size + * Other states are the same as the VMS DLM. + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same) + */ + +#define modes_compat(gr, rq) \ + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1] + +const int __dlm_compat_matrix[8][8] = { + /* UN NL CR CW PR PW EX PD */ + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */ + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */ + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */ + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */ + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */ + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */ + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */ + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ +}; + +/* + * Compatibility matrix for conversions with QUECVT set. + * Granted mode is the row; requested mode is the column. + * Usage: matrix[grmode+1][rqmode+1] + */ + +const int __quecvt_compat_matrix[8][8] = { + /* UN NL CR CW PR PW EX PD */ + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */ + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */ + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */ + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */ + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */ + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */ + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */ + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ +}; + +/* + * This defines the direction of transfer of LVB data. + * Granted mode is the row; requested mode is the column. + * Usage: matrix[grmode+1][rqmode+1] + * 1 = LVB is returned to the caller + * 0 = LVB is written to the resource + * -1 = nothing happens to the LVB + */ + +const int __lvb_operations[8][8] = { + /* UN NL CR CW PR PW EX PD*/ + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */ + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */ + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */ + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */ + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */ + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */ + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */ + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */ +}; + +static void grant_lock(struct dlm_lkb *lkb, int send_remote); +static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb); +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb); +static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb, + uint32_t flags, void *ast, void *astarg, void *bast, + struct dlm_range *range); +static int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb, + uint32_t flags, char *name, int namelen); + + +inline int dlm_modes_compat(int mode1, int mode2) +{ + return __dlm_compat_matrix[mode1 + 1][mode2 + 1]; +} + +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head) +{ + struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, lkb_statequeue); + + if (lkb->lkb_id == first->lkb_id) + return 1; + + return 0; +} + +/* + * Return 1 if the locks' ranges overlap + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff + */ + +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2) +{ + if (!lkb1->lkb_range || !lkb2->lkb_range) + return 1; + + if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] || + lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END]) + return 0; + + return 1; +} + +/* + * "A conversion deadlock arises with a pair of lock requests in the converting + * queue for one resource. The granted mode of each lock blocks the requested + * mode of the other lock." + */ + +static struct dlm_lkb *conversion_deadlock_detect(struct dlm_rsb *rsb, + struct dlm_lkb *lkb) +{ + struct dlm_lkb *this; + + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) { + if (this == lkb) + continue; + + if (!ranges_overlap(lkb, this)) + continue; + + if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) + return this; + } + + return NULL; +} + +/* + * Check if the given lkb conflicts with another lkb on the queue. + */ + +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb) +{ + struct dlm_lkb *this; + + list_for_each_entry(this, head, lkb_statequeue) { + if (this == lkb) + continue; + if (ranges_overlap(lkb, this) && !modes_compat(this, lkb)) + return TRUE; + } + return FALSE; +} + +/* + * Return 1 if the lock can be granted, 0 otherwise. + * Also detect and resolve conversion deadlocks. + * + * lkb is the lock to be granted + * + * now is 1 if the function is being called in the context of the + * immediate request, it is 0 if called later, after the lock has been + * queued. + * + * References are from chapter 6 of "VAXcluster Principles" by Roy Davis + */ + +static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) +{ + int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV); + + /* + * 6-10: Version 5.4 introduced an option to address the phenomenon of + * a new request for a NL mode lock being blocked. + * + * 6-11: If the optional EXPEDITE flag is used with the new NL mode + * request, then it would be granted. In essence, the use of this flag + * tells the Lock Manager to expedite theis request by not considering + * what may be in the CONVERTING or WAITING queues... As of this + * writing, the EXPEDITE flag can be used only with new requests for NL + * mode locks. This flag is not valid for conversion requests. + * + * A shortcut. Earlier checks return an error if EXPEDITE is used in a + * conversion or used with a non-NL requested mode. We also know an + * EXPEDITE request is always granted immediately, so now must always + * be 1. The full condition to grant an expedite request: (now && + * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can + * therefore be shortened to just checking the flag. + */ + + if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE) + return TRUE; + + /* + * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be + * added to the remaining conditions. + */ + + if (queue_conflict(&r->res_grantqueue, lkb)) + goto out; + + /* + * 6-3: By default, a conversion request is immediately granted if the + * requested mode is compatible with the modes of all other granted + * locks + */ + + if (queue_conflict(&r->res_convertqueue, lkb)) + goto out; + + /* + * 6-5: But the default algorithm for deciding whether to grant or + * queue conversion requests does not by itself guarantee that such + * requests are serviced on a "first come first serve" basis. This, in + * turn, can lead to a phenomenon known as "indefinate postponement". + * + * 6-7: This issue is dealt with by using the optional QUECVT flag with + * the system service employed to request a lock conversion. This flag + * forces certain conversion requests to be queued, even if they are + * compatible with the granted modes of other locks on the same + * resource. Thus, the use of this flag results in conversion requests + * being ordered on a "first come first servce" basis. + */ + + if (now && conv && !(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)) + return TRUE; + + /* + * When using range locks the NOORDER flag is set to avoid the standard + * vms rules on grant order. + */ + + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOORDER) + return TRUE; + + /* + * 6-3: Once in that queue [CONVERTING], a conversion request cannot be + * granted until all other conversion requests ahead of it are granted + * and/or canceled. + */ + + if (!now && conv && first_in_list(lkb, &r->res_convertqueue)) + return TRUE; + + /* + * 6-4: By default, a new request is immediately granted only if all + * three of the following conditions are satisfied when the request is + * issued: + * - The queue of ungranted conversion requests for the resource is + * empty. + * - The queue of ungranted new requests for the resource is empty. + * - The mode of the new request is compatible with the most + * restrictive mode of all granted locks on the resource. + */ + + if (now && !conv && list_empty(&r->res_convertqueue) && + list_empty(&r->res_waitqueue)) + return TRUE; + + /* + * 6-4: Once a lock request is in the queue of ungranted new requests, + * it cannot be granted until the queue of ungranted conversion + * requests is empty, all ungranted new requests ahead of it are + * granted and/or canceled, and it is compatible with the granted mode + * of the most restrictive lock granted on the resource. + */ + + if (!now && !conv && list_empty(&r->res_convertqueue) && + first_in_list(lkb, &r->res_waitqueue)) + return TRUE; + + out: + /* + * The following, enabled by CONVDEADLK, departs from VMS. + */ + + if (now && conv && (lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK) && + conversion_deadlock_detect(r, lkb)) { + lkb->lkb_grmode = DLM_LOCK_NL; + lkb->lkb_flags |= GDLM_LKFLG_DEMOTED; + } + + return FALSE; +} + +int dlm_lock(void *lockspace, + uint32_t mode, + struct dlm_lksb *lksb, + uint32_t flags, + void *name, + unsigned int namelen, + uint32_t parent, + void (*ast) (void *astarg), + void *astarg, + void (*bast) (void *astarg, int mode), + struct dlm_range *range) +{ + struct dlm_ls *lspace; + struct dlm_lkb *lkb = NULL, *parent_lkb = NULL; + int ret = -EINVAL; + + lspace = find_lockspace_by_local_id(lockspace); + if (!lspace) + return ret; + + if (mode < 0 || mode > DLM_LOCK_EX) + goto out; + + if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN)) + goto out; + + if (flags & DLM_LKF_CANCEL) + goto out; + + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT)) + goto out; + + if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT)) + goto out; + + if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE) + goto out; + + if (flags & DLM_LKF_EXPEDITE && (mode != DLM_LOCK_NL)) + goto out; + + if (!ast || !lksb) + goto out; + + if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) + goto out; + + /* + * Take conversion path. + */ + + if (flags & DLM_LKF_CONVERT) { + ret = convert_lock(lspace, mode, lksb, flags, ast, astarg, + bast, range); + goto out; + } + +#ifdef CONFIG_DLM_STATS + dlm_stats.lockops++; +#endif + /* + * Take new lock path. + */ + + if (parent) { + down_read(&lspace->ls_unlock_sem); + + parent_lkb = find_lock_by_id(lspace, parent); + + if (!parent_lkb || + parent_lkb->lkb_flags & GDLM_LKFLG_DELETED || + parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY || + parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) { + up_read(&lspace->ls_unlock_sem); + goto out; + } + + atomic_inc(&parent_lkb->lkb_childcnt); + up_read(&lspace->ls_unlock_sem); + } + + down_read(&lspace->ls_in_recovery); + + ret = -ENOMEM; + + lkb = create_lkb(lspace); + if (!lkb) + goto fail_dec; + lkb->lkb_astaddr = ast; + lkb->lkb_astparam = (long) astarg; + lkb->lkb_bastaddr = bast; + lkb->lkb_rqmode = mode; + lkb->lkb_grmode = DLM_LOCK_IV; + lkb->lkb_nodeid = -1; + lkb->lkb_lksb = lksb; + lkb->lkb_parent = parent_lkb; + lkb->lkb_lockqueue_flags = flags; + lkb->lkb_lvbptr = lksb->sb_lvbptr; + + if (!in_interrupt() && current) + lkb->lkb_ownpid = (int) current->pid; + else + lkb->lkb_ownpid = 0; + + if (range) { + if (range->ra_start > range->ra_end) { + ret = -EINVAL; + goto fail_free; + } + + if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end)) + goto fail_free; + } + + /* Convert relevant flags to internal numbers */ + if (flags & DLM_LKF_VALBLK) + lkb->lkb_flags |= GDLM_LKFLG_VALBLK; + if (flags & DLM_LKF_PERSISTENT) + lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT; + if (flags & DLM_LKF_NODLCKWT) + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT; + + lksb->sb_lkid = lkb->lkb_id; + + ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen); + if (ret) + goto fail_free; + + up_read(&lspace->ls_in_recovery); + + wake_astd(); + + put_lockspace(lspace); + return 0; + + fail_free: + release_lkb(lspace, lkb); + goto fail_unlock; + + fail_dec: + if (parent_lkb) + atomic_dec(&parent_lkb->lkb_childcnt); + + fail_unlock: + up_read(&lspace->ls_in_recovery); + + out: + put_lockspace(lspace); + return ret; +} + +int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb, uint32_t flags, + char *name, int namelen) +{ + struct dlm_rsb *rsb, *parent_rsb = NULL; + struct dlm_lkb *parent_lkb = lkb->lkb_parent; + uint32_t nodeid; + int error, dir_error = 0; + + if (parent_lkb) + parent_rsb = parent_lkb->lkb_resource; + + error = find_rsb(ls, parent_rsb, name, namelen, CREATE, &rsb); + if (error) + return error; + lkb->lkb_resource = rsb; + down_write(&rsb->res_lock); + + log_debug(ls, "(%d) rq %u %x \"%s\"", lkb->lkb_ownpid, lkb->lkb_rqmode, + lkb->lkb_id, rsb->res_name); + /* + * Next stage, do we need to find the master or can + * we get on with the real locking work ? + */ + + retry: + if (rsb->res_nodeid == -1) { + if (get_directory_nodeid(rsb) != our_nodeid()) { + remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB); + up_write(&rsb->res_lock); + return 0; + } + + error = dlm_dir_lookup(ls, our_nodeid(), rsb->res_name, + rsb->res_length, &nodeid); + if (error) { + DLM_ASSERT(error == -EEXIST,); + msleep(500); + dir_error = error; + goto retry; + } + + if (nodeid == our_nodeid()) { + set_bit(RESFL_MASTER, &rsb->res_flags); + rsb->res_nodeid = 0; + } else { + clear_bit(RESFL_MASTER, &rsb->res_flags); + rsb->res_nodeid = nodeid; + } + + if (dir_error) { + log_all(ls, "dir lookup retry %x %u", lkb->lkb_id, + nodeid); + } + } + + lkb->lkb_nodeid = rsb->res_nodeid; + up_write(&rsb->res_lock); + + error = dlm_lock_stage2(ls, lkb, rsb, flags); + + return error; +} + +/* + * Locking routine called after we have an RSB, either a copy of a remote one + * or a local one, or perhaps a shiny new one all of our very own + */ + +int dlm_lock_stage2(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *rsb, + uint32_t flags) +{ + int error = 0; + + DLM_ASSERT(rsb->res_nodeid != -1, print_lkb(lkb); print_rsb(rsb);); + + if (rsb->res_nodeid) { + res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING); + error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT); + } else { + dlm_lock_stage3(lkb); + } + + return error; +} + +/* + * Called on an RSB's master node to do stage2 locking for a remote lock + * request. Returns a proper lkb with rsb ready for lock processing. + * This is analagous to sections of dlm_lock() and dlm_lock_stage1(). + */ + +struct dlm_lkb *remote_stage2(int remote_nodeid, struct dlm_ls *ls, + struct dlm_request *freq) +{ + struct dlm_rsb *rsb = NULL, *parent_rsb = NULL; + struct dlm_lkb *lkb = NULL, *parent_lkb = NULL; + int error, namelen; + + if (freq->rr_remparid) { + parent_lkb = find_lock_by_id(ls, freq->rr_remparid); + if (!parent_lkb) + goto fail; + + atomic_inc(&parent_lkb->lkb_childcnt); + parent_rsb = parent_lkb->lkb_resource; + } + + /* + * A new MSTCPY lkb. Initialize lkb fields including the real lkid and + * node actually holding the (non-MSTCPY) lkb. AST address are just + * flags in the master copy. + */ + + lkb = create_lkb(ls); + if (!lkb) + goto fail_dec; + lkb->lkb_grmode = DLM_LOCK_IV; + lkb->lkb_rqmode = freq->rr_rqmode; + lkb->lkb_parent = parent_lkb; + lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP); + lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST); + lkb->lkb_nodeid = remote_nodeid; + lkb->lkb_remid = freq->rr_header.rh_lkid; + lkb->lkb_flags = GDLM_LKFLG_MSTCPY; + lkb->lkb_lockqueue_flags = freq->rr_flags; + + if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) { + lkb->lkb_flags |= GDLM_LKFLG_VALBLK; + allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb); + if (!lkb->lkb_lvbptr) + goto fail_free; + } + + if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) { + error = lkb_set_range(ls, lkb, freq->rr_range_start, + freq->rr_range_end); + if (error) + goto fail_free; + } + + /* + * Get the RSB which this lock is for. Create a new RSB if this is a + * new lock on a new resource. We must be the master of any new rsb. + */ + + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1; + + error = find_rsb(ls, parent_rsb, freq->rr_name, namelen, MASTER, &rsb); + if (error) + goto fail_free; + + if (!rsb) { + log_debug(ls, "send einval to %u", remote_nodeid); + /* print_name(freq->rr_name, namelen); */ + lkb->lkb_retstatus = -EINVAL; + goto out; + } + + lkb->lkb_resource = rsb; + + log_debug(ls, "(%d) rq %u from %u %x \"%s\"", + lkb->lkb_ownpid, lkb->lkb_rqmode, remote_nodeid, + lkb->lkb_id, rsb->res_name); + + out: + return lkb; + + fail_free: + /* release_lkb handles parent */ + release_lkb(ls, lkb); + parent_lkb = NULL; + + fail_dec: + if (parent_lkb) + atomic_dec(&parent_lkb->lkb_childcnt); + fail: + return NULL; +} + +/* + * The final bit of lock request processing on the master node. Here the lock + * is granted and the completion ast is queued, or the lock is put on the + * waitqueue and blocking asts are sent. + */ + +void dlm_lock_stage3(struct dlm_lkb *lkb) +{ + struct dlm_rsb *rsb = lkb->lkb_resource; + + /* + * This is a locally mastered lock on a resource that already exists, + * see if it can be granted or if it must wait. When this function is + * called for a remote lock request (process_cluster_request, + * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the + * requesting node at the end of process_cluster_request, not at the + * end of grant_lock. + */ + + down_write(&rsb->res_lock); + + if (can_be_granted(rsb, lkb, TRUE)) { + grant_lock(lkb, 0); + goto out; + } + + /* + * This request is not a conversion, so the lkb didn't exist other than + * for this request and should be freed after EAGAIN is returned in the + * ast. + */ + + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) { + lkb->lkb_retstatus = -EAGAIN; + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST) + send_blocking_asts_all(rsb, lkb); + queue_ast(lkb, AST_COMP | AST_DEL, 0); + goto out; + } + + /* + * The requested lkb must wait. Because the rsb of the requested lkb + * is mastered here, send blocking asts for the lkb's blocking the + * request. + */ + + log_debug2("w %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid, + lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode, + lkb->lkb_status, rsb->res_name); + + lkb->lkb_retstatus = 0; + lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING); + + send_blocking_asts(rsb, lkb); + + out: + up_write(&rsb->res_lock); +} + +int dlm_unlock(void *lockspace, + uint32_t lkid, + uint32_t flags, + struct dlm_lksb *lksb, + void *astarg) +{ + struct dlm_ls *ls = find_lockspace_by_local_id(lockspace); + struct dlm_lkb *lkb; + struct dlm_rsb *rsb; + int ret = -EINVAL; + + if (!ls) { + log_print("dlm_unlock: lkid %x lockspace not found", lkid); + return ret; + } + + lkb = find_lock_by_id(ls, lkid); + if (!lkb) { + log_debug(ls, "unlock %x no id", lkid); + goto out; + } + + /* Can't dequeue a master copy (a remote node's mastered lock) */ + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) { + log_debug(ls, "(%d) unlock %x lkb_flags %x", + lkb->lkb_ownpid, lkid, lkb->lkb_flags); + goto out; + } + + /* Already waiting for a remote lock operation */ + if (lkb->lkb_lockqueue_state) { + log_debug(ls, "(%d) unlock %x lq%d", + lkb->lkb_ownpid, lkid, lkb->lkb_lockqueue_state); + ret = -EBUSY; + goto out; + } + +#ifdef CONFIG_DLM_STATS + dlm_stats.unlockops++; +#endif + /* Can only cancel WAITING or CONVERTing locks. + * This is just a quick check - it is also checked in unlock_stage2() + * (which may be on the master) under the semaphore. + */ + if ((flags & DLM_LKF_CANCEL) && + (lkb->lkb_status == GDLM_LKSTS_GRANTED)) { + log_debug(ls, "(%d) unlock %x %x %d", + lkb->lkb_ownpid, lkid, flags, lkb->lkb_status); + goto out; + } + + /* "Normal" unlocks must operate on a granted lock */ + if (!(flags & DLM_LKF_CANCEL) && + (lkb->lkb_status != GDLM_LKSTS_GRANTED)) { + log_debug(ls, "(%d) unlock %x %x %d", + lkb->lkb_ownpid, lkid, flags, lkb->lkb_status); + goto out; + } + + if (lkb->lkb_flags & GDLM_LKFLG_DELETED) { + log_debug(ls, "(%d) unlock deleted %x %x %d", + lkb->lkb_ownpid, lkid, flags, lkb->lkb_status); + goto out; + } + + down_write(&ls->ls_unlock_sem); + /* Can't dequeue a lock with sublocks */ + if (atomic_read(&lkb->lkb_childcnt)) { + up_write(&ls->ls_unlock_sem); + ret = -ENOTEMPTY; + goto out; + } + /* Mark it as deleted so we can't use it as a parent in dlm_lock() */ + if (!(flags & DLM_LKF_CANCEL)) + lkb->lkb_flags |= GDLM_LKFLG_DELETED; + up_write(&ls->ls_unlock_sem); + + down_read(&ls->ls_in_recovery); + rsb = find_rsb_to_unlock(ls, lkb); + + log_debug(ls, "(%d) un %x %x %d %d \"%s\"", + lkb->lkb_ownpid, + lkb->lkb_id, + lkb->lkb_flags, + lkb->lkb_nodeid, + rsb->res_nodeid, + rsb->res_name); + + /* Save any new params */ + if (lksb) + lkb->lkb_lksb = lksb; + lkb->lkb_astparam = (long) astarg; + lkb->lkb_lockqueue_flags = flags; + + if (lkb->lkb_nodeid) + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK); + else + ret = dlm_unlock_stage2(lkb, rsb, flags); + up_read(&ls->ls_in_recovery); + + wake_astd(); + + out: + put_lockspace(ls); + return ret; +} + +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags) +{ + int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY; + int old_status; + + down_write(&rsb->res_lock); + + /* Can only cancel WAITING or CONVERTing locks */ + if ((flags & DLM_LKF_CANCEL) && + (lkb->lkb_status == GDLM_LKSTS_GRANTED)) { + lkb->lkb_retstatus = -EINVAL; + queue_ast(lkb, AST_COMP, 0); + goto out; + } + + log_debug2("u %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid, + lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode, + lkb->lkb_status, rsb->res_name); + + old_status = lkb_dequeue(lkb); + + /* + * Cancelling a conversion + */ + + if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) { + /* VMS semantics say we should send blocking ASTs again here */ + send_blocking_asts(rsb, lkb); + + /* Remove from deadlock detection */ + if (lkb->lkb_duetime) + remove_from_deadlockqueue(lkb); + + /* Stick it back on the granted queue */ + lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED); + lkb->lkb_rqmode = lkb->lkb_grmode; + + /* Was it blocking any other locks? */ + if (first_in_list(lkb, &rsb->res_convertqueue)) + grant_pending_locks(rsb); + + lkb->lkb_retstatus = -DLM_ECANCEL; + queue_ast(lkb, AST_COMP, 0); + goto out; + } + + /* + * If was granted grant any converting or waiting locks + * and save or clear lvb + */ + + if (old_status == GDLM_LKSTS_GRANTED) { + if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) { + if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr) + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, + DLM_LVB_LEN); + if (flags & DLM_LKF_IVVALBLK) + memset(rsb->res_lvbptr, 0, DLM_LVB_LEN); + } + + grant_pending_locks(rsb); + } else + DLM_ASSERT(0, print_lkb(lkb); print_rsb(rsb);); + + lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK; + + if (!remote) { + queue_ast(lkb, AST_COMP | AST_DEL, 0); + } else { + up_write(&rsb->res_lock); + release_lkb(rsb->res_ls, lkb); + release_rsb(rsb); + goto out2; + } + + out: + up_write(&rsb->res_lock); + out2: + wake_astd(); + return 0; +} + +/* + * Lock conversion + */ + +static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb, + uint32_t flags, void *ast, void *astarg, void *bast, + struct dlm_range *range) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *rsb; + int ret = -EINVAL; + + lkb = find_lock_by_id(ls, lksb->sb_lkid); + if (!lkb) { + goto out; + } + + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) { + ret = -EBUSY; + goto out; + } + + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) { + goto out; + } + + if ((flags & DLM_LKF_QUECVT) && + !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) { + goto out; + } + + if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) { + goto out; + } + +#ifdef CONFIG_DLM_STATS + dlm_stats.convertops++; +#endif + /* Set up the ranges as appropriate */ + if (range) { + if (range->ra_start > range->ra_end) + goto out; + + if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) { + ret = -ENOMEM; + goto out; + } + } + + rsb = lkb->lkb_resource; + down_read(&ls->ls_in_recovery); + + log_debug(ls, "(%d) cv %u %x \"%s\"", lkb->lkb_ownpid, mode, + lkb->lkb_id, rsb->res_name); + + lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK; + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED; + + if (flags & DLM_LKF_NODLCKWT) + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT; + lkb->lkb_astaddr = ast; + lkb->lkb_astparam = (long) astarg; + lkb->lkb_bastaddr = bast; + lkb->lkb_rqmode = mode; + lkb->lkb_lockqueue_flags = flags; + lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0; + lkb->lkb_lvbptr = lksb->sb_lvbptr; + + if (rsb->res_nodeid) { + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT); + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT); + } else { + ret = dlm_convert_stage2(lkb, FALSE); + } + + up_read(&ls->ls_in_recovery); + + wake_astd(); + + out: + return ret; +} + +/* + * For local conversion requests on locally mastered locks this is called + * directly from dlm_lock/convert_lock. This function is also called for + * remote conversion requests of MSTCPY locks (from process_cluster_request). + */ + +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast) +{ + struct dlm_rsb *rsb = lkb->lkb_resource; + int ret = 0; + + down_write(&rsb->res_lock); + + if (can_be_granted(rsb, lkb, TRUE)) { + grant_lock(lkb, 0); + grant_pending_locks(rsb); + goto out; + } + + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) { + ret = lkb->lkb_retstatus = -EAGAIN; + if (do_ast) + queue_ast(lkb, AST_COMP, 0); + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST) + send_blocking_asts_all(rsb, lkb); + goto out; + } + + log_debug2("c %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid, + lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode, + lkb->lkb_status, rsb->res_name); + + lkb->lkb_retstatus = 0; + lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT); + + /* + * The granted mode may have been reduced to NL by conversion deadlock + * avoidance in can_be_granted(). If so, try to grant other locks. + */ + + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) + grant_pending_locks(rsb); + + send_blocking_asts(rsb, lkb); + + if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT)) + add_to_deadlockqueue(lkb); + + out: + up_write(&rsb->res_lock); + return ret; +} + +/* + * Remove lkb from any queue it's on, add it to the granted queue, and queue a + * completion ast. rsb res_lock must be held in write when this is called. + */ + +static void grant_lock(struct dlm_lkb *lkb, int send_remote) +{ + struct dlm_rsb *rsb = lkb->lkb_resource; + + if (lkb->lkb_duetime) + remove_from_deadlockqueue(lkb); + + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) { + int b; + DLM_ASSERT(lkb->lkb_lvbptr,); + + if (!rsb->res_lvbptr) + rsb->res_lvbptr = allocate_lvb(rsb->res_ls); + + b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; + if (b) + memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN); + else + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); + } + + if (lkb->lkb_range) { + lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START]; + lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END]; + } + + log_debug2("g %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid, + lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode, + lkb->lkb_status, rsb->res_name); + + if (lkb->lkb_grmode != lkb->lkb_rqmode) { + lkb->lkb_grmode = lkb->lkb_rqmode; + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); + } + lkb->lkb_rqmode = DLM_LOCK_IV; + lkb->lkb_highbast = 0; + lkb->lkb_retstatus = 0; + queue_ast(lkb, AST_COMP, 0); + + /* + * A remote conversion request has been granted, either immediately + * upon being requested or after waiting a bit. In the former case, + * reply_and_grant() is called. In the later case send_remote is 1 and + * remote_grant() is called. + * + * The "send_remote" flag is set only for locks which are granted "out + * of band" - ie by another lock being converted or unlocked. + * + * The second case occurs when this lkb is granted right away as part + * of processing the initial request. In that case, we send a single + * message in reply_and_grant which combines the request reply with the + * grant message. + */ + + if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) { + if (send_remote) + remote_grant(lkb); + else if (lkb->lkb_request) + reply_and_grant(lkb); + } + +} + +static void send_bast_queue(struct list_head *head, struct dlm_lkb *lkb) +{ + struct dlm_lkb *gr; + + list_for_each_entry(gr, head, lkb_statequeue) { + if (gr->lkb_bastaddr && + gr->lkb_highbast < lkb->lkb_rqmode && + ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) { + queue_ast(gr, AST_BAST, lkb->lkb_rqmode); + gr->lkb_highbast = lkb->lkb_rqmode; + } + } +} + +/* + * Notify granted locks if they are blocking a newly forced-to-wait lock. + */ + +static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb) +{ + send_bast_queue(&rsb->res_grantqueue, lkb); + /* check if the following improves performance */ + /* send_bast_queue(&rsb->res_convertqueue, lkb); */ +} + +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb) +{ + send_bast_queue(&rsb->res_grantqueue, lkb); + send_bast_queue(&rsb->res_convertqueue, lkb); +} + +/* + * Called when a lock has been dequeued. Look for any locks to grant that are + * waiting for conversion or waiting to be granted. + * The rsb res_lock must be held in write when this function is called. + */ + +int grant_pending_locks(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb, *s; + int8_t high = DLM_LOCK_IV; + + list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) { + if (can_be_granted(r, lkb, FALSE)) + grant_lock(lkb, 1); + else + high = MAX(lkb->lkb_rqmode, high); + } + + list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { + if (lkb->lkb_lockqueue_state) + continue; + + if (can_be_granted(r, lkb, FALSE)) + grant_lock(lkb, 1); + else + high = MAX(lkb->lkb_rqmode, high); + } + + /* + * If there are locks left on the wait/convert queue then send blocking + * ASTs to granted locks that are blocking + * + * FIXME: This might generate some spurious blocking ASTs for range + * locks. + */ + + if (high > DLM_LOCK_IV) { + list_for_each_entry_safe(lkb, s, &r->res_grantqueue, + lkb_statequeue) { + if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) && + !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) { + queue_ast(lkb, AST_BAST, high); + lkb->lkb_highbast = high; + } + } + } + + return 0; +} + +/* + * Called to cancel a locking operation that failed due to some internal + * reason. + * + * Waiting locks will be removed, converting locks will be reverted to their + * granted status, unlocks will be left where they are. + * + * A completion AST will be delivered to the caller. + */ + +int cancel_lockop(struct dlm_lkb *lkb, int status) +{ + int state = lkb->lkb_lockqueue_state; + uint16_t astflags = AST_COMP; + + lkb->lkb_lockqueue_state = 0; + + switch (state) { + case GDLM_LQSTATE_WAIT_RSB: + astflags |= AST_DEL; + break; + + case GDLM_LQSTATE_WAIT_CONDGRANT: + res_lkb_dequeue(lkb); + astflags |= AST_DEL; + break; + + case GDLM_LQSTATE_WAIT_CONVERT: + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED); + + /* Remove from deadlock detection */ + if (lkb->lkb_duetime) { + remove_from_deadlockqueue(lkb); + } + break; + + case GDLM_LQSTATE_WAIT_UNLOCK: + /* We can leave this. I think.... */ + break; + } + + lkb->lkb_retstatus = status; + queue_ast(lkb, astflags, 0); + + return 0; +} + +/* + * Check for conversion deadlock. If a deadlock was found + * return lkb to kill, else return NULL + */ + +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb) +{ + struct dlm_rsb *rsb = lkb->lkb_resource; + struct list_head *entry; + + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,); + + /* Work our way up to the head of the queue looking for locks that + * conflict with us */ + + down_read(&rsb->res_lock); + + entry = lkb->lkb_statequeue.prev; + while (entry != &rsb->res_convertqueue) { + struct dlm_lkb *lkb2 = list_entry(entry, struct dlm_lkb, lkb_statequeue); + + if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) { + up_read(&rsb->res_lock); + return lkb; + } + entry = entry->prev; + } + up_read(&rsb->res_lock); + + return 0; +} + +/* + * Conversion operation was cancelled by us (not the user). + * ret contains the return code to pass onto the user + */ + +void cancel_conversion(struct dlm_lkb *lkb, int ret) +{ + struct dlm_rsb *rsb = lkb->lkb_resource; + + /* Stick it back on the granted queue */ + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); + lkb->lkb_rqmode = lkb->lkb_grmode; + + remove_from_deadlockqueue(lkb); + + lkb->lkb_retstatus = ret; + queue_ast(lkb, AST_COMP, 0); + wake_astd(); +} + +/* + * As new master of the rsb for this lkb, we need to handle these requests + * removed from the lockqueue and originating from local processes: + * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT, + * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT. + */ + +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state) +{ + struct dlm_rsb *rsb; + + switch (state) { + case GDLM_LQSTATE_WAIT_RSB: + dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb, + lkb->lkb_lockqueue_flags, + lkb->lkb_resource->res_name, + lkb->lkb_resource->res_length); + break; + + case GDLM_LQSTATE_WAIT_CONDGRANT: + res_lkb_dequeue(lkb); + dlm_lock_stage3(lkb); + break; + + case GDLM_LQSTATE_WAIT_UNLOCK: + rsb = find_rsb_to_unlock(ls, lkb); + dlm_unlock_stage2(lkb, rsb, lkb->lkb_lockqueue_flags); + break; + + case GDLM_LQSTATE_WAIT_CONVERT: + dlm_convert_stage2(lkb, TRUE); + break; + + default: + DLM_ASSERT(0,); + } +} + +static void dump_queue(struct list_head *head, char *qname) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, head, lkb_statequeue) { + printk("%s %08x gr %d rq %d flg %x sts %u node %u remid %x " + "lq %d,%x\n", + qname, + lkb->lkb_id, + lkb->lkb_grmode, + lkb->lkb_rqmode, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_lockqueue_state, + lkb->lkb_lockqueue_flags); + } +} + +static void dump_rsb(struct dlm_rsb *rsb) +{ + printk("name \"%s\" flags %lx nodeid %d ref %u\n", + rsb->res_name, rsb->res_flags, rsb->res_nodeid, + atomic_read(&rsb->res_ref)); + + if (!list_empty(&rsb->res_grantqueue)) + dump_queue(&rsb->res_grantqueue, "G"); + + if (!list_empty(&rsb->res_convertqueue)) + dump_queue(&rsb->res_convertqueue, "C"); + + if (!list_empty(&rsb->res_waitqueue)) + dump_queue(&rsb->res_waitqueue, "W"); +} + +void dlm_locks_dump(void) +{ + struct dlm_ls *ls; + struct dlm_rsb *rsb; + struct list_head *head; + int i; + + lowcomms_stop_accept(); + + list_for_each_entry(ls, &lslist, ls_list) { + down_write(&ls->ls_in_recovery); + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + head = &ls->ls_rsbtbl[i].list; + list_for_each_entry(rsb, head, res_hashchain) + dump_rsb(rsb); + } + } +} + diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h --- linux-orig/cluster/dlm/locking.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/locking.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,33 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCKING_DOT_H__ +#define __LOCKING_DOT_H__ + +int dlm_modes_compat(int mode1, int mode2); +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state); +void dlm_lock_stage3(struct dlm_lkb *lkb); +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast); +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags); +int dlm_lock_stage2(struct dlm_ls *lspace, struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags); +struct dlm_rsb *create_rsb(struct dlm_ls *lspace, struct dlm_lkb *lkb, char *name, int namelen); +int free_rsb_if_unused(struct dlm_rsb *rsb); +struct dlm_lkb *remote_stage2(int remote_csid, struct dlm_ls *lspace, + struct dlm_request *freq); +int cancel_lockop(struct dlm_lkb *lkb, int status); +int dlm_remove_lock(struct dlm_lkb *lkb, uint32_t flags); +int grant_pending_locks(struct dlm_rsb *rsb); +void cancel_conversion(struct dlm_lkb *lkb, int ret); +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb); + +#endif /* __LOCKING_DOT_H__ */ diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c --- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/lockqueue.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,1159 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * lockqueue.c + * + * This controls the lock queue, which is where locks + * come when they need to wait for a remote operation + * to complete. + * + * This could also be thought of as the "high-level" comms + * layer. + * + */ + +#include "dlm_internal.h" +#include "lockqueue.h" +#include "dir.h" +#include "locking.h" +#include "lkb.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "reccomms.h" +#include "nodes.h" +#include "lockspace.h" +#include "ast.h" +#include "memory.h" +#include "rsb.h" +#include "queries.h" +#include "util.h" + +static void add_reply_lvb(struct dlm_lkb * lkb, struct dlm_reply *reply); +static void add_request_lvb(struct dlm_lkb * lkb, struct dlm_request *req); + +/* + * format of an entry on the request queue + */ +struct rq_entry { + struct list_head rqe_list; + uint32_t rqe_nodeid; + char rqe_request[1]; +}; + +/* + * Add a new request (if appropriate) to the request queue and send the remote + * request out. - runs in the context of the locking caller + * + * Recovery of a remote_stage request if the remote end fails while the lkb + * is still on the lockqueue: + * + * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in + * lockqueue_lkb_mark() at the start of recovery. + * + * o Some lkb's will be rebuilt on new master rsb's during recovery. + * (depends on the type of request, see below). + * + * o At the end of recovery, resend_cluster_requests() looks at these + * LQRESEND lkb's and either: + * + * i) resends the request to the new master for the rsb where the + * request is processed as usual. The lkb remains on the lockqueue until + * the new master replies and we run process_lockqueue_reply(). + * + * ii) if we've become the rsb master, remove the lkb from the lockqueue + * and processes the request locally via process_remastered_lkb(). + * + * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue + * and the request should be resent if dest node is failed. + * + * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's + * wait queue. Don't rebuild this lkb on a new master rsb (the NOREBUILD flag + * makes send_lkb_queue() skip it). Resend this request to the new master. + * + * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue. It will + * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue). + * Resend this request to the new master. + * + * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue. + * It will be rebuilt on the new master rsb's granted queue. Resend this + * request to the new master. + */ + +int remote_stage(struct dlm_lkb *lkb, int state) +{ + int error; + + lkb->lkb_lockqueue_state = state; + add_to_lockqueue(lkb); + + error = send_cluster_request(lkb, state); + if (error < 0) { + log_error(lkb->lkb_resource->res_ls, "remote_stage error %d %x", + error, lkb->lkb_id); + /* Leave on lockqueue, it will be resent to correct node during + * recovery. */ + } + return 0; +} + +/* + * Requests received while the lockspace is in recovery get added to the + * request queue and processed when recovery is complete. + */ + +void add_to_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) +{ + struct rq_entry *entry; + int length = hd->rh_length; + + if (test_bit(LSFL_REQUEST_WARN, &ls->ls_flags)) + log_error(ls, "request during recovery from %u", nodeid); + + if (in_nodes_gone(ls, nodeid)) + return; + + entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); + if (!entry) { + // TODO something better + printk("dlm: add_to_requestqueue: out of memory\n"); + return; + } + + log_debug(ls, "add_to_requestq cmd %d fr %d", hd->rh_cmd, nodeid); + entry->rqe_nodeid = nodeid; + memcpy(entry->rqe_request, hd, length); + + down(&ls->ls_requestqueue_lock); + list_add_tail(&entry->rqe_list, &ls->ls_requestqueue); + up(&ls->ls_requestqueue_lock); +} + +int process_requestqueue(struct dlm_ls *ls) +{ + int error = 0, count = 0; + struct rq_entry *entry; + struct dlm_header *hd; + + log_all(ls, "process held requests"); + + down(&ls->ls_requestqueue_lock); + + for (;;) { + if (list_empty(&ls->ls_requestqueue)) { + up(&ls->ls_requestqueue_lock); + error = 0; + break; + } + + entry = list_entry(ls->ls_requestqueue.next, struct rq_entry, + rqe_list); + up(&ls->ls_requestqueue_lock); + hd = (struct dlm_header *) entry->rqe_request; + + log_debug(ls, "process_requestq cmd %d fr %u", hd->rh_cmd, + entry->rqe_nodeid); + + error = process_cluster_request(entry->rqe_nodeid, hd, TRUE); + if (error == -EINTR) { + /* entry is left on requestqueue */ + log_debug(ls, "process_requestqueue abort eintr"); + break; + } + + down(&ls->ls_requestqueue_lock); + list_del(&entry->rqe_list); + kfree(entry); + count++; + + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { + log_debug(ls, "process_requestqueue abort ls_run"); + up(&ls->ls_requestqueue_lock); + error = -EINTR; + break; + } + } + + log_all(ls, "processed %d requests", count); + return error; +} + +void wait_requestqueue(struct dlm_ls *ls) +{ + for (;;) { + down(&ls->ls_requestqueue_lock); + if (list_empty(&ls->ls_requestqueue)) + break; + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) + break; + up(&ls->ls_requestqueue_lock); + schedule(); + } + up(&ls->ls_requestqueue_lock); +} + +/* + * Resdir requests (lookup or remove) and replies from before recovery are + * invalid since the resdir was rebuilt. Clear them. Requests from nodes now + * gone are also invalid. + */ + +void purge_requestqueue(struct dlm_ls *ls) +{ + int count = 0; + struct rq_entry *entry, *safe; + struct dlm_header *hd; + struct dlm_lkb *lkb; + + log_all(ls, "purge requests"); + + down(&ls->ls_requestqueue_lock); + + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) { + hd = (struct dlm_header *) entry->rqe_request; + + if (hd->rh_cmd == GDLM_REMCMD_REM_RESDATA || + hd->rh_cmd == GDLM_REMCMD_LOOKUP || + in_nodes_gone(ls, entry->rqe_nodeid)) { + + list_del(&entry->rqe_list); + kfree(entry); + count++; + + } else if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY) { + + /* + * Replies to resdir lookups are invalid and must be + * purged. The lookup requests are marked in + * lockqueue_lkb_mark and will be resent in + * resend_cluster_requests. The only way to check if + * this is a lookup reply is to look at the + * lockqueue_state of the lkb. + */ + + lkb = find_lock_by_id(ls, hd->rh_lkid); + DLM_ASSERT(lkb,); + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) { + list_del(&entry->rqe_list); + kfree(entry); + count++; + } + } + } + up(&ls->ls_requestqueue_lock); + + log_all(ls, "purged %d requests", count); +} + +/* + * Check if there's a reply for the given lkid in the requestqueue. + */ + +int reply_in_requestqueue(struct dlm_ls *ls, int lkid) +{ + int rv = FALSE; + struct rq_entry *entry; + struct dlm_header *hd; + + down(&ls->ls_requestqueue_lock); + + list_for_each_entry(entry, &ls->ls_requestqueue, rqe_list) { + hd = (struct dlm_header *) entry->rqe_request; + if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY && hd->rh_lkid == lkid){ + log_debug(ls, "reply_in_requestq cmd %d fr %d id %x", + hd->rh_cmd, entry->rqe_nodeid, lkid); + rv = TRUE; + break; + } + } + up(&ls->ls_requestqueue_lock); + + return rv; +} + +void allocate_and_copy_lvb(struct dlm_ls *ls, char **lvbptr, char *src) +{ + if (!*lvbptr) + *lvbptr = allocate_lvb(ls); + if (*lvbptr) + memcpy(*lvbptr, src, DLM_LVB_LEN); +} + +/* + * Process a lockqueue LKB after it has had it's remote processing complete and + * been pulled from the lockqueue. Runs in the context of the DLM recvd thread + * on the machine that requested the lock. + */ + +static void process_lockqueue_reply(struct dlm_lkb *lkb, + struct dlm_reply *reply, + uint32_t nodeid) +{ + struct dlm_rsb *rsb = lkb->lkb_resource; + struct dlm_ls *ls = rsb->res_ls; + int oldstate, state = lkb->lkb_lockqueue_state; + + if (state) + remove_from_lockqueue(lkb); + + switch (state) { + case GDLM_LQSTATE_WAIT_RSB: + + if (reply->rl_status) { + DLM_ASSERT(reply->rl_status == -EEXIST,); + if (rsb->res_nodeid == -1) { + msleep(500); + remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB); + break; + } + } else { + if (reply->rl_nodeid == our_nodeid()) { + set_bit(RESFL_MASTER, &rsb->res_flags); + rsb->res_nodeid = 0; + } else { + clear_bit(RESFL_MASTER, &rsb->res_flags); + rsb->res_nodeid = reply->rl_nodeid; + } + } + + log_debug(ls, "(%d) lu rep %x fr %u %u", lkb->lkb_ownpid, + lkb->lkb_id, nodeid, + rsb->res_nodeid); + + lkb->lkb_nodeid = rsb->res_nodeid; + dlm_lock_stage2(ls, lkb, rsb, lkb->lkb_lockqueue_flags); + break; + + case GDLM_LQSTATE_WAIT_CONVERT: + case GDLM_LQSTATE_WAIT_CONDGRANT: + + /* + * the destination wasn't the master + * this implies the request was a CONDGRANT + */ + + if (reply->rl_status == -EINVAL) { + int master_nodeid; + + DLM_ASSERT(state == GDLM_LQSTATE_WAIT_CONDGRANT, ); + + log_debug(ls, "(%d) req reply einval %x fr %d r %d %s", + lkb->lkb_ownpid, lkb->lkb_id, nodeid, + rsb->res_nodeid, rsb->res_name); + + lkb_dequeue(lkb); + + if (rsb->res_nodeid == lkb->lkb_nodeid || rsb->res_nodeid == -1){ + /* + * We need to re-lookup the master and resend our + * request to it. + */ + + lkb->lkb_nodeid = -1; + rsb->res_nodeid = -1; + + if (get_directory_nodeid(rsb) != our_nodeid()) + remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB); + else { + int error = dlm_dir_lookup(ls, our_nodeid(), + rsb->res_name, + rsb->res_length, + &master_nodeid); + if (error == -EEXIST) { + /* don't expect this will happen */ + log_all(ls, "EEXIST %x", lkb->lkb_id); + print_lkb(lkb); + print_rsb(rsb); + } + + if (master_nodeid == our_nodeid()) { + set_bit(RESFL_MASTER, &rsb->res_flags); + master_nodeid = 0; + } else + clear_bit(RESFL_MASTER,&rsb->res_flags); + + rsb->res_nodeid = master_nodeid; + lkb->lkb_nodeid = master_nodeid; + + dlm_lock_stage2(ls, lkb, rsb, + lkb->lkb_lockqueue_flags); + } + } else { + /* + * Another request on this rsb has since found + * the master, we'll use that one although it too + * may be invalid requiring us to retry again. + */ + + lkb->lkb_nodeid = rsb->res_nodeid; + dlm_lock_stage2(ls, lkb, rsb, + lkb->lkb_lockqueue_flags); + } + + break; + } + + + /* + * After a remote lock/conversion/grant request we put the lock + * on the right queue and send an AST if appropriate. Any lock + * shuffling (eg newly granted locks because this one was + * converted downwards) will be dealt with in seperate messages + * (which may be in the same network message) + */ + + if (!lkb->lkb_remid) + lkb->lkb_remid = reply->rl_lkid; + + /* + * The remote request failed (we assume because of NOQUEUE). + * If this is a new request (non-conv) the lkb was created just + * for it so the lkb should be freed. If this was a + * conversion, the lkb already existed so we should put it back + * on the grant queue. + */ + + if (reply->rl_status != 0) { + DLM_ASSERT(reply->rl_status == -EAGAIN,); + + if (state == GDLM_LQSTATE_WAIT_CONDGRANT) { + res_lkb_dequeue(lkb); + lkb->lkb_retstatus = reply->rl_status; + queue_ast(lkb, AST_COMP | AST_DEL, 0); + } else { + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); + lkb->lkb_retstatus = reply->rl_status; + queue_ast(lkb, AST_COMP, 0); + } + break; + } + + /* + * The remote request was successful in granting the request or + * queuing it to be granted later. Add the lkb to the + * appropriate rsb queue. + */ + + switch (reply->rl_lockstate) { + case GDLM_LKSTS_GRANTED: + + /* Compact version of grant_lock(). */ + + down_write(&rsb->res_lock); + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) + memcpy(lkb->lkb_lvbptr, reply->rl_lvb, + DLM_LVB_LEN); + + lkb->lkb_grmode = lkb->lkb_rqmode; + lkb->lkb_rqmode = DLM_LOCK_IV; + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); + + if (lkb->lkb_range) { + lkb->lkb_range[GR_RANGE_START] = + lkb->lkb_range[RQ_RANGE_START]; + lkb->lkb_range[GR_RANGE_END] = + lkb->lkb_range[RQ_RANGE_END]; + } + up_write(&rsb->res_lock); + + lkb->lkb_retstatus = 0; + queue_ast(lkb, AST_COMP, 0); + break; + + case GDLM_LKSTS_WAITING: + + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING); + else + log_error(ls, "wait reply for granted %x %u", + lkb->lkb_id, lkb->lkb_nodeid); + break; + + case GDLM_LKSTS_CONVERT: + + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT); + else + log_error(ls, "convert reply for granted %x %u", + lkb->lkb_id, lkb->lkb_nodeid); + break; + + default: + log_error(ls, "process_lockqueue_reply state %d", + reply->rl_lockstate); + } + + break; + + case GDLM_LQSTATE_WAIT_UNLOCK: + + /* + * Unlocks should never fail. Update local lock info. This + * always sends completion AST with status in lksb + */ + + DLM_ASSERT(reply->rl_status == 0,); + oldstate = res_lkb_dequeue(lkb); + + /* Differentiate between unlocks and conversion cancellations */ + if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL) { + if (oldstate == GDLM_LKSTS_CONVERT) { + res_lkb_enqueue(lkb->lkb_resource, lkb, + GDLM_LKSTS_GRANTED); + lkb->lkb_retstatus = -DLM_ECANCEL; + queue_ast(lkb, AST_COMP, 0); + } else + log_error(ls, "cancel state %d", oldstate); + } else { + DLM_ASSERT(oldstate == GDLM_LKSTS_GRANTED, + print_lkb(lkb);); + + lkb->lkb_retstatus = -DLM_EUNLOCK; + queue_ast(lkb, AST_COMP | AST_DEL, 0); + } + break; + + default: + log_error(ls, "process_lockqueue_reply id %x state %d", + lkb->lkb_id, state); + } +} + +/* + * Tell a remote node to grant a lock. This happens when we are the master + * copy for a lock that is actually held on a remote node. The remote end is + * also responsible for sending the completion AST. + */ + +void remote_grant(struct dlm_lkb *lkb) +{ + struct writequeue_entry *e; + struct dlm_request *req; + + // TODO Error handling + e = lowcomms_get_buffer(lkb->lkb_nodeid, + sizeof(struct dlm_request), + lkb->lkb_resource->res_ls->ls_allocation, + (char **) &req); + if (!e) + return; + + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT; + req->rr_header.rh_length = sizeof(struct dlm_request); + req->rr_header.rh_flags = 0; + req->rr_header.rh_lkid = lkb->lkb_id; + req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id; + req->rr_remlkid = lkb->lkb_remid; + req->rr_flags = 0; + + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) { + /* This is a confusing non-standard use of rr_flags which is + * usually used to pass lockqueue_flags. */ + req->rr_flags |= GDLM_LKFLG_DEMOTED; + } + + add_request_lvb(lkb, req); + midcomms_send_buffer(&req->rr_header, e); +} + +void reply_and_grant(struct dlm_lkb *lkb) +{ + struct dlm_request *req = lkb->lkb_request; + struct dlm_reply *reply; + struct writequeue_entry *e; + + // TODO Error handling + e = lowcomms_get_buffer(lkb->lkb_nodeid, + sizeof(struct dlm_reply), + lkb->lkb_resource->res_ls->ls_allocation, + (char **) &reply); + if (!e) + return; + + reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY; + reply->rl_header.rh_flags = 0; + reply->rl_header.rh_length = sizeof(struct dlm_reply); + reply->rl_header.rh_lkid = req->rr_header.rh_lkid; + reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace; + + reply->rl_status = lkb->lkb_retstatus; + reply->rl_lockstate = lkb->lkb_status; + reply->rl_lkid = lkb->lkb_id; + + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),); + + lkb->lkb_request = NULL; + + add_reply_lvb(lkb, reply); + midcomms_send_buffer(&reply->rl_header, e); +} + +/* + * Request removal of a dead entry in the resource directory + */ + +void remote_remove_direntry(struct dlm_ls *ls, int nodeid, char *name, + int namelen) +{ + struct writequeue_entry *e; + struct dlm_request *req; + + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { + struct dlm_rcom *rc = allocate_rcom_buffer(ls); + + memcpy(rc->rc_buf, name, namelen); + rc->rc_datalen = namelen; + + rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0); + + free_rcom_buffer(rc); + return; + } + // TODO Error handling + e = lowcomms_get_buffer(nodeid, + sizeof(struct dlm_request) + namelen - 1, + ls->ls_allocation, (char **) &req); + if (!e) + return; + + memset(req, 0, sizeof(struct dlm_request) + namelen - 1); + req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA; + req->rr_header.rh_length = + sizeof(struct dlm_request) + namelen - 1; + req->rr_header.rh_flags = 0; + req->rr_header.rh_lkid = 0; + req->rr_header.rh_lockspace = ls->ls_global_id; + req->rr_remlkid = 0; + memcpy(req->rr_name, name, namelen); + + midcomms_send_buffer(&req->rr_header, e); +} + +/* + * Send remote cluster request to directory or master node before the request + * is put on the lock queue. Runs in the context of the locking caller. + */ + +int send_cluster_request(struct dlm_lkb *lkb, int state) +{ + uint32_t target_nodeid; + struct dlm_rsb *rsb = lkb->lkb_resource; + struct dlm_ls *ls = rsb->res_ls; + struct dlm_request *req; + struct writequeue_entry *e; + + if (state == GDLM_LQSTATE_WAIT_RSB) + target_nodeid = get_directory_nodeid(rsb); + else + target_nodeid = lkb->lkb_nodeid; + + /* during recovery it's valid for target_nodeid to equal our own; + resend_cluster_requests does this to get requests back on track */ + + DLM_ASSERT(target_nodeid && target_nodeid != -1, + print_lkb(lkb); + print_rsb(rsb); + printk("target_nodeid %u\n", target_nodeid);); + + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { + /* this may happen when called by resend_cluster_request */ + log_error(ls, "send_cluster_request to %u state %d recovery", + target_nodeid, state); + } + + e = lowcomms_get_buffer(target_nodeid, + sizeof(struct dlm_request) + + rsb->res_length - 1, ls->ls_allocation, + (char **) &req); + if (!e) + return -ENOBUFS; + memset(req, 0, sizeof(struct dlm_request) + rsb->res_length - 1); + + /* Common stuff, some are just defaults */ + + if (lkb->lkb_bastaddr) + req->rr_asts = AST_BAST; + if (lkb->lkb_astaddr) + req->rr_asts |= AST_COMP; + if (lkb->lkb_parent) + req->rr_remparid = lkb->lkb_parent->lkb_remid; + + req->rr_flags = lkb->lkb_lockqueue_flags; + req->rr_rqmode = lkb->lkb_rqmode; + req->rr_remlkid = lkb->lkb_remid; + req->rr_pid = lkb->lkb_ownpid; + req->rr_header.rh_length = + sizeof(struct dlm_request) + rsb->res_length - 1; + req->rr_header.rh_flags = 0; + req->rr_header.rh_lkid = lkb->lkb_id; + req->rr_header.rh_lockspace = ls->ls_global_id; + + switch (state) { + + case GDLM_LQSTATE_WAIT_RSB: + + DLM_ASSERT(!lkb->lkb_parent, + print_lkb(lkb); + print_rsb(rsb);); + + log_debug(ls, "(%d) send lu %x to %u", + lkb->lkb_ownpid, lkb->lkb_id, target_nodeid); + + req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP; + memcpy(req->rr_name, rsb->res_name, rsb->res_length); + break; + + case GDLM_LQSTATE_WAIT_CONVERT: + + DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid, + print_lkb(lkb); + print_rsb(rsb);); + + log_debug(ls, "(%d) send cv %x to %u", + lkb->lkb_ownpid, lkb->lkb_id, target_nodeid); + + req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST; + if (lkb->lkb_range) { + req->rr_flags |= GDLM_LKFLG_RANGE; + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START]; + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END]; + } + break; + + case GDLM_LQSTATE_WAIT_CONDGRANT: + + DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid, + print_lkb(lkb); + print_rsb(rsb);); + + log_debug(ls, "(%d) send rq %x to %u", + lkb->lkb_ownpid, lkb->lkb_id, target_nodeid); + + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST; + memcpy(req->rr_name, rsb->res_name, rsb->res_length); + if (lkb->lkb_range) { + req->rr_flags |= GDLM_LKFLG_RANGE; + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START]; + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END]; + } + break; + + case GDLM_LQSTATE_WAIT_UNLOCK: + + log_debug(ls, "(%d) send un %x to %u", + lkb->lkb_ownpid, lkb->lkb_id, target_nodeid); + + req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST; + break; + + default: + DLM_ASSERT(0, printk("Unknown cluster request\n");); + } + + add_request_lvb(lkb, req); + midcomms_send_buffer(&req->rr_header, e); + + return 0; +} + +/* + * We got a request from another cluster node, process it and return an info + * structure with the lock state/LVB etc as required. Executes in the DLM's + * recvd thread. + */ + +int process_cluster_request(int nodeid, struct dlm_header *req, int recovery) +{ + struct dlm_ls *lspace; + struct dlm_lkb *lkb = NULL; + struct dlm_rsb *rsb; + int send_reply = 0, status = 0, namelen; + struct dlm_request *freq = (struct dlm_request *) req; + struct dlm_reply *rp = (struct dlm_reply *) req; + struct dlm_reply reply; + + lspace = find_lockspace_by_global_id(req->rh_lockspace); + + if (!lspace) { + log_print("process_cluster_request invalid lockspace %x " + "from %d req %u", req->rh_lockspace, nodeid, + req->rh_cmd); + return -EINVAL; + } + + /* wait for recoverd to drain requestqueue */ + if (!recovery) + wait_requestqueue(lspace); + + /* + * If we're in recovery then queue the request for later. Otherwise, + * we still need to get the "in_recovery" lock to make sure the + * recovery itself doesn't start until we are done. + */ + retry: + if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) { + if (!recovery) + add_to_requestqueue(lspace, nodeid, req); + status = -EINTR; + goto out; + } + if (!down_read_trylock(&lspace->ls_in_recovery)) { + schedule(); + goto retry; + } + + + /* + * Process the request. + */ + + switch (req->rh_cmd) { + + case GDLM_REMCMD_LOOKUP: + { + uint32_t dir_nodeid, r_nodeid; + int status; + + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1; + + dir_nodeid = name_to_directory_nodeid(lspace, + freq->rr_name, + namelen); + if (dir_nodeid != our_nodeid()) + log_debug(lspace, "ignoring directory lookup"); + + status = dlm_dir_lookup(lspace, nodeid, freq->rr_name, + namelen, &r_nodeid); + reply.rl_status = status; + reply.rl_lockstate = 0; + reply.rl_nodeid = r_nodeid; + } + send_reply = 1; + break; + + case GDLM_REMCMD_REM_RESDATA: + + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1; + dlm_dir_remove(lspace, nodeid, freq->rr_name, namelen); + break; + + case GDLM_REMCMD_LOCKREQUEST: + + lkb = remote_stage2(nodeid, lspace, freq); + if (lkb) { + lkb->lkb_request = freq; + lkb->lkb_ownpid = freq->rr_pid; + if (lkb->lkb_retstatus != -EINVAL) + dlm_lock_stage3(lkb); + + /* + * If the request was granted in lock_stage3, then a + * reply message was already sent in combination with + * the grant message and lkb_request is NULL. + */ + + if (lkb->lkb_request) { + lkb->lkb_request = NULL; + send_reply = 1; + reply.rl_status = lkb->lkb_retstatus; + reply.rl_lockstate = lkb->lkb_status; + reply.rl_lkid = lkb->lkb_id; + + /* + * If the request could not be granted and the + * user won't wait, then free up the LKB + */ + + if (lkb->lkb_retstatus == -EAGAIN) { + rsb = lkb->lkb_resource; + release_lkb(lspace, lkb); + release_rsb(rsb); + lkb = NULL; + } + else if (lkb->lkb_retstatus == -EINVAL) { + release_lkb(lspace, lkb); + lkb = NULL; + } + } + } else { + reply.rl_status = -ENOMEM; + send_reply = 1; + } + break; + + case GDLM_REMCMD_CONVREQUEST: + + lkb = find_lock_by_id(lspace, freq->rr_remlkid); + + + DLM_ASSERT(lkb, + print_request(freq); + printk("nodeid %u\n", nodeid);); + + rsb = lkb->lkb_resource; + + DLM_ASSERT(rsb, + print_lkb(lkb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + DLM_ASSERT(!rsb->res_nodeid, + print_lkb(lkb); + print_rsb(rsb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY, + print_lkb(lkb); + print_rsb(rsb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_GRANTED, + print_lkb(lkb); + print_rsb(rsb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + /* Update orphan lock status */ + if (freq->rr_flags & DLM_LKF_ORPHAN) { + lkb->lkb_flags |= GDLM_LKFLG_ORPHAN; + } + + lkb->lkb_rqmode = freq->rr_rqmode; + lkb->lkb_lockqueue_flags = freq->rr_flags; + lkb->lkb_request = freq; + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED; + + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK || + freq->rr_flags & DLM_LKF_VALBLK) { + lkb->lkb_flags |= GDLM_LKFLG_VALBLK; + allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr, + freq->rr_lvb); + } + + if (freq->rr_flags & GDLM_LKFLG_RANGE) { + if (lkb_set_range(lspace, lkb, freq->rr_range_start, + freq->rr_range_end)) { + reply.rl_status = -ENOMEM; + send_reply = 1; + goto out; + } + } + + log_debug(lspace, "(%d) cv %u from %u %x \"%s\"", + lkb->lkb_ownpid, lkb->lkb_rqmode, nodeid, + lkb->lkb_id, rsb->res_name); + + dlm_convert_stage2(lkb, FALSE); + + /* + * If the conv request was granted in stage2, then a reply + * message was already sent in combination with the grant + * message. + */ + + if (lkb->lkb_request) { + lkb->lkb_request = NULL; + send_reply = 1; + reply.rl_status = lkb->lkb_retstatus; + reply.rl_lockstate = lkb->lkb_status; + reply.rl_lkid = lkb->lkb_id; + } + break; + + case GDLM_REMCMD_LOCKREPLY: + + lkb = find_lock_by_id(lspace, req->rh_lkid); + + DLM_ASSERT(lkb, + print_reply(rp); + printk("nodeid %u\n", nodeid);); + + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY), + print_lkb(lkb); + print_reply(rp); + printk("nodeid %u\n", nodeid);); + + process_lockqueue_reply(lkb, rp, nodeid); + break; + + case GDLM_REMCMD_LOCKGRANT: + + /* + * Remote lock has been granted asynchronously. Do a compact + * version of what grant_lock() does. + */ + + lkb = find_lock_by_id(lspace, freq->rr_remlkid); + + DLM_ASSERT(lkb, + print_request(freq); + printk("nodeid %u\n", nodeid);); + + rsb = lkb->lkb_resource; + + DLM_ASSERT(rsb, + print_lkb(lkb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + DLM_ASSERT(rsb->res_nodeid, + print_lkb(lkb); + print_rsb(rsb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY), + print_lkb(lkb); + print_rsb(rsb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + if (lkb->lkb_lockqueue_state) { + log_debug(rsb->res_ls, "grant lock on lockqueue %d", + lkb->lkb_lockqueue_state); + + /* Don't grant locks that are waiting for an unlock */ + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK) + return 0; + + print_lkb(lkb); + print_request(freq); + remove_from_lockqueue(lkb); + if (!lkb->lkb_remid) + lkb->lkb_remid = req->rh_lkid; + } + + down_write(&rsb->res_lock); + + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) + allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr, freq->rr_lvb); + + lkb->lkb_grmode = lkb->lkb_rqmode; + lkb->lkb_rqmode = DLM_LOCK_IV; + + if (lkb->lkb_range) { + lkb->lkb_range[GR_RANGE_START] = + lkb->lkb_range[RQ_RANGE_START]; + lkb->lkb_range[GR_RANGE_END] = + lkb->lkb_range[RQ_RANGE_END]; + } + + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED); + up_write(&rsb->res_lock); + + if (freq->rr_flags & GDLM_LKFLG_DEMOTED) + lkb->lkb_flags |= GDLM_LKFLG_DEMOTED; + + lkb->lkb_retstatus = 0; + queue_ast(lkb, AST_COMP, 0); + break; + + case GDLM_REMCMD_SENDBAST: + + lkb = find_lock_by_id(lspace, freq->rr_remlkid); + + DLM_ASSERT(lkb, + print_request(freq); + printk("nodeid %u\n", nodeid);); + + if (lkb->lkb_status == GDLM_LKSTS_GRANTED) + queue_ast(lkb, AST_BAST, freq->rr_rqmode); + break; + + case GDLM_REMCMD_SENDCAST: + + /* This is only used for some error completion ASTs */ + + lkb = find_lock_by_id(lspace, freq->rr_remlkid); + + DLM_ASSERT(lkb, + print_request(freq); + printk("nodeid %u\n", nodeid);); + + /* Return the lock to granted status */ + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED); + lkb->lkb_retstatus = freq->rr_status; + queue_ast(lkb, AST_COMP, 0); + break; + + case GDLM_REMCMD_UNLOCKREQUEST: + + lkb = find_lock_by_id(lspace, freq->rr_remlkid); + + DLM_ASSERT(lkb, + print_request(freq); + printk("nodeid %u\n", nodeid);); + + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY, + print_lkb(lkb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + DLM_ASSERT(lkb->lkb_nodeid == nodeid, + print_lkb(lkb); + print_request(freq); + printk("nodeid %u\n", nodeid);); + + rsb = find_rsb_to_unlock(lspace, lkb); + + log_debug(lspace, "(%d) un from %u %x \"%s\"", lkb->lkb_ownpid, + nodeid, lkb->lkb_id, rsb->res_name); + + reply.rl_status = dlm_unlock_stage2(lkb, rsb, freq->rr_flags); + send_reply = 1; + break; + + case GDLM_REMCMD_QUERY: + remote_query(nodeid, lspace, req); + break; + + case GDLM_REMCMD_QUERYREPLY: + remote_query_reply(nodeid, lspace, req); + break; + + default: + log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd); + } + + up_read(&lspace->ls_in_recovery); + + out: + if (send_reply) { + reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY; + reply.rl_header.rh_flags = 0; + reply.rl_header.rh_length = sizeof(reply); + reply.rl_header.rh_lkid = freq->rr_header.rh_lkid; + reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace; + + status = midcomms_send_message(nodeid, &reply.rl_header, + GFP_KERNEL); + } + + wake_astd(); + put_lockspace(lspace); + return status; +} + +static void add_reply_lvb(struct dlm_lkb *lkb, struct dlm_reply *reply) +{ + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) + memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN); +} + +static void add_request_lvb(struct dlm_lkb *lkb, struct dlm_request *req) +{ + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) + memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN); +} diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h --- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/lockqueue.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,29 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCKQUEUE_DOT_H__ +#define __LOCKQUEUE_DOT_H__ + +void remote_grant(struct dlm_lkb * lkb); +void reply_and_grant(struct dlm_lkb * lkb); +int remote_stage(struct dlm_lkb * lkb, int state); +int process_cluster_request(int csid, struct dlm_header *req, int recovery); +int send_cluster_request(struct dlm_lkb * lkb, int state); +void purge_requestqueue(struct dlm_ls * ls); +int process_requestqueue(struct dlm_ls * ls); +int reply_in_requestqueue(struct dlm_ls * ls, int lkid); +void remote_remove_direntry(struct dlm_ls * ls, int nodeid, char *name, + int namelen); +void allocate_and_copy_lvb(struct dlm_ls * ls, char **lvbptr, char *src); + +#endif /* __LOCKQUEUE_DOT_H__ */ diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c --- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/lockspace.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,715 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include + +#include "dlm_internal.h" +#include "recoverd.h" +#include "ast.h" +#include "lkb.h" +#include "nodes.h" +#include "dir.h" +#include "lowcomms.h" +#include "config.h" +#include "memory.h" +#include "lockspace.h" +#include "device.h" + +#define GDST_NONE (0) +#define GDST_RUNNING (1) + +static int dlmstate; +static int dlmcount; +static struct semaphore dlmstate_lock; +struct list_head lslist; +spinlock_t lslist_lock; +struct kcl_service_ops ls_ops; + +static int new_lockspace(char *name, int namelen, void **lockspace, int flags); + + +void dlm_lockspace_init(void) +{ + dlmstate = GDST_NONE; + dlmcount = 0; + init_MUTEX(&dlmstate_lock); + INIT_LIST_HEAD(&lslist); + spin_lock_init(&lslist_lock); +} + +struct dlm_ls *find_lockspace_by_name(char *name, int namelen) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_namelen == namelen && + memcmp(ls->ls_name, name, namelen) == 0) + goto out; + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +struct dlm_ls *find_lockspace_by_global_id(uint32_t id) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_global_id == id) { + ls->ls_count++; + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +struct dlm_ls *find_lockspace_by_local_id(void *id) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_local_id == (uint32_t)(long)id) { + ls->ls_count++; + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +/* must be called with lslist_lock held */ +void hold_lockspace(struct dlm_ls *ls) +{ + ls->ls_count++; +} + +void put_lockspace(struct dlm_ls *ls) +{ + spin_lock(&lslist_lock); + ls->ls_count--; + spin_unlock(&lslist_lock); +} + +static void remove_lockspace(struct dlm_ls *ls) +{ + for (;;) { + spin_lock(&lslist_lock); + if (ls->ls_count == 0) { + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); + return; + } + spin_unlock(&lslist_lock); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } +} + +/* + * Called from dlm_init. These are the general threads which are not + * lockspace-specific and work for all dlm lockspaces. + */ + +static int threads_start(void) +{ + int error; + + /* Thread which process lock requests for all ls's */ + error = astd_start(); + if (error) { + log_print("cannot start ast thread %d", error); + goto fail; + } + + /* Thread for sending/receiving messages for all ls's */ + error = lowcomms_start(); + if (error) { + log_print("cannot start lowcomms %d", error); + goto astd_fail; + } + + return 0; + + astd_fail: + astd_stop(); + + fail: + return error; +} + +static void threads_stop(void) +{ + lowcomms_stop(); + astd_stop(); +} + +static int init_internal(void) +{ + int error = 0; + + if (dlmstate == GDST_RUNNING) + dlmcount++; + else { + error = threads_start(); + if (error) + goto out; + + dlmstate = GDST_RUNNING; + dlmcount = 1; + } + + out: + return error; +} + +/* + * Called after dlm module is loaded and before any lockspaces are created. + * Starts and initializes global threads and structures. These global entities + * are shared by and independent of all lockspaces. + * + * There should be a dlm-specific user command which a person can run which + * calls this function. If a user hasn't run that command and something + * creates a new lockspace, this is called first. + * + * This also starts the default lockspace. + */ + +int dlm_init(void) +{ + int error; + + down(&dlmstate_lock); + error = init_internal(); + up(&dlmstate_lock); + + return error; +} + +int dlm_release(void) +{ + int error = 0; + + down(&dlmstate_lock); + + if (dlmstate == GDST_NONE) + goto out; + + if (dlmcount) + dlmcount--; + + if (dlmcount) + goto out; + + spin_lock(&lslist_lock); + if (!list_empty(&lslist)) { + spin_unlock(&lslist_lock); + log_print("cannot stop threads, lockspaces still exist"); + goto out; + } + spin_unlock(&lslist_lock); + + threads_stop(); + dlmstate = GDST_NONE; + + out: + up(&dlmstate_lock); + + return error; +} + +struct dlm_ls *allocate_ls(int namelen) +{ + struct dlm_ls *ls; + + ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); + if (ls) + memset(ls, 0, sizeof(struct dlm_ls) + namelen); + + return ls; +} + +static int new_lockspace(char *name, int namelen, void **lockspace, int flags) +{ + struct dlm_ls *ls; + int i, size, error = -ENOMEM; + uint32_t local_id = 0; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + if (namelen > MAX_SERVICE_NAME_LEN) + return -EINVAL; + + ls = find_lockspace_by_name(name, namelen); + if (ls) { + *lockspace = (void *)(long) ls->ls_local_id; + return -EEXIST; + } + + /* + * Initialize ls fields + */ + + ls = allocate_ls(namelen); + if (!ls) + goto out; + + memcpy(ls->ls_name, name, namelen); + ls->ls_namelen = namelen; + + ls->ls_allocation = GFP_KERNEL; + ls->ls_count = 0; + ls->ls_flags = 0; + + size = dlm_config.rsbtbl_size; + ls->ls_rsbtbl_size = size; + + ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); + if (!ls->ls_rsbtbl) + goto out_lsfree; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list); + rwlock_init(&ls->ls_rsbtbl[i].lock); + } + + size = dlm_config.lkbtbl_size; + ls->ls_lkbtbl_size = size; + + ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); + if (!ls->ls_lkbtbl) + goto out_rsbfree; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list); + rwlock_init(&ls->ls_lkbtbl[i].lock); + ls->ls_lkbtbl[i].counter = 1; + } + + size = dlm_config.dirtbl_size; + ls->ls_dirtbl_size = size; + + ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); + if (!ls->ls_dirtbl) + goto out_lkbfree; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&ls->ls_dirtbl[i].list); + rwlock_init(&ls->ls_dirtbl[i].lock); + } + + INIT_LIST_HEAD(&ls->ls_nodes); + INIT_LIST_HEAD(&ls->ls_nodes_gone); + ls->ls_num_nodes = 0; + ls->ls_node_array = NULL; + ls->ls_recoverd_task = NULL; + init_MUTEX(&ls->ls_recoverd_lock); + INIT_LIST_HEAD(&ls->ls_recover); + spin_lock_init(&ls->ls_recover_lock); + INIT_LIST_HEAD(&ls->ls_recover_list); + ls->ls_recover_list_count = 0; + spin_lock_init(&ls->ls_recover_list_lock); + init_waitqueue_head(&ls->ls_wait_general); + INIT_LIST_HEAD(&ls->ls_rootres); + INIT_LIST_HEAD(&ls->ls_requestqueue); + INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list); + ls->ls_last_stop = 0; + ls->ls_last_start = 0; + ls->ls_last_finish = 0; + ls->ls_rcom_msgid = 0; + init_MUTEX(&ls->ls_requestqueue_lock); + init_MUTEX(&ls->ls_rcom_lock); + init_rwsem(&ls->ls_unlock_sem); + init_rwsem(&ls->ls_root_lock); + init_rwsem(&ls->ls_in_recovery); + + down_write(&ls->ls_in_recovery); + + if (flags & DLM_LSF_NOTIMERS) + set_bit(LSFL_NOTIMERS, &ls->ls_flags); + + + /* + * Connect this lockspace with the cluster manager + */ + + error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM, + &ls_ops, TRUE, (void *) ls, &local_id); + if (error) + goto out_recoverd; + + ls->ls_state = LSST_INIT; + ls->ls_local_id = local_id; + + spin_lock(&lslist_lock); + list_add(&ls->ls_list, &lslist); + spin_unlock(&lslist_lock); + + error = kcl_join_service(local_id); + if (error) { + log_error(ls, "service manager join error %d", error); + goto out_reg; + } + + /* The ls isn't actually running until it receives a start() from CMAN. + Neither does it have a global ls id until started. */ + + /* Return the local ID as the lockspace handle. I've left this + cast to a void* as it allows us to replace it with pretty much + anything at a future date without breaking clients. But returning + the address of the lockspace is a bad idea as it could get + forcibly removed, leaving client with a dangling pointer */ + + *lockspace = (void *)(long) local_id; + return 0; + + out_reg: + kcl_unregister_service(ls->ls_local_id); + out_recoverd: + dlm_recoverd_stop(ls); + kfree(ls->ls_dirtbl); + out_lkbfree: + kfree(ls->ls_lkbtbl); + out_rsbfree: + kfree(ls->ls_rsbtbl); + out_lsfree: + kfree(ls); + out: + return error; +} + +/* + * Called by a system like GFS which wants independent lock spaces. + */ + +int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags) +{ + int error = -ENOSYS; + + down(&dlmstate_lock); + error = init_internal(); + if (error) + goto out; + + error = new_lockspace(name, namelen, lockspace, flags); + out: + up(&dlmstate_lock); + return error; +} + +/* Return 1 if the lockspace still has active remote locks, + * 2 if the lockspace still has active local locks. + */ +static int lockspace_busy(struct dlm_ls *ls) +{ + int i, lkb_found = 0; + struct dlm_lkb *lkb; + + /* NOTE: We check the lockidtbl here rather than the resource table. + This is because there may be LKBs queued as ASTs that have been + unlinked from their RSBs and are pending deletion once the AST has + been delivered */ + + for (i = 0; i < ls->ls_lkbtbl_size; i++) { + read_lock(&ls->ls_lkbtbl[i].lock); + if (!list_empty(&ls->ls_lkbtbl[i].list)) { + lkb_found = 1; + list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list, + lkb_idtbl_list) { + if (!lkb->lkb_nodeid) { + read_unlock(&ls->ls_lkbtbl[i].lock); + return 2; + } + } + } + read_unlock(&ls->ls_lkbtbl[i].lock); + } + return lkb_found; +} + +static int release_lockspace(struct dlm_ls *ls, int force) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *rsb; + struct dlm_recover *rv; + struct list_head *head; + int i; + int busy = lockspace_busy(ls); + + /* Don't destroy a busy lockspace */ + if (busy > force) + return -EBUSY; + + if (force < 3) { + kcl_leave_service(ls->ls_local_id); + kcl_unregister_service(ls->ls_local_id); + } + + dlm_recoverd_stop(ls); + + remove_lockspace(ls); + + /* + * Free direntry structs. + */ + + dlm_dir_clear(ls); + kfree(ls->ls_dirtbl); + + /* + * Free all lkb's on lkbtbl[] lists. + */ + + for (i = 0; i < ls->ls_lkbtbl_size; i++) { + head = &ls->ls_lkbtbl[i].list; + while (!list_empty(head)) { + lkb = list_entry(head->next, struct dlm_lkb, + lkb_idtbl_list); + list_del(&lkb->lkb_idtbl_list); + + if (lkb->lkb_lockqueue_state) + remove_from_lockqueue(lkb); + + if (lkb->lkb_astflags & (AST_COMP | AST_BAST)) + list_del(&lkb->lkb_astqueue); + + if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY) + free_lvb(lkb->lkb_lvbptr); + + free_lkb(lkb); + } + } + + kfree(ls->ls_lkbtbl); + + /* + * Free all rsb's on rsbtbl[] lists + */ + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + head = &ls->ls_rsbtbl[i].list; + while (!list_empty(head)) { + rsb = list_entry(head->next, struct dlm_rsb, + res_hashchain); + list_del(&rsb->res_hashchain); + + if (rsb->res_lvbptr) + free_lvb(rsb->res_lvbptr); + + free_rsb(rsb); + } + } + + kfree(ls->ls_rsbtbl); + + /* + * Free structures on any other lists + */ + + head = &ls->ls_recover; + while (!list_empty(head)) { + rv = list_entry(head->next, struct dlm_recover, list); + list_del(&rv->list); + kfree(rv); + } + + clear_free_de(ls); + + ls_nodes_clear(ls); + ls_nodes_gone_clear(ls); + if (ls->ls_node_array) + kfree(ls->ls_node_array); + + kfree(ls); + dlm_release(); + module_put(THIS_MODULE); + return 0; +} + + +/* + * Called when a system has released all its locks and is not going to use the + * lockspace any longer. We blindly free everything we're managing for this + * lockspace. Remaining nodes will go through the recovery process as if we'd + * died. The lockspace must continue to function as usual, participating in + * recoveries, until kcl_leave_service returns. + * + * Force has 4 possible values: + * 0 - don't destroy locksapce if it has any LKBs + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs + * 2 - destroy lockspace regardless of LKBs + * 3 - destroy lockspace as part of a forced shutdown + */ + +int dlm_release_lockspace(void *lockspace, int force) +{ + struct dlm_ls *ls; + + ls = find_lockspace_by_local_id(lockspace); + if (!ls) + return -EINVAL; + put_lockspace(ls); + return release_lockspace(ls, force); +} + + +/* Called when the cluster is being shut down dirtily */ +void dlm_emergency_shutdown() +{ + struct dlm_ls *ls; + struct dlm_ls *tmp; + + /* Shut lowcomms down to prevent any socket activity */ + lowcomms_stop_accept(); + + /* Delete the devices that belong the the userland + lockspaces to be deleted. */ + dlm_device_free_devices(); + + /* Now try to clean the lockspaces */ + spin_lock(&lslist_lock); + + list_for_each_entry_safe(ls, tmp, &lslist, ls_list) { + spin_unlock(&lslist_lock); + release_lockspace(ls, 3); + spin_lock(&lslist_lock); + } + + spin_unlock(&lslist_lock); +} + +struct dlm_recover *allocate_dlm_recover(void) +{ + struct dlm_recover *rv; + + rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL); + if (rv) + memset(rv, 0, sizeof(struct dlm_recover)); + return rv; +} + +/* + * Called by CMAN on a specific ls. "stop" means set flag which while set + * causes all new requests to ls to be queued and not submitted until flag is + * cleared. stop on a ls also needs to cancel any prior starts on the ls. + * The recoverd thread carries out any work called for by this event. + */ + +static int dlm_ls_stop(void *servicedata) +{ + struct dlm_ls *ls = (struct dlm_ls *) servicedata; + int new; + + spin_lock(&ls->ls_recover_lock); + ls->ls_last_stop = ls->ls_last_start; + set_bit(LSFL_LS_STOP, &ls->ls_flags); + new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags); + spin_unlock(&ls->ls_recover_lock); + + /* + * This in_recovery lock does two things: + * + * 1) Keeps this function from returning until all threads are out + * of locking routines and locking is truely stopped. + * 2) Keeps any new requests from being processed until it's unlocked + * when recovery is complete. + */ + + if (new) + down_write(&ls->ls_in_recovery); + + clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags); + clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags); + clear_bit(LSFL_NODES_VALID, &ls->ls_flags); + clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags); + + dlm_recoverd_kick(ls); + + return 0; +} + +/* + * Called by CMAN on a specific ls. "start" means enable the lockspace to do + * request processing which first requires that the recovery procedure be + * stepped through with all nodes sharing the lockspace (nodeids). The first + * start on the ls after it's created is a special case and requires some extra + * work like figuring out our own local nodeid. We can't do all this in the + * calling CMAN context, so we must pass this work off to the recoverd thread + * which was created in dlm_init(). The recoverd thread carries out any work + * called for by this event. + */ + +static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count, + int event_id, int type) +{ + struct dlm_ls *ls = (struct dlm_ls *) servicedata; + struct dlm_recover *rv; + int error = -ENOMEM; + + rv = allocate_dlm_recover(); + if (!rv) + goto out; + + rv->nodeids = nodeids; + rv->node_count = count; + rv->event_id = event_id; + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_last_start == event_id) + log_all(ls, "repeated start %d stop %d finish %d", + event_id, ls->ls_last_stop, ls->ls_last_finish); + ls->ls_last_start = event_id; + list_add_tail(&rv->list, &ls->ls_recover); + set_bit(LSFL_LS_START, &ls->ls_flags); + spin_unlock(&ls->ls_recover_lock); + + dlm_recoverd_kick(ls); + error = 0; + + out: + return error; +} + +/* + * Called by CMAN on a specific ls. "finish" means that all nodes which + * received a "start" have completed the start and called kcl_start_done. + * The recoverd thread carries out any work called for by this event. + */ + +static void dlm_ls_finish(void *servicedata, int event_id) +{ + struct dlm_ls *ls = (struct dlm_ls *) servicedata; + + spin_lock(&ls->ls_recover_lock); + ls->ls_last_finish = event_id; + set_bit(LSFL_LS_FINISH, &ls->ls_flags); + spin_unlock(&ls->ls_recover_lock); + + dlm_recoverd_kick(ls); +} + +struct kcl_service_ops ls_ops = { + .stop = dlm_ls_stop, + .start = dlm_ls_start, + .finish = dlm_ls_finish +}; diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h --- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/lockspace.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,29 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCKSPACE_DOT_H__ +#define __LOCKSPACE_DOT_H__ + +void dlm_lockspace_init(void); +int dlm_init(void); +int dlm_release(void); +int dlm_new_lockspace(char *name, int namelen, void **ls, int flags); +int dlm_release_lockspace(void *ls, int force); +void dlm_emergency_shutdown(void); +struct dlm_ls *find_lockspace_by_global_id(uint32_t id); +struct dlm_ls *find_lockspace_by_local_id(void *id); +struct dlm_ls *find_lockspace_by_name(char *name, int namelen); +void hold_lockspace(struct dlm_ls *ls); +void put_lockspace(struct dlm_ls *ls); + +#endif /* __LOCKSPACE_DOT_H__ */ diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c --- linux-orig/cluster/dlm/lowcomms.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/lowcomms.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,1415 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * lowcomms.c + * + * This is the "low-level" comms layer. + * + * It is responsible for sending/receiving messages + * from other nodes in the cluster. + * + * Cluster nodes are referred to by their nodeids. nodeids are + * simply 32 bit numbers to the locking module - if they need to + * be expanded for the cluster infrastructure then that is it's + * responsibility. It is this layer's + * responsibility to resolve these into IP address or + * whatever it needs for inter-node communication. + * + * The comms level is two kernel threads that deal mainly with + * the receiving of messages from other nodes and passing them + * up to the mid-level comms layer (which understands the + * message format) for execution by the locking core, and + * a send thread which does all the setting up of connections + * to remote nodes and the sending of data. Threads are not allowed + * to send their own data because it may cause them to wait in times + * of high load. Also, this way, the sending thread can collect together + * messages bound for one node and send them in one block. + * + * I don't see any problem with the recv thread executing the locking + * code on behalf of remote processes as the locking code is + * short, efficient and never waits. + * + */ + + +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "config.h" + +struct cbuf { + unsigned base; + unsigned len; + unsigned mask; +}; + +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0) +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) +#define CBUF_EMPTY(cb) ((cb)->len == 0) +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) +#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \ + (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0) +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) + +struct connection { + struct socket *sock; /* NULL if not connected */ + uint32_t nodeid; /* So we know who we are in the list */ + struct rw_semaphore sock_sem; /* Stop connect races */ + struct list_head read_list; /* On this list when ready for reading */ + struct list_head write_list; /* On this list when ready for writing */ + struct list_head state_list; /* On this list when ready to connect */ + unsigned long flags; /* bit 1,2 = We are on the read/write lists */ +#define CF_READ_PENDING 1 +#define CF_WRITE_PENDING 2 +#define CF_CONNECT_PENDING 3 +#define CF_IS_OTHERCON 4 + struct list_head writequeue; /* List of outgoing writequeue_entries */ + struct list_head listenlist; /* List of allocated listening sockets */ + spinlock_t writequeue_lock; + int (*rx_action) (struct connection *); /* What to do when active */ + struct page *rx_page; + struct cbuf cb; + int retries; + atomic_t waiting_requests; +#define MAX_CONNECT_RETRIES 3 + struct connection *othercon; +}; +#define sock2con(x) ((struct connection *)(x)->sk_user_data) + +/* An entry waiting to be sent */ +struct writequeue_entry { + struct list_head list; + struct page *page; + int offset; + int len; + int end; + int users; + struct connection *con; +}; + +/* "Template" structure for IPv4 and IPv6 used to fill + * in the missing bits when converting between cman (which knows + * nothing about sockaddr structs) and real life where we actually + * have to connect to these addresses. Also one of these structs + * will hold the cached "us" address. + * + * It's an in6 sockaddr just so there's enough space for anything + * we're likely to see here. + */ +static struct sockaddr_in6 local_addr; + +/* Manage daemons */ +static struct task_struct *recv_task; +static struct task_struct *send_task; + +static wait_queue_t lowcomms_send_waitq_head; +static wait_queue_head_t lowcomms_send_waitq; +static wait_queue_t lowcomms_recv_waitq_head; +static wait_queue_head_t lowcomms_recv_waitq; + +/* An array of pointers to connections, indexed by NODEID */ +static struct connection **connections; +static struct rw_semaphore connections_lock; +static kmem_cache_t *con_cache; +static int conn_array_size; +static atomic_t accepting; + +/* List of sockets that have reads pending */ +static struct list_head read_sockets; +static spinlock_t read_sockets_lock; + +/* List of sockets which have writes pending */ +static struct list_head write_sockets; +static spinlock_t write_sockets_lock; + +/* List of sockets which have connects pending */ +static struct list_head state_sockets; +static spinlock_t state_sockets_lock; + +/* List of allocated listen sockets */ +static struct list_head listen_sockets; + +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr); +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len); + + +static struct connection *nodeid2con(int nodeid, int allocation) +{ + struct connection *con = NULL; + + down_read(&connections_lock); + if (nodeid >= conn_array_size) { + int new_size = nodeid + dlm_config.conn_increment; + struct connection **new_conns; + + new_conns = kmalloc(sizeof(struct connection *) * + new_size, allocation); + if (!new_conns) + goto finish; + + up_read(&connections_lock); + /* The worst that can happen here (I think), is that + we get two consecutive reallocations */ + down_write(&connections_lock); + + memset(new_conns, 0, sizeof(struct connection *) * new_size); + memcpy(new_conns, connections, sizeof(struct connection *) * conn_array_size); + conn_array_size = new_size; + kfree(connections); + connections = new_conns; + + up_write(&connections_lock); + down_read(&connections_lock); + } + + con = connections[nodeid]; + if (con == NULL && allocation) { + con = kmem_cache_alloc(con_cache, allocation); + if (!con) + goto finish; + + memset(con, 0, sizeof(*con)); + con->nodeid = nodeid; + init_rwsem(&con->sock_sem); + INIT_LIST_HEAD(&con->writequeue); + spin_lock_init(&con->writequeue_lock); + + connections[nodeid] = con; + } + + finish: + up_read(&connections_lock); + return con; +} + +/* Data available on socket or listen socket received a connect */ +static void lowcomms_data_ready(struct sock *sk, int count_unused) +{ + struct connection *con = sock2con(sk); + + atomic_inc(&con->waiting_requests); + if (test_and_set_bit(CF_READ_PENDING, &con->flags)) + return; + + spin_lock_bh(&read_sockets_lock); + list_add_tail(&con->read_list, &read_sockets); + spin_unlock_bh(&read_sockets_lock); + + wake_up_interruptible(&lowcomms_recv_waitq); +} + +static void lowcomms_write_space(struct sock *sk) +{ + struct connection *con = sock2con(sk); + + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + return; + + spin_lock_bh(&write_sockets_lock); + list_add_tail(&con->write_list, &write_sockets); + spin_unlock_bh(&write_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); +} + +static inline void lowcomms_connect_sock(struct connection *con) +{ + if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) + return; + if (!atomic_read(&accepting)) + return; + + spin_lock_bh(&state_sockets_lock); + list_add_tail(&con->state_list, &state_sockets); + spin_unlock_bh(&state_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); +} + +static void lowcomms_state_change(struct sock *sk) +{ +/* struct connection *con = sock2con(sk); */ + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + lowcomms_write_space(sk); + break; + + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + case TCP_TIME_WAIT: + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: + case TCP_CLOSING: + /* FIXME: I think this causes more trouble than it solves. + lowcomms wil reconnect anyway when there is something to + send. This just attempts reconnection if a node goes down! + */ + /* lowcomms_connect_sock(con); */ + break; + + default: + printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state); + break; + } +} + +/* Make a socket active */ +static int add_sock(struct socket *sock, struct connection *con) +{ + con->sock = sock; + + /* Install a data_ready callback */ + con->sock->sk->sk_data_ready = lowcomms_data_ready; + con->sock->sk->sk_write_space = lowcomms_write_space; + con->sock->sk->sk_state_change = lowcomms_state_change; + + return 0; +} + +/* Add the port number to an IP6 or 4 sockaddr and return the address + length */ +static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port, + int *addr_len) +{ + saddr->sin6_family = local_addr.sin6_family; + if (local_addr.sin6_family == AF_INET) { + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; + in4_addr->sin_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in); + } + else { + saddr->sin6_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in6); + } +} + +/* Close a remote connection and tidy up */ +static void close_connection(struct connection *con, int and_other) +{ + down_write(&con->sock_sem); + + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + if (con->othercon && and_other) { + /* Argh! recursion in kernel code! + Actually, this isn't a list so it + will only re-enter once. + */ + close_connection(con->othercon, TRUE); + } + } + if (con->rx_page) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + up_write(&con->sock_sem); +} + +/* Data received from remote end */ +static int receive_from_sock(struct connection *con) +{ + int ret = 0; + struct msghdr msg; + struct iovec iov[2]; + mm_segment_t fs; + unsigned len; + int r; + int call_again_soon = 0; + + down_read(&con->sock_sem); + + if (con->sock == NULL) + goto out; + if (con->rx_page == NULL) { + /* + * This doesn't need to be atomic, but I think it should + * improve performance if it is. + */ + con->rx_page = alloc_page(GFP_ATOMIC); + if (con->rx_page == NULL) + goto out_resched; + CBUF_INIT(&con->cb, PAGE_CACHE_SIZE); + } + + /* + * To avoid doing too many short reads, we will reschedule for + * another time if there are less than 20 bytes left in the buffer. + */ + if (!CBUF_MAY_ADD(&con->cb, 20)) + goto out_resched; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = iov; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = 0; + + /* + * iov[0] is the bit of the circular buffer between the current end + * point (cb.base + cb.len) and the end of the buffer. + */ + iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb); + iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb); + iov[1].iov_len = 0; + + /* + * iov[1] is the bit of the circular buffer between the start of the + * buffer and the start of the currently used section (cb.base) + */ + if (CBUF_DATA(&con->cb) >= con->cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb); + iov[1].iov_len = con->cb.base; + iov[1].iov_base = page_address(con->rx_page); + msg.msg_iovlen = 2; + } + len = iov[0].iov_len + iov[1].iov_len; + + fs = get_fs(); + set_fs(get_ds()); + r = ret = sock_recvmsg(con->sock, &msg, len, + MSG_DONTWAIT | MSG_NOSIGNAL); + set_fs(fs); + + if (ret <= 0) + goto out_close; + if (ret == len) + call_again_soon = 1; + CBUF_ADD(&con->cb, ret); + ret = midcomms_process_incoming_buffer(con->nodeid, + page_address(con->rx_page), + con->cb.base, con->cb.len, + PAGE_CACHE_SIZE); + if (ret == -EBADMSG) { + printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, " + "iov_len=%u, iov_base[0]=%p, read=%d\n", + page_address(con->rx_page), con->cb.base, con->cb.len, + len, iov[0].iov_base, r); + } + if (ret < 0) + goto out_close; + CBUF_EAT(&con->cb, ret); + + if (CBUF_EMPTY(&con->cb) && !call_again_soon) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + + out: + if (call_again_soon) + goto out_resched; + up_read(&con->sock_sem); + ret = 0; + goto out_ret; + + out_resched: + lowcomms_data_ready(con->sock->sk, 0); + up_read(&con->sock_sem); + ret = 0; + goto out_ret; + + out_close: + up_read(&con->sock_sem); + if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) { + close_connection(con, FALSE); + lowcomms_connect_sock(con); + } + + out_ret: + return ret; +} + +/* Listening socket is busy, accept a connection */ +static int accept_from_sock(struct connection *con) +{ + int result; + struct sockaddr_in6 peeraddr; + struct socket *newsock; + int len; + int nodeid; + struct connection *newcon; + + memset(&peeraddr, 0, sizeof(peeraddr)); + newsock = sock_alloc(); + if (!newsock) + return -ENOMEM; + + down_read(&con->sock_sem); + + result = -ENOTCONN; + if (con->sock == NULL) + goto accept_err; + + newsock->type = con->sock->type; + newsock->ops = con->sock->ops; + + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK); + if (result < 0) + goto accept_err; + + /* Get the connected socket's peer */ + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, + &len, 2)) { + result = -ECONNABORTED; + goto accept_err; + } + + /* Get the new node's NODEID */ + nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len); + if (nodeid == 0) { + printk("dlm: connect from non cluster node\n"); + sock_release(newsock); + up_read(&con->sock_sem); + return -1; + } + + log_print("got connection from %d", nodeid); + + /* Check to see if we already have a connection to this node. This + * could happen if the two nodes initiate a connection at roughly + * the same time and the connections cross on the wire. + * TEMPORARY FIX: + * In this case we store the incoming one in "othercon" + */ + newcon = nodeid2con(nodeid, GFP_KERNEL); + if (!newcon) { + result = -ENOMEM; + goto accept_err; + } + down_write(&newcon->sock_sem); + if (newcon->sock) { + struct connection *othercon = newcon->othercon; + + if (!othercon) { + othercon = kmem_cache_alloc(con_cache, GFP_KERNEL); + if (!othercon) { + printk("dlm: failed to allocate incoming socket\n"); + up_write(&newcon->sock_sem); + result = -ENOMEM; + goto accept_err; + } + memset(othercon, 0, sizeof(*othercon)); + othercon->nodeid = nodeid; + othercon->rx_action = receive_from_sock; + init_rwsem(&othercon->sock_sem); + set_bit(CF_IS_OTHERCON, &othercon->flags); + newcon->othercon = othercon; + } + othercon->sock = newsock; + newsock->sk->sk_user_data = othercon; + add_sock(newsock, othercon); + } + else { + newsock->sk->sk_user_data = newcon; + newcon->rx_action = receive_from_sock; + add_sock(newsock, newcon); + + } + + up_write(&newcon->sock_sem); + + /* + * Add it to the active queue in case we got data + * beween processing the accept adding the socket + * to the read_sockets list + */ + lowcomms_data_ready(newsock->sk, 0); + up_read(&con->sock_sem); + + return 0; + + accept_err: + up_read(&con->sock_sem); + sock_release(newsock); + + if (result != -EAGAIN) + printk("dlm: error accepting connection from node: %d\n", result); + return result; +} + +/* Connect a new socket to its peer */ +static int connect_to_sock(struct connection *con) +{ + int result = -EHOSTUNREACH; + struct sockaddr_in6 saddr; + int addr_len; + struct socket *sock; + + if (con->nodeid == 0) { + log_print("attempt to connect sock 0 foiled"); + return 0; + } + + down_write(&con->sock_sem); + if (con->retries++ > MAX_CONNECT_RETRIES) + goto out; + + // FIXME not sure this should happen, let alone like this. + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } + + /* Create a socket to communicate with */ + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock); + if (result < 0) + goto out_err; + + memset(&saddr, 0, sizeof(saddr)); + if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0) + goto out_err; + + sock->sk->sk_user_data = con; + con->rx_action = receive_from_sock; + + make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len); + + add_sock(sock, con); + + log_print("connecting to %d", con->nodeid); + result = + sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len, + O_NONBLOCK); + if (result == -EINPROGRESS) + result = 0; + if (result != 0) + goto out_err; + + out: + up_write(&con->sock_sem); + /* + * Returning an error here means we've given up trying to connect to + * a remote node, otherwise we return 0 and reschedule the connetion + * attempt + */ + return result; + + out_err: + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && result != -ENETUNREACH && + result != -ENETDOWN && result != EINVAL + && result != -EPROTONOSUPPORT) { + lowcomms_connect_sock(con); + result = 0; + } + goto out; +} + +static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len) +{ + struct socket *sock = NULL; + mm_segment_t fs; + int result = 0; + int one = 1; + struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr; + + /* Create a socket to communicate with */ + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock); + if (result < 0) { + printk("dlm: Can't create listening comms socket\n"); + goto create_out; + } + + fs = get_fs(); + set_fs(get_ds()); + result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one)); + set_fs(fs); + if (result < 0) { + printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result); + } + sock->sk->sk_user_data = con; + con->rx_action = accept_from_sock; + con->sock = sock; + + /* Bind to our port */ + make_sockaddr(saddr, dlm_config.tcp_port, &addr_len); + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); + if (result < 0) { + printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port); + sock_release(sock); + sock = NULL; + goto create_out; + } + + fs = get_fs(); + set_fs(get_ds()); + + result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one)); + set_fs(fs); + if (result < 0) { + printk("dlm: Set keepalive failed: %d\n", result); + } + + result = sock->ops->listen(sock, 5); + if (result < 0) { + printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port); + sock_release(sock); + sock = NULL; + goto create_out; + } + + create_out: + return sock; +} + + +/* Listen on all interfaces */ +static int listen_for_all(void) +{ + int result = 0; + int nodeid; + struct socket *sock = NULL; + struct list_head *addr_list; + struct connection *con = nodeid2con(0, GFP_KERNEL); + struct connection *temp; + struct cluster_node_addr *node_addr; + char local_addr[sizeof(struct sockaddr_in6)]; + + /* This will also fill in local_addr */ + nodeid = lowcomms_our_nodeid(); + + addr_list = kcl_get_node_addresses(nodeid); + if (!addr_list) { + printk("dlm: cannot initialise comms layer\n"); + result = -ENOTCONN; + goto create_out; + } + + list_for_each_entry(node_addr, addr_list, list) { + + if (!con) { + con = kmem_cache_alloc(con_cache, GFP_KERNEL); + if (!con) { + printk("dlm: failed to allocate listen socket\n"); + result = -ENOMEM; + goto create_free; + } + memset(con, 0, sizeof(*con)); + init_rwsem(&con->sock_sem); + spin_lock_init(&con->writequeue_lock); + INIT_LIST_HEAD(&con->writequeue); + set_bit(CF_IS_OTHERCON, &con->flags); + } + + memcpy(local_addr, node_addr->addr, node_addr->addr_len); + sock = create_listen_sock(con, local_addr, + node_addr->addr_len); + if (sock) { + add_sock(sock, con); + + /* Keep a list of dynamically allocated listening sockets + so we can free them at shutdown */ + if (test_bit(CF_IS_OTHERCON, &con->flags)) { + list_add_tail(&con->listenlist, &listen_sockets); + } + } + else { + result = -EADDRINUSE; + kmem_cache_free(con_cache, con); + goto create_free; + } + + con = NULL; + } + + create_out: + return result; + + create_free: + /* Free up any dynamically allocated listening sockets */ + list_for_each_entry_safe(con, temp, &listen_sockets, listenlist) { + sock_release(con->sock); + kmem_cache_free(con_cache, con); + } + return result; +} + + + +static struct writequeue_entry *new_writequeue_entry(struct connection *con, + int allocation) +{ + struct writequeue_entry *entry; + + entry = kmalloc(sizeof(struct writequeue_entry), allocation); + if (!entry) + return NULL; + + entry->page = alloc_page(allocation); + if (!entry->page) { + kfree(entry); + return NULL; + } + + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->users = 0; + entry->con = con; + + return entry; +} + +struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len, + int allocation, char **ppc) +{ + struct connection *con = nodeid2con(nodeid, allocation); + struct writequeue_entry *e; + int offset = 0; + int users = 0; + + if (!con) + return NULL; + + if (!atomic_read(&accepting)) + return NULL; + + spin_lock(&con->writequeue_lock); + e = list_entry(con->writequeue.prev, struct writequeue_entry, list); + if (((struct list_head *) e == &con->writequeue) || + (PAGE_CACHE_SIZE - e->end < len)) { + e = NULL; + } else { + offset = e->end; + e->end += len; + users = e->users++; + } + spin_unlock(&con->writequeue_lock); + + if (e) { + got_one: + if (users == 0) + kmap(e->page); + *ppc = page_address(e->page) + offset; + return e; + } + + e = new_writequeue_entry(con, allocation); + if (e) { + spin_lock(&con->writequeue_lock); + offset = e->end; + e->end += len; + users = e->users++; + list_add_tail(&e->list, &con->writequeue); + spin_unlock(&con->writequeue_lock); + goto got_one; + } + return NULL; +} + +void lowcomms_commit_buffer(struct writequeue_entry *e) +{ + struct connection *con = e->con; + int users; + + if (!atomic_read(&accepting)) + return; + + spin_lock(&con->writequeue_lock); + users = --e->users; + if (users) + goto out; + e->len = e->end - e->offset; + kunmap(e->page); + spin_unlock(&con->writequeue_lock); + + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) { + spin_lock_bh(&write_sockets_lock); + list_add_tail(&con->write_list, &write_sockets); + spin_unlock_bh(&write_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); + } + return; + + out: + spin_unlock(&con->writequeue_lock); + return; +} + +static void free_entry(struct writequeue_entry *e) +{ + __free_page(e->page); + kfree(e); +} + +/* Send a message */ +static int send_to_sock(struct connection *con) +{ + int ret = 0; + ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int); + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + struct writequeue_entry *e; + int len, offset; + + down_read(&con->sock_sem); + if (con->sock == NULL) + goto out_connect; + + sendpage = con->sock->ops->sendpage; + + spin_lock(&con->writequeue_lock); + for (;;) { + e = list_entry(con->writequeue.next, struct writequeue_entry, + list); + if ((struct list_head *) e == &con->writequeue) + break; + + len = e->len; + offset = e->offset; + BUG_ON(len == 0 && e->users == 0); + spin_unlock(&con->writequeue_lock); + + ret = 0; + if (len) { + ret = sendpage(con->sock, e->page, offset, len, + msg_flags); + if (ret == -EAGAIN || ret == 0) + goto out; + if (ret <= 0) + goto send_error; + } + + spin_lock(&con->writequeue_lock); + e->offset += ret; + e->len -= ret; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + continue; + } + } + spin_unlock(&con->writequeue_lock); + out: + up_read(&con->sock_sem); + return ret; + + send_error: + up_read(&con->sock_sem); + close_connection(con, FALSE); + lowcomms_connect_sock(con); + return ret; + + out_connect: + up_read(&con->sock_sem); + lowcomms_connect_sock(con); + return 0; +} + +static void clean_one_writequeue(struct connection *con) +{ + struct list_head *list; + struct list_head *temp; + + spin_lock(&con->writequeue_lock); + list_for_each_safe(list, temp, &con->writequeue) { + struct writequeue_entry *e = + list_entry(list, struct writequeue_entry, list); + list_del(&e->list); + free_entry(e); + } + spin_unlock(&con->writequeue_lock); +} + +/* Called from recovery when it knows that a node has + left the cluster */ +int lowcomms_close(int nodeid) +{ + struct connection *con; + + if (!connections) + goto out; + + log_print("closing connection to node %d", nodeid); + con = nodeid2con(nodeid, 0); + if (con) { + close_connection(con, TRUE); + clean_one_writequeue(con); + atomic_set(&con->waiting_requests, 0); + } + return 0; + + out: + return -1; +} + +/* API send message call, may queue the request */ +/* N.B. This is the old interface - use the new one for new calls */ +int lowcomms_send_message(int nodeid, char *buf, int len, int allocation) +{ + struct writequeue_entry *e; + char *b; + + e = lowcomms_get_buffer(nodeid, len, allocation, &b); + if (e) { + memcpy(b, buf, len); + lowcomms_commit_buffer(e); + return 0; + } + return -ENOBUFS; +} + +/* Look for activity on active sockets */ +static void process_sockets(void) +{ + struct list_head *list; + struct list_head *temp; + + spin_lock_bh(&read_sockets_lock); + list_for_each_safe(list, temp, &read_sockets) { + struct connection *con = + list_entry(list, struct connection, read_list); + list_del(&con->read_list); + clear_bit(CF_READ_PENDING, &con->flags); + + spin_unlock_bh(&read_sockets_lock); + + /* This can reach zero if we are processing requests + * as they come in. + */ + if (atomic_read(&con->waiting_requests) == 0) { + spin_lock_bh(&read_sockets_lock); + continue; + } + + do { + con->rx_action(con); + } while (!atomic_dec_and_test(&con->waiting_requests) && + !kthread_should_stop()); + + /* Don't starve out everyone else */ + schedule(); + spin_lock_bh(&read_sockets_lock); + } + spin_unlock_bh(&read_sockets_lock); +} + +/* Try to send any messages that are pending + */ +static void process_output_queue(void) +{ + struct list_head *list; + struct list_head *temp; + int ret; + + spin_lock_bh(&write_sockets_lock); + list_for_each_safe(list, temp, &write_sockets) { + struct connection *con = + list_entry(list, struct connection, write_list); + list_del(&con->write_list); + clear_bit(CF_WRITE_PENDING, &con->flags); + + spin_unlock_bh(&write_sockets_lock); + + ret = send_to_sock(con); + if (ret < 0) { + } + spin_lock_bh(&write_sockets_lock); + } + spin_unlock_bh(&write_sockets_lock); +} + +static void process_state_queue(void) +{ + struct list_head *list; + struct list_head *temp; + int ret; + + spin_lock_bh(&state_sockets_lock); + list_for_each_safe(list, temp, &state_sockets) { + struct connection *con = + list_entry(list, struct connection, state_list); + list_del(&con->state_list); + clear_bit(CF_CONNECT_PENDING, &con->flags); + spin_unlock_bh(&state_sockets_lock); + + ret = connect_to_sock(con); + if (ret < 0) { + } + spin_lock_bh(&state_sockets_lock); + } + spin_unlock_bh(&state_sockets_lock); +} + + +/* Discard all entries on the write queues */ +static void clean_writequeues(void) +{ + int nodeid; + + for (nodeid = 1; nodeid < conn_array_size; nodeid++) { + struct connection *con = nodeid2con(nodeid, 0); + + if (con) + clean_one_writequeue(con); + } +} + +static int read_list_empty(void) +{ + int status; + + spin_lock_bh(&read_sockets_lock); + status = list_empty(&read_sockets); + spin_unlock_bh(&read_sockets_lock); + + return status; +} + +/* DLM Transport comms receive daemon */ +static int dlm_recvd(void *data) +{ + init_waitqueue_head(&lowcomms_recv_waitq); + init_waitqueue_entry(&lowcomms_recv_waitq_head, current); + add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (read_list_empty()) + schedule(); + set_current_state(TASK_RUNNING); + + process_sockets(); + } + + return 0; +} + +static int write_and_state_lists_empty(void) +{ + int status; + + spin_lock_bh(&write_sockets_lock); + status = list_empty(&write_sockets); + spin_unlock_bh(&write_sockets_lock); + + spin_lock_bh(&state_sockets_lock); + if (list_empty(&state_sockets) == 0) + status = 0; + spin_unlock_bh(&state_sockets_lock); + + return status; +} + +/* DLM Transport send daemon */ +static int dlm_sendd(void *data) +{ + init_waitqueue_head(&lowcomms_send_waitq); + init_waitqueue_entry(&lowcomms_send_waitq_head, current); + add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (write_and_state_lists_empty()) + schedule(); + set_current_state(TASK_RUNNING); + + process_state_queue(); + process_output_queue(); + } + + return 0; +} + +static void daemons_stop(void) +{ + kthread_stop(recv_task); + kthread_stop(send_task); +} + +static int daemons_start(void) +{ + struct task_struct *p; + int error; + + p = kthread_run(dlm_recvd, NULL, 0, "dlm_recvd"); + error = IS_ERR(p); + if (error) { + log_print("can't start dlm_recvd %d", error); + return error; + } + recv_task = p; + + p = kthread_run(dlm_sendd, NULL, 0, "dlm_sendd"); + error = IS_ERR(p); + if (error) { + log_print("can't start dlm_sendd %d", error); + kthread_stop(recv_task); + return error; + } + send_task = p; + + return 0; +} + +/* + * Return the largest buffer size we can cope with. + */ +int lowcomms_max_buffer_size(void) +{ + return PAGE_CACHE_SIZE; +} + +void lowcomms_stop(void) +{ + int i; + struct connection *temp; + struct connection *lcon; + + atomic_set(&accepting, 0); + + /* Set all the activity flags to prevent any + socket activity. + */ + for (i = 0; i < conn_array_size; i++) { + if (connections[i]) + connections[i]->flags = 0x7; + } + daemons_stop(); + clean_writequeues(); + + for (i = 0; i < conn_array_size; i++) { + if (connections[i]) { + close_connection(connections[i], TRUE); + if (connections[i]->othercon) + kmem_cache_free(con_cache, connections[i]->othercon); + kmem_cache_free(con_cache, connections[i]); + } + } + + kfree(connections); + connections = NULL; + + /* Free up any dynamically allocated listening sockets */ + list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) { + sock_release(lcon->sock); + kmem_cache_free(con_cache, lcon); + } + + kmem_cache_destroy(con_cache); + kcl_releaseref_cluster(); +} + +/* This is quite likely to sleep... */ +int lowcomms_start(void) +{ + int error = 0; + struct connection *temp; + struct connection *lcon; + + INIT_LIST_HEAD(&read_sockets); + INIT_LIST_HEAD(&write_sockets); + INIT_LIST_HEAD(&state_sockets); + INIT_LIST_HEAD(&listen_sockets); + + spin_lock_init(&read_sockets_lock); + spin_lock_init(&write_sockets_lock); + spin_lock_init(&state_sockets_lock); + init_rwsem(&connections_lock); + + error = -ENOTCONN; + if (kcl_addref_cluster()) + goto out; + + /* + * Temporarily initialise the waitq head so that lowcomms_send_message + * doesn't crash if it gets called before the thread is fully + * initialised + */ + init_waitqueue_head(&lowcomms_send_waitq); + + error = -ENOMEM; + connections = kmalloc(sizeof(struct connection *) * + dlm_config.conn_increment, GFP_KERNEL); + if (!connections) + goto out; + + memset(connections, 0, + sizeof(struct connection *) * dlm_config.conn_increment); + + conn_array_size = dlm_config.conn_increment; + + con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), + __alignof__(struct connection), 0, NULL, NULL); + if (!con_cache) + goto fail_free_conn; + + + /* Start listening */ + error = listen_for_all(); + if (error) + goto fail_unlisten; + + error = daemons_start(); + if (error) + goto fail_unlisten; + + atomic_set(&accepting, 1); + + return 0; + + fail_unlisten: + close_connection(connections[0], 0); + kmem_cache_free(con_cache, connections[0]); + list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) { + sock_release(lcon->sock); + kmem_cache_free(con_cache, lcon); + } + + kmem_cache_destroy(con_cache); + + fail_free_conn: + kcl_releaseref_cluster(); + kfree(connections); + + out: + return error; +} + +/* Don't accept any more outgoing work */ +void lowcomms_stop_accept() +{ + atomic_set(&accepting, 0); +} + +/* Cluster Manager interface functions for looking up + nodeids and IP addresses by each other +*/ + +/* Return the IP address of a node given its NODEID */ +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr) +{ + struct list_head *addrs; + struct cluster_node_addr *node_addr; + struct cluster_node_addr *current_addr = NULL; + struct sockaddr_in6 *saddr; + int interface; + int i; + + addrs = kcl_get_node_addresses(nodeid); + if (!addrs) + return -1; + + interface = kcl_get_current_interface(); + + /* Look for address number */ + i=0; /* i/f numbers start at 1 */ + list_for_each_entry(node_addr, addrs, list) { + if (interface == ++i) { + current_addr = node_addr; + break; + } + } + + /* If that failed then just use the first one */ + if (!current_addr) + current_addr = (struct cluster_node_addr *)addrs->next; + + saddr = (struct sockaddr_in6 *)current_addr->addr; + + /* Extract the IP address */ + if (local_addr.sin6_family == AF_INET) { + struct sockaddr_in *in4 = (struct sockaddr_in *)saddr; + struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr; + ret4->sin_addr.s_addr = in4->sin_addr.s_addr; + } + else { + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr; + memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr)); + } + + return 0; +} + +/* Return the NODEID for a node given its sockaddr */ +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len) +{ + struct kcl_cluster_node node; + struct sockaddr_in6 ipv6_addr; + struct sockaddr_in ipv4_addr; + + if (local_addr.sin6_family == AF_INET) { + struct sockaddr_in *in4 = (struct sockaddr_in *)addr; + memcpy(&ipv4_addr, &local_addr, addr_len); + memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr)); + + addr = (struct sockaddr *)&ipv4_addr; + } + else { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; + memcpy(&ipv6_addr, &local_addr, addr_len); + memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr)); + + addr = (struct sockaddr *)&ipv6_addr; + } + + if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0) + return node.node_id; + else + return 0; +} + +int lowcomms_our_nodeid(void) +{ + struct kcl_cluster_node node; + struct list_head *addrs; + struct cluster_node_addr *first_addr; + static int our_nodeid = 0; + + if (our_nodeid) + return our_nodeid; + + if (kcl_get_node_by_nodeid(0, &node) == -1) + return 0; + + our_nodeid = node.node_id; + + /* Fill in the "template" structure */ + addrs = kcl_get_node_addresses(our_nodeid); + if (!addrs) + return 0; + + first_addr = (struct cluster_node_addr *) addrs->next; + memcpy(&local_addr, &first_addr->addr, first_addr->addr_len); + + return node.node_id; +} +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h --- linux-orig/cluster/dlm/lowcomms.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/lowcomms.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,34 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOWCOMMS_DOT_H__ +#define __LOWCOMMS_DOT_H__ + +/* The old interface */ +int lowcomms_send_message(int csid, char *buf, int len, int allocation); + +/* The new interface */ +struct writequeue_entry; +extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len, + int allocation, char **ppc); +extern void lowcomms_commit_buffer(struct writequeue_entry *e); + +int lowcomms_start(void); +void lowcomms_stop(void); +void lowcomms_stop_accept(void); +int lowcomms_close(int nodeid); +int lowcomms_max_buffer_size(void); + +int lowcomms_our_nodeid(void); + +#endif /* __LOWCOMMS_DOT_H__ */ diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c --- linux-orig/cluster/dlm/main.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/main.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,93 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include + +#include + +#include "dlm_internal.h" +#include "lockspace.h" +#include "ast.h" +#include "lkb.h" +#include "nodes.h" +#include "locking.h" +#include "config.h" +#include "memory.h" +#include "recover.h" +#include "lowcomms.h" + +int dlm_device_init(void); +void dlm_device_exit(void); +void dlm_proc_init(void); +void dlm_proc_exit(void); + + +/* Cluster manager callbacks, we want to know if a node dies + N.B. this is independent of lockspace-specific event callbacks from SM */ + +static void cman_callback(kcl_callback_reason reason, long arg) +{ + /* This is unconditional. so do what we can to tidy up */ + if (reason == LEAVING) { + dlm_emergency_shutdown(); + } +} + +int __init init_dlm(void) +{ + dlm_proc_init(); + dlm_lockspace_init(); + dlm_nodes_init(); + dlm_device_init(); + dlm_memory_init(); + dlm_config_init(); + + kcl_add_callback(cman_callback); + + printk("DLM %s (built %s %s) installed\n", + DLM_RELEASE_NAME, __DATE__, __TIME__); + + return 0; +} + +void __exit exit_dlm(void) +{ + kcl_remove_callback(cman_callback); + + dlm_device_exit(); + dlm_memory_exit(); + dlm_config_exit(); + dlm_proc_exit(); +} + +MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +module_init(init_dlm); +module_exit(exit_dlm); + +EXPORT_SYMBOL(dlm_init); +EXPORT_SYMBOL(dlm_release); +EXPORT_SYMBOL(dlm_new_lockspace); +EXPORT_SYMBOL(dlm_release_lockspace); +EXPORT_SYMBOL(dlm_lock); +EXPORT_SYMBOL(dlm_unlock); +EXPORT_SYMBOL(dlm_debug_dump); +EXPORT_SYMBOL(dlm_locks_dump); diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c --- linux-orig/cluster/dlm/memory.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/memory.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,238 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* memory.c + * + * memory allocation routines + * + */ + +#include "dlm_internal.h" +#include "memory.h" +#include "config.h" + +/* as the man says...Shouldn't this be in a header file somewhere? */ +#define BYTES_PER_WORD sizeof(void *) + +static kmem_cache_t *rsb_cache_small; +static kmem_cache_t *rsb_cache_large; +static kmem_cache_t *lkb_cache; +static kmem_cache_t *lvb_cache; +static kmem_cache_t *resdir_cache_large; +static kmem_cache_t *resdir_cache_small; + +/* The thresholds above which we allocate large RSBs/direntry rather than small + * ones. This must make the resultant structure end on a word boundary */ +#define LARGE_RSB_NAME 28 +#define LARGE_RES_NAME 28 + +int dlm_memory_init() +{ + int ret = -ENOMEM; + + + rsb_cache_small = + kmem_cache_create("dlm_rsb(small)", + (sizeof(struct dlm_rsb) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1), + __alignof__(struct dlm_rsb), 0, NULL, NULL); + if (!rsb_cache_small) + goto out; + + rsb_cache_large = + kmem_cache_create("dlm_rsb(large)", + sizeof(struct dlm_rsb) + DLM_RESNAME_MAXLEN, + __alignof__(struct dlm_rsb), 0, NULL, NULL); + if (!rsb_cache_large) + goto out_free_rsbs; + + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), + __alignof__(struct dlm_lkb), 0, NULL, NULL); + if (!lkb_cache) + goto out_free_rsbl; + + resdir_cache_large = + kmem_cache_create("dlm_resdir(l)", + sizeof(struct dlm_direntry) + DLM_RESNAME_MAXLEN, + __alignof__(struct dlm_direntry), 0, NULL, NULL); + if (!resdir_cache_large) + goto out_free_lkb; + + resdir_cache_small = + kmem_cache_create("dlm_resdir(s)", + (sizeof(struct dlm_direntry) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1), + __alignof__(struct dlm_direntry), 0, NULL, NULL); + if (!resdir_cache_small) + goto out_free_resl; + + /* LVB cache also holds ranges, so should be 64bit aligned */ + lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN, + __alignof__(uint64_t), 0, NULL, NULL); + if (!lkb_cache) + goto out_free_ress; + + ret = 0; + goto out; + + out_free_ress: + kmem_cache_destroy(resdir_cache_small); + + out_free_resl: + kmem_cache_destroy(resdir_cache_large); + + out_free_lkb: + kmem_cache_destroy(lkb_cache); + + out_free_rsbl: + kmem_cache_destroy(rsb_cache_large); + + out_free_rsbs: + kmem_cache_destroy(rsb_cache_small); + + out: + return ret; +} + +void dlm_memory_exit() +{ + kmem_cache_destroy(rsb_cache_large); + kmem_cache_destroy(rsb_cache_small); + kmem_cache_destroy(lkb_cache); + kmem_cache_destroy(resdir_cache_small); + kmem_cache_destroy(resdir_cache_large); + kmem_cache_destroy(lvb_cache); +} + +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) +{ + struct dlm_rsb *r; + + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); + + if (namelen >= LARGE_RSB_NAME) + r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation); + else + r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation); + + if (r) + memset(r, 0, sizeof(struct dlm_rsb) + namelen); + + return r; +} + +void free_rsb(struct dlm_rsb *r) +{ + int length = r->res_length; + +#ifdef POISON + memset(r, 0x55, sizeof(struct dlm_rsb) + r->res_length); +#endif + + if (length >= LARGE_RSB_NAME) + kmem_cache_free(rsb_cache_large, r); + else + kmem_cache_free(rsb_cache_small, r); +} + +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) +{ + struct dlm_lkb *l; + + l = kmem_cache_alloc(lkb_cache, ls->ls_allocation); + if (l) + memset(l, 0, sizeof(struct dlm_lkb)); + + return l; +} + +void free_lkb(struct dlm_lkb *l) +{ +#ifdef POISON + memset(l, 0xAA, sizeof(struct dlm_lkb)); +#endif + kmem_cache_free(lkb_cache, l); +} + +struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen) +{ + struct dlm_direntry *rd; + + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); + + if (namelen >= LARGE_RES_NAME) + rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation); + else + rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation); + + if (rd) + memset(rd, 0, sizeof(struct dlm_direntry)); + + return rd; +} + +void free_direntry(struct dlm_direntry *de) +{ + if (de->length >= LARGE_RES_NAME) + kmem_cache_free(resdir_cache_large, de); + else + kmem_cache_free(resdir_cache_small, de); +} + +char *allocate_lvb(struct dlm_ls *ls) +{ + char *l; + + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation); + if (l) + memset(l, 0, DLM_LVB_LEN); + + return l; +} + +void free_lvb(char *l) +{ + kmem_cache_free(lvb_cache, l); +} + +/* Ranges are allocated from the LVB cache as they are the same size (4x64 + * bits) */ +uint64_t *allocate_range(struct dlm_ls * ls) +{ + uint64_t *l; + + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation); + if (l) + memset(l, 0, DLM_LVB_LEN); + + return l; +} + +void free_range(uint64_t *l) +{ + kmem_cache_free(lvb_cache, l); +} + +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls) +{ + struct dlm_rcom *rc; + + rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation); + if (rc) + memset(rc, 0, dlm_config.buffer_size); + + return rc; +} + +void free_rcom_buffer(struct dlm_rcom *rc) +{ + kfree(rc); +} diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h --- linux-orig/cluster/dlm/memory.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/memory.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,32 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MEMORY_DOT_H__ +#define __MEMORY_DOT_H__ + +int dlm_memory_init(void); +void dlm_memory_exit(void); +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen); +void free_rsb(struct dlm_rsb *r); +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls); +void free_lkb(struct dlm_lkb *l); +struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen); +void free_direntry(struct dlm_direntry *de); +char *allocate_lvb(struct dlm_ls *ls); +void free_lvb(char *l); +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls); +void free_rcom_buffer(struct dlm_rcom *rc); +uint64_t *allocate_range(struct dlm_ls *ls); +void free_range(uint64_t *l); + +#endif /* __MEMORY_DOT_H__ */ diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c --- linux-orig/cluster/dlm/midcomms.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/midcomms.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,355 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * midcomms.c + * + * This is the appallingly named "mid-level" comms layer. + * + * Its purpose is to take packets from the "real" comms layer, + * split them up into packets and pass them to the interested + * part of the locking mechanism. + * + * It also takes messages from the locking layer, formats them + * into packets and sends them to the comms layer. + * + * It knows the format of the mid-level messages used and nodeidss + * but it does not know how to resolve a nodeid into an IP address + * or any of the comms channel details + * + */ + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "lockqueue.h" +#include "nodes.h" +#include "reccomms.h" +#include "config.h" + +/* Byteorder routines */ + +static void host_to_network(void *msg) +{ + struct dlm_header *head = msg; + struct dlm_request *req = msg; + struct dlm_reply *rep = msg; + struct dlm_query_request *qreq = msg; + struct dlm_query_reply *qrep= msg; + struct dlm_rcom *rc = msg; + + /* Force into network byte order */ + + /* + * Do the common header first + */ + + head->rh_length = cpu_to_le16(head->rh_length); + head->rh_lockspace = cpu_to_le32(head->rh_lockspace); + /* Leave the lkid alone as it is transparent at the remote end */ + + /* + * Do the fields in the remlockrequest or remlockreply structs + */ + + switch (req->rr_header.rh_cmd) { + + case GDLM_REMCMD_LOCKREQUEST: + case GDLM_REMCMD_CONVREQUEST: + req->rr_range_start = cpu_to_le64(req->rr_range_start); + req->rr_range_end = cpu_to_le64(req->rr_range_end); + /* Deliberate fall through */ + case GDLM_REMCMD_UNLOCKREQUEST: + case GDLM_REMCMD_LOOKUP: + case GDLM_REMCMD_LOCKGRANT: + case GDLM_REMCMD_SENDBAST: + case GDLM_REMCMD_SENDCAST: + case GDLM_REMCMD_REM_RESDATA: + req->rr_flags = cpu_to_le32(req->rr_flags); + req->rr_status = cpu_to_le32(req->rr_status); + break; + + case GDLM_REMCMD_LOCKREPLY: + rep->rl_lockstate = cpu_to_le32(rep->rl_lockstate); + rep->rl_nodeid = cpu_to_le32(rep->rl_nodeid); + rep->rl_status = cpu_to_le32(rep->rl_status); + break; + + case GDLM_REMCMD_RECOVERMESSAGE: + case GDLM_REMCMD_RECOVERREPLY: + rc->rc_msgid = cpu_to_le32(rc->rc_msgid); + rc->rc_datalen = cpu_to_le16(rc->rc_datalen); + break; + + case GDLM_REMCMD_QUERY: + qreq->rq_mstlkid = cpu_to_le32(qreq->rq_mstlkid); + qreq->rq_query = cpu_to_le32(qreq->rq_query); + qreq->rq_maxlocks = cpu_to_le32(qreq->rq_maxlocks); + break; + + case GDLM_REMCMD_QUERYREPLY: + qrep->rq_numlocks = cpu_to_le32(qrep->rq_numlocks); + qrep->rq_status = cpu_to_le32(qrep->rq_status); + qrep->rq_grantcount = cpu_to_le32(qrep->rq_grantcount); + qrep->rq_waitcount = cpu_to_le32(qrep->rq_waitcount); + qrep->rq_convcount = cpu_to_le32(qrep->rq_convcount); + break; + + default: + printk("dlm: warning, unknown REMCMD type %u\n", + req->rr_header.rh_cmd); + } +} + +static void network_to_host(void *msg) +{ + struct dlm_header *head = msg; + struct dlm_request *req = msg; + struct dlm_reply *rep = msg; + struct dlm_query_request *qreq = msg; + struct dlm_query_reply *qrep = msg; + struct dlm_rcom *rc = msg; + + /* Force into host byte order */ + + /* + * Do the common header first + */ + + head->rh_length = le16_to_cpu(head->rh_length); + head->rh_lockspace = le32_to_cpu(head->rh_lockspace); + /* Leave the lkid alone as it is transparent at the remote end */ + + /* + * Do the fields in the remlockrequest or remlockreply structs + */ + + switch (req->rr_header.rh_cmd) { + + case GDLM_REMCMD_LOCKREQUEST: + case GDLM_REMCMD_CONVREQUEST: + req->rr_range_start = le64_to_cpu(req->rr_range_start); + req->rr_range_end = le64_to_cpu(req->rr_range_end); + case GDLM_REMCMD_LOOKUP: + case GDLM_REMCMD_UNLOCKREQUEST: + case GDLM_REMCMD_LOCKGRANT: + case GDLM_REMCMD_SENDBAST: + case GDLM_REMCMD_SENDCAST: + case GDLM_REMCMD_REM_RESDATA: + /* Actually, not much to do here as the remote lock IDs are + * transparent too */ + req->rr_flags = le32_to_cpu(req->rr_flags); + req->rr_status = le32_to_cpu(req->rr_status); + break; + + case GDLM_REMCMD_LOCKREPLY: + rep->rl_lockstate = le32_to_cpu(rep->rl_lockstate); + rep->rl_nodeid = le32_to_cpu(rep->rl_nodeid); + rep->rl_status = le32_to_cpu(rep->rl_status); + break; + + case GDLM_REMCMD_RECOVERMESSAGE: + case GDLM_REMCMD_RECOVERREPLY: + rc->rc_msgid = le32_to_cpu(rc->rc_msgid); + rc->rc_datalen = le16_to_cpu(rc->rc_datalen); + break; + + + case GDLM_REMCMD_QUERY: + qreq->rq_mstlkid = le32_to_cpu(qreq->rq_mstlkid); + qreq->rq_query = le32_to_cpu(qreq->rq_query); + qreq->rq_maxlocks = le32_to_cpu(qreq->rq_maxlocks); + break; + + case GDLM_REMCMD_QUERYREPLY: + qrep->rq_numlocks = le32_to_cpu(qrep->rq_numlocks); + qrep->rq_status = le32_to_cpu(qrep->rq_status); + qrep->rq_grantcount = le32_to_cpu(qrep->rq_grantcount); + qrep->rq_waitcount = le32_to_cpu(qrep->rq_waitcount); + qrep->rq_convcount = le32_to_cpu(qrep->rq_convcount); + break; + + default: + printk("dlm: warning, unknown REMCMD type %u\n", + req->rr_header.rh_cmd); + } +} + +static void copy_from_cb(void *dst, const void *base, unsigned offset, + unsigned len, unsigned limit) +{ + unsigned copy = len; + + if ((copy + offset) > limit) + copy = limit - offset; + memcpy(dst, base + offset, copy); + len -= copy; + if (len) + memcpy(dst + copy, base, len); +} + +static void khexdump(const unsigned char *c, int len) +{ + while (len > 16) { + printk(KERN_INFO + "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n", + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], + c[9], c[10], c[11], c[12], c[13], c[14], c[15]); + len -= 16; + c += 16; + } + while (len > 4) { + printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2], + c[3]); + len -= 4; + c += 4; + } + while (len > 0) { + printk(KERN_INFO "%02x\n", c[0]); + len--; + c++; + } +} + +/* + * Called from the low-level comms layer to process a buffer of + * commands. + * + * Only complete messages are processed here, any "spare" bytes from + * the end of a buffer are saved and tacked onto the front of the next + * message that comes in. I doubt this will happen very often but we + * need to be able to cope with it and I don't want the task to be waiting + * for packets to come in when there is useful work to be done. + * + */ +int midcomms_process_incoming_buffer(int nodeid, const void *base, + unsigned offset, unsigned len, + unsigned limit) +{ + unsigned char __tmp[sizeof(struct dlm_header) + 64]; + struct dlm_header *msg = (struct dlm_header *) __tmp; + int ret = 0; + int err = 0; + unsigned msglen; + __u32 id, space; + + while (len > sizeof(struct dlm_header)) { + /* Get message header and check it over */ + copy_from_cb(msg, base, offset, sizeof(struct dlm_header), + limit); + msglen = le16_to_cpu(msg->rh_length); + id = msg->rh_lkid; + space = msg->rh_lockspace; + + /* Check message size */ + err = -EINVAL; + if (msglen < sizeof(struct dlm_header)) + break; + err = -E2BIG; + if (msglen > dlm_config.buffer_size) { + printk("dlm: message size from %d too big %d(pkt len=%d)\n", nodeid, msglen, len); + khexdump((const unsigned char *) msg, len); + break; + } + err = 0; + + /* Not enough in buffer yet? wait for some more */ + if (msglen > len) + break; + + /* Make sure our temp buffer is large enough */ + if (msglen > sizeof(__tmp) && + msg == (struct dlm_header *) __tmp) { + msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL); + if (msg == NULL) + return ret; + } + + copy_from_cb(msg, base, offset, msglen, limit); + BUG_ON(id != msg->rh_lkid); + BUG_ON(space != msg->rh_lockspace); + ret += msglen; + offset += msglen; + offset &= (limit - 1); + len -= msglen; + network_to_host(msg); + + if ((msg->rh_cmd > 32) || + (msg->rh_cmd == 0) || + (msg->rh_length < sizeof(struct dlm_header)) || + (msg->rh_length > dlm_config.buffer_size)) { + + printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, " + "lkid=%u, lockspace=%u\n", + msg->rh_cmd, msg->rh_flags, msg->rh_length, + msg->rh_lkid, msg->rh_lockspace); + + printk("dlm: midcomms: base=%p, offset=%u, len=%u, " + "ret=%u, limit=%08x newbuf=%d\n", + base, offset, len, ret, limit, + ((struct dlm_header *) __tmp == msg)); + + khexdump((const unsigned char *) msg, msg->rh_length); + + return -EBADMSG; + } + + switch (msg->rh_cmd) { + case GDLM_REMCMD_RECOVERMESSAGE: + case GDLM_REMCMD_RECOVERREPLY: + process_recovery_comm(nodeid, msg); + break; + default: + process_cluster_request(nodeid, msg, FALSE); + } + } + + if (msg != (struct dlm_header *) __tmp) + kfree(msg); + + return err ? err : ret; +} + +/* + * Send a lowcomms buffer + */ + +void midcomms_send_buffer(struct dlm_header *msg, struct writequeue_entry *e) +{ + host_to_network(msg); + lowcomms_commit_buffer(e); +} + +/* + * Make the message into network byte order and send it + */ + +int midcomms_send_message(uint32_t nodeid, struct dlm_header *msg, + int allocation) +{ + int len = msg->rh_length; + + host_to_network(msg); + + /* + * Loopback. In fact, the locking code pretty much prevents this from + * being needed but it can happen when the directory node is also the + * local node. + */ + + if (nodeid == our_nodeid()) + return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0, + len, len); + + return lowcomms_send_message(nodeid, (char *) msg, len, allocation); +} diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h --- linux-orig/cluster/dlm/midcomms.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/midcomms.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,24 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MIDCOMMS_DOT_H__ +#define __MIDCOMMS_DOT_H__ + +int midcomms_send_message(uint32_t csid, struct dlm_header *msg, + int allocation); +int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset, + unsigned len, unsigned limit); +void midcomms_send_buffer(struct dlm_header *msg, + struct writequeue_entry *e); + +#endif /* __MIDCOMMS_DOT_H__ */ diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c --- linux-orig/cluster/dlm/nodes.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/nodes.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,347 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "nodes.h" +#include "recover.h" +#include "reccomms.h" +#include "util.h" + +static struct list_head cluster_nodes; +static spinlock_t node_lock; + + +void dlm_nodes_init(void) +{ + INIT_LIST_HEAD(&cluster_nodes); + spin_lock_init(&node_lock); +} + +static struct dlm_node *search_node(uint32_t nodeid) +{ + struct dlm_node *node; + + list_for_each_entry(node, &cluster_nodes, list) { + if (node->nodeid == nodeid) + goto out; + } + node = NULL; + out: + return node; +} + +static void put_node(struct dlm_node *node) +{ + spin_lock(&node_lock); + if (atomic_dec_and_test(&node->refcount)) { + lowcomms_close(node->nodeid); + list_del(&node->list); + spin_unlock(&node_lock); + kfree(node); + return; + } + spin_unlock(&node_lock); +} + +static int get_node(uint32_t nodeid, struct dlm_node **ndp) +{ + struct dlm_node *node, *node2; + int error = -ENOMEM; + + spin_lock(&node_lock); + node = search_node(nodeid); + if (node) + atomic_inc(&node->refcount); + spin_unlock(&node_lock); + + if (node) + goto out; + + node = (struct dlm_node *) kmalloc(sizeof(struct dlm_node), GFP_KERNEL); + if (!node) + goto fail; + + memset(node, 0, sizeof(struct dlm_node)); + node->nodeid = nodeid; + + spin_lock(&node_lock); + node2 = search_node(nodeid); + if (node2) { + atomic_inc(&node2->refcount); + spin_unlock(&node_lock); + kfree(node); + node = node2; + goto out; + } + + atomic_set(&node->refcount, 1); + list_add_tail(&node->list, &cluster_nodes); + spin_unlock(&node_lock); + + out: + *ndp = node; + return 0; + fail: + return error; +} + +int init_new_csb(uint32_t nodeid, struct dlm_csb **ret_csb) +{ + struct dlm_csb *csb; + struct dlm_node *node; + int error = -ENOMEM; + + csb = (struct dlm_csb *) kmalloc(sizeof(struct dlm_csb), GFP_KERNEL); + if (!csb) + goto fail; + + memset(csb, 0, sizeof(struct dlm_csb)); + + error = get_node(nodeid, &node); + if (error) + goto fail_free; + + csb->node = node; + *ret_csb = csb; + return 0; + + fail_free: + kfree(csb); + fail: + return error; +} + +void release_csb(struct dlm_csb *csb) +{ + put_node(csb->node); + kfree(csb); +} + +uint32_t our_nodeid(void) +{ + return lowcomms_our_nodeid(); +} + +static void make_node_array(struct dlm_ls *ls) +{ + struct dlm_csb *csb; + uint32_t *array; + int i = 0; + + if (ls->ls_node_array) { + kfree(ls->ls_node_array); + ls->ls_node_array = NULL; + } + + array = kmalloc(sizeof(uint32_t) * ls->ls_num_nodes, GFP_KERNEL); + if (!array) + return; + + list_for_each_entry(csb, &ls->ls_nodes, list) + array[i++] = csb->node->nodeid; + + ls->ls_node_array = array; +} + +int nodes_reconfig_wait(struct dlm_ls *ls) +{ + int error; + + if (ls->ls_low_nodeid == our_nodeid()) { + error = dlm_wait_status_all(ls, NODES_VALID); + if (!error) + set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags); + + /* Experimental: this delay should allow any final messages + * from the previous node to be received before beginning + * recovery. */ + + if (ls->ls_num_nodes == 1) { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout((2) * HZ); + } + + } else + error = dlm_wait_status_low(ls, NODES_ALL_VALID); + + return error; +} + +static void add_ordered_node(struct dlm_ls *ls, struct dlm_csb *new) +{ + struct dlm_csb *csb = NULL; + struct list_head *tmp; + struct list_head *newlist = &new->list; + struct list_head *head = &ls->ls_nodes; + + list_for_each(tmp, head) { + csb = list_entry(tmp, struct dlm_csb, list); + + if (new->node->nodeid < csb->node->nodeid) + break; + } + + if (!csb) + list_add_tail(newlist, head); + else { + /* FIXME: can use list macro here */ + newlist->prev = tmp->prev; + newlist->next = tmp; + tmp->prev->next = newlist; + tmp->prev = newlist; + } +} + +int ls_nodes_reconfig(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) +{ + struct dlm_csb *csb, *safe; + int error, i, found, pos = 0, neg = 0; + uint32_t low = (uint32_t) (-1); + + /* + * Remove (and save) departed nodes from lockspace's nodes list + */ + + list_for_each_entry_safe(csb, safe, &ls->ls_nodes, list) { + found = FALSE; + for (i = 0; i < rv->node_count; i++) { + if (csb->node->nodeid == rv->nodeids[i]) { + found = TRUE; + break; + } + } + + if (!found) { + neg++; + csb->gone_event = rv->event_id; + list_del(&csb->list); + list_add_tail(&csb->list, &ls->ls_nodes_gone); + ls->ls_num_nodes--; + log_all(ls, "remove node %u", csb->node->nodeid); + } + } + + /* + * Add new nodes to lockspace's nodes list + */ + + for (i = 0; i < rv->node_count; i++) { + found = FALSE; + list_for_each_entry(csb, &ls->ls_nodes, list) { + if (csb->node->nodeid == rv->nodeids[i]) { + found = TRUE; + break; + } + } + + if (!found) { + pos++; + + error = init_new_csb(rv->nodeids[i], &csb); + DLM_ASSERT(!error,); + + add_ordered_node(ls, csb); + ls->ls_num_nodes++; + log_all(ls, "add node %u", csb->node->nodeid); + } + } + + list_for_each_entry(csb, &ls->ls_nodes, list) { + if (csb->node->nodeid < low) + low = csb->node->nodeid; + } + + ls->ls_low_nodeid = low; + set_bit(LSFL_NODES_VALID, &ls->ls_flags); + *neg_out = neg; + make_node_array(ls); + + error = nodes_reconfig_wait(ls); + + log_all(ls, "total nodes %d", ls->ls_num_nodes); + + return error; +} + +static void nodes_clear(struct list_head *head) +{ + struct dlm_csb *csb; + + while (!list_empty(head)) { + csb = list_entry(head->next, struct dlm_csb, list); + list_del(&csb->list); + release_csb(csb); + } +} + +void ls_nodes_clear(struct dlm_ls *ls) +{ + nodes_clear(&ls->ls_nodes); + ls->ls_num_nodes = 0; +} + +void ls_nodes_gone_clear(struct dlm_ls *ls) +{ + nodes_clear(&ls->ls_nodes_gone); +} + +int ls_nodes_init(struct dlm_ls *ls, struct dlm_recover *rv) +{ + struct dlm_csb *csb; + int i, error; + uint32_t low = (uint32_t) (-1); + + /* nodes may be left from a previous failed start */ + ls_nodes_clear(ls); + + log_all(ls, "add nodes"); + + for (i = 0; i < rv->node_count; i++) { + error = init_new_csb(rv->nodeids[i], &csb); + if (error) + goto fail; + + add_ordered_node(ls, csb); + ls->ls_num_nodes++; + + if (csb->node->nodeid < low) + low = csb->node->nodeid; + } + + ls->ls_low_nodeid = low; + set_bit(LSFL_NODES_VALID, &ls->ls_flags); + make_node_array(ls); + + error = nodes_reconfig_wait(ls); + + log_all(ls, "total nodes %d", ls->ls_num_nodes); + return error; + fail: + ls_nodes_clear(ls); + return error; +} + +int in_nodes_gone(struct dlm_ls *ls, uint32_t nodeid) +{ + struct dlm_csb *csb; + + list_for_each_entry(csb, &ls->ls_nodes_gone, list) { + if (csb->node->nodeid == nodeid) + return TRUE; + } + return FALSE; +} diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h --- linux-orig/cluster/dlm/nodes.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/nodes.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,27 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __NODES_DOT_H__ +#define __NODES_DOT_H__ + +void dlm_nodes_init(void); +int init_new_csb(uint32_t nodeid, struct dlm_csb ** ret_csb); +void release_csb(struct dlm_csb * csb); +uint32_t our_nodeid(void); +int ls_nodes_reconfig(struct dlm_ls * ls, struct dlm_recover * gr, int *neg); +int ls_nodes_init(struct dlm_ls * ls, struct dlm_recover * gr); +int in_nodes_gone(struct dlm_ls * ls, uint32_t nodeid); +void ls_nodes_clear(struct dlm_ls *ls); +void ls_nodes_gone_clear(struct dlm_ls *ls); + +#endif /* __NODES_DOT_H__ */ diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c --- linux-orig/cluster/dlm/proc.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/proc.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,652 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lockspace.h" + +#if defined(DLM_DEBUG) +#define DLM_DEBUG_SIZE (1024) +#define MAX_DEBUG_MSG_LEN (64) +#else +#define DLM_DEBUG_SIZE (0) +#define MAX_DEBUG_MSG_LEN (0) +#endif + +static char * debug_buf; +static unsigned int debug_size; +static unsigned int debug_point; +static int debug_wrap; +static spinlock_t debug_lock; +static struct proc_dir_entry * debug_proc_entry = NULL; +static char proc_ls_name[255] = ""; + +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS +static struct proc_dir_entry * locks_proc_entry = NULL; +static struct seq_operations locks_info_op; +static struct proc_dir_entry * dir_proc_entry = NULL; +static struct seq_operations dir_info_op; + + +/* + * /proc/cluster/dlm_locks - dump resources and locks + */ + +static int locks_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &locks_info_op); +} + +/* Write simply sets the lockspace to use */ +static ssize_t locks_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + if (count < sizeof(proc_ls_name)) { + copy_from_user(proc_ls_name, buf, count); + proc_ls_name[count] = '\0'; + + /* Remove any trailing LF so that lazy users + can just echo "lsname" > /proc/cluster/dlm_locks */ + if (proc_ls_name[count - 1] == '\n') + proc_ls_name[count - 1] = '\0'; + + return count; + } + return 0; +} + +static struct file_operations locks_fops = { + open:locks_open, + write:locks_write, + read:seq_read, + llseek:seq_lseek, + release:seq_release, +}; + +struct ls_dumpinfo { + int entry; + struct list_head *next; + struct dlm_ls *ls; + struct dlm_rsb *rsb; + struct dlm_direntry *de; +}; + +static int print_resource(struct dlm_rsb * res, struct seq_file *s); + +static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di) +{ + int i; + + if (!di->next) { + /* Find the next non-empty hash bucket */ + for (i = di->entry; i < di->ls->ls_rsbtbl_size; i++) { + read_lock(&di->ls->ls_rsbtbl[i].lock); + if (!list_empty(&di->ls->ls_rsbtbl[i].list)) { + di->next = di->ls->ls_rsbtbl[i].list.next; + read_unlock(&di->ls->ls_rsbtbl[i].lock); + break; + } + read_unlock(&di->ls->ls_rsbtbl[i].lock); + } + di->entry = i; + + if (di->entry >= di->ls->ls_rsbtbl_size) + return NULL; /* End of hash list */ + } else { /* Find the next entry in the list */ + i = di->entry; + read_lock(&di->ls->ls_rsbtbl[i].lock); + di->next = di->next->next; + if (di->next->next == di->ls->ls_rsbtbl[i].list.next) { + /* End of list - move to next bucket */ + di->next = NULL; + di->entry++; + read_unlock(&di->ls->ls_rsbtbl[i].lock); + return next_rsb(di); /* do the top half of this conditional */ + } + read_unlock(&di->ls->ls_rsbtbl[i].lock); + } + di->rsb = list_entry(di->next, struct dlm_rsb, res_hashchain); + + return di; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct ls_dumpinfo *di; + struct dlm_ls *ls; + int i; + + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name)); + if (!ls) + return NULL; + + di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL); + if (!di) + return NULL; + + if (*pos == 0) + seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name); + + di->entry = 0; + di->next = NULL; + di->ls = ls; + di->de = NULL; + + for (i = 0; i < *pos; i++) + if (next_rsb(di) == NULL) + return NULL; + + return next_rsb(di); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct ls_dumpinfo *di = p; + + *pos += 1; + + return next_rsb(di); +} + +static int s_show(struct seq_file *m, void *p) +{ + struct ls_dumpinfo *di = p; + return print_resource(di->rsb, m); +} + +static void s_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static struct seq_operations locks_info_op = { + start:s_start, + next:s_next, + stop:s_stop, + show:s_show +}; + +static char *print_lockmode(int mode) +{ + switch (mode) { + case DLM_LOCK_IV: + return "--"; + case DLM_LOCK_NL: + return "NL"; + case DLM_LOCK_CR: + return "CR"; + case DLM_LOCK_CW: + return "CW"; + case DLM_LOCK_PR: + return "PR"; + case DLM_LOCK_PW: + return "PW"; + case DLM_LOCK_EX: + return "EX"; + default: + return "??"; + } +} + +static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) +{ + + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); + + if (lkb->lkb_status == GDLM_LKSTS_CONVERT + || lkb->lkb_status == GDLM_LKSTS_WAITING) + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); + + if (lkb->lkb_range) { + /* This warns on Alpha. Tough. Only I see it */ + if (lkb->lkb_status == GDLM_LKSTS_CONVERT + || lkb->lkb_status == GDLM_LKSTS_GRANTED) + seq_printf(s, " %" PRIx64 "-%" PRIx64, + lkb->lkb_range[GR_RANGE_START], + lkb->lkb_range[GR_RANGE_END]); + if (lkb->lkb_status == GDLM_LKSTS_CONVERT + || lkb->lkb_status == GDLM_LKSTS_WAITING) + seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")", + lkb->lkb_range[RQ_RANGE_START], + lkb->lkb_range[RQ_RANGE_END]); + } + + if (lkb->lkb_nodeid) { + if (lkb->lkb_nodeid != res->res_nodeid) + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid, + lkb->lkb_remid); + else + seq_printf(s, " Master: %08x", lkb->lkb_remid); + } + + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) + seq_printf(s, " LQ: %d,0x%x", lkb->lkb_lockqueue_state, + lkb->lkb_lockqueue_flags); + + seq_printf(s, "\n"); +} + +static int print_resource(struct dlm_rsb *res, struct seq_file *s) +{ + int i; + struct list_head *locklist; + + seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res, + res->res_parent, res->res_length); + for (i = 0; i < res->res_length; i++) { + if (isprint(res->res_name[i])) + seq_printf(s, "%c", res->res_name[i]); + else + seq_printf(s, "%c", '.'); + } + if (res->res_nodeid) + seq_printf(s, "\" \nLocal Copy, Master is node %d\n", + res->res_nodeid); + else + seq_printf(s, "\" \nMaster Copy\n"); + + /* Print the LVB: */ + if (res->res_lvbptr) { + seq_printf(s, "LVB: "); + for (i = 0; i < DLM_LVB_LEN; i++) { + if (i == DLM_LVB_LEN / 2) + seq_printf(s, "\n "); + seq_printf(s, "%02x ", + (unsigned char) res->res_lvbptr[i]); + } + seq_printf(s, "\n"); + } + + /* Print the locks attached to this resource */ + seq_printf(s, "Granted Queue\n"); + list_for_each(locklist, &res->res_grantqueue) { + struct dlm_lkb *this_lkb = + list_entry(locklist, struct dlm_lkb, lkb_statequeue); + print_lock(s, this_lkb, res); + } + + seq_printf(s, "Conversion Queue\n"); + list_for_each(locklist, &res->res_convertqueue) { + struct dlm_lkb *this_lkb = + list_entry(locklist, struct dlm_lkb, lkb_statequeue); + print_lock(s, this_lkb, res); + } + + seq_printf(s, "Waiting Queue\n"); + list_for_each(locklist, &res->res_waitqueue) { + struct dlm_lkb *this_lkb = + list_entry(locklist, struct dlm_lkb, lkb_statequeue); + print_lock(s, this_lkb, res); + } + + return 0; +} + + +/* + * /proc/cluster/dlm_dir - dump resource directory + */ + +static int print_de(struct dlm_direntry *de, struct seq_file *s) +{ + char strname[DLM_RESNAME_MAXLEN+1]; + + memset(strname, 0, DLM_RESNAME_MAXLEN+1); + memcpy(strname, de->name, de->length); + + seq_printf(s, "%s %u\n", strname, de->master_nodeid); + return 0; +} + +static int dir_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &dir_info_op); +} + +static ssize_t dir_write(struct file *file, const char *buf, + size_t count, loff_t *ppos) +{ + return locks_write(file, buf, count, ppos); +} + +static struct file_operations dir_fops = { + .open = dir_open, + .write = dir_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static struct ls_dumpinfo *next_de(struct ls_dumpinfo *di) +{ + int i; + + if (!di->next) { + /* Find the next non-empty hash bucket */ + for (i = di->entry; i < di->ls->ls_dirtbl_size; i++) { + read_lock(&di->ls->ls_dirtbl[i].lock); + if (!list_empty(&di->ls->ls_dirtbl[i].list)) { + di->next = di->ls->ls_dirtbl[i].list.next; + read_unlock(&di->ls->ls_dirtbl[i].lock); + break; + } + read_unlock(&di->ls->ls_dirtbl[i].lock); + } + di->entry = i; + + if (di->entry >= di->ls->ls_dirtbl_size) + return NULL; /* End of hash list */ + } else { /* Find the next entry in the list */ + i = di->entry; + read_lock(&di->ls->ls_dirtbl[i].lock); + di->next = di->next->next; + if (di->next->next == di->ls->ls_dirtbl[i].list.next) { + /* End of list - move to next bucket */ + di->next = NULL; + di->entry++; + read_unlock(&di->ls->ls_dirtbl[i].lock); + return next_de(di); /* do the top half of this conditional */ + } + read_unlock(&di->ls->ls_dirtbl[i].lock); + } + di->de = list_entry(di->next, struct dlm_direntry, list); + + return di; +} + +static void *dir_start(struct seq_file *m, loff_t *pos) +{ + struct ls_dumpinfo *di; + struct dlm_ls *ls; + int i; + + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name)); + if (!ls) + return NULL; + + di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL); + if (!di) + return NULL; + + if (*pos == 0) + seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name); + + di->entry = 0; + di->next = NULL; + di->ls = ls; + + for (i = 0; i < *pos; i++) + if (next_de(di) == NULL) + return NULL; + + return next_de(di); +} + +static void *dir_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct ls_dumpinfo *di = p; + + *pos += 1; + + return next_de(di); +} + +static int dir_show(struct seq_file *m, void *p) +{ + struct ls_dumpinfo *di = p; + return print_de(di->de, m); +} + +static void dir_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static struct seq_operations dir_info_op = { + .start = dir_start, + .next = dir_next, + .stop = dir_stop, + .show = dir_show, +}; +#endif /* CONFIG_CLUSTER_DLM_PROCLOCKS */ + +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...) +{ + va_list va; + int i, n, size, len; + char buf[MAX_DEBUG_MSG_LEN+1]; + + spin_lock(&debug_lock); + + if (!debug_buf) + goto out; + + size = MAX_DEBUG_MSG_LEN; + memset(buf, 0, size+1); + + n = snprintf(buf, size, "%s ", ls->ls_name); + size -= n; + + va_start(va, fmt); + vsnprintf(buf+n, size, fmt, va); + va_end(va); + + len = strlen(buf); + if (len > MAX_DEBUG_MSG_LEN-1) + len = MAX_DEBUG_MSG_LEN-1; + buf[len] = '\n'; + buf[len+1] = '\0'; + + for (i = 0; i < strlen(buf); i++) { + debug_buf[debug_point++] = buf[i]; + + if (debug_point == debug_size) { + debug_point = 0; + debug_wrap = 1; + } + } + out: + spin_unlock(&debug_lock); +} + +void dlm_debug_dump(void) +{ + int i; + + spin_lock(&debug_lock); + if (debug_wrap) { + for (i = debug_point; i < debug_size; i++) + printk("%c", debug_buf[i]); + } + for (i = 0; i < debug_point; i++) + printk("%c", debug_buf[i]); + spin_unlock(&debug_lock); +} + +void dlm_debug_setup(int size) +{ + char *b = NULL; + + if (size > PAGE_SIZE) + size = PAGE_SIZE; + if (size) + b = kmalloc(size, GFP_KERNEL); + + spin_lock(&debug_lock); + if (debug_buf) + kfree(debug_buf); + if (!size || !b) + goto out; + debug_size = size; + debug_point = 0; + debug_wrap = 0; + debug_buf = b; + memset(debug_buf, 0, debug_size); + out: + spin_unlock(&debug_lock); +} + +static void dlm_debug_init(void) +{ + debug_buf = NULL; + debug_size = 0; + debug_point = 0; + debug_wrap = 0; + spin_lock_init(&debug_lock); + + dlm_debug_setup(DLM_DEBUG_SIZE); +} + +#ifdef CONFIG_PROC_FS +int dlm_debug_info(char *b, char **start, off_t offset, int length) +{ + int i, n = 0; + + spin_lock(&debug_lock); + + if (debug_wrap) { + for (i = debug_point; i < debug_size; i++) + n += sprintf(b + n, "%c", debug_buf[i]); + } + for (i = 0; i < debug_point; i++) + n += sprintf(b + n, "%c", debug_buf[i]); + + spin_unlock(&debug_lock); + + return n; +} +#endif + +#ifdef CONFIG_DLM_STATS +struct dlm_statinfo dlm_stats; +static struct proc_dir_entry *stats_proc_entry = NULL; +static int dlm_stats_info(char *b, char **start, off_t offset, int length) +{ + int n=0; + int i; + long lq_locks = 0; + unsigned long lq_time = 0; + + n += sprintf(b+n, "DLM stats (HZ=%d)\n\n", HZ); + n += sprintf(b+n, "Lock operations: %7d\n", dlm_stats.lockops); + n += sprintf(b+n, "Unlock operations: %7d\n", dlm_stats.unlockops); + n += sprintf(b+n, "Convert operations: %7d\n", dlm_stats.convertops); + n += sprintf(b+n, "Completion ASTs: %7d\n", dlm_stats.cast); + n += sprintf(b+n, "Blocking ASTs: %7d\n", dlm_stats.bast); + n += sprintf(b+n, "\n"); + n += sprintf(b+n, "Lockqueue num waittime ave\n"); + for (i=1; i<=4 ; i++) { + char *lq_reason="???"; + switch (i){ + case 1: lq_reason = "WAIT_RSB "; + break; + case 2: lq_reason = "WAIT_CONV "; + break; + case 3: lq_reason = "WAIT_GRANT "; + break; + case 4: lq_reason = "WAIT_UNLOCK"; + break; + } + if (dlm_stats.lockqueue_locks[i]) + n += sprintf(b+n, "%s %6lu %7lu %3lu\n", + lq_reason, + dlm_stats.lockqueue_locks[i], + dlm_stats.lockqueue_time[i], + dlm_stats.lockqueue_time[i]/ + dlm_stats.lockqueue_locks[i]); + + lq_locks += dlm_stats.lockqueue_locks[i]; + lq_time += dlm_stats.lockqueue_time[i]; + } + if (lq_locks) + n += sprintf(b+n, "Total %6lu %7lu %3lu\n", + lq_locks, lq_time, lq_time/lq_locks); + return n; +} + +static int dlm_stats_clear(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + memset(&dlm_stats, 0, sizeof(dlm_stats)); + return count; +} +#endif /* CONFIG_DLM_STATS */ + +void dlm_proc_init(void) +{ +#ifdef CONFIG_PROC_FS + debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO, + NULL); + if (!debug_proc_entry) + return; + + debug_proc_entry->get_info = &dlm_debug_info; +#endif + +#ifdef CONFIG_DLM_STATS + stats_proc_entry = create_proc_entry("cluster/dlm_stats", + S_IRUSR | S_IWUSR, NULL); + if (!stats_proc_entry) + return; + + stats_proc_entry->get_info = &dlm_stats_info; + stats_proc_entry->write_proc = &dlm_stats_clear; +#endif + + dlm_debug_init(); + +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS + locks_proc_entry = create_proc_read_entry("cluster/dlm_locks", + S_IFREG | 0400, + NULL, NULL, NULL); + if (!locks_proc_entry) + return; + locks_proc_entry->proc_fops = &locks_fops; + + dir_proc_entry = create_proc_read_entry("cluster/dlm_dir", + S_IFREG | 0400, + NULL, NULL, NULL); + if (!dir_proc_entry) + return; + dir_proc_entry->proc_fops = &dir_fops; +#endif +} + +void dlm_proc_exit(void) +{ +#ifdef CONFIG_PROC_FS + if (debug_proc_entry) { + remove_proc_entry("cluster/dlm_debug", NULL); + dlm_debug_setup(0); + } +#endif + +#ifdef CONFIG_DLM_STATS + if (stats_proc_entry) + remove_proc_entry("cluster/dlm_stats", NULL); +#endif + +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS + if (locks_proc_entry) + remove_proc_entry("cluster/dlm_locks", NULL); + if (dir_proc_entry) + remove_proc_entry("cluster/dlm_dir", NULL); +#endif +} diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c --- linux-orig/cluster/dlm/queries.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/queries.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,713 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * queries.c + * + * This file provides the kernel query interface to the DLM. + * + */ + +#define EXPORT_SYMTAB +#include + +#include "dlm_internal.h" +#include "lockspace.h" +#include "lockqueue.h" +#include "locking.h" +#include "lkb.h" +#include "nodes.h" +#include "dir.h" +#include "ast.h" +#include "memory.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "rsb.h" + +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo); +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo); + +/* + * API entry point. + */ +int dlm_query(void *lockspace, + struct dlm_lksb *lksb, + int query, + struct dlm_queryinfo *qinfo, + void (ast_routine(void *)), + void *astarg) +{ + int status = -EINVAL; + struct dlm_lkb *target_lkb; + struct dlm_lkb *query_lkb = NULL; /* Our temporary LKB */ + struct dlm_ls *ls = find_lockspace_by_local_id(lockspace); + + if (!ls) + return -EINVAL; + if (!qinfo) + goto out; + if (!ast_routine) + goto out; + if (!lksb) + goto out; + + if (!qinfo->gqi_lockinfo) + qinfo->gqi_locksize = 0; + + /* Find the lkid */ + target_lkb = find_lock_by_id(ls, lksb->sb_lkid); + if (!target_lkb) + goto out; + + /* If the user wants a list of locks that are blocking or + not blocking this lock, then it must be waiting + for something + */ + if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING || + (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) && + target_lkb->lkb_status == GDLM_LKSTS_GRANTED) + goto out; + + /* We now allocate an LKB for our own use (so we can hang + * things like the AST routine and the lksb from it) */ + lksb->sb_status = -EBUSY; + query_lkb = create_lkb(ls); + if (!query_lkb) { + status = -ENOMEM; + goto out; + } + query_lkb->lkb_astaddr = ast_routine; + query_lkb->lkb_astparam = (long)astarg; + query_lkb->lkb_resource = target_lkb->lkb_resource; + query_lkb->lkb_lksb = lksb; + + /* Don't free the resource while we are querying it. This ref + * will be dropped when the LKB is freed */ + hold_rsb(query_lkb->lkb_resource); + + /* Fill in the stuff that's always local */ + if (qinfo->gqi_resinfo) { + if (target_lkb->lkb_resource->res_nodeid) + qinfo->gqi_resinfo->rsi_masternode = + target_lkb->lkb_resource->res_nodeid; + else + qinfo->gqi_resinfo->rsi_masternode = our_nodeid(); + qinfo->gqi_resinfo->rsi_length = + target_lkb->lkb_resource->res_length; + memcpy(qinfo->gqi_resinfo->rsi_name, + target_lkb->lkb_resource->res_name, + qinfo->gqi_resinfo->rsi_length); + } + + /* If the master is local (or the user doesn't want the overhead of a + * remote call) - fill in the details here */ + if (target_lkb->lkb_resource->res_nodeid == 0 || + (query & DLM_QUERY_LOCAL)) { + + status = 0; + /* Resource info */ + if (qinfo->gqi_resinfo) { + query_resource(target_lkb->lkb_resource, + qinfo->gqi_resinfo); + } + + /* Lock lists */ + if (qinfo->gqi_lockinfo) { + status = query_locks(query, target_lkb, qinfo); + } + + query_lkb->lkb_retstatus = status; + queue_ast(query_lkb, AST_COMP | AST_DEL, 0); + wake_astd(); + + /* An AST will be delivered so we must return success here */ + status = 0; + goto out; + } + + /* Remote master */ + if (target_lkb->lkb_resource->res_nodeid != 0) + { + struct dlm_query_request *remquery; + struct writequeue_entry *e; + + /* Clear this cos the receiving end adds to it with + each incoming packet */ + qinfo->gqi_lockcount = 0; + + /* Squirrel a pointer to the query info struct + somewhere illegal */ + query_lkb->lkb_request = (struct dlm_request *) qinfo; + + e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid, + sizeof(struct dlm_query_request), + ls->ls_allocation, + (char **) &remquery); + if (!e) { + status = -ENOBUFS; + goto out; + } + + /* Build remote packet */ + memset(remquery, 0, sizeof(struct dlm_query_request)); + + remquery->rq_maxlocks = qinfo->gqi_locksize; + remquery->rq_query = query; + remquery->rq_mstlkid = target_lkb->lkb_remid; + if (qinfo->gqi_lockinfo) + remquery->rq_maxlocks = qinfo->gqi_locksize; + + remquery->rq_header.rh_cmd = GDLM_REMCMD_QUERY; + remquery->rq_header.rh_flags = 0; + remquery->rq_header.rh_length = sizeof(struct dlm_query_request); + remquery->rq_header.rh_lkid = query_lkb->lkb_id; + remquery->rq_header.rh_lockspace = ls->ls_global_id; + + midcomms_send_buffer(&remquery->rq_header, e); + status = 0; + } + + out: + put_lockspace(ls); + return status; +} + +static inline int valid_range(struct dlm_range *r) +{ + if (r->ra_start != 0ULL || + r->ra_end != 0xFFFFFFFFFFFFFFFFULL) + return 1; + else + return 0; +} + +static void put_int(int x, char *buf, int *offp) +{ + x = cpu_to_le32(x); + memcpy(buf + *offp, &x, sizeof(int)); + *offp += sizeof(int); +} + +static void put_int64(uint64_t x, char *buf, int *offp) +{ + x = cpu_to_le64(x); + memcpy(buf + *offp, &x, sizeof(uint64_t)); + *offp += sizeof(uint64_t); +} + +static int get_int(char *buf, int *offp) +{ + int value; + memcpy(&value, buf + *offp, sizeof(int)); + *offp += sizeof(int); + return le32_to_cpu(value); +} + +static uint64_t get_int64(char *buf, int *offp) +{ + uint64_t value; + + memcpy(&value, buf + *offp, sizeof(uint64_t)); + *offp += sizeof(uint64_t); + return le64_to_cpu(value); +} + +#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4) + +/* Called from recvd to get lock info for a remote node */ +int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg) +{ + struct dlm_query_request *query = (struct dlm_query_request *) msg; + struct dlm_query_reply *reply; + struct dlm_resinfo resinfo; + struct dlm_queryinfo qinfo; + struct writequeue_entry *e; + char *buf; + struct dlm_lkb *lkb; + int status = 0; + int bufidx; + int finished = 0; + int cur_lock = 0; + int start_lock = 0; + + lkb = find_lock_by_id(ls, query->rq_mstlkid); + if (!lkb) { + status = -EINVAL; + goto send_error; + } + + qinfo.gqi_resinfo = &resinfo; + qinfo.gqi_locksize = query->rq_maxlocks; + + /* Get the resource bits */ + query_resource(lkb->lkb_resource, &resinfo); + + /* Now get the locks if wanted */ + if (query->rq_maxlocks) { + qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks, + GFP_KERNEL); + if (!qinfo.gqi_lockinfo) { + status = -ENOMEM; + goto send_error; + } + + status = query_locks(query->rq_query, lkb, &qinfo); + if (status && status != -E2BIG) { + kfree(qinfo.gqi_lockinfo); + goto send_error; + } + } + else { + qinfo.gqi_lockinfo = NULL; + qinfo.gqi_lockcount = 0; + } + + /* Send as many blocks as needed for all the locks */ + do { + int i; + int msg_len = sizeof(struct dlm_query_reply); + int last_msg_len = msg_len; /* keeps compiler quiet */ + int last_lock; + + /* First work out how many locks we can fit into a block */ + for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) { + + last_msg_len = msg_len; + + msg_len += LOCK_LEN; + if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) || + valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) { + + msg_len += sizeof(uint64_t) * 4; + } + } + + /* There must be a neater way of doing this... */ + if (msg_len > PAGE_SIZE) { + last_lock = i-1; + msg_len = last_msg_len; + } + else { + last_lock = i; + } + + e = lowcomms_get_buffer(nodeid, + msg_len, + ls->ls_allocation, + (char **) &reply); + if (!e) { + kfree(qinfo.gqi_lockinfo); + status = -ENOBUFS; + goto out; + } + + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY; + reply->rq_header.rh_length = msg_len; + reply->rq_header.rh_lkid = msg->rh_lkid; + reply->rq_header.rh_lockspace = msg->rh_lockspace; + + reply->rq_status = status; + reply->rq_startlock = cur_lock; + reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount; + reply->rq_convcount = qinfo.gqi_resinfo->rsi_convcount; + reply->rq_waitcount = qinfo.gqi_resinfo->rsi_waitcount; + memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN); + + buf = (char *)reply; + bufidx = sizeof(struct dlm_query_reply); + + for (; cur_lock < last_lock; cur_lock++) { + + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state; + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode; + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode; + put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx); + put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx); + put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx); + put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx); + put_int(qinfo.gqi_lockinfo[cur_lock].lki_ownpid, buf, &bufidx); + + if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) || + valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) { + + buf[bufidx++] = 1; + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx); + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx); + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx); + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx); + } + else { + buf[bufidx++] = 0; + } + } + + if (cur_lock == qinfo.gqi_lockcount) { + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; + finished = 1; + } + else { + reply->rq_header.rh_flags = 0; + } + + reply->rq_numlocks = cur_lock - start_lock; + start_lock = cur_lock; + + midcomms_send_buffer(&reply->rq_header, e); + } while (!finished); + + kfree(qinfo.gqi_lockinfo); + out: + return status; + + send_error: + e = lowcomms_get_buffer(nodeid, + sizeof(struct dlm_query_reply), + ls->ls_allocation, + (char **) &reply); + if (!e) { + status = -ENOBUFS; + goto out; + } + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY; + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; + reply->rq_header.rh_length = sizeof(struct dlm_query_reply); + reply->rq_header.rh_lkid = msg->rh_lkid; + reply->rq_header.rh_lockspace = msg->rh_lockspace; + reply->rq_status = status; + reply->rq_numlocks = 0; + reply->rq_startlock = 0; + reply->rq_grantcount = 0; + reply->rq_convcount = 0; + reply->rq_waitcount = 0; + + midcomms_send_buffer(&reply->rq_header, e); + + return status; +} + +/* Reply to a remote query */ +int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg) +{ + struct dlm_lkb *query_lkb; + struct dlm_queryinfo *qinfo; + struct dlm_query_reply *reply; + char *buf; + int i; + int bufidx; + + query_lkb = find_lock_by_id(ls, msg->rh_lkid); + if (!query_lkb) + return -EINVAL; + + qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request; + reply = (struct dlm_query_reply *) msg; + + /* Copy the easy bits first */ + qinfo->gqi_lockcount += reply->rq_numlocks; + if (qinfo->gqi_resinfo) { + qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount; + qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount; + qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount; + memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk, + DLM_LVB_LEN); + } + + /* Now unpack the locks */ + bufidx = sizeof(struct dlm_query_reply); + buf = (char *) msg; + + DLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize, + printk("start = %d, num + %d. Max= %d\n", + reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize);); + + for (i = reply->rq_startlock; + i < reply->rq_startlock + reply->rq_numlocks; i++) { + qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++]; + qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++]; + qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++]; + qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx); + qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx); + qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx); + qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx); + qinfo->gqi_lockinfo[i].lki_ownpid = get_int(buf, &bufidx); + if (buf[bufidx++]) { + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx); + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = get_int64(buf, &bufidx); + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx); + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = get_int64(buf, &bufidx); + } + else { + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL; + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = 0xFFFFFFFFFFFFFFFFULL; + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL; + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = 0xFFFFFFFFFFFFFFFFULL; + } + } + + /* If this was the last block then now tell the user */ + if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) { + query_lkb->lkb_retstatus = reply->rq_status; + queue_ast(query_lkb, AST_COMP | AST_DEL, 0); + wake_astd(); + } + + return 0; +} + +/* Aggregate resource information */ +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo) +{ + struct list_head *tmp; + + if (rsb->res_lvbptr) + memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN); + + down_read(&rsb->res_lock); + resinfo->rsi_grantcount = 0; + list_for_each(tmp, &rsb->res_grantqueue) { + resinfo->rsi_grantcount++; + } + + resinfo->rsi_waitcount = 0; + list_for_each(tmp, &rsb->res_waitqueue) { + resinfo->rsi_waitcount++; + } + + resinfo->rsi_convcount = 0; + list_for_each(tmp, &rsb->res_convertqueue) { + resinfo->rsi_convcount++; + } + up_read(&rsb->res_lock); + + return 0; +} + +static int add_lock(struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo) +{ + int entry; + + /* Don't fill it in if the buffer is full */ + if (qinfo->gqi_lockcount == qinfo->gqi_locksize) + return -E2BIG; + + /* gqi_lockcount contains the number of locks we have returned */ + entry = qinfo->gqi_lockcount++; + + /* Fun with master copies */ + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) { + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid; + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id; + } + else { + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id; + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid; + } + + /* Also make sure we always have a valid nodeid in there, the + calling end may not know which node "0" is */ + if (lkb->lkb_nodeid) + qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid; + else + qinfo->gqi_lockinfo[entry].lki_node = our_nodeid(); + + if (lkb->lkb_parent) + qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id; + else + qinfo->gqi_lockinfo[entry].lki_parent = 0; + + qinfo->gqi_lockinfo[entry].lki_state = lkb->lkb_status; + qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode; + qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode; + qinfo->gqi_lockinfo[entry].lki_ownpid = lkb->lkb_ownpid; + + if (lkb->lkb_range) { + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = + lkb->lkb_range[GR_RANGE_START]; + qinfo->gqi_lockinfo[entry].lki_grrange.ra_end = + lkb->lkb_range[GR_RANGE_END]; + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = + lkb->lkb_range[RQ_RANGE_START]; + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end = + lkb->lkb_range[RQ_RANGE_END]; + } else { + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL; + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL; + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL; + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL; + } + return 0; +} + +static int query_lkb_queue(struct dlm_rsb *rsb, + struct list_head *queue, int query, + struct dlm_queryinfo *qinfo) +{ + struct list_head *tmp; + int status = 0; + int mode = query & DLM_QUERY_MODE_MASK; + + down_read(&rsb->res_lock); + list_for_each(tmp, queue) { + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue); + int lkmode; + + if (query & DLM_QUERY_RQMODE) + lkmode = lkb->lkb_rqmode; + else + lkmode = lkb->lkb_grmode; + + /* Add the LKB info to the list if it matches the criteria in + * the query bitmap */ + switch (query & DLM_QUERY_MASK) { + case DLM_QUERY_LOCKS_ALL: + status = add_lock(lkb, qinfo); + break; + + case DLM_QUERY_LOCKS_HIGHER: + if (lkmode > mode) + status = add_lock(lkb, qinfo); + break; + + case DLM_QUERY_LOCKS_EQUAL: + if (lkmode == mode) + status = add_lock(lkb, qinfo); + break; + + case DLM_QUERY_LOCKS_LOWER: + if (lkmode < mode) + status = add_lock(lkb, qinfo); + + case DLM_QUERY_LOCKS_ORPHAN: + if (lkb->lkb_flags & GDLM_LKFLG_ORPHAN) + status = add_lock(lkb, qinfo); + break; + } + } + up_read(&rsb->res_lock); + return status; +} + +/* + * Return 1 if the locks' ranges overlap + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff + */ +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2) +{ + if (!lkb1->lkb_range || !lkb2->lkb_range) + return 1; + + if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] || + lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END]) + return 0; + + return 1; +} +extern const int __dlm_compat_matrix[8][8]; + + +static int get_blocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo) +{ + struct list_head *tmp; + int status = 0; + + down_read(&qlkb->lkb_resource->res_lock); + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) { + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue); + + if (ranges_overlap(lkb, qlkb) && + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]) + status = add_lock(lkb, qinfo); + } + up_read(&qlkb->lkb_resource->res_lock); + + return status; +} + +static int get_nonblocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo) +{ + struct list_head *tmp; + int status = 0; + + down_read(&qlkb->lkb_resource->res_lock); + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) { + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue); + + if (!(ranges_overlap(lkb, qlkb) && + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])) + status = add_lock(lkb, qinfo); + } + up_read(&qlkb->lkb_resource->res_lock); + + return status; +} + +/* Gather a list of appropriate locks */ +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo) +{ + int status = 0; + + + /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS + * was requested as the mode + */ + if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) { + query &= ~DLM_QUERY_MODE_MASK; + if (query & DLM_QUERY_RQMODE) + query |= lkb->lkb_rqmode; + else + query |= lkb->lkb_grmode; + } + + qinfo->gqi_lockcount = 0; + + /* BLOCKING/NOTBLOCK only look at the granted queue */ + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING) + return get_blocking_locks(lkb, qinfo); + + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) + return get_nonblocking_locks(lkb, qinfo); + + /* Do the lock queues that were requested */ + if (query & DLM_QUERY_QUEUE_GRANT) { + status = query_lkb_queue(lkb->lkb_resource, + &lkb->lkb_resource->res_grantqueue, + query, qinfo); + } + + if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) { + status = query_lkb_queue(lkb->lkb_resource, + &lkb->lkb_resource->res_convertqueue, + query, qinfo); + } + + if (!status && (query & DLM_QUERY_QUEUE_WAIT)) { + status = query_lkb_queue(lkb->lkb_resource, + &lkb->lkb_resource->res_waitqueue, + query, qinfo); + } + + + return status; +} + +EXPORT_SYMBOL(dlm_query); +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h --- linux-orig/cluster/dlm/queries.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/queries.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,20 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __QUERIES_DOT_H__ +#define __QUERIES_DOT_H__ + +extern int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg); +extern int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg); + +#endif /* __QUERIES_DOT_H__ */ diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c --- linux-orig/cluster/dlm/rebuild.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/rebuild.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,1280 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * Rebuild RSB's on new masters. Functions for transferring locks and + * subresources to new RSB masters during recovery. + */ + +#include "dlm_internal.h" +#include "reccomms.h" +#include "lkb.h" +#include "rsb.h" +#include "nodes.h" +#include "config.h" +#include "memory.h" +#include "recover.h" + + +/* Types of entity serialised in remastering messages */ +#define REMASTER_ROOTRSB 1 +#define REMASTER_RSB 2 +#define REMASTER_LKB 3 + +struct rcom_fill { + char * outbuf; /* Beginning of data */ + int offset; /* Current offset into outbuf */ + int maxlen; /* Max value of offset */ + int remasterid; + int count; + struct dlm_rsb * rsb; + struct dlm_rsb * subrsb; + struct dlm_lkb * lkb; + struct list_head * lkbqueue; + char more; +}; +typedef struct rcom_fill rcom_fill_t; + + +struct rebuild_node { + struct list_head list; + int nodeid; + struct dlm_rsb * rootrsb; +}; +typedef struct rebuild_node rebuild_node_t; + + +/* + * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new + * master. The rsb will be "done" with recovery when the new master has + * replied with all the new remote lockid's for this rsb's lkb's. + */ + +void expect_new_lkids(struct dlm_rsb *rsb) +{ + rsb->res_newlkid_expect = 0; + recover_list_add(rsb); +} + +/* + * This function is called on root rsb or subrsb when another lkb is being sent + * to the new master for which we expect to receive a corresponding remote lkid + */ + +void need_new_lkid(struct dlm_rsb *rsb) +{ + struct dlm_rsb *root = rsb; + + if (rsb->res_parent) + root = rsb->res_root; + + if (!root->res_newlkid_expect) + recover_list_add(root); + else + DLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),); + + root->res_newlkid_expect++; +} + +/* + * This function is called for each lkb for which a new remote lkid is + * received. Decrement the expected number of remote lkids expected for the + * root rsb. + */ + +void have_new_lkid(struct dlm_lkb *lkb) +{ + struct dlm_rsb *root = lkb->lkb_resource; + + if (root->res_parent) + root = root->res_root; + + down_write(&root->res_lock); + + DLM_ASSERT(root->res_newlkid_expect, + printk("newlkid_expect=%d\n", root->res_newlkid_expect);); + + root->res_newlkid_expect--; + + if (!root->res_newlkid_expect) { + clear_bit(RESFL_NEW_MASTER, &root->res_flags); + recover_list_del(root); + } + up_write(&root->res_lock); +} + +/* + * Return the rebuild struct for a node - will create an entry on the rootrsb + * list if necessary. + * + * Currently no locking is needed here as it all happens in the dlm_recvd + * thread + */ + +static rebuild_node_t *find_rebuild_root(struct dlm_ls *ls, int nodeid) +{ + rebuild_node_t *node = NULL; + + list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) { + if (node->nodeid == nodeid) + return node; + } + + /* Not found, add one */ + node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL); + if (!node) + return NULL; + + node->nodeid = nodeid; + node->rootrsb = NULL; + list_add(&node->list, &ls->ls_rebuild_rootrsb_list); + + return node; +} + +/* + * Tidy up after a rebuild run. Called when all recovery has finished + */ + +void rebuild_freemem(struct dlm_ls *ls) +{ + rebuild_node_t *node = NULL, *s; + + list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) { + list_del(&node->list); + kfree(node); + } +} + +static void put_int(int x, char *buf, int *offp) +{ + x = cpu_to_le32(x); + memcpy(buf + *offp, &x, sizeof(int)); + *offp += sizeof(int); +} + +static void put_int64(uint64_t x, char *buf, int *offp) +{ + x = cpu_to_le64(x); + memcpy(buf + *offp, &x, sizeof(uint64_t)); + *offp += sizeof(uint64_t); +} + +static void put_bytes(char *x, int len, char *buf, int *offp) +{ + put_int(len, buf, offp); + memcpy(buf + *offp, x, len); + *offp += len; +} + +static void put_char(char x, char *buf, int *offp) +{ + buf[*offp] = x; + *offp += 1; +} + +static int get_int(char *buf, int *offp) +{ + int value; + memcpy(&value, buf + *offp, sizeof(int)); + *offp += sizeof(int); + return le32_to_cpu(value); +} + +static uint64_t get_int64(char *buf, int *offp) +{ + uint64_t value; + + memcpy(&value, buf + *offp, sizeof(uint64_t)); + *offp += sizeof(uint64_t); + return le64_to_cpu(value); +} + +static char get_char(char *buf, int *offp) +{ + char x = buf[*offp]; + + *offp += 1; + return x; +} + +static void get_bytes(char *bytes, int *len, char *buf, int *offp) +{ + *len = get_int(buf, offp); + memcpy(bytes, buf + *offp, *len); + *offp += *len; +} + +static int lkb_length(struct dlm_lkb *lkb) +{ + int len = 0; + + len += sizeof(int); /* lkb_id */ + len += sizeof(int); /* lkb_resource->res_reamasterid */ + len += sizeof(int); /* lkb_flags */ + len += sizeof(int); /* lkb_status */ + len += sizeof(char); /* lkb_rqmode */ + len += sizeof(char); /* lkb_grmode */ + len += sizeof(int); /* lkb_childcnt */ + len += sizeof(int); /* lkb_parent->lkb_id */ + len += sizeof(int); /* lkb_bastaddr */ + len += sizeof(int); /* lkb_ownpid */ + + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) { + len += sizeof(int); /* number of lvb bytes */ + len += DLM_LVB_LEN; + } + + if (lkb->lkb_range) { + len += sizeof(uint64_t); + len += sizeof(uint64_t); + if (lkb->lkb_status == GDLM_LKSTS_CONVERT) { + len += sizeof(uint64_t); + len += sizeof(uint64_t); + } + } + + return len; +} + +/* + * It's up to the caller to be sure there's enough space in the buffer. + */ + +static void serialise_lkb(struct dlm_lkb *lkb, char *buf, int *offp) +{ + int flags; + + /* Need to tell the remote end if we have a range */ + flags = lkb->lkb_flags; + if (lkb->lkb_range) + flags |= GDLM_LKFLG_RANGE; + + /* + * See lkb_length() + * Total: 30 (no lvb) or 66 (with lvb) bytes + */ + + put_int(lkb->lkb_id, buf, offp); + put_int(lkb->lkb_resource->res_remasterid, buf, offp); + put_int(flags, buf, offp); + put_int(lkb->lkb_status, buf, offp); + put_char(lkb->lkb_rqmode, buf, offp); + put_char(lkb->lkb_grmode, buf, offp); + put_int(atomic_read(&lkb->lkb_childcnt), buf, offp); + + if (lkb->lkb_parent) + put_int(lkb->lkb_parent->lkb_id, buf, offp); + else + put_int(0, buf, offp); + + if (lkb->lkb_bastaddr) + put_int(1, buf, offp); + else + put_int(0, buf, offp); + put_int(lkb->lkb_ownpid, buf, offp); + + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) { + DLM_ASSERT(lkb->lkb_lvbptr,); + put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp); + } + + /* Only send the range we actually need */ + if (lkb->lkb_range) { + switch (lkb->lkb_status) { + case GDLM_LKSTS_CONVERT: + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp); + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp); + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp); + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp); + break; + case GDLM_LKSTS_WAITING: + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp); + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp); + break; + case GDLM_LKSTS_GRANTED: + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp); + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp); + break; + default: + DLM_ASSERT(0,); + } + } +} + +static int rsb_length(struct dlm_rsb *rsb) +{ + int len = 0; + + len += sizeof(int); /* number of res_name bytes */ + len += rsb->res_length; /* res_name */ + len += sizeof(int); /* res_remasterid */ + len += sizeof(int); /* res_parent->res_remasterid */ + + return len; +} + +static inline struct dlm_rsb *next_subrsb(struct dlm_rsb *subrsb) +{ + struct list_head *tmp; + struct dlm_rsb *r; + + tmp = subrsb->res_subreslist.next; + r = list_entry(tmp, struct dlm_rsb, res_subreslist); + + return r; +} + +static inline int last_in_list(struct dlm_rsb *r, struct list_head *head) +{ + struct dlm_rsb *last; + last = list_entry(head->prev, struct dlm_rsb, res_subreslist); + if (last == r) + return 1; + return 0; +} + +static int lkbs_to_remaster_list(struct list_head *head) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, head, lkb_statequeue) { + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD) + continue; + return TRUE; + } + return FALSE; +} + +/* + * Used to decide if an rsb should be rebuilt on a new master. An rsb only + * needs to be rebuild if we have lkb's queued on it. NOREBUILD lkb's are not + * rebuilt. + */ + +static int lkbs_to_remaster(struct dlm_rsb *r) +{ + struct dlm_rsb *sub; + + if (lkbs_to_remaster_list(&r->res_grantqueue)) + return TRUE; + if (lkbs_to_remaster_list(&r->res_convertqueue)) + return TRUE; + if (lkbs_to_remaster_list(&r->res_waitqueue)) + return TRUE; + + list_for_each_entry(sub, &r->res_subreslist, res_subreslist) { + if (lkbs_to_remaster_list(&sub->res_grantqueue)) + return TRUE; + if (lkbs_to_remaster_list(&sub->res_convertqueue)) + return TRUE; + if (lkbs_to_remaster_list(&sub->res_waitqueue)) + return TRUE; + } + + return FALSE; +} + +static void serialise_rsb(struct dlm_rsb *rsb, char *buf, int *offp) +{ + /* + * See rsb_length() + * Total: 36 bytes (4 + 24 + 4 + 4) + */ + + put_bytes(rsb->res_name, rsb->res_length, buf, offp); + put_int(rsb->res_remasterid, buf, offp); + + if (rsb->res_parent) + put_int(rsb->res_parent->res_remasterid, buf, offp); + else + put_int(0, buf, offp); + + DLM_ASSERT(!rsb->res_lvbptr,); +} + +/* + * Flatten an LKB into a buffer for sending to the new RSB master. As a + * side-effect the nodeid of the lock is set to the nodeid of the new RSB + * master. + */ + +static int pack_one_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, + rcom_fill_t *fill) +{ + if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen) + goto nospace; + + lkb->lkb_nodeid = r->res_nodeid; + + put_char(REMASTER_LKB, fill->outbuf, &fill->offset); + serialise_lkb(lkb, fill->outbuf, &fill->offset); + + fill->count++; + need_new_lkid(r); + return 0; + + nospace: + return -ENOSPC; +} + +/* + * Pack all LKB's from a given queue, except for those with the NOREBUILD flag. + */ + +static int pack_lkb_queue(struct dlm_rsb *r, struct list_head *queue, + rcom_fill_t *fill) +{ + struct dlm_lkb *lkb; + int error; + + list_for_each_entry(lkb, queue, lkb_statequeue) { + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD) + continue; + + error = pack_one_lkb(r, lkb, fill); + if (error) + goto nospace; + } + + return 0; + + nospace: + fill->lkb = lkb; + fill->lkbqueue = queue; + + return error; +} + +static int pack_lkb_queues(struct dlm_rsb *r, rcom_fill_t *fill) +{ + int error; + + error = pack_lkb_queue(r, &r->res_grantqueue, fill); + if (error) + goto nospace; + + error = pack_lkb_queue(r, &r->res_convertqueue, fill); + if (error) + goto nospace; + + error = pack_lkb_queue(r, &r->res_waitqueue, fill); + + nospace: + return error; +} + +/* + * Pack remaining lkb's for rsb or subrsb. This may include a partial lkb + * queue and full lkb queues. + */ + +static int pack_lkb_remaining(struct dlm_rsb *r, rcom_fill_t *fill) +{ + struct list_head *tmp, *start, *end; + struct dlm_lkb *lkb; + int error; + + /* + * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue. + */ + + error = pack_one_lkb(r, fill->lkb, fill); + if (error) + goto out; + + start = fill->lkb->lkb_statequeue.next; + end = fill->lkbqueue; + + for (tmp = start; tmp != end; tmp = tmp->next) { + lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue); + + error = pack_one_lkb(r, lkb, fill); + if (error) { + fill->lkb = lkb; + goto out; + } + } + + /* + * Pack all lkb's on r's queues following fill->lkbqueue. + */ + + if (fill->lkbqueue == &r->res_waitqueue) + goto out; + if (fill->lkbqueue == &r->res_convertqueue) + goto skip; + + DLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,); + + error = pack_lkb_queue(r, &r->res_convertqueue, fill); + if (error) + goto out; + skip: + error = pack_lkb_queue(r, &r->res_waitqueue, fill); + + out: + return error; +} + +static int pack_one_subrsb(struct dlm_rsb *rsb, struct dlm_rsb *subrsb, + rcom_fill_t *fill) +{ + int error; + + down_write(&subrsb->res_lock); + + if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen) + goto nospace; + + subrsb->res_nodeid = rsb->res_nodeid; + subrsb->res_remasterid = ++fill->remasterid; + + put_char(REMASTER_RSB, fill->outbuf, &fill->offset); + serialise_rsb(subrsb, fill->outbuf, &fill->offset); + + error = pack_lkb_queues(subrsb, fill); + if (error) + goto nospace; + + up_write(&subrsb->res_lock); + + return 0; + + nospace: + up_write(&subrsb->res_lock); + fill->subrsb = subrsb; + + return -ENOSPC; +} + +static int pack_subrsbs(struct dlm_rsb *rsb, struct dlm_rsb *in_subrsb, + rcom_fill_t *fill) +{ + struct dlm_rsb *subrsb; + int error = 0; + + /* + * When an initial subrsb is given, we know it needs to be packed. + * When no initial subrsb is given, begin with the first (if any exist). + */ + + if (!in_subrsb) { + if (list_empty(&rsb->res_subreslist)) + goto out; + + subrsb = list_entry(rsb->res_subreslist.next, struct dlm_rsb, + res_subreslist); + } else + subrsb = in_subrsb; + + for (;;) { + error = pack_one_subrsb(rsb, subrsb, fill); + if (error) + goto out; + + if (last_in_list(subrsb, &rsb->res_subreslist)) + break; + + subrsb = next_subrsb(subrsb); + } + + out: + return error; +} + +/* + * Finish packing whatever is left in an rsb tree. If space runs out while + * finishing, save subrsb/lkb and this will be called again for the same rsb. + * + * !subrsb && lkb, we left off part way through root rsb's lkbs. + * subrsb && !lkb, we left off just before starting a new subrsb. + * subrsb && lkb, we left off part way through a subrsb's lkbs. + * !subrsb && !lkb, we shouldn't be in this function, but starting + * a new rsb in pack_rsb_tree(). + */ + +static int pack_rsb_tree_remaining(struct dlm_ls *ls, struct dlm_rsb *rsb, + rcom_fill_t *fill) +{ + struct dlm_rsb *subrsb = NULL; + int error = 0; + + if (!fill->subrsb && fill->lkb) { + error = pack_lkb_remaining(rsb, fill); + if (error) + goto out; + + error = pack_subrsbs(rsb, NULL, fill); + if (error) + goto out; + } + + else if (fill->subrsb && !fill->lkb) { + error = pack_subrsbs(rsb, fill->subrsb, fill); + if (error) + goto out; + } + + else if (fill->subrsb && fill->lkb) { + error = pack_lkb_remaining(fill->subrsb, fill); + if (error) + goto out; + + if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist)) + goto out; + + subrsb = next_subrsb(fill->subrsb); + + error = pack_subrsbs(rsb, subrsb, fill); + if (error) + goto out; + } + + fill->subrsb = NULL; + fill->lkb = NULL; + + out: + return error; +} + +/* + * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a + * buffer. When the buffer runs out of space, save the place to restart (the + * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit). + */ + +static int pack_rsb_tree(struct dlm_ls *ls, struct dlm_rsb *rsb, + rcom_fill_t *fill) +{ + int error = -ENOSPC; + + fill->remasterid = 0; + + /* + * Pack the root rsb itself. A 1 byte type precedes the serialised + * rsb. Then pack the lkb's for the root rsb. + */ + + down_write(&rsb->res_lock); + + if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen) + goto out; + + rsb->res_remasterid = ++fill->remasterid; + put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset); + serialise_rsb(rsb, fill->outbuf, &fill->offset); + + error = pack_lkb_queues(rsb, fill); + if (error) + goto out; + + up_write(&rsb->res_lock); + + /* + * Pack subrsb/lkb's under the root rsb. + */ + + error = pack_subrsbs(rsb, NULL, fill); + + return error; + + out: + up_write(&rsb->res_lock); + return error; +} + +/* + * Given an RSB, return the next RSB that should be sent to a new master. + */ + +static struct dlm_rsb *next_remastered_rsb(struct dlm_ls *ls, + struct dlm_rsb *rsb) +{ + struct list_head *tmp, *start, *end; + struct dlm_rsb *r; + + if (!rsb) + start = ls->ls_rootres.next; + else + start = rsb->res_rootlist.next; + + end = &ls->ls_rootres; + + for (tmp = start; tmp != end; tmp = tmp->next) { + r = list_entry(tmp, struct dlm_rsb, res_rootlist); + + if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) { + if (r->res_nodeid && lkbs_to_remaster(r)) { + expect_new_lkids(r); + return r; + } else + clear_bit(RESFL_NEW_MASTER, &r->res_flags); + } + } + + return NULL; +} + +/* + * Given an rcom buffer, fill it with RSB's that need to be sent to a single + * new master node. In the case where all the data to send to one node + * requires multiple messages, this function needs to resume filling each + * successive buffer from the point where it left off when the previous buffer + * filled up. + */ + +static void fill_rcom_buffer(struct dlm_ls *ls, rcom_fill_t *fill, + uint32_t *nodeid) +{ + struct dlm_rsb *rsb, *prev_rsb = fill->rsb; + int error; + + fill->offset = 0; + + if (!prev_rsb) { + + /* + * The first time this function is called. + */ + + rsb = next_remastered_rsb(ls, NULL); + if (!rsb) + goto no_more; + + } else if (fill->subrsb || fill->lkb) { + + /* + * Continue packing an rsb tree that was partially packed last + * time (fill->subrsb/lkb indicates where packing of last block + * left off) + */ + + rsb = prev_rsb; + *nodeid = rsb->res_nodeid; + + error = pack_rsb_tree_remaining(ls, rsb, fill); + if (error == -ENOSPC) + goto more; + + rsb = next_remastered_rsb(ls, prev_rsb); + if (!rsb) + goto no_more; + + if (rsb->res_nodeid != prev_rsb->res_nodeid) + goto more; + } else { + rsb = prev_rsb; + } + + /* + * Pack rsb trees into the buffer until we run out of space, run out of + * new rsb's or hit a new nodeid. + */ + + *nodeid = rsb->res_nodeid; + + for (;;) { + error = pack_rsb_tree(ls, rsb, fill); + if (error == -ENOSPC) + goto more; + + prev_rsb = rsb; + + rsb = next_remastered_rsb(ls, prev_rsb); + if (!rsb) + goto no_more; + + if (rsb->res_nodeid != prev_rsb->res_nodeid) + goto more; + } + + more: + fill->more = 1; + fill->rsb = rsb; + return; + + no_more: + fill->more = 0; +} + +/* + * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters. + */ + +int rebuild_rsbs_send(struct dlm_ls *ls) +{ + struct dlm_rcom *rc; + rcom_fill_t fill; + uint32_t nodeid; + int error; + + DLM_ASSERT(recover_list_empty(ls),); + + log_all(ls, "rebuild locks"); + + error = -ENOMEM; + rc = allocate_rcom_buffer(ls); + if (!rc) + goto ret; + + down_read(&ls->ls_root_lock); + + error = 0; + memset(&fill, 0, sizeof(rcom_fill_t)); + fill.outbuf = rc->rc_buf; + fill.maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom); + + do { + fill_rcom_buffer(ls, &fill, &nodeid); + if (!fill.offset) + break; + + rc->rc_datalen = fill.offset; + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0); + if (error) { + up_read(&ls->ls_root_lock); + goto out; + } + + schedule(); + error = dlm_recovery_stopped(ls); + if (error) { + up_read(&ls->ls_root_lock); + goto out; + } + } + while (fill.more); + + up_read(&ls->ls_root_lock); + + error = dlm_wait_function(ls, &recover_list_empty); + + log_all(ls, "rebuilt %d locks", fill.count); + + out: + free_rcom_buffer(rc); + + ret: + return error; +} + +static struct dlm_rsb *find_by_remasterid(struct dlm_ls *ls, int remasterid, + struct dlm_rsb *rootrsb) +{ + struct dlm_rsb *rsb; + + DLM_ASSERT(rootrsb,); + + if (rootrsb->res_remasterid == remasterid) { + rsb = rootrsb; + goto out; + } + + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) { + if (rsb->res_remasterid == remasterid) + goto out; + } + rsb = NULL; + + out: + return rsb; +} + +/* + * Search a queue for the given remote lock id (remlkid). + */ + +static struct dlm_lkb *search_remlkid(struct list_head *statequeue, int nodeid, + int remid) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, statequeue, lkb_statequeue) { + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) { + return lkb; + } + } + + return NULL; +} + +/* + * Given a remote lock ID (and a parent resource), return the local LKB for it + * Hopefully we dont need to do this too often on deep lock trees. This is + * VERY suboptimal for anything but the smallest lock trees. It searches the + * lock tree for an LKB with the remote id "remid" and the node "nodeid" and + * returns the LKB address. OPTIMISATION: we should keep a list of these while + * we are building up the remastered LKBs + */ + +static struct dlm_lkb *find_by_remlkid(struct dlm_rsb *rootrsb, int nodeid, + int remid) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *rsb; + + lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid); + if (lkb) + goto out; + + lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid); + if (lkb) + goto out; + + lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid); + if (lkb) + goto out; + + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) { + lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid); + if (lkb) + goto out; + + lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid); + if (lkb) + goto out; + + lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid); + if (lkb) + goto out; + } + lkb = NULL; + + out: + return lkb; +} + +/* + * Unpack an LKB from a remaster operation + */ + +static int deserialise_lkb(struct dlm_ls *ls, int rem_nodeid, + struct dlm_rsb *rootrsb, char *buf, int *ptr, + char *outbuf, int *outoffp) +{ + struct dlm_lkb *lkb, *exist_lkb = NULL; + struct dlm_rsb *rsb; + int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp; + + remote_lkid = get_int(buf, ptr); + + rsb_rmid = get_int(buf, ptr); + rsb = find_by_remasterid(ls, rsb_rmid, rootrsb); + DLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid);); + + /* + * We could have received this lkb already from a previous recovery + * that was interrupted. We still need to advance ptr so read in + * lkb and then release it. FIXME: verify this is valid. + */ + lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid); + if (lkb) { + log_all(ls, "lkb %x exists %s", remote_lkid, rsb->res_name); + exist_lkb = lkb; + } + + lkb = create_lkb(ls); + if (!lkb) + goto out; + + lkb->lkb_remid = remote_lkid; + lkb->lkb_flags = get_int(buf, ptr); + status = get_int(buf, ptr); + lkb->lkb_rqmode = get_char(buf, ptr); + lkb->lkb_grmode = get_char(buf, ptr); + atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr)); + + parentid = get_int(buf, ptr); + lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr); + lkb->lkb_ownpid = get_int(buf, ptr); + + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) { + lkb->lkb_lvbptr = allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + goto out; + get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr); + } + + if (lkb->lkb_flags & GDLM_LKFLG_RANGE) { + uint64_t start, end; + + /* Don't need to keep the range flag, for comms use only */ + lkb->lkb_flags &= ~GDLM_LKFLG_RANGE; + start = get_int64(buf, ptr); + end = get_int64(buf, ptr); + + lkb->lkb_range = allocate_range(ls); + if (!lkb->lkb_range) + goto out; + + switch (status) { + case GDLM_LKSTS_CONVERT: + lkb->lkb_range[RQ_RANGE_START] = start; + lkb->lkb_range[RQ_RANGE_END] = end; + start = get_int64(buf, ptr); + end = get_int64(buf, ptr); + lkb->lkb_range[GR_RANGE_START] = start; + lkb->lkb_range[GR_RANGE_END] = end; + + case GDLM_LKSTS_WAITING: + lkb->lkb_range[RQ_RANGE_START] = start; + lkb->lkb_range[RQ_RANGE_END] = end; + break; + + case GDLM_LKSTS_GRANTED: + lkb->lkb_range[GR_RANGE_START] = start; + lkb->lkb_range[GR_RANGE_END] = end; + break; + default: + DLM_ASSERT(0,); + } + } + + if (exist_lkb) { + /* verify lkb and exist_lkb values match? */ + release_lkb(ls, lkb); + lkb = exist_lkb; + goto put_lkid; + } + + /* Resolve local lock LKB address from parent ID */ + if (parentid) + lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid, + parentid); + + atomic_inc(&rsb->res_ref); + lkb->lkb_resource = rsb; + + lkb->lkb_flags |= GDLM_LKFLG_MSTCPY; + lkb->lkb_nodeid = rem_nodeid; + + /* + * Put the lkb on an RSB queue. An lkb that's in the midst of a + * conversion request (on the requesting node's lockqueue and has + * LQCONVERT set) should be put on the granted queue. The convert + * request will be resent by the requesting node. + */ + + if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) { + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT; + DLM_ASSERT(status == GDLM_LKSTS_CONVERT, + printk("status=%d\n", status);); + lkb->lkb_rqmode = DLM_LOCK_IV; + status = GDLM_LKSTS_GRANTED; + } + + lkb_enqueue(rsb, lkb, status); + + /* + * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL). + */ + + if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK) + && lkb->lkb_grmode > DLM_LOCK_NL) { + if (!rsb->res_lvbptr) + rsb->res_lvbptr = allocate_lvb(ls); + if (!rsb->res_lvbptr) + goto out; + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); + } + + /* + * Clear flags that may have been sent over that are only relevant in + * the context of the sender. + */ + + lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND | + GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED); + + put_lkid: + /* Return the new LKID to the caller's buffer */ + put_int(lkb->lkb_id, outbuf, outoffp); + put_int(lkb->lkb_remid, outbuf, outoffp); + error = 0; + + out: + return error; +} + +static struct dlm_rsb *deserialise_rsb(struct dlm_ls *ls, int nodeid, + struct dlm_rsb *rootrsb, char *buf, + int *ptr) +{ + int length; + int remasterid; + int parent_remasterid; + char name[DLM_RESNAME_MAXLEN]; + int error; + struct dlm_rsb *parent = NULL; + struct dlm_rsb *rsb; + + get_bytes(name, &length, buf, ptr); + remasterid = get_int(buf, ptr); + parent_remasterid = get_int(buf, ptr); + + if (parent_remasterid) + parent = find_by_remasterid(ls, parent_remasterid, rootrsb); + + /* + * The rsb reference from this find_or_create_rsb() will keep the rsb + * around while we add new lkb's to it from deserialise_lkb. Each of + * the lkb's will add an rsb reference. The reference added here is + * removed by release_rsb() after all lkb's are added. + */ + + error = find_rsb(ls, parent, name, length, CREATE, &rsb); + DLM_ASSERT(!error,); + + set_bit(RESFL_MASTER, &rsb->res_flags); + + /* There is a case where the above needs to create the RSB. */ + if (rsb->res_nodeid == -1) + rsb->res_nodeid = our_nodeid(); + + rsb->res_remasterid = remasterid; + + return rsb; +} + +/* + * Processing at the receiving end of a NEWLOCKS message from a node in + * rebuild_rsbs_send(). Rebuild a remastered lock tree. Nodeid is the remote + * node whose locks we are now mastering. For a reply we need to send back the + * new lockids of the remastered locks so that remote ops can find them. + */ + +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len) +{ + struct dlm_rcom *rc; + struct dlm_rsb *rsb = NULL; + rebuild_node_t *rnode; + char *outbuf; + int outptr, ptr = 0, error = -ENOMEM; + + rnode = find_rebuild_root(ls, nodeid); + if (!rnode) + goto out; + + /* + * Allocate a buffer for the reply message which is a list of remote + * lock IDs and their (new) local lock ids. It will always be big + * enough to fit ID pairs if it already fit LKBs. + */ + + rc = allocate_rcom_buffer(ls); + if (!rc) + goto out; + outbuf = rc->rc_buf; + outptr = 0; + + /* + * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're + * created. Each deserialise_rsb adds an rsb reference that must be + * removed with release_rsb once all new lkb's for an rsb have been + * added. + */ + + while (ptr < len) { + int type; + + type = get_char(buf, &ptr); + + switch (type) { + case REMASTER_ROOTRSB: + if (rsb) + release_rsb(rsb); + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf, + &ptr); + rnode->rootrsb = rsb; + break; + + case REMASTER_RSB: + if (rsb) + release_rsb(rsb); + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf, + &ptr); + break; + + case REMASTER_LKB: + deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr, + outbuf, &outptr); + break; + + default: + DLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d " + "len=%d\n", type, nodeid, ptr, + len);); + } + } + + if (rsb) + release_rsb(rsb); + + /* + * Reply with the new lock IDs. + */ + + rc->rc_datalen = outptr; + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0); + + free_rcom_buffer(rc); + + out: + return error; +} + +/* + * Processing for a NEWLOCKIDS message. Called when we get the reply from the + * new master telling us what the new remote lock IDs are for the remastered + * locks + */ + +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len) +{ + int offset = 0; + + if (len == 1) + len = 0; + + while (offset < len) { + int remote_id; + int local_id; + struct dlm_lkb *lkb; + + if (offset + 8 > len) { + log_error(ls, "rebuild_rsbs_lkids_recv: bad data " + "length nodeid=%d offset=%d len=%d", + nodeid, offset, len); + break; + } + + remote_id = get_int(buf, &offset); + local_id = get_int(buf, &offset); + + lkb = find_lock_by_id(ls, local_id); + if (lkb) { + lkb->lkb_remid = remote_id; + have_new_lkid(lkb); + } else { + log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid " + "nodeid=%d id=%x remid=%x offset=%d len=%d", + nodeid, local_id, remote_id, offset, len); + } + } + + if (recover_list_empty(ls)) + wake_up(&ls->ls_wait_general); + + return 0; +} diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h --- linux-orig/cluster/dlm/rebuild.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/rebuild.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,22 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __REBUILD_DOT_H__ +#define __REBUILD_DOT_H__ + +int rebuild_rsbs_send(struct dlm_ls *ls); +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len); +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len); +int rebuild_freemem(struct dlm_ls *ls); + +#endif /* __REBUILD_DOT_H__ */ diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c --- linux-orig/cluster/dlm/reccomms.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/reccomms.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,447 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "reccomms.h" +#include "nodes.h" +#include "lockspace.h" +#include "recover.h" +#include "dir.h" +#include "config.h" +#include "rebuild.h" +#include "memory.h" + +/* Running on the basis that only a single recovery communication will be done + * at a time per lockspace */ + +static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc); + +static int rcom_response(struct dlm_ls *ls) +{ + return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags); +} + +/** + * rcom_send_message - send or request recovery data + * @ls: the lockspace + * @nodeid: node to which the message is sent + * @type: type of recovery message + * @rc: the rc buffer to send + * @need_reply: wait for reply if this is set + * + * Using this interface + * i) Allocate an rc buffer: + * rc = allocate_rcom_buffer(ls); + * ii) Copy data to send beginning at rc->rc_buf: + * memcpy(rc->rc_buf, mybuf, mylen); + * iii) Set rc->rc_datalen to the number of bytes copied in (ii): + * rc->rc_datalen = mylen + * iv) Submit the rc to this function: + * rcom_send_message(rc); + * + * The max value of "mylen" is dlm_config.buffer_size - sizeof(struct + * dlm_rcom). If more data must be passed in one send, use + * rcom_expand_buffer() which incrementally increases the size of the rc buffer + * by dlm_config.buffer_size bytes. + * + * Any data returned for the message (when need_reply is set) will saved in + * rc->rc_buf when this function returns and rc->rc_datalen will be set to the + * number of bytes copied into rc->rc_buf. + * + * Returns: 0 on success, -EXXX on failure + */ + +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type, + struct dlm_rcom *rc, int need_reply) +{ + int error = 0; + + if (!rc->rc_datalen) + rc->rc_datalen = 1; + + /* + * Fill in the header. + */ + + rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE; + rc->rc_header.rh_lockspace = ls->ls_global_id; + rc->rc_header.rh_length = sizeof(struct dlm_rcom) + rc->rc_datalen - 1; + rc->rc_subcmd = type; + rc->rc_msgid = ++ls->ls_rcom_msgid; + + /* + * When a reply is received, the reply data goes back into this buffer. + * Synchronous rcom requests (need_reply=1) are serialised because of + * the single ls_rcom. + */ + + if (need_reply) { + down(&ls->ls_rcom_lock); + ls->ls_rcom = rc; + } + + /* + * After sending the message we'll wait at the end of this function to + * get a reply. The READY flag will be set when the reply has been + * received and requested data has been copied into + * ls->ls_rcom->rc_buf; + */ + + DLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),); + + /* + * The WAIT bit indicates that we're waiting for and willing to accept a + * reply. Any replies are ignored unless this bit is set. + */ + + set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags); + + /* + * Process the message locally. + */ + + if (nodeid == our_nodeid()) { + rcom_process_message(ls, nodeid, rc); + goto out; + } + + /* + * Send the message. + */ + + log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid); + + error = midcomms_send_message(nodeid, (struct dlm_header *) rc, + GFP_KERNEL); + DLM_ASSERT(error >= 0, printk("error = %d\n", error);); + error = 0; + + /* + * Wait for a reply. Once a reply is processed from midcomms, the + * READY bit will be set and we'll be awoken (dlm_wait_function will + * return 0). + */ + + if (need_reply) { + error = dlm_wait_function(ls, &rcom_response); + if (error) + log_debug(ls, "rcom wait error %d", error); + } + + out: + clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags); + clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags); + + if (need_reply) + up(&ls->ls_rcom_lock); + + return error; +} + +/* + * Runs in same context as midcomms. + */ + +static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc) +{ + struct dlm_rcom rc_stack; + struct dlm_rcom *reply = NULL; + int status, datalen, maxlen; + uint32_t r_nodeid, be_nodeid; + + if (!ls) + return; + + if (dlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) { + log_error(ls, "ignoring recovery message %x from %u", + rc->rc_subcmd, nodeid); + return; + } + + switch (rc->rc_subcmd) { + + case RECCOMM_STATUS: + + memset(&rc_stack, 0, sizeof(struct dlm_rcom)); + reply = &rc_stack; + + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; + reply->rc_subcmd = rc->rc_subcmd; + reply->rc_msgid = rc->rc_msgid; + reply->rc_buf[0] = 0; + + if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags)) + reply->rc_buf[0] |= RESDIR_VALID; + + if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags)) + reply->rc_buf[0] |= RESDIR_ALL_VALID; + + if (test_bit(LSFL_NODES_VALID, &ls->ls_flags)) + reply->rc_buf[0] |= NODES_VALID; + + if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags)) + reply->rc_buf[0] |= NODES_ALL_VALID; + + reply->rc_datalen = 1; + reply->rc_header.rh_length = + sizeof(struct dlm_rcom) + reply->rc_datalen - 1; + + log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid); + break; + + case RECCOMM_RECOVERNAMES: + + reply = allocate_rcom_buffer(ls); + DLM_ASSERT(reply,); + maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom); + + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; + reply->rc_subcmd = rc->rc_subcmd; + reply->rc_msgid = rc->rc_msgid; + + /* + * The other node wants a bunch of resource names. The name of + * the resource to begin with is in rc->rc_buf. + */ + + datalen = dlm_dir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen, + reply->rc_buf, maxlen, nodeid); + + reply->rc_datalen = datalen; + reply->rc_header.rh_length = + sizeof(struct dlm_rcom) + reply->rc_datalen - 1; + + log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid, + reply->rc_msgid); + break; + + case RECCOMM_GETMASTER: + + reply = allocate_rcom_buffer(ls); + DLM_ASSERT(reply,); + + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; + reply->rc_subcmd = rc->rc_subcmd; + reply->rc_msgid = rc->rc_msgid; + + /* + * The other node wants to know the master of a named resource. + */ + + status = dlm_dir_lookup(ls, nodeid, rc->rc_buf, rc->rc_datalen, + &r_nodeid); + if (status != 0) { + log_all(ls, "rcom lookup error %d", status); + free_rcom_buffer(reply); + reply = NULL; + return; + } + be_nodeid = cpu_to_be32(r_nodeid); + memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t)); + reply->rc_datalen = sizeof(uint32_t); + reply->rc_header.rh_length = + sizeof(struct dlm_rcom) + reply->rc_datalen - 1; + break; + + case RECCOMM_BULKLOOKUP: + + reply = allocate_rcom_buffer(ls); + DLM_ASSERT(reply,); + + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; + reply->rc_subcmd = rc->rc_subcmd; + reply->rc_msgid = rc->rc_msgid; + + /* + * This is a bulk version of the above and just returns a + * buffer full of node ids to match the resources + */ + + datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf, + rc->rc_datalen, reply->rc_buf); + if (datalen < 0) { + free_rcom_buffer(reply); + reply = NULL; + return; + } + + reply->rc_datalen = datalen; + reply->rc_header.rh_length = + sizeof(struct dlm_rcom) + reply->rc_datalen - 1; + break; + + /* + * These RECCOMM messages don't need replies. + */ + + case RECCOMM_NEWLOCKS: + rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen); + break; + + case RECCOMM_NEWLOCKIDS: + rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen); + break; + + case RECCOMM_REMRESDATA: + dlm_dir_remove(ls, nodeid, rc->rc_buf, rc->rc_datalen); + break; + + default: + DLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd);); + } + + if (reply) { + if (nodeid == our_nodeid()) { + DLM_ASSERT(rc == ls->ls_rcom,); + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen); + rc->rc_datalen = reply->rc_datalen; + } else { + midcomms_send_message(nodeid, + (struct dlm_header *) reply, + GFP_KERNEL); + } + + if (reply != &rc_stack) + free_rcom_buffer(reply); + } +} + +static void process_reply_sync(struct dlm_ls *ls, uint32_t nodeid, + struct dlm_rcom *reply) +{ + struct dlm_rcom *rc = ls->ls_rcom; + + if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) { + log_error(ls, "unexpected rcom reply nodeid=%u", nodeid); + return; + } + + if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) { + log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u", + reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid); + return; + } + + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen); + rc->rc_datalen = reply->rc_datalen; + + /* + * Tell the thread waiting in rcom_send_message() that it can go ahead. + */ + + set_bit(LSFL_RECCOMM_READY, &ls->ls_flags); + wake_up(&ls->ls_wait_general); +} + +static void process_reply_async(struct dlm_ls *ls, uint32_t nodeid, + struct dlm_rcom *reply) +{ + restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen, + reply->rc_msgid); +} + +/* + * Runs in same context as midcomms. + */ + +static void rcom_process_reply(struct dlm_ls *ls, uint32_t nodeid, + struct dlm_rcom *reply) +{ + if (dlm_recovery_stopped(ls)) { + log_error(ls, "ignoring recovery reply %x from %u", + reply->rc_subcmd, nodeid); + return; + } + + switch (reply->rc_subcmd) { + case RECCOMM_GETMASTER: + process_reply_async(ls, nodeid, reply); + break; + case RECCOMM_STATUS: + case RECCOMM_NEWLOCKS: + case RECCOMM_NEWLOCKIDS: + case RECCOMM_RECOVERNAMES: + process_reply_sync(ls, nodeid, reply); + break; + default: + log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u", + reply->rc_subcmd, nodeid); + } +} + + +static int send_ls_not_ready(uint32_t nodeid, struct dlm_header *header) +{ + struct writequeue_entry *wq; + struct dlm_rcom *rc = (struct dlm_rcom *) header; + struct dlm_rcom *reply; + + wq = lowcomms_get_buffer(nodeid, sizeof(struct dlm_rcom), GFP_KERNEL, + (char **)&reply); + if (!wq) + return -ENOMEM; + + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY; + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace; + reply->rc_subcmd = rc->rc_subcmd; + reply->rc_msgid = rc->rc_msgid; + reply->rc_buf[0] = 0; + + reply->rc_datalen = 1; + reply->rc_header.rh_length = sizeof(struct dlm_rcom) + reply->rc_datalen - 1; + + midcomms_send_buffer((struct dlm_header *)reply, wq); + return 0; +} + + +/* + * Runs in same context as midcomms. Both recovery requests and recovery + * replies come through this function. + */ + +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header) +{ + struct dlm_ls *ls = find_lockspace_by_global_id(header->rh_lockspace); + struct dlm_rcom *rc = (struct dlm_rcom *) header; + + /* If the lockspace doesn't exist then still send a status message + back; it's possible that it just doesn't have its global_id yet. */ + + if (!ls) { + send_ls_not_ready(nodeid, header); + return; + } + + switch (header->rh_cmd) { + case GDLM_REMCMD_RECOVERMESSAGE: + rcom_process_message(ls, nodeid, rc); + break; + + case GDLM_REMCMD_RECOVERREPLY: + rcom_process_reply(ls, nodeid, rc); + break; + + default: + DLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd);); + } + + put_lockspace(ls); +} + diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h --- linux-orig/cluster/dlm/reccomms.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/reccomms.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,36 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECCOMMS_DOT_H__ +#define __RECCOMMS_DOT_H__ + +/* Bit flags */ + +#define RESDIR_VALID (1) +#define RESDIR_ALL_VALID (2) +#define NODES_VALID (4) +#define NODES_ALL_VALID (8) + +#define RECCOMM_STATUS (1) +#define RECCOMM_RECOVERNAMES (2) +#define RECCOMM_GETMASTER (3) +#define RECCOMM_BULKLOOKUP (4) +#define RECCOMM_NEWLOCKS (5) +#define RECCOMM_NEWLOCKIDS (6) +#define RECCOMM_REMRESDATA (7) + +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type, + struct dlm_rcom *rc, int need_reply); +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header); + +#endif diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c --- linux-orig/cluster/dlm/recover.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/recover.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,611 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "reccomms.h" +#include "dir.h" +#include "locking.h" +#include "rsb.h" +#include "lockspace.h" +#include "lkb.h" +#include "nodes.h" +#include "config.h" +#include "ast.h" +#include "memory.h" + +/* + * Called in recovery routines to check whether the recovery process has been + * interrupted/stopped by another transition. A recovery in-process will abort + * if the lockspace is "stopped" so that a new recovery process can start from + * the beginning when the lockspace is "started" again. + */ + +int dlm_recovery_stopped(struct dlm_ls *ls) +{ + return test_bit(LSFL_LS_STOP, &ls->ls_flags); +} + +static void dlm_wait_timer_fn(unsigned long data) +{ + struct dlm_ls *ls = (struct dlm_ls *) data; + + wake_up(&ls->ls_wait_general); +} + +/* + * Wait until given function returns non-zero or lockspace is stopped (LS_STOP + * set due to failure of a node in ls_nodes). When another function thinks it + * could have completed the waited-on task, they should wake up ls_wait_general + * to get an immediate response rather than waiting for the timer to detect the + * result. A timer wakes us up periodically while waiting to see if we should + * abort due to a node failure. + */ + +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) +{ + struct timer_list timer; + int error = 0; + + init_timer(&timer); + timer.function = dlm_wait_timer_fn; + timer.data = (long) ls; + + for (;;) { + mod_timer(&timer, jiffies + (dlm_config.recover_timer * HZ)); + + wchan_cond_sleep_intr(ls->ls_wait_general, + !testfn(ls) && + !test_bit(LSFL_LS_STOP, &ls->ls_flags)); + + if (timer_pending(&timer)) + del_timer(&timer); + + if (testfn(ls)) + break; + + if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) { + error = -1; + break; + } + } + + return error; +} + +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status) +{ + struct dlm_rcom rc_stack, *rc; + struct dlm_csb *csb; + int status; + int error = 0; + + memset(&rc_stack, 0, sizeof(struct dlm_rcom)); + rc = &rc_stack; + rc->rc_datalen = 0; + + list_for_each_entry(csb, &ls->ls_nodes, list) { + for (;;) { + error = dlm_recovery_stopped(ls); + if (error) + goto out; + + error = rcom_send_message(ls, csb->node->nodeid, + RECCOMM_STATUS, rc, 1); + if (error) + goto out; + + status = rc->rc_buf[0]; + if (status & wait_status) + break; + else { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ >> 1); + } + } + } + + out: + return error; +} + +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status) +{ + struct dlm_rcom rc_stack, *rc; + uint32_t nodeid = ls->ls_low_nodeid; + int status; + int error = 0; + + memset(&rc_stack, 0, sizeof(struct dlm_rcom)); + rc = &rc_stack; + rc->rc_datalen = 0; + + for (;;) { + error = dlm_recovery_stopped(ls); + if (error) + goto out; + + error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1); + if (error) + break; + + status = rc->rc_buf[0]; + if (status & wait_status) + break; + else { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ >> 1); + } + } + + out: + return error; +} + +static int purge_queue(struct dlm_ls *ls, struct list_head *queue) +{ + struct dlm_lkb *lkb, *safe; + struct dlm_rsb *rsb; + int count = 0; + + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) { + if (!lkb->lkb_nodeid) + continue; + + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,); + + if (in_nodes_gone(ls, lkb->lkb_nodeid)) { + list_del(&lkb->lkb_statequeue); + + rsb = lkb->lkb_resource; + lkb->lkb_status = 0; + + if (lkb->lkb_status == GDLM_LKSTS_CONVERT + && &lkb->lkb_duetime) + remove_from_deadlockqueue(lkb); + + release_lkb(ls, lkb); + release_rsb_locked(rsb); + count++; + } + } + + return count; +} + +/* + * Go through local restbl and for each rsb we're master of, clear out any + * lkb's held by departed nodes. + */ + +int restbl_lkb_purge(struct dlm_ls *ls) +{ + struct list_head *tmp2, *safe2; + int count = 0; + struct dlm_rsb *rootrsb, *safe, *rsb; + + log_all(ls, "purge locks of departed nodes"); + down_write(&ls->ls_root_lock); + + list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) { + + if (rootrsb->res_nodeid) + continue; + + hold_rsb(rootrsb); + down_write(&rootrsb->res_lock); + + /* This traverses the subreslist in reverse order so we purge + * the children before their parents. */ + + for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev; + tmp2 != &rootrsb->res_subreslist; + tmp2 = safe2, safe2 = safe2->prev) { + rsb = list_entry(tmp2, struct dlm_rsb, res_subreslist); + + hold_rsb(rsb); + purge_queue(ls, &rsb->res_grantqueue); + purge_queue(ls, &rsb->res_convertqueue); + purge_queue(ls, &rsb->res_waitqueue); + release_rsb_locked(rsb); + } + count += purge_queue(ls, &rootrsb->res_grantqueue); + count += purge_queue(ls, &rootrsb->res_convertqueue); + count += purge_queue(ls, &rootrsb->res_waitqueue); + + up_write(&rootrsb->res_lock); + release_rsb_locked(rootrsb); + } + + up_write(&ls->ls_root_lock); + log_all(ls, "purged %d locks", count); + + return 0; +} + +/* + * Grant any locks that have become grantable after a purge + */ + +int restbl_grant_after_purge(struct dlm_ls *ls) +{ + struct dlm_rsb *root, *rsb, *safe; + int error = 0; + + down_read(&ls->ls_root_lock); + + list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) { + /* only the rsb master grants locks */ + if (root->res_nodeid) + continue; + + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { + log_debug(ls, "restbl_grant_after_purge aborted"); + error = -EINTR; + up_read(&ls->ls_root_lock); + goto out; + } + + down_write(&root->res_lock); + grant_pending_locks(root); + up_write(&root->res_lock); + + list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){ + down_write(&rsb->res_lock); + grant_pending_locks(rsb); + up_write(&rsb->res_lock); + } + } + up_read(&ls->ls_root_lock); + wake_astd(); + out: + return error; +} + +/* + * Set the lock master for all LKBs in a lock queue + */ + +static void set_lock_master(struct list_head *queue, int nodeid) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, queue, lkb_statequeue) { + /* Don't muck around with pre-exising sublocks */ + if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY)) + lkb->lkb_nodeid = nodeid; + } +} + +static void set_master_lkbs(struct dlm_rsb *rsb) +{ + set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid); + set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid); + set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid); +} + +/* + * This rsb struct is now the master so it is responsible for keeping the + * latest rsb. Find if any current lkb's have an up to date copy of the lvb to + * be used as the rsb copy. An equivalent step occurs as new lkb's arrive for + * this rsb in deserialise_lkb. + */ + +static void set_rsb_lvb(struct dlm_rsb *rsb) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) { + + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) && + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) && + (lkb->lkb_grmode > DLM_LOCK_NL)) + { + if (!rsb->res_lvbptr) + rsb->res_lvbptr = allocate_lvb(rsb->res_ls); + + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); + return; + } + } + + list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) { + + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) && + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) && + (lkb->lkb_grmode > DLM_LOCK_NL)) + { + if (!rsb->res_lvbptr) + rsb->res_lvbptr = allocate_lvb(rsb->res_ls); + + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN); + return; + } + } +} + +/* + * Propogate the new master nodeid to locks, subrsbs, sublocks. + * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider. + */ + +static void set_new_master(struct dlm_rsb *rsb, uint32_t nodeid) +{ + struct dlm_rsb *subrsb; + + down_write(&rsb->res_lock); + + if (nodeid == our_nodeid()) { + set_bit(RESFL_MASTER, &rsb->res_flags); + rsb->res_nodeid = 0; + set_rsb_lvb(rsb); + } else + rsb->res_nodeid = nodeid; + + set_master_lkbs(rsb); + + list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) { + subrsb->res_nodeid = rsb->res_nodeid; + set_master_lkbs(subrsb); + } + + up_write(&rsb->res_lock); + + set_bit(RESFL_NEW_MASTER, &rsb->res_flags); +} + +/* + * The recover_list contains all the rsb's for which we've requested the new + * master nodeid. As replies are returned from the resource directories the + * rsb's are removed from the list. When the list is empty we're done. + * + * The recover_list is later similarly used for all rsb's for which we've sent + * new lkb's and need to receive new corresponding lkid's. + */ + +int recover_list_empty(struct dlm_ls *ls) +{ + int empty; + + spin_lock(&ls->ls_recover_list_lock); + empty = list_empty(&ls->ls_recover_list); + spin_unlock(&ls->ls_recover_list_lock); + + return empty; +} + +int recover_list_count(struct dlm_ls *ls) +{ + int count; + + spin_lock(&ls->ls_recover_list_lock); + count = ls->ls_recover_list_count; + spin_unlock(&ls->ls_recover_list_lock); + + return count; +} + +void recover_list_add(struct dlm_rsb *rsb) +{ + struct dlm_ls *ls = rsb->res_ls; + + spin_lock(&ls->ls_recover_list_lock); + if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) { + list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list); + ls->ls_recover_list_count++; + hold_rsb(rsb); + } + spin_unlock(&ls->ls_recover_list_lock); +} + +void recover_list_del(struct dlm_rsb *rsb) +{ + struct dlm_ls *ls = rsb->res_ls; + + spin_lock(&ls->ls_recover_list_lock); + clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags); + list_del(&rsb->res_recover_list); + ls->ls_recover_list_count--; + spin_unlock(&ls->ls_recover_list_lock); + + release_rsb(rsb); +} + +static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, int msgid) +{ + struct dlm_rsb *rsb = NULL; + + spin_lock(&ls->ls_recover_list_lock); + + list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) { + if (rsb->res_recover_msgid == msgid) + goto rec_found; + } + rsb = NULL; + + rec_found: + spin_unlock(&ls->ls_recover_list_lock); + return rsb; +} + +static int rsb_master_lookup(struct dlm_rsb *rsb, struct dlm_rcom *rc) +{ + struct dlm_ls *ls = rsb->res_ls; + uint32_t dir_nodeid, r_nodeid; + int error; + + dir_nodeid = get_directory_nodeid(rsb); + + if (dir_nodeid == our_nodeid()) { + error = dlm_dir_lookup(ls, dir_nodeid, rsb->res_name, + rsb->res_length, &r_nodeid); + if (error == -EEXIST) { + log_all(ls, "rsb_master_lookup %u EEXIST %s", + r_nodeid, rsb->res_name); + } else if (error) + goto fail; + + set_new_master(rsb, r_nodeid); + } else { + /* As we are the only thread doing recovery this + should be safe. if not then we need to use a different + ID somehow. We must set it in the RSB before rcom_send_msg + completes cos we may get a reply quite quickly. + */ + rsb->res_recover_msgid = ls->ls_rcom_msgid + 1; + + recover_list_add(rsb); + + memcpy(rc->rc_buf, rsb->res_name, rsb->res_length); + rc->rc_datalen = rsb->res_length; + + error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER, + rc, 0); + if (error) + goto fail; + } + + fail: + return error; +} + +static int needs_update(struct dlm_ls *ls, struct dlm_rsb *r) +{ + if (!r->res_nodeid) + return FALSE; + + if (r->res_nodeid == -1) + return FALSE; + + if (in_nodes_gone(ls, r->res_nodeid)) + return TRUE; + + return FALSE; +} + +/* + * Go through local root resources and for each rsb which has a master which + * has departed, get the new master nodeid from the resdir. The resdir will + * assign mastery to the first node to look up the new master. That means + * we'll discover in this lookup if we're the new master of any rsb's. + * + * We fire off all the resdir requests individually and asynchronously to the + * correct resdir node. The replies are processed in rsb_master_recv(). + */ + +int restbl_rsb_update(struct dlm_ls *ls) +{ + struct dlm_rsb *rsb, *safe; + struct dlm_rcom *rc; + int error = -ENOMEM; + int count = 0; + + log_all(ls, "update remastered resources"); + + rc = allocate_rcom_buffer(ls); + if (!rc) + goto out; + + down_read(&ls->ls_root_lock); + + list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) { + error = dlm_recovery_stopped(ls); + if (error) { + up_read(&ls->ls_root_lock); + goto out_free; + } + + if (needs_update(ls, rsb)) { + error = rsb_master_lookup(rsb, rc); + if (error) { + up_read(&ls->ls_root_lock); + goto out_free; + } + count++; + } + } + up_read(&ls->ls_root_lock); + + error = dlm_wait_function(ls, &recover_list_empty); + + log_all(ls, "updated %d resources", count); + out_free: + free_rcom_buffer(rc); + out: + return error; +} + +int restbl_rsb_update_recv(struct dlm_ls *ls, uint32_t nodeid, char *buf, + int length, int msgid) +{ + struct dlm_rsb *rsb; + uint32_t be_nodeid; + + rsb = recover_list_find(ls, msgid); + if (!rsb) { + log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid); + goto out; + } + + memcpy(&be_nodeid, buf, sizeof(uint32_t)); + set_new_master(rsb, be32_to_cpu(be_nodeid)); + recover_list_del(rsb); + + if (recover_list_empty(ls)) + wake_up(&ls->ls_wait_general); + + out: + return 0; +} + +/* + * This function not used any longer. + */ + +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen, + char *outbuf) +{ + char *inbufptr, *outbufptr; + + /* + * The other node wants nodeids matching the resource names in inbuf. + * The resource names are packed into inbuf as + * [len1][name1][len2][name2]... where lenX is 1 byte and nameX is + * lenX bytes. Matching nodeids are packed into outbuf in order + * [nodeid1][nodeid2]... + */ + + inbufptr = inbuf; + outbufptr = outbuf; + + while (inbufptr < inbuf + inlen) { + uint32_t r_nodeid, be_nodeid; + int status; + + status = dlm_dir_lookup(ls, nodeid, inbufptr + 1, *inbufptr, + &r_nodeid); + if (status != 0) + goto fail; + + inbufptr += *inbufptr + 1; + + be_nodeid = cpu_to_be32(r_nodeid); + memcpy(outbufptr, &be_nodeid, sizeof(uint32_t)); + outbufptr += sizeof(uint32_t); + + /* add assertion that outbufptr - outbuf is not > than ... */ + } + + return (outbufptr - outbuf); + fail: + return -1; +} diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h --- linux-orig/cluster/dlm/recover.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/recover.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,33 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECOVER_DOT_H__ +#define __RECOVER_DOT_H__ + +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls)); +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status); +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status); +int dlm_recovery_stopped(struct dlm_ls *ls); +int recover_list_empty(struct dlm_ls *ls); +int recover_list_count(struct dlm_ls *ls); +void recover_list_add(struct dlm_rsb *rsb); +void recover_list_del(struct dlm_rsb *rsb); +int restbl_lkb_purge(struct dlm_ls *ls); +void restbl_grant_after_purge(struct dlm_ls *ls); +int restbl_rsb_update(struct dlm_ls *ls); +int restbl_rsb_update_recv(struct dlm_ls *ls, int nodeid, char *buf, int len, + int msgid); +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen, + char *outbuf); + +#endif /* __RECOVER_DOT_H__ */ diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c --- linux-orig/cluster/dlm/recoverd.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/recoverd.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,713 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "nodes.h" +#include "dir.h" +#include "ast.h" +#include "recover.h" +#include "lockspace.h" +#include "lowcomms.h" +#include "lockqueue.h" +#include "lkb.h" +#include "rebuild.h" + +/* + * next_move actions + */ + +#define DO_STOP (1) +#define DO_START (2) +#define DO_FINISH (3) +#define DO_FINISH_STOP (4) +#define DO_FINISH_START (5) + +/* + * Queue of lockspaces (dlm_recover structs) which need to be + * started/recovered + */ + +static int enable_locking(struct dlm_ls *ls, int event_id) +{ + int error = 0; + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_last_stop < event_id) { + set_bit(LSFL_LS_RUN, &ls->ls_flags); + up_write(&ls->ls_in_recovery); + } else { + error = -EINTR; + log_debug(ls, "enable_locking: abort %d", event_id); + } + spin_unlock(&ls->ls_recover_lock); + return error; +} + +static int ls_first_start(struct dlm_ls *ls, struct dlm_recover *rv) +{ + int error; + + log_all(ls, "recover event %u (first)", rv->event_id); + + kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id); + + error = ls_nodes_init(ls, rv); + if (error) { + log_error(ls, "nodes_init failed %d", error); + goto out; + } + + error = dlm_dir_rebuild_local(ls); + if (error) { + log_error(ls, "dlm_dir_rebuild_local failed %d", error); + goto out; + } + + error = dlm_dir_rebuild_wait(ls); + if (error) { + log_error(ls, "dlm_dir_rebuild_wait failed %d", error); + goto out; + } + + log_all(ls, "recover event %u done", rv->event_id); + kcl_start_done(ls->ls_local_id, rv->event_id); + + out: + return error; +} + +/* + * We are given here a new group of nodes which are in the lockspace. We first + * figure out the differences in ls membership from when we were last running. + * If nodes from before are gone, then there will be some lock recovery to do. + * If there are only nodes which have joined, then there's no lock recovery. + * + * note: cman requires an rc to finish starting on an revent (where nodes die) + * before it allows an sevent (where nodes join) to be processed. This means + * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA + * joined. + */ + +static int ls_reconfig(struct dlm_ls *ls, struct dlm_recover *rv) +{ + int error, neg = 0; + + log_all(ls, "recover event %u", rv->event_id); + + /* + * this list may be left over from a previous aborted recovery + */ + + rebuild_freemem(ls); + + /* + * Add or remove nodes from the lockspace's ls_nodes list. + */ + + error = ls_nodes_reconfig(ls, rv, &neg); + if (error) { + log_error(ls, "nodes_reconfig failed %d", error); + goto fail; + } + + /* + * Rebuild our own share of the resdir by collecting from all other + * nodes rsb name/master pairs for which the name hashes to us. + */ + + error = dlm_dir_rebuild_local(ls); + if (error) { + log_error(ls, "dlm_dir_rebuild_local failed %d", error); + goto fail; + } + + /* + * Purge resdir-related requests that are being held in requestqueue. + * All resdir requests from before recovery started are invalid now due + * to the resdir rebuild and will be resent by the requesting nodes. + */ + + purge_requestqueue(ls); + set_bit(LSFL_REQUEST_WARN, &ls->ls_flags); + + /* + * Wait for all nodes to complete resdir rebuild. + */ + + error = dlm_dir_rebuild_wait(ls); + if (error) { + log_error(ls, "dlm_dir_rebuild_wait failed %d", error); + goto fail; + } + + /* + * Mark our own lkb's waiting in the lockqueue for remote replies from + * nodes that are now departed. These will be resent to the new + * masters in resend_cluster_requests. Also mark resdir lookup + * requests for resending. + */ + + lockqueue_lkb_mark(ls); + + error = dlm_recovery_stopped(ls); + if (error) + goto fail; + + if (neg) { + /* + * Clear lkb's for departed nodes. This can't fail since it + * doesn't involve communicating with other nodes. + */ + + restbl_lkb_purge(ls); + + /* + * Get new master id's for rsb's of departed nodes. This fails + * if we can't communicate with other nodes. + */ + + error = restbl_rsb_update(ls); + if (error) { + log_error(ls, "restbl_rsb_update failed %d", error); + goto fail; + } + + /* + * Send our lkb info to new masters. This fails if we can't + * communicate with a node. + */ + + error = rebuild_rsbs_send(ls); + if (error) { + log_error(ls, "rebuild_rsbs_send failed %d", error); + goto fail; + } + } + + clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags); + + log_all(ls, "recover event %u done", rv->event_id); + kcl_start_done(ls->ls_local_id, rv->event_id); + return 0; + + fail: + log_all(ls, "recover event %d error %d", rv->event_id, error); + return error; +} + +static void clear_finished_nodes(struct dlm_ls *ls, int finish_event) +{ + struct dlm_csb *csb, *safe; + + list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, list) { + if (csb->gone_event <= finish_event) { + list_del(&csb->list); + release_csb(csb); + } + } +} + +/* + * Between calls to this routine for a ls, there can be multiple stop/start + * events from cman where every start but the latest is cancelled by stops. + * There can only be a single finish from cman because every finish requires us + * to call start_done. A single finish event could be followed by multiple + * stop/start events. This routine takes any combination of events from cman + * and boils them down to one course of action. + */ + +static int next_move(struct dlm_ls *ls, struct dlm_recover **rv_out, + int *finish_out) +{ + LIST_HEAD(events); + unsigned int cmd = 0, stop, start, finish; + unsigned int last_stop, last_start, last_finish; + struct dlm_recover *rv = NULL, *start_rv = NULL; + + /* + * Grab the current state of cman/sm events. + */ + + spin_lock(&ls->ls_recover_lock); + + stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0; + start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0; + finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0; + + last_stop = ls->ls_last_stop; + last_start = ls->ls_last_start; + last_finish = ls->ls_last_finish; + + while (!list_empty(&ls->ls_recover)) { + rv = list_entry(ls->ls_recover.next, struct dlm_recover, list); + list_del(&rv->list); + list_add_tail(&rv->list, &events); + } + + /* + * There are two cases where we need to adjust these event values: + * 1. - we get a first start + * - we get a stop + * - we process the start + stop here and notice this special case + * + * 2. - we get a first start + * - we process the start + * - we get a stop + * - we process the stop here and notice this special case + * + * In both cases, the first start we received was aborted by a + * stop before we received a finish. last_finish being zero is the + * indication that this is the "first" start, i.e. we've not yet + * finished a start; if we had, last_finish would be non-zero. + * Part of the problem arises from the fact that when we initially + * get start/stop/start, SM uses the same event id for both starts + * (since the first was cancelled). + * + * In both cases, last_start and last_stop will be equal. + * In both cases, finish=0. + * In the first case start=1 && stop=1. + * In the second case start=0 && stop=1. + * + * In both cases, we need to make adjustments to values so: + * - we process the current event (now) as a normal stop + * - the next start we receive will be processed normally + * (taking into account the assertions below) + * + * In the first case, dlm_ls_start() will have printed the + * "repeated start" warning. + * + * In the first case we need to get rid of the recover event struct. + * + * - set stop=1, start=0, finish=0 for case 4 below + * - last_stop and last_start must be set equal per the case 4 assert + * - ls_last_stop = 0 so the next start will be larger + * - ls_last_start = 0 not really necessary (avoids dlm_ls_start print) + */ + + if (!last_finish && (last_start == last_stop)) { + log_all(ls, "move reset %u,%u,%u ids %u,%u,%u", stop, + start, finish, last_stop, last_start, last_finish); + stop = 1; + start = 0; + finish = 0; + last_stop = 0; + last_start = 0; + ls->ls_last_stop = 0; + ls->ls_last_start = 0; + + while (!list_empty(&events)) { + rv = list_entry(events.next, struct dlm_recover, list); + list_del(&rv->list); + kfree(rv->nodeids); + kfree(rv); + } + } + spin_unlock(&ls->ls_recover_lock); + + log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish, + last_stop, last_start, last_finish); + + /* + * Toss start events which have since been cancelled. + */ + + while (!list_empty(&events)) { + DLM_ASSERT(start,); + rv = list_entry(events.next, struct dlm_recover, list); + list_del(&rv->list); + + if (rv->event_id <= last_stop) { + log_debug(ls, "move skip event %u", rv->event_id); + kfree(rv->nodeids); + kfree(rv); + rv = NULL; + } else { + log_debug(ls, "move use event %u", rv->event_id); + DLM_ASSERT(!start_rv,); + start_rv = rv; + } + } + + /* + * Eight possible combinations of events. + */ + + /* 0 */ + if (!stop && !start && !finish) { + DLM_ASSERT(!start_rv,); + cmd = 0; + goto out; + } + + /* 1 */ + if (!stop && !start && finish) { + DLM_ASSERT(!start_rv,); + DLM_ASSERT(last_start > last_stop,); + DLM_ASSERT(last_finish == last_start,); + cmd = DO_FINISH; + *finish_out = last_finish; + goto out; + } + + /* 2 */ + if (!stop && start && !finish) { + DLM_ASSERT(start_rv,); + DLM_ASSERT(last_start > last_stop,); + cmd = DO_START; + *rv_out = start_rv; + goto out; + } + + /* 3 */ + if (!stop && start && finish) { + DLM_ASSERT(0, printk("finish and start with no stop\n");); + } + + /* 4 */ + if (stop && !start && !finish) { + DLM_ASSERT(!start_rv,); + DLM_ASSERT(last_start == last_stop,); + cmd = DO_STOP; + goto out; + } + + /* 5 */ + if (stop && !start && finish) { + DLM_ASSERT(!start_rv,); + DLM_ASSERT(last_finish == last_start,); + DLM_ASSERT(last_stop == last_start,); + cmd = DO_FINISH_STOP; + *finish_out = last_finish; + goto out; + } + + /* 6 */ + if (stop && start && !finish) { + if (start_rv) { + DLM_ASSERT(last_start > last_stop,); + cmd = DO_START; + *rv_out = start_rv; + } else { + DLM_ASSERT(last_stop == last_start,); + cmd = DO_STOP; + } + goto out; + } + + /* 7 */ + if (stop && start && finish) { + if (start_rv) { + DLM_ASSERT(last_start > last_stop,); + DLM_ASSERT(last_start > last_finish,); + cmd = DO_FINISH_START; + *finish_out = last_finish; + *rv_out = start_rv; + } else { + DLM_ASSERT(last_start == last_stop,); + DLM_ASSERT(last_start > last_finish,); + cmd = DO_FINISH_STOP; + *finish_out = last_finish; + } + goto out; + } + + out: + return cmd; +} + +/* + * This function decides what to do given every combination of current + * lockspace state and next lockspace state. + */ + +static void do_ls_recovery(struct dlm_ls *ls) +{ + struct dlm_recover *rv = NULL; + int error, cur_state, next_state = 0, do_now, finish_event = 0; + + do_now = next_move(ls, &rv, &finish_event); + if (!do_now) + goto out; + + cur_state = ls->ls_state; + next_state = 0; + + DLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags), + log_error(ls, "curstate=%d donow=%d", cur_state, do_now);); + + /* + * LSST_CLEAR - we're not in any recovery state. We can get a stop or + * a stop and start which equates with a START. + */ + + if (cur_state == LSST_CLEAR) { + switch (do_now) { + case DO_STOP: + next_state = LSST_WAIT_START; + break; + + case DO_START: + error = ls_reconfig(ls, rv); + if (error) + next_state = LSST_WAIT_START; + else + next_state = LSST_RECONFIG_DONE; + break; + + case DO_FINISH: /* invalid */ + case DO_FINISH_STOP: /* invalid */ + case DO_FINISH_START: /* invalid */ + default: + DLM_ASSERT(0,); + } + goto out; + } + + /* + * LSST_WAIT_START - we're not running because of getting a stop or + * failing a start. We wait in this state for another stop/start or + * just the next start to begin another reconfig attempt. + */ + + if (cur_state == LSST_WAIT_START) { + switch (do_now) { + case DO_STOP: + break; + + case DO_START: + error = ls_reconfig(ls, rv); + if (error) + next_state = LSST_WAIT_START; + else + next_state = LSST_RECONFIG_DONE; + break; + + case DO_FINISH: /* invalid */ + case DO_FINISH_STOP: /* invalid */ + case DO_FINISH_START: /* invalid */ + default: + DLM_ASSERT(0,); + } + goto out; + } + + /* + * LSST_RECONFIG_DONE - we entered this state after successfully + * completing ls_reconfig and calling kcl_start_done. We expect to get + * a finish if everything goes ok. A finish could be followed by stop + * or stop/start before we get here to check it. Or a finish may never + * happen, only stop or stop/start. + */ + + if (cur_state == LSST_RECONFIG_DONE) { + switch (do_now) { + case DO_FINISH: + rebuild_freemem(ls); + + clear_finished_nodes(ls, finish_event); + next_state = LSST_CLEAR; + + error = enable_locking(ls, finish_event); + if (error) + break; + + error = process_requestqueue(ls); + if (error) + break; + + error = resend_cluster_requests(ls); + if (error) + break; + + restbl_grant_after_purge(ls); + + log_all(ls, "recover event %u finished", finish_event); + break; + + case DO_STOP: + next_state = LSST_WAIT_START; + break; + + case DO_FINISH_STOP: + clear_finished_nodes(ls, finish_event); + next_state = LSST_WAIT_START; + break; + + case DO_FINISH_START: + clear_finished_nodes(ls, finish_event); + /* fall into DO_START */ + + case DO_START: + error = ls_reconfig(ls, rv); + if (error) + next_state = LSST_WAIT_START; + else + next_state = LSST_RECONFIG_DONE; + break; + + default: + DLM_ASSERT(0,); + } + goto out; + } + + /* + * LSST_INIT - state after ls is created and before it has been + * started. A start operation will cause the ls to be started for the + * first time. A failed start will cause to just wait in INIT for + * another stop/start. + */ + + if (cur_state == LSST_INIT) { + switch (do_now) { + case DO_START: + error = ls_first_start(ls, rv); + if (!error) + next_state = LSST_INIT_DONE; + break; + + case DO_STOP: + break; + + case DO_FINISH: /* invalid */ + case DO_FINISH_STOP: /* invalid */ + case DO_FINISH_START: /* invalid */ + default: + DLM_ASSERT(0,); + } + goto out; + } + + /* + * LSST_INIT_DONE - after the first start operation is completed + * successfully and kcl_start_done() called. If there are no errors, a + * finish will arrive next and we'll move to LSST_CLEAR. + */ + + if (cur_state == LSST_INIT_DONE) { + switch (do_now) { + case DO_STOP: + case DO_FINISH_STOP: + next_state = LSST_WAIT_START; + break; + + case DO_START: + case DO_FINISH_START: + error = ls_reconfig(ls, rv); + if (error) + next_state = LSST_WAIT_START; + else + next_state = LSST_RECONFIG_DONE; + break; + + case DO_FINISH: + next_state = LSST_CLEAR; + + enable_locking(ls, finish_event); + + process_requestqueue(ls); + + log_all(ls, "recover event %u finished", finish_event); + break; + + default: + DLM_ASSERT(0,); + } + goto out; + } + + out: + if (next_state) + ls->ls_state = next_state; + + if (rv) { + kfree(rv->nodeids); + kfree(rv); + } +} + +int dlm_recoverd(void *arg) +{ + struct dlm_ls *ls = arg; + + hold_lockspace(ls); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (!test_bit(LSFL_WORK, &ls->ls_flags)) + schedule(); + set_current_state(TASK_RUNNING); + + if (test_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags)) { + down(&ls->ls_recoverd_lock); + ls->ls_recoverd_task = NULL; + up(&ls->ls_recoverd_lock); + goto out; + } + + if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) { + do_ls_recovery(ls); + + down(&ls->ls_recoverd_lock); + if (ls->ls_state == LSST_CLEAR && + !test_bit(LSFL_WORK, &ls->ls_flags)) { + ls->ls_recoverd_task = NULL; + up(&ls->ls_recoverd_lock); + goto out; + } + up(&ls->ls_recoverd_lock); + } + } + + out: + put_lockspace(ls); + return 0; +} + +void dlm_recoverd_kick(struct dlm_ls *ls) +{ + struct task_struct *p; + + down(&ls->ls_recoverd_lock); + set_bit(LSFL_WORK, &ls->ls_flags); + + if (!ls->ls_recoverd_task) { + p = kthread_run(dlm_recoverd, (void *) ls, 0, "dlm_recoverd"); + if (IS_ERR(p)) { + log_error(ls, "can't start dlm_recoverd %ld", + PTR_ERR(p)); + goto out; + } + ls->ls_recoverd_task = p; + } else + wake_up_process(ls->ls_recoverd_task); + out: + up(&ls->ls_recoverd_lock); +} + +void dlm_recoverd_stop(struct dlm_ls *ls) +{ + set_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags); + + for (;;) { + down(&ls->ls_recoverd_lock); + if (!ls->ls_recoverd_task) { + up(&ls->ls_recoverd_lock); + break; + } + wake_up_process(ls->ls_recoverd_task); + up(&ls->ls_recoverd_lock); + msleep(100); + } +} + diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h --- linux-orig/cluster/dlm/recoverd.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/recoverd.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,21 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECOVERD_DOT_H__ +#define __RECOVERD_DOT_H__ + +int dlm_recoverd(void *arg); +void dlm_recoverd_kick(struct dlm_ls *ls); +void dlm_recoverd_stop(struct dlm_ls *ls); + +#endif /* __RECOVERD_DOT_H__ */ diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c --- linux-orig/cluster/dlm/rsb.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/rsb.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,329 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "locking.h" +#include "memory.h" +#include "lockqueue.h" +#include "nodes.h" +#include "dir.h" +#include "util.h" +#include "rsb.h" + +static struct dlm_rsb *search_hashchain(struct list_head *head, + struct dlm_rsb *parent, + char *name, int namelen) +{ + struct dlm_rsb *r; + + list_for_each_entry(r, head, res_hashchain) { + if ((parent == r->res_parent) && (namelen == r->res_length) && + (memcmp(name, r->res_name, namelen) == 0)) { + return r; + } + } + + return NULL; +} + +/* + * A way to arbitrarily hold onto an rsb which we already have a reference to + * to make sure it doesn't go away. Opposite of release_rsb(). + */ + +void hold_rsb(struct dlm_rsb *r) +{ + atomic_inc(&r->res_ref); +} + +/* + * release_rsb() - Decrement reference count on rsb struct. Free the rsb + * struct when there are zero references. Every lkb for the rsb adds a + * reference. When ref is zero there can be no more lkb's for the rsb, on the + * queue's or anywhere else. + */ + +static void _release_rsb(struct dlm_rsb *r, int locked) +{ + struct dlm_ls *ls = r->res_ls; + uint32_t nodeid; + int removed = FALSE; + + write_lock(&ls->ls_rsbtbl[r->res_bucket].lock); + if (atomic_dec_and_test(&r->res_ref)) { + DLM_ASSERT(list_empty(&r->res_grantqueue), print_rsb(r);); + DLM_ASSERT(list_empty(&r->res_waitqueue), print_rsb(r);); + DLM_ASSERT(list_empty(&r->res_convertqueue), print_rsb(r);); + removed = TRUE; + list_del(&r->res_hashchain); + } + write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock); + + if (!removed) + return; + + if (!locked) + down_write(&ls->ls_root_lock); + if (r->res_parent) + list_del(&r->res_subreslist); + else + list_del(&r->res_rootlist); + if (!locked) + up_write(&ls->ls_root_lock); + + if (r->res_parent || !test_bit(RESFL_MASTER, &r->res_flags)) + goto out; + + nodeid = get_directory_nodeid(r); + + if (nodeid != our_nodeid()) + remote_remove_direntry(ls, nodeid, r->res_name, r->res_length); + else + dlm_dir_remove(ls, nodeid, r->res_name, r->res_length); + out: + if (r->res_lvbptr) + free_lvb(r->res_lvbptr); + + free_rsb(r); +} + +void release_rsb(struct dlm_rsb *r) +{ + _release_rsb(r, 0); +} + +void release_rsb_locked(struct dlm_rsb *r) +{ + _release_rsb(r, 1); +} + +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + struct dlm_rsb *r = lkb->lkb_resource; + return r; +} + +/* + * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist. + * If the rsb exists, its ref count is incremented by this function. If it + * doesn't exist, it's created with a ref count of one. + */ + +int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name, int len, + int flags, struct dlm_rsb **rp) +{ + uint32_t bucket; + struct dlm_rsb *r, *tmp; + int error = -ENOMEM; + + DLM_ASSERT(len <= DLM_RESNAME_MAXLEN,); + + bucket = dlm_hash(name, len); + bucket &= (ls->ls_rsbtbl_size - 1); + + read_lock(&ls->ls_rsbtbl[bucket].lock); + r = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len); + if (r) { + if (r->res_nodeid != 0 && (flags & MASTER)) + r = NULL; + else + atomic_inc(&r->res_ref); + } + read_unlock(&ls->ls_rsbtbl[bucket].lock); + + if (r) + goto out_set; + + /* Always create sublocks */ + if (!(flags & CREATE) && !parent) { + *rp = NULL; + goto out; + } + + r = allocate_rsb(ls, len); + if (!r) + goto fail; + + INIT_LIST_HEAD(&r->res_subreslist); + INIT_LIST_HEAD(&r->res_grantqueue); + INIT_LIST_HEAD(&r->res_convertqueue); + INIT_LIST_HEAD(&r->res_waitqueue); + + memcpy(r->res_name, name, len); + r->res_length = len; + r->res_ls = ls; + init_rwsem(&r->res_lock); + atomic_set(&r->res_ref, 1); + r->res_bucket = bucket; + + if (parent) { + r->res_parent = parent; + r->res_depth = parent->res_depth + 1; + r->res_root = parent->res_root; + r->res_nodeid = parent->res_nodeid; + } else { + r->res_parent = NULL; + r->res_depth = 1; + r->res_root = r; + r->res_nodeid = -1; + } + + write_lock(&ls->ls_rsbtbl[bucket].lock); + tmp = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len); + if (tmp) { + atomic_inc(&tmp->res_ref); + write_unlock(&ls->ls_rsbtbl[bucket].lock); + free_rsb(r); + r = tmp; + } else { + list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); + write_unlock(&ls->ls_rsbtbl[bucket].lock); + + down_write(&ls->ls_root_lock); + if (parent) + list_add_tail(&r->res_subreslist, + &r->res_root->res_subreslist); + else + list_add(&r->res_rootlist, &ls->ls_rootres); + up_write(&ls->ls_root_lock); + } + + out_set: + *rp = r; + + out: + error = 0; + + fail: + return error; +} + +/* + * Add a LKB to a resource's grant/convert/wait queue. in order + */ + +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode) +{ + struct dlm_lkb *lkb = NULL; + + list_for_each_entry(lkb, head, lkb_statequeue) { + if (lkb->lkb_rqmode < mode) + break; + } + + if (!lkb) { + /* No entries in the queue, we are alone */ + list_add_tail(new, head); + } else { + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); + } +} + +/* + * The rsb res_lock must be held in write when this function is called. + */ + +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type) +{ + DLM_ASSERT(!lkb->lkb_status, + print_lkb(lkb); + print_rsb(r);); + + lkb->lkb_status = type; + + switch (type) { + case GDLM_LKSTS_WAITING: + if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE) + list_add(&lkb->lkb_statequeue, &r->res_waitqueue); + else + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue); + break; + + case GDLM_LKSTS_GRANTED: + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue, + lkb->lkb_grmode); + break; + + case GDLM_LKSTS_CONVERT: + if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE) + list_add(&lkb->lkb_statequeue, &r->res_convertqueue); + else + list_add_tail(&lkb->lkb_statequeue, + &r->res_convertqueue); + break; + + default: + DLM_ASSERT(0,); + } +} + +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type) +{ + down_write(&r->res_lock); + lkb_enqueue(r, lkb, type); + up_write(&r->res_lock); +} + +/* + * The rsb res_lock must be held in write when this function is called. + */ + +int lkb_dequeue(struct dlm_lkb *lkb) +{ + int status = lkb->lkb_status; + + if (!status) + goto out; + + lkb->lkb_status = 0; + list_del(&lkb->lkb_statequeue); + + out: + return status; +} + +int res_lkb_dequeue(struct dlm_lkb *lkb) +{ + int status; + + down_write(&lkb->lkb_resource->res_lock); + status = lkb_dequeue(lkb); + up_write(&lkb->lkb_resource->res_lock); + + return status; +} + +/* + * The rsb res_lock must be held in write when this function is called. + */ + +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type) +{ + int status; + + status = lkb_dequeue(lkb); + lkb_enqueue(r, lkb, type); + + return status; +} + +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type) +{ + int status; + + down_write(&r->res_lock); + status = lkb_swqueue(r, lkb, type); + up_write(&r->res_lock); + + return status; +} diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h --- linux-orig/cluster/dlm/rsb.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/rsb.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,34 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RSB_DOT_H__ +#define __RSB_DOT_H__ + +#define CREATE 1 +#define MASTER 2 + +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode); +void release_rsb(struct dlm_rsb *r); +void release_rsb_locked(struct dlm_rsb *r); +void hold_rsb(struct dlm_rsb *r); +int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name, + int namelen, int flags, struct dlm_rsb **rp); +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb); +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type); +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type); +int lkb_dequeue(struct dlm_lkb *lkb); +int res_lkb_dequeue(struct dlm_lkb *lkb); +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type); +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type); + +#endif /* __RSB_DOT_H__ */ diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c --- linux-orig/cluster/dlm/util.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/util.c 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,183 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" + +static const uint32_t crc_32_tab[] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, + 0xe963a535, 0x9e6495a3, + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, + 0xe7b82d07, 0x90bf1d91, + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, + 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, + 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, + 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, + 0xcfba9599, 0xb8bda50f, + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, + 0xc1611dab, 0xb6662d3d, + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, + 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, + 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, + 0x8208f4c1, 0xf50fc457, + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, + 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, + 0xa4d1c46d, 0xd3d6f4fb, + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, + 0xaa0a4c5f, 0xdd0d7cc9, + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, + 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, + 0xb7bd5c3b, 0xc0ba6cad, + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, + 0x04db2615, 0x73dc1683, + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, + 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, + 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, + 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, + 0x316e8eef, 0x4669be79, + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, + 0x220216b9, 0x5505262f, + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, + 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, + 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, + 0x7cdcefb7, 0x0bdbdf21, + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, + 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, + 0x616bffd3, 0x166ccf45, + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, + 0x4969474d, 0x3e6e77db, + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, + 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, + 0x54de5729, 0x23d967bf, + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, + 0x5a05df1b, 0x2d02ef8d +}; + +/** + * dlm_hash - hash an array of data + * @data: the data to be hashed + * @len: the length of data to be hashed + * + * Copied from GFS. + * + * Take some data and convert it to a 32-bit hash. + * + * The hash function is a 32-bit CRC of the data. The algorithm uses + * the crc_32_tab table above. + * + * This may not be the fastest hash function, but it does a fair bit better + * at providing uniform results than the others I've looked at. That's + * really important for efficient directories. + * + * Returns: the hash + */ + +uint32_t dlm_hash(const char *data, int len) +{ + uint32_t hash = 0xFFFFFFFF; + + for (; len--; data++) + hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8); + + hash = ~hash; + + return hash; +} + +void print_lkb(struct dlm_lkb *lkb) +{ + printk("dlm: lkb\n" + "id %x\n" + "remid %x\n" + "flags %x\n" + "status %x\n" + "rqmode %d\n" + "grmode %d\n" + "nodeid %d\n" + "lqstate %x\n" + "lqflags %x\n", + lkb->lkb_id, + lkb->lkb_remid, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_rqmode, + lkb->lkb_grmode, + lkb->lkb_nodeid, + lkb->lkb_lockqueue_state, + lkb->lkb_lockqueue_flags); +} + +void print_rsb(struct dlm_rsb *r) +{ + printk("dlm: rsb\n" + "name \"%s\"\n" + "nodeid %d\n" + "flags %lx\n" + "ref %u\n", + r->res_name, + r->res_nodeid, + r->res_flags, + atomic_read(&r->res_ref)); +} + +void print_request(struct dlm_request *req) +{ + printk("dlm: request\n" + "rh_cmd %u\n" + "rh_lkid %x\n" + "remlkid %x\n" + "flags %x\n" + "status %u\n" + "rqmode %u\n", + req->rr_header.rh_cmd, + req->rr_header.rh_lkid, + req->rr_remlkid, + req->rr_flags, + req->rr_status, + req->rr_rqmode); +} + +void print_reply(struct dlm_reply *rp) +{ + printk("dlm: reply\n" + "rh_cmd %u\n" + "rh_lkid %x\n" + "lockstate %u\n" + "nodeid %u\n" + "status %u\n" + "lkid %x\n", + rp->rl_header.rh_cmd, + rp->rl_header.rh_lkid, + rp->rl_lockstate, + rp->rl_nodeid, + rp->rl_status, + rp->rl_lkid); +} + diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h --- linux-orig/cluster/dlm/util.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/dlm/util.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,24 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + +uint32_t dlm_hash(const char *data, int len); + +void print_lkb(struct dlm_lkb *lkb); +void print_rsb(struct dlm_rsb *r); +void print_request(struct dlm_request *req); +void print_reply(struct dlm_reply *rp); + +#endif diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h --- linux-orig/include/cluster/dlm.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/include/cluster/dlm.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,416 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DLM_DOT_H__ +#define __DLM_DOT_H__ + +/* + * Interface to DLM - routines and structures to use DLM lockspaces. + */ + +/* + * Lock Modes + */ + +#define DLM_LOCK_IV (-1) /* invalid */ +#define DLM_LOCK_NL (0) /* null */ +#define DLM_LOCK_CR (1) /* concurrent read */ +#define DLM_LOCK_CW (2) /* concurrent write */ +#define DLM_LOCK_PR (3) /* protected read */ +#define DLM_LOCK_PW (4) /* protected write */ +#define DLM_LOCK_EX (5) /* exclusive */ + +/* + * Maximum size in bytes of a dlm_lock name + */ + +#define DLM_RESNAME_MAXLEN (64) + +/* + * Size in bytes of Lock Value Block + */ + +#define DLM_LVB_LEN (32) + +/* + * Flags to dlm_new_lockspace + * + * DLM_LSF_NOTIMERS + * + * Do not subject locks in this lockspace to time-outs. + */ + +#define DLM_LSF_NOTIMERS (1) + +/* + * Flags to dlm_lock + * + * DLM_LKF_NOQUEUE + * + * Do not queue the lock request on the wait queue if it cannot be granted + * immediately. If the lock cannot be granted because of this flag, DLM will + * either return -EAGAIN from the dlm_lock call or will return 0 from + * dlm_lock and -EAGAIN in the lock status block when the AST is executed. + * + * DLM_LKF_CONVERT + * + * Indicates a lock conversion request. For conversions the name and namelen + * are ignored and the lock ID in the LKSB is used to identify the lock. + * + * DLM_LKF_VALBLK + * + * Requests DLM to return the current contents of the lock value block in the + * lock status block. When this flag is set in a lock conversion from PW or EX + * modes, DLM assigns the value specified in the lock status block to the lock + * value block of the lock resource. The LVB is a DLM_LVB_LEN size array + * containing application-specific information. + * + * DLM_LKF_QUECVT + * + * Force a conversion request to be queued, even if it is compatible with + * the granted modes of other locks on the same resource. + * + * DLM_LKF_CANCEL + * + * Used to cancel a pending conversion (with dlm_unlock). Lock is returned to + * previously granted mode. + * + * DLM_LKF_IVVALBLK + * + * Invalidate/clear the lock value block. + * + * DLM_LKF_CONVDEADLK + * + * The granted mode of a lock being converted (from a non-NL mode) can be + * changed to NL in the process of acquiring the requested mode to avoid + * conversion deadlock. + * + * DLM_LKF_PERSISTENT + * + * Only relevant to locks originating in userspace. Signals to the ioctl.c code + * that this lock should not be unlocked when the process exits. + * + * DLM_LKF_NODLKWT + * + * This lock is not to be checked for conversion deadlocks. + * + * DLM_LKF_NODLCKBLK + * + * not yet implemented + * + * DLM_LKF_EXPEDITE + * + * Used only with new requests for NL mode locks. Tells the lock manager + * to grant the lock, ignoring other locks in convert and wait queues. + * + * DLM_LKF_NOQUEUEBAST + * + * Send blocking AST's before returning -EAGAIN to the caller. It is only + * used along with the NOQUEUE flag. Blocking AST's are not sent for failed + * NOQUEUE requests otherwise. + * + * DLM_LKF_HEADQUE + * + * Add a lock to the head of the convert or wait queue rather than the tail. + * + * DLM_LKF_NOORDER + * + * Disregard the standard grant order rules and grant a lock as soon as it + * is compatible with other granted locks. + */ + +#define DLM_LKF_NOQUEUE (0x00000001) +#define DLM_LKF_CANCEL (0x00000002) +#define DLM_LKF_CONVERT (0x00000004) +#define DLM_LKF_VALBLK (0x00000008) +#define DLM_LKF_QUECVT (0x00000010) +#define DLM_LKF_IVVALBLK (0x00000020) +#define DLM_LKF_CONVDEADLK (0x00000040) +#define DLM_LKF_PERSISTENT (0x00000080) +#define DLM_LKF_NODLCKWT (0x00000100) +#define DLM_LKF_NODLCKBLK (0x00000200) +#define DLM_LKF_EXPEDITE (0x00000400) +#define DLM_LKF_NOQUEUEBAST (0x00000800) +#define DLM_LKF_HEADQUE (0x00001000) +#define DLM_LKF_NOORDER (0x00002000) +#define DLM_LKF_ORPHAN (0x00004000) + +/* + * Some return codes that are not in errno.h + */ + +#define DLM_ECANCEL (0x10001) +#define DLM_EUNLOCK (0x10002) + +typedef void dlm_lockspace_t; + +/* + * Lock range structure + */ + +struct dlm_range { + uint64_t ra_start; + uint64_t ra_end; +}; + +/* + * Lock status block + * + * Use this structure to specify the contents of the lock value block. For a + * conversion request, this structure is used to specify the lock ID of the + * lock. DLM writes the status of the lock request and the lock ID assigned + * to the request in the lock status block. + * + * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests. + * It is available when dlm_lock returns. + * + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules + * shown for the DLM_LKF_VALBLK flag. + * + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock, + * it was first demoted to NL to avoid conversion deadlock. + * + * sb_status: the returned status of the lock request set prior to AST + * execution. Possible return values: + * + * 0 if lock request was successful + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE + * -ENOMEM if there is no memory to process request + * -EINVAL if there are invalid parameters + * -DLM_EUNLOCK if unlock request was successful + * -DLM_ECANCEL ? + */ + +#define DLM_SBF_DEMOTED (0x01) + +struct dlm_lksb { + int sb_status; + uint32_t sb_lkid; + char sb_flags; + char * sb_lvbptr; +}; + +/* + * These defines are the bits that make up the query code. + */ + +/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in + * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */ +#define DLM_LOCK_THIS 0x0007 +#define DLM_QUERY_MODE_MASK 0x0007 + +/* Bits 3, 4, 5 bitmap of queue(s) to query */ +#define DLM_QUERY_QUEUE_WAIT 0x0008 +#define DLM_QUERY_QUEUE_CONVERT 0x0010 +#define DLM_QUERY_QUEUE_GRANT 0x0020 +#define DLM_QUERY_QUEUE_GRANTED 0x0030 /* Shorthand */ +#define DLM_QUERY_QUEUE_ALL 0x0038 /* Shorthand */ + +/* Bit 6, Return only the information that can be established without a network + * round-trip. The caller must be aware of the implications of this. Useful for + * just getting the master node id or resource name. */ +#define DLM_QUERY_LOCAL 0x0040 + +/* Bits 8 up, query type */ +#define DLM_QUERY_LOCKS_HIGHER 0x0100 +#define DLM_QUERY_LOCKS_LOWER 0x0200 +#define DLM_QUERY_LOCKS_EQUAL 0x0300 +#define DLM_QUERY_LOCKS_BLOCKING 0x0400 +#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500 +#define DLM_QUERY_LOCKS_ALL 0x0600 +#define DLM_QUERY_LOCKS_ORPHAN 0x0700 +#define DLM_QUERY_MASK 0x0F00 + +/* GRMODE is the default for mode comparisons, + RQMODE might also be handy */ +#define DLM_QUERY_GRMODE 0x0000 +#define DLM_QUERY_RQMODE 0x1000 + +/* Structures passed into and out of the query */ + +struct dlm_lockinfo { + int lki_lkid; /* Lock ID on originating node */ + int lki_mstlkid; /* Lock ID on master node */ + int lki_parent; + int lki_node; /* Originating node (not master) */ + int lki_ownpid; /* Owner pid on originating node */ + uint8_t lki_state; /* Queue the lock is on */ + uint8_t lki_grmode; /* Granted mode */ + uint8_t lki_rqmode; /* Requested mode */ + struct dlm_range lki_grrange; /* Granted range, if applicable */ + struct dlm_range lki_rqrange; /* Requested range, if applicable */ +}; + +struct dlm_resinfo { + int rsi_length; + int rsi_grantcount; /* No. of nodes on grant queue */ + int rsi_convcount; /* No. of nodes on convert queue */ + int rsi_waitcount; /* No. of nodes on wait queue */ + int rsi_masternode; /* Master for this resource */ + char rsi_name[DLM_RESNAME_MAXLEN]; /* Resource name */ + char rsi_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable + */ +}; + +struct dlm_queryinfo { + struct dlm_resinfo *gqi_resinfo; + struct dlm_lockinfo *gqi_lockinfo; /* This points to an array + * of structs */ + int gqi_locksize; /* input */ + int gqi_lockcount; /* output */ +}; + +#ifdef __KERNEL__ +/* + * dlm_init + * + * Starts and initializes DLM threads and structures. Creation of the first + * lockspace will call this if it has not been called already. + * + * Returns: 0 if successful, -EXXX on error + */ + +int dlm_init(void); + +/* + * dlm_release + * + * Stops DLM threads. + * + * Returns: 0 if successful, -EXXX on error + */ + +int dlm_release(void); + +/* + * dlm_new_lockspace + * + * Starts a lockspace with the given name. If the named lockspace exists in + * the cluster, the calling node joins it. + */ + +int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace, + int flags); + +/* + * dlm_release_lockspace + * + * Stop a lockspace. + */ + +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); + +/* + * dlm_lock + * + * Make an asyncronous request to acquire or convert a lock on a named + * resource. + * + * lockspace: context for the request + * mode: the requested mode of the lock (DLM_LOCK_) + * lksb: lock status block for input and async return values + * flags: input flags (DLM_LKF_) + * name: name of the resource to lock, can be binary + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN) + * parent: the lock ID of a parent lock or 0 if none + * lockast: function DLM executes when it completes processing the request + * astarg: argument passed to lockast and bast functions + * bast: function DLM executes when this lock later blocks another request + * + * Returns: + * 0 if request is successfully queued for processing + * -EINVAL if any input parameters are invalid + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE + * -ENOMEM if there is no memory to process request + * -ENOTCONN if there is a communication error + * + * If the call to dlm_lock returns an error then the operation has failed and + * the AST routine will not be called. If dlm_lock returns 0 it is still + * possible that the lock operation will fail. The AST routine will be called + * when the locking is complete and the status is returned in the lksb. + * + * If the AST routines or parameter are passed to a conversion operation then + * they will overwrite those values that were passed to a previous dlm_lock + * call. + * + * AST routines should not block (at least not for long), but may make + * any locking calls they please. + */ + +int dlm_lock(dlm_lockspace_t *lockspace, + uint32_t mode, + struct dlm_lksb *lksb, + uint32_t flags, + void *name, + unsigned int namelen, + uint32_t parent, + void (*lockast) (void *astarg), + void *astarg, + void (*bast) (void *astarg, int mode), + struct dlm_range *range); + +/* + * dlm_unlock + * + * Asynchronously release a lock on a resource. The AST routine is called + * when the resource is successfully unlocked. + * + * lockspace: context for the request + * lkid: the lock ID as returned in the lksb + * flags: input flags (DLM_LKF_) + * lksb: if NULL the lksb parameter passed to last lock request is used + * astarg: the arg used with the completion ast for the unlock + * + * Returns: + * 0 if request is successfully queued for processing + * -EINVAL if any input parameters are invalid + * -ENOTEMPTY if the lock still has sublocks + * -EBUSY if the lock is waiting for a remote lock operation + * -ENOTCONN if there is a communication error + */ + +extern int dlm_unlock(dlm_lockspace_t *lockspace, + uint32_t lkid, + uint32_t flags, + struct dlm_lksb *lksb, + void *astarg); + +/* Query interface + * + * Query the other holders of a resource, given a known lock ID + * + * lockspace: context for the request + * lksb: LKSB, sb_lkid contains the lock ID of a valid lock + * on the resource. sb_status will contain the status + * of the request on completion. + * query: query bitmap see DLM_QUERY_* above + * qinfo: pointer to dlm_queryinfo structure + * ast_routine: AST routine to call on completion + * artarg: argument to AST routine. It is "traditional" + * to put the qinfo pointer into lksb->sb_lvbptr + * and pass the lksb in here. + */ +extern int dlm_query(dlm_lockspace_t *lockspace, + struct dlm_lksb *lksb, + int query, + struct dlm_queryinfo *qinfo, + void (ast_routine(void *)), + void *astarg); + + +void dlm_debug_dump(void); +void dlm_locks_dump(void); + +#endif /* __KERNEL__ */ + +#endif /* __DLM_DOT_H__ */ diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h --- linux-orig/include/cluster/dlm_device.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/include/cluster/dlm_device.h 2004-11-03 11:31:56.000000000 +0800 @@ -0,0 +1,64 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* This is the device interface for dlm, most users will use a library + * interface. + */ + +/* Version of the device interface */ +#define DLM_DEVICE_VERSION_MAJOR 2 +#define DLM_DEVICE_VERSION_MINOR 0 +#define DLM_DEVICE_VERSION_PATCH 0 + +/* struct passed to the lock write */ +struct dlm_lock_params { + uint32_t version[3]; + uint8_t cmd; + uint8_t mode; + uint16_t flags; + uint32_t lkid; + uint32_t parent; + struct dlm_range range; + uint8_t namelen; + void *castparam; + void *castaddr; + void *bastparam; + void *bastaddr; + struct dlm_lksb *lksb; + char name[1]; +}; + + +/* struct read from the "device" fd, + consists mainly of userspace pointers for the library to use */ +struct dlm_lock_result { + uint8_t cmd; + void *astparam; + void (*astaddr)(void *astparam); + struct dlm_lksb *user_lksb; + struct dlm_lksb lksb; /* But this has real data in it */ + uint8_t bast_mode; /* Not yet used */ +}; + +/* commands passed to the device */ +#define DLM_USER_LOCK 1 +#define DLM_USER_UNLOCK 2 +#define DLM_USER_QUERY 3 + +/* Arbitrary length restriction */ +#define MAX_LS_NAME_LEN 64 + +/* ioctls on the device */ +#define DLM_CREATE_LOCKSPACE _IOW('D', 0x01, char *) +#define DLM_RELEASE_LOCKSPACE _IOW('D', 0x02, char *) +#define DLM_FORCE_RELEASE_LOCKSPACE _IOW('D', 0x03, char *)