1 # Add DLM to the build system
2 diff -urN -p linux-2.6.8.1/cluster/Kconfig linux/cluster/Kconfig
3 --- linux-2.6.8.1/cluster/Kconfig 2004-08-24 13:23:09.000000000 +0800
4 +++ linux/cluster/Kconfig 2004-08-24 13:23:32.000000000 +0800
5 @@ -10,4 +10,22 @@ config CLUSTER
6 needed by all the other components. It provides membership services
7 for those other subsystems.
10 + tristate "Distributed Lock Manager"
13 + A fully distributed lock manager, providing cluster-wide locking services
14 + and protected lock namespaces for kernel and userland applications.
16 +config CLUSTER_DLM_PROCLOCKS
17 + boolean "/proc/locks support for DLM"
18 + depends on CLUSTER_DLM
21 + If this option is enabled a file will appear in /proc/cluster/dlm_locks.
22 + write into this "file" the name of a lockspace known to the DLM and then
23 + read out a list of all the resources and locks in that lockspace that are
24 + known to the local node. Note because the DLM is distributed this may not
25 + be the full lock picture.
28 diff -urN -p linux-2.6.8.1/cluster/Makefile linux/cluster/Makefile
29 --- linux-2.6.8.1/cluster/Makefile 2004-08-24 13:23:09.000000000 +0800
30 +++ linux/cluster/Makefile 2004-08-24 13:23:32.000000000 +0800
34 obj-$(CONFIG_CLUSTER) += cman/
35 +obj-$(CONFIG_CLUSTER_DLM) += dlm/
36 diff -urN -p linux-2.6.8.1/cluster/dlm/Makefile linux/cluster/dlm/Makefile
37 --- linux-2.6.8.1/cluster/dlm/Makefile 1970-01-01 07:30:00.000000000 +0730
38 +++ linux/cluster/dlm/Makefile 2004-08-24 13:23:32.000000000 +0800
62 +obj-$(CONFIG_CLUSTER_DLM) += dlm.o
63 diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
64 --- linux-orig/cluster/dlm/ast.c 1970-01-01 07:30:00.000000000 +0730
65 +++ linux-patched/cluster/dlm/ast.c 2004-11-03 11:31:56.000000000 +0800
67 +/******************************************************************************
68 +*******************************************************************************
70 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
71 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
73 +** This copyrighted material is made available to anyone wishing to use,
74 +** modify, copy, or redistribute it subject to the terms and conditions
75 +** of the GNU General Public License v.2.
77 +*******************************************************************************
78 +******************************************************************************/
81 + * This delivers ASTs and checks for dead remote requests and deadlocks.
84 +#include <linux/timer.h>
86 +#include "dlm_internal.h"
88 +#include "lockqueue.h"
92 +#include "lowcomms.h"
93 +#include "midcomms.h"
99 +/* Wake up flags for astd */
101 +#define WAKE_TIMER 2
103 +static struct list_head ast_queue;
104 +static struct semaphore ast_queue_lock;
105 +static wait_queue_head_t astd_waitchan;
106 +struct task_struct * astd_task;
107 +static unsigned long astd_wakeflags;
109 +static struct list_head _deadlockqueue;
110 +static struct semaphore _deadlockqueue_lock;
111 +static struct list_head _lockqueue;
112 +static struct semaphore _lockqueue_lock;
113 +static struct timer_list _lockqueue_timer;
115 +void add_to_lockqueue(struct dlm_lkb *lkb)
117 + /* Time stamp the entry so we know if it's been waiting too long */
118 + lkb->lkb_lockqueue_time = jiffies;
120 + down(&_lockqueue_lock);
121 + list_add(&lkb->lkb_lockqueue, &_lockqueue);
122 + up(&_lockqueue_lock);
125 +void remove_from_lockqueue(struct dlm_lkb *lkb)
127 + down(&_lockqueue_lock);
128 + list_del(&lkb->lkb_lockqueue);
129 + up(&_lockqueue_lock);
131 +#ifdef CONFIG_DLM_STATS
132 + dlm_stats.lockqueue_time[lkb->lkb_lockqueue_state] += (jiffies - lkb->lkb_lockqueue_time);
133 + dlm_stats.lockqueue_locks[lkb->lkb_lockqueue_state]++;
135 + lkb->lkb_lockqueue_state = 0;
138 +void add_to_deadlockqueue(struct dlm_lkb *lkb)
140 + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
142 + lkb->lkb_duetime = jiffies;
143 + down(&_deadlockqueue_lock);
144 + list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
145 + up(&_deadlockqueue_lock);
148 +void remove_from_deadlockqueue(struct dlm_lkb *lkb)
150 + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
153 + down(&_deadlockqueue_lock);
154 + list_del(&lkb->lkb_deadlockq);
155 + up(&_deadlockqueue_lock);
157 + /* Invalidate the due time */
158 + memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
162 + * Queue an AST for delivery, this will only deal with
163 + * kernel ASTs, usermode API will piggyback on top of this.
165 + * This can be called in either the user or DLM context.
166 + * ASTs are queued EVEN IF we are already running in dlm_astd
167 + * context as we don't know what other locks are held (eg we could
168 + * be being called from a lock operation that was called from
170 + * If the AST is to be queued remotely then a message is sent to
171 + * the target system via midcomms.
174 +void queue_ast(struct dlm_lkb *lkb, uint16_t flags, uint8_t rqmode)
176 + struct dlm_request req;
178 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
180 + * Send a message to have an ast queued remotely. Note: we do
181 + * not send remote completion asts, they are handled as part of
182 + * remote lock granting.
184 + if (flags & AST_BAST) {
185 + req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
186 + req.rr_header.rh_length = sizeof(req);
187 + req.rr_header.rh_flags = 0;
188 + req.rr_header.rh_lkid = lkb->lkb_id;
189 + req.rr_header.rh_lockspace =
190 + lkb->lkb_resource->res_ls->ls_global_id;
191 + req.rr_status = lkb->lkb_retstatus;
192 + req.rr_remlkid = lkb->lkb_remid;
193 + req.rr_rqmode = rqmode;
195 + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
196 + lkb->lkb_resource->res_ls->ls_allocation);
197 + } else if (lkb->lkb_retstatus == -EDEADLOCK) {
199 + * We only queue remote Completion ASTs here for error
200 + * completions that happen out of band.
201 + * DEADLOCK is one such.
203 + req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
204 + req.rr_header.rh_length = sizeof(req);
205 + req.rr_header.rh_flags = 0;
206 + req.rr_header.rh_lkid = lkb->lkb_id;
207 + req.rr_header.rh_lockspace =
208 + lkb->lkb_resource->res_ls->ls_global_id;
209 + req.rr_status = lkb->lkb_retstatus;
210 + req.rr_remlkid = lkb->lkb_remid;
211 + req.rr_rqmode = rqmode;
213 + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
214 + lkb->lkb_resource->res_ls->ls_allocation);
218 + * Prepare info that will be returned in ast/bast.
221 + if (flags & AST_BAST) {
222 + lkb->lkb_bastmode = rqmode;
224 + lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
225 + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
226 + lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
228 + lkb->lkb_lksb->sb_flags = 0;
231 + down(&ast_queue_lock);
232 + if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST)))
233 + list_add_tail(&lkb->lkb_astqueue, &ast_queue);
234 + lkb->lkb_astflags |= flags;
235 + up(&ast_queue_lock);
237 + /* It is the responsibility of the caller to call wake_astd()
238 + * after it has finished other locking operations that request
239 + * the ASTs to be delivered after */
244 + * Process any LKBs on the AST queue.
247 +static void process_asts(void)
250 + struct dlm_rsb *rsb;
251 + struct dlm_lkb *lkb;
252 + void (*cast) (long param);
253 + void (*bast) (long param, int mode);
258 + down(&ast_queue_lock);
259 + if (list_empty(&ast_queue)) {
260 + up(&ast_queue_lock);
264 + lkb = list_entry(ast_queue.next, struct dlm_lkb, lkb_astqueue);
265 + list_del(&lkb->lkb_astqueue);
266 + flags = lkb->lkb_astflags;
267 + lkb->lkb_astflags = 0;
268 + up(&ast_queue_lock);
270 + cast = lkb->lkb_astaddr;
271 + bast = lkb->lkb_bastaddr;
272 + astparam = lkb->lkb_astparam;
273 + rsb = lkb->lkb_resource;
276 + if (flags & AST_COMP) {
277 + if (flags & AST_DEL) {
278 + DLM_ASSERT(lkb->lkb_astflags == 0,);
280 + /* FIXME: we don't want to block asts for other
281 + lockspaces while one is being recovered */
283 + down_read(&ls->ls_in_recovery);
284 + release_lkb(ls, lkb);
286 + up_read(&ls->ls_in_recovery);
290 +#ifdef CONFIG_DLM_STATS
297 + if (flags & AST_BAST && !(flags & AST_DEL)) {
298 + int bmode = lkb->lkb_bastmode;
300 + /* gr or rq mode of the lock may have changed since the
301 + ast was queued making the delivery unnecessary */
303 + if (!bast || dlm_modes_compat(lkb->lkb_grmode, bmode))
306 + if (lkb->lkb_rqmode == DLM_LOCK_IV ||
307 + !dlm_modes_compat(lkb->lkb_rqmode, bmode)) {
308 + bast(astparam, bmode);
309 +#ifdef CONFIG_DLM_STATS
319 +void lockqueue_lkb_mark(struct dlm_ls *ls)
321 + struct dlm_lkb *lkb, *safe;
324 + log_all(ls, "mark waiting requests");
326 + down(&_lockqueue_lock);
328 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
330 + if (lkb->lkb_resource->res_ls != ls)
333 + log_debug(ls, "mark %x lq %d nodeid %d", lkb->lkb_id,
334 + lkb->lkb_lockqueue_state, lkb->lkb_nodeid);
337 + * These lkb's are new and the master is being looked up. Mark
338 + * the lkb request to be resent. Even if the destination node
339 + * for the request is still living and has our request, it will
340 + * purge all resdir requests in purge_requestqueue. If there's
341 + * a reply to the LOOKUP request in our requestqueue (the reply
342 + * arrived after ls_stop), it is invalid and will be discarded
343 + * in purge_requestqueue, too.
346 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
347 + DLM_ASSERT(lkb->lkb_nodeid == -1,
349 + print_rsb(lkb->lkb_resource););
351 + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
357 + * We're waiting for an unlock reply and the master node from
358 + * whom we're expecting the reply has failed. If there's a
359 + * reply in the requestqueue do nothing and process it later in
360 + * process_requestqueue. If there's no reply, don't rebuild
361 + * the lkb on a new master, but just assume we've gotten an
362 + * unlock completion reply from the prev master (this also
363 + * means not resending the unlock request). If the unlock is
364 + * for the last lkb on the rsb, the rsb has nodeid of -1 and
365 + * the rsb won't be rebuilt on the new master either.
367 + * If we're waiting for an unlock reply and the master node is
368 + * still alive, we should either have a reply in the
369 + * requestqueue from the master already, or we should get one
370 + * from the master once recovery is complete. There is no
371 + * rebuilding of the rsb/lkb in this case and no resending of
375 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK) {
376 + if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
377 + if (reply_in_requestqueue(ls, lkb->lkb_id)) {
378 + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
379 + log_debug(ls, "mark %x unlock have rep",
382 + /* assume we got reply fr old master */
383 + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
384 + lkb->lkb_flags |= GDLM_LKFLG_UNLOCKDONE;
385 + log_debug(ls, "mark %x unlock no rep",
394 + * These lkb's have an outstanding request to a bygone node.
395 + * The request will be redirected to the new master node in
396 + * resend_cluster_requests(). Don't mark the request for
397 + * resending if there's a reply for it saved in the
401 + if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
402 + !reply_in_requestqueue(ls, lkb->lkb_id)) {
404 + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
407 + * Don't rebuild this lkb on a new rsb in
408 + * rebuild_rsbs_send().
411 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONDGRANT) {
412 + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_WAITING,
414 + print_rsb(lkb->lkb_resource););
415 + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
419 + * This flag indicates to the new master that his lkb
420 + * is in the midst of a convert request and should be
421 + * placed on the granted queue rather than the convert
422 + * queue. We will resend this convert request to the
426 + else if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONVERT) {
427 + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,
429 + print_rsb(lkb->lkb_resource););
430 + lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
436 + up(&_lockqueue_lock);
438 + log_all(ls, "marked %d requests", count);
441 +int resend_cluster_requests(struct dlm_ls *ls)
443 + struct dlm_lkb *lkb, *safe;
445 + int error = 0, state, count = 0;
447 + log_all(ls, "resend marked requests");
449 + down(&_lockqueue_lock);
451 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
453 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
454 + log_debug(ls, "resend_cluster_requests: aborted");
459 + r = lkb->lkb_resource;
461 + if (r->res_ls != ls)
464 + log_debug(ls, "resend %x lq %d flg %x node %d/%d \"%s\"",
465 + lkb->lkb_id, lkb->lkb_lockqueue_state, lkb->lkb_flags,
466 + lkb->lkb_nodeid, r->res_nodeid, r->res_name);
468 + if (lkb->lkb_flags & GDLM_LKFLG_UNLOCKDONE) {
469 + log_debug(ls, "unlock done %x", lkb->lkb_id);
470 + list_del(&lkb->lkb_lockqueue);
471 + res_lkb_dequeue(lkb);
472 + lkb->lkb_retstatus = -DLM_EUNLOCK;
473 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
479 + * Resend/process the lockqueue lkb's (in-progres requests)
480 + * that were flagged at the start of recovery in
481 + * lockqueue_lkb_mark().
484 + if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
485 + lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
486 + lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
487 + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
489 + if (lkb->lkb_nodeid == -1) {
491 + * Send lookup to new resdir node.
493 + lkb->lkb_lockqueue_time = jiffies;
494 + send_cluster_request(lkb,
495 + lkb->lkb_lockqueue_state);
498 + else if (lkb->lkb_nodeid != 0) {
500 + * There's a new RSB master (that's not us.)
502 + lkb->lkb_lockqueue_time = jiffies;
503 + send_cluster_request(lkb,
504 + lkb->lkb_lockqueue_state);
509 + * We are the new RSB master for this lkb
512 + state = lkb->lkb_lockqueue_state;
513 + lkb->lkb_lockqueue_state = 0;
514 + /* list_del equals remove_from_lockqueue() */
515 + list_del(&lkb->lkb_lockqueue);
516 + process_remastered_lkb(ls, lkb, state);
522 + up(&_lockqueue_lock);
524 + log_all(ls, "resent %d requests", count);
529 + * Process any LKBs on the Lock queue, this
530 + * just looks at the entries to see if they have been
531 + * on the queue too long and fails the requests if so.
534 +static void process_lockqueue(void)
536 + struct dlm_lkb *lkb, *safe;
540 + down(&_lockqueue_lock);
542 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
543 + ls = lkb->lkb_resource->res_ls;
545 + if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
548 + /* Don't time out locks that are in transition */
549 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
552 + if (check_timeout(lkb->lkb_lockqueue_time,
553 + dlm_config.lock_timeout)) {
555 + list_del(&lkb->lkb_lockqueue);
556 + up(&_lockqueue_lock);
557 + cancel_lockop(lkb, -ETIMEDOUT);
558 + down(&_lockqueue_lock);
561 + up(&_lockqueue_lock);
566 + mod_timer(&_lockqueue_timer,
567 + jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
570 +/* Look for deadlocks */
571 +static void process_deadlockqueue(void)
573 + struct dlm_lkb *lkb, *safe;
575 + down(&_deadlockqueue_lock);
577 + list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
578 + struct dlm_lkb *kill_lkb;
580 + /* Only look at "due" locks */
581 + if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
584 + /* Don't look at locks that are in transition */
585 + if (!test_bit(LSFL_LS_RUN,
586 + &lkb->lkb_resource->res_ls->ls_flags))
589 + up(&_deadlockqueue_lock);
591 + /* Lock has hit due time, check for conversion deadlock */
592 + kill_lkb = conversion_deadlock_check(lkb);
594 + cancel_conversion(kill_lkb, -EDEADLOCK);
596 + down(&_deadlockqueue_lock);
598 + up(&_deadlockqueue_lock);
601 +static __inline__ int no_asts(void)
605 + down(&ast_queue_lock);
606 + ret = list_empty(&ast_queue);
607 + up(&ast_queue_lock);
611 +static void lockqueue_timer_fn(unsigned long arg)
613 + set_bit(WAKE_TIMER, &astd_wakeflags);
614 + wake_up(&astd_waitchan);
618 + * DLM daemon which delivers asts.
621 +static int dlm_astd(void *data)
624 + * Set a timer to check the lockqueue for dead locks (and deadlocks).
626 + INIT_LIST_HEAD(&_lockqueue);
627 + init_MUTEX(&_lockqueue_lock);
628 + INIT_LIST_HEAD(&_deadlockqueue);
629 + init_MUTEX(&_deadlockqueue_lock);
630 + init_timer(&_lockqueue_timer);
631 + _lockqueue_timer.function = lockqueue_timer_fn;
632 + _lockqueue_timer.data = 0;
633 + mod_timer(&_lockqueue_timer,
634 + jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
636 + while (!kthread_should_stop()) {
637 + wchan_cond_sleep_intr(astd_waitchan, !test_bit(WAKE_ASTS, &astd_wakeflags));
639 + if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
642 + if (test_and_clear_bit(WAKE_TIMER, &astd_wakeflags)) {
643 + process_lockqueue();
644 + if (dlm_config.deadlocktime)
645 + process_deadlockqueue();
649 + if (timer_pending(&_lockqueue_timer))
650 + del_timer(&_lockqueue_timer);
655 +void wake_astd(void)
658 + set_bit(WAKE_ASTS, &astd_wakeflags);
659 + wake_up(&astd_waitchan);
663 +int astd_start(void)
665 + struct task_struct *p;
668 + INIT_LIST_HEAD(&ast_queue);
669 + init_MUTEX(&ast_queue_lock);
670 + init_waitqueue_head(&astd_waitchan);
672 + p = kthread_run(dlm_astd, NULL, 0, "dlm_astd");
674 + error = PTR_ERR(p);
680 +void astd_stop(void)
682 + kthread_stop(astd_task);
683 + wake_up(&astd_waitchan);
685 diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
686 --- linux-orig/cluster/dlm/ast.h 1970-01-01 07:30:00.000000000 +0730
687 +++ linux-patched/cluster/dlm/ast.h 2004-11-03 11:31:56.000000000 +0800
689 +/******************************************************************************
690 +*******************************************************************************
692 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
693 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
695 +** This copyrighted material is made available to anyone wishing to use,
696 +** modify, copy, or redistribute it subject to the terms and conditions
697 +** of the GNU General Public License v.2.
699 +*******************************************************************************
700 +******************************************************************************/
702 +#ifndef __AST_DOT_H__
703 +#define __AST_DOT_H__
705 +void lockqueue_lkb_mark(struct dlm_ls *ls);
706 +int resend_cluster_requests(struct dlm_ls *ls);
707 +void add_to_lockqueue(struct dlm_lkb *lkb);
708 +void remove_from_lockqueue(struct dlm_lkb *lkb);
709 +void add_to_deadlockqueue(struct dlm_lkb *lkb);
710 +void remove_from_deadlockqueue(struct dlm_lkb *lkb);
711 +void queue_ast(struct dlm_lkb *lkb, uint16_t astflags, uint8_t rqmode);
712 +void wake_astd(void);
713 +int astd_start(void);
714 +void astd_stop(void);
716 +#endif /* __AST_DOT_H__ */
717 diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
718 --- linux-orig/cluster/dlm/config.c 1970-01-01 07:30:00.000000000 +0730
719 +++ linux-patched/cluster/dlm/config.c 2004-11-03 11:31:56.000000000 +0800
721 +/******************************************************************************
722 +*******************************************************************************
724 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
725 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
727 +** This copyrighted material is made available to anyone wishing to use,
728 +** modify, copy, or redistribute it subject to the terms and conditions
729 +** of the GNU General Public License v.2.
731 +*******************************************************************************
732 +******************************************************************************/
734 +#include <linux/module.h>
735 +#include <linux/proc_fs.h>
737 +#include "dlm_internal.h"
738 +#include "lowcomms.h"
741 +/* Config file defaults */
742 +#define DEFAULT_TCP_PORT 21064
743 +#define DEFAULT_LOCK_TIMEOUT 30
744 +#define DEFAULT_BUFFER_SIZE 4096
745 +#define DEFAULT_RSBTBL_SIZE 256
746 +#define DEFAULT_LKBTBL_SIZE 1024
747 +#define DEFAULT_DIRTBL_SIZE 512
748 +#define DEFAULT_CONN_INCREMENT 32
749 +#define DEFAULT_DEADLOCKTIME 10
750 +#define DEFAULT_RECOVER_TIMER 5
752 +struct config_info dlm_config = {
753 + .tcp_port = DEFAULT_TCP_PORT,
754 + .lock_timeout = DEFAULT_LOCK_TIMEOUT,
755 + .buffer_size = DEFAULT_BUFFER_SIZE,
756 + .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
757 + .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
758 + .dirtbl_size = DEFAULT_DIRTBL_SIZE,
759 + .conn_increment = DEFAULT_CONN_INCREMENT,
760 + .deadlocktime = DEFAULT_DEADLOCKTIME,
761 + .recover_timer = DEFAULT_RECOVER_TIMER
765 +static struct config_proc_info {
770 + .name = "tcp_port",
771 + .value = &dlm_config.tcp_port,
774 + .name = "lock_timeout",
775 + .value = &dlm_config.lock_timeout,
778 + .name = "buffer_size",
779 + .value = &dlm_config.buffer_size,
782 + .name = "rsbtbl_size",
783 + .value = &dlm_config.rsbtbl_size,
786 + .name = "lkbtbl_size",
787 + .value = &dlm_config.lkbtbl_size,
790 + .name = "dirtbl_size",
791 + .value = &dlm_config.dirtbl_size,
794 + .name = "conn_increment",
795 + .value = &dlm_config.conn_increment,
798 + .name = "deadlocktime",
799 + .value = &dlm_config.deadlocktime,
802 + .name = "recover_timer",
803 + .value = &dlm_config.recover_timer,
806 +static struct proc_dir_entry *dlm_dir;
808 +static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
809 + int *eof, void *data)
811 + struct config_proc_info *cinfo = data;
812 + return snprintf(page, count, "%d\n", *cinfo->value);
815 +static int dlm_config_write_proc(struct file *file, const char *buffer,
816 + unsigned long count, void *data)
818 + struct config_proc_info *cinfo = data;
822 + value = simple_strtoul(buffer, &end, 10);
824 + *cinfo->value = value;
828 +int dlm_config_init(void)
831 + struct proc_dir_entry *pde;
833 + dlm_dir = proc_mkdir("cluster/config/dlm", 0);
837 + dlm_dir->owner = THIS_MODULE;
839 + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
840 + pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
842 + pde->data = &config_proc[i];
843 + pde->write_proc = dlm_config_write_proc;
844 + pde->read_proc = dlm_config_read_proc;
850 +void dlm_config_exit(void)
854 + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
855 + remove_proc_entry(config_proc[i].name, dlm_dir);
856 + remove_proc_entry("cluster/config/dlm", NULL);
858 diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
859 --- linux-orig/cluster/dlm/config.h 1970-01-01 07:30:00.000000000 +0730
860 +++ linux-patched/cluster/dlm/config.h 2004-11-03 11:31:56.000000000 +0800
862 +/******************************************************************************
863 +*******************************************************************************
865 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
866 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
868 +** This copyrighted material is made available to anyone wishing to use,
869 +** modify, copy, or redistribute it subject to the terms and conditions
870 +** of the GNU General Public License v.2.
872 +*******************************************************************************
873 +******************************************************************************/
875 +#ifndef __CONFIG_DOT_H__
876 +#define __CONFIG_DOT_H__
878 +struct config_info {
885 + int conn_increment;
890 +extern struct config_info dlm_config;
891 +extern int dlm_config_init(void);
892 +extern void dlm_config_exit(void);
894 +#endif /* __CONFIG_DOT_H__ */
895 diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
896 --- linux-orig/cluster/dlm/device.c 1970-01-01 07:30:00.000000000 +0730
897 +++ linux-patched/cluster/dlm/device.c 2004-11-03 11:31:56.000000000 +0800
899 +/******************************************************************************
900 +*******************************************************************************
902 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
903 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
905 +** This copyrighted material is made available to anyone wishing to use,
906 +** modify, copy, or redistribute it subject to the terms and conditions
907 +** of the GNU General Public License v.2.
909 +*******************************************************************************
910 +******************************************************************************/
915 + * This is the userland interface to the DLM.
917 + * The locking is done via a misc char device (find the
918 + * registered minor number in /proc/misc).
920 + * User code should not use this interface directly but
921 + * call the library routines in libdlm.a instead.
925 +#include <linux/miscdevice.h>
926 +#include <linux/init.h>
927 +#include <linux/wait.h>
928 +#include <linux/module.h>
929 +#include <linux/file.h>
930 +#include <linux/fs.h>
931 +#include <linux/poll.h>
932 +#include <linux/signal.h>
933 +#include <linux/spinlock.h>
934 +#include <asm/ioctls.h>
936 +#include "dlm_internal.h"
939 +extern struct dlm_lkb *dlm_get_lkb(struct dlm_ls *, int);
940 +static struct file_operations _dlm_fops;
941 +static const char *name_prefix="dlm";
942 +static struct list_head user_ls_list;
943 +static struct semaphore user_ls_lock;
945 +/* Flags in li_flags */
946 +#define LI_FLAG_COMPLETE 1
947 +#define LI_FLAG_FIRSTLOCK 2
949 +#define LOCKINFO_MAGIC 0x53595324
954 + struct dlm_lksb li_lksb;
955 + wait_queue_head_t li_waitq;
956 + unsigned long li_flags;
957 + void __user *li_castparam;
958 + void __user *li_castaddr;
959 + void __user *li_bastparam;
960 + void __user *li_bastaddr;
961 + void __user *li_pend_bastparam;
962 + void __user *li_pend_bastaddr;
963 + void __user *li_user_lvbptr;
964 + struct list_head li_ownerqueue;
965 + struct file_info *li_file;
966 + struct dlm_lksb __user *li_user_lksb;
967 + struct semaphore li_firstlock;
968 + struct dlm_queryinfo *li_queryinfo;
969 + struct dlm_queryinfo __user *li_user_queryinfo;
972 +/* A queued AST no less */
974 + struct dlm_lock_result result;
975 + struct dlm_queryinfo *queryinfo;
976 + struct dlm_queryinfo __user *user_queryinfo;
977 + struct list_head list;
978 + void __user *user_lvbptr;
979 + uint32_t ast_reason; /* AST_COMP or AST_BAST from dlm_internal.h */
982 +/* One of these per userland lockspace */
984 + void *ls_lockspace;
985 + atomic_t ls_refcnt;
986 + long ls_flags; /* bit 1 means LS has been deleted */
988 + /* Passed into misc_register() */
989 + struct miscdevice ls_miscinfo;
990 + struct list_head ls_list;
993 +/* misc_device info for the control device */
994 +static struct miscdevice ctl_device;
997 + * Stuff we hang off the file struct.
998 + * The first two are to cope with unlocking all the
999 + * locks help by a process when it dies.
1002 + struct list_head fi_lkb_list; /* List of active lkbs */
1003 + spinlock_t fi_lkb_lock;
1004 + struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
1005 + spinlock_t fi_ast_lock;
1006 + wait_queue_head_t fi_wait;
1007 + struct user_ls *fi_ls;
1008 + atomic_t fi_refcnt; /* Number of users */
1009 + unsigned long fi_flags; /* Bit 1 means the device is open */
1013 +/* get and put ops for file_info.
1014 + Actually I don't really like "get" and "put", but everyone
1015 + else seems to use them and I can't think of anything
1016 + nicer at the moment */
1017 +static void get_file_info(struct file_info *f)
1019 + atomic_inc(&f->fi_refcnt);
1022 +static void put_file_info(struct file_info *f)
1024 + if (atomic_dec_and_test(&f->fi_refcnt))
1028 +static void release_lockinfo(struct lock_info *li)
1030 + put_file_info(li->li_file);
1031 + if (li->li_lksb.sb_lvbptr && li->li_cmd != DLM_USER_QUERY)
1032 + kfree(li->li_lksb.sb_lvbptr);
1036 +static struct user_ls *__find_lockspace(int minor)
1038 + struct user_ls *lsinfo;
1040 + list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
1042 + if (lsinfo->ls_miscinfo.minor == minor)
1048 +/* Find a lockspace struct given the device minor number */
1049 +static struct user_ls *find_lockspace(int minor)
1051 + struct user_ls *lsinfo;
1053 + down(&user_ls_lock);
1054 + lsinfo = __find_lockspace(minor);
1055 + up(&user_ls_lock);
1060 +static void add_lockspace_to_list(struct user_ls *lsinfo)
1062 + down(&user_ls_lock);
1063 + list_add(&lsinfo->ls_list, &user_ls_list);
1064 + up(&user_ls_lock);
1067 +/* Register a lockspace with the DLM and create a misc
1068 + device for userland to access it */
1069 +static int register_lockspace(char *name, struct user_ls **ls)
1071 + struct user_ls *newls;
1075 + namelen = strlen(name)+strlen(name_prefix)+2;
1077 + newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
1080 + memset(newls, 0, sizeof(struct user_ls));
1082 + newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
1083 + if (!newls->ls_miscinfo.name) {
1087 + status = dlm_new_lockspace(name, strlen(name),
1088 + &newls->ls_lockspace, 0);
1090 + if (status != 0) {
1091 + kfree(newls->ls_miscinfo.name);
1096 + snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
1098 + newls->ls_miscinfo.fops = &_dlm_fops;
1099 + newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
1101 + status = misc_register(&newls->ls_miscinfo);
1103 + log_print("failed to register misc device for %s", name);
1104 + dlm_release_lockspace(newls->ls_lockspace, 0);
1105 + kfree(newls->ls_miscinfo.name);
1111 + add_lockspace_to_list(newls);
1116 +/* Called with the user_ls_lock semaphore held */
1117 +static int unregister_lockspace(struct user_ls *lsinfo, int force)
1121 + status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
1125 + status = misc_deregister(&lsinfo->ls_miscinfo);
1129 + list_del(&lsinfo->ls_list);
1130 + set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
1131 + lsinfo->ls_lockspace = NULL;
1132 + if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
1133 + kfree(lsinfo->ls_miscinfo.name);
1140 +/* Add it to userland's AST queue */
1141 +static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam, uint32_t reason)
1143 + struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
1147 + ast->result.astparam = astparam;
1148 + ast->result.astaddr = astaddr;
1149 + ast->result.user_lksb = li->li_user_lksb;
1150 + ast->result.cmd = li->li_cmd;
1151 + ast->user_lvbptr = li->li_user_lvbptr;
1152 + ast->ast_reason = reason;
1153 + memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
1155 + /* These two will both be NULL for anything other than queries */
1156 + ast->queryinfo = li->li_queryinfo;
1157 + ast->user_queryinfo = li->li_user_queryinfo;
1159 + spin_lock(&li->li_file->fi_ast_lock);
1160 + list_add_tail(&ast->list, &li->li_file->fi_ast_list);
1161 + spin_unlock(&li->li_file->fi_ast_lock);
1162 + wake_up_interruptible(&li->li_file->fi_wait);
1165 +static void bast_routine(void *param, int mode)
1167 + struct lock_info *li = param;
1169 + if (li && li->li_bastaddr) {
1170 + add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, AST_BAST);
1175 + * This is the kernel's AST routine.
1176 + * All lock, unlock & query operations complete here.
1177 + * The only syncronous ops are those done during device close.
1179 +static void ast_routine(void *param)
1181 + struct lock_info *li = param;
1183 + /* Param may be NULL if a persistent lock is unlocked by someone else */
1187 + /* If this is a succesful conversion then activate the blocking ast
1188 + * args from the conversion request */
1189 + if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1190 + li->li_lksb.sb_status == 0) {
1192 + li->li_bastparam = li->li_pend_bastparam;
1193 + li->li_bastaddr = li->li_pend_bastaddr;
1194 + li->li_pend_bastaddr = NULL;
1197 + /* If it's an async request then post data to the user's AST queue. */
1198 + if (li->li_castaddr) {
1200 + /* Only queue AST if the device is still open */
1201 + if (test_bit(1, &li->li_file->fi_flags))
1202 + add_to_astqueue(li, li->li_castaddr, li->li_castparam, AST_COMP);
1204 + /* If it's a new lock operation that failed, then
1205 + * remove it from the owner queue and free the
1206 + * lock_info. The DLM will not free the LKB until this
1207 + * AST has completed.
1209 + if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1210 + li->li_lksb.sb_status != 0) {
1211 + struct dlm_lkb *lkb;
1213 + /* Wait till dlm_lock() has finished */
1214 + down(&li->li_firstlock);
1215 + up(&li->li_firstlock);
1217 + /* If the LKB has been freed then we need to tidy up too */
1218 + lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1220 + spin_lock(&li->li_file->fi_lkb_lock);
1221 + list_del(&li->li_ownerqueue);
1222 + spin_unlock(&li->li_file->fi_lkb_lock);
1224 + release_lockinfo(li);
1228 + /* Free unlocks & queries */
1229 + if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
1230 + li->li_cmd == DLM_USER_QUERY) {
1231 + release_lockinfo(li);
1235 + /* Synchronous request, just wake up the caller */
1236 + set_bit(LI_FLAG_COMPLETE, &li->li_flags);
1237 + wake_up_interruptible(&li->li_waitq);
1242 + * Wait for the lock op to complete and return the status.
1244 +static int wait_for_ast(struct lock_info *li)
1246 + /* Wait for the AST routine to complete */
1247 + set_task_state(current, TASK_INTERRUPTIBLE);
1248 + while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
1251 + set_task_state(current, TASK_RUNNING);
1253 + return li->li_lksb.sb_status;
1257 +/* Open on control device */
1258 +static int dlm_ctl_open(struct inode *inode, struct file *file)
1263 +/* Close on control device */
1264 +static int dlm_ctl_close(struct inode *inode, struct file *file)
1269 +/* Open on lockspace device */
1270 +static int dlm_open(struct inode *inode, struct file *file)
1272 + struct file_info *f;
1273 + struct user_ls *lsinfo;
1275 + lsinfo = find_lockspace(iminor(inode));
1279 + f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
1283 + atomic_inc(&lsinfo->ls_refcnt);
1284 + INIT_LIST_HEAD(&f->fi_lkb_list);
1285 + INIT_LIST_HEAD(&f->fi_ast_list);
1286 + spin_lock_init(&f->fi_ast_lock);
1287 + spin_lock_init(&f->fi_lkb_lock);
1288 + init_waitqueue_head(&f->fi_wait);
1289 + f->fi_ls = lsinfo;
1290 + atomic_set(&f->fi_refcnt, 1);
1291 + set_bit(1, &f->fi_flags);
1293 + file->private_data = f;
1298 +/* Check the user's version matches ours */
1299 +static int check_version(struct dlm_lock_params *params)
1301 + if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
1302 + (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
1303 + params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
1305 + log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
1306 + params->version[0],
1307 + params->version[1],
1308 + params->version[2],
1309 + DLM_DEVICE_VERSION_MAJOR,
1310 + DLM_DEVICE_VERSION_MINOR,
1311 + DLM_DEVICE_VERSION_PATCH);
1317 +/* Close on lockspace device */
1318 +static int dlm_close(struct inode *inode, struct file *file)
1320 + struct file_info *f = file->private_data;
1321 + struct lock_info li;
1322 + struct lock_info *old_li, *safe;
1325 + struct user_ls *lsinfo;
1326 + DECLARE_WAITQUEUE(wq, current);
1328 + lsinfo = find_lockspace(iminor(inode));
1332 + /* Mark this closed so that ASTs will not be delivered any more */
1333 + clear_bit(1, &f->fi_flags);
1335 + /* Block signals while we are doing this */
1336 + sigfillset(&allsigs);
1337 + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1339 + /* We use our own lock_info struct here, so that any
1340 + * outstanding "real" ASTs will be delivered with the
1341 + * corresponding "real" params, thus freeing the lock_info
1342 + * that belongs the lock. This catches the corner case where
1343 + * a lock is BUSY when we try to unlock it here
1345 + memset(&li, 0, sizeof(li));
1346 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1347 + init_waitqueue_head(&li.li_waitq);
1348 + add_wait_queue(&li.li_waitq, &wq);
1351 + * Free any outstanding locks, they are on the
1352 + * list in LIFO order so there should be no problems
1353 + * about unlocking parents before children.
1354 + * Although we don't remove the lkbs from the list here
1355 + * (what would be the point?), foreach_safe is needed
1356 + * because the lkbs are freed during dlm_unlock operations
1358 + list_for_each_entry_safe(old_li, safe, &f->fi_lkb_list, li_ownerqueue) {
1362 + struct dlm_lkb *lkb;
1364 + lkb = dlm_get_lkb(f->fi_ls->ls_lockspace, old_li->li_lksb.sb_lkid);
1366 + /* Don't unlock persistent locks */
1367 + if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
1368 + list_del(&old_li->li_ownerqueue);
1370 + /* Update master copy */
1371 + if (lkb->lkb_resource->res_nodeid) {
1372 + li.li_lksb.sb_lkid = lkb->lkb_id;
1373 + status = dlm_lock(f->fi_ls->ls_lockspace,
1374 + lkb->lkb_grmode, &li.li_lksb,
1375 + DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
1376 + NULL, 0, 0, ast_routine, &li,
1379 + wait_for_ast(&li);
1381 + lkb->lkb_flags |= GDLM_LKFLG_ORPHAN;
1383 + /* But tidy our references in it */
1385 + lkb->lkb_astparam = (long)NULL;
1391 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1393 + /* If it's not granted then cancel the request.
1394 + * If the lock was WAITING then it will be dropped,
1395 + * if it was converting then it will be reverted to GRANTED,
1396 + * then we will unlock it.
1398 + lock_status = lkb->lkb_status;
1400 + if (lock_status != GDLM_LKSTS_GRANTED)
1401 + flags = DLM_LKF_CANCEL;
1403 + if (lkb->lkb_grmode >= DLM_LOCK_PW)
1404 + flags |= DLM_LKF_IVVALBLK;
1406 + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1408 + /* Must wait for it to complete as the next lock could be its
1411 + wait_for_ast(&li);
1413 + /* If it was waiting for a conversion, it will
1414 + now be granted so we can unlock it properly */
1415 + if (lock_status == GDLM_LKSTS_CONVERT) {
1416 + flags &= ~DLM_LKF_CANCEL;
1417 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1418 + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1421 + wait_for_ast(&li);
1423 + /* Unlock suceeded, free the lock_info struct. */
1424 + if (status == 0) {
1430 + remove_wait_queue(&li.li_waitq, &wq);
1432 + /* If this is the last reference, and the lockspace has been deleted
1433 + then free the struct */
1434 + if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
1435 + kfree(lsinfo->ls_miscinfo.name);
1439 + /* Restore signals */
1440 + sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1441 + recalc_sigpending();
1447 + * ioctls to create/remove lockspaces, and check how many
1448 + * outstanding ASTs there are against a particular LS.
1450 +static int dlm_ioctl(struct inode *inode, struct file *file,
1451 + uint command, ulong u)
1453 + struct file_info *fi = file->private_data;
1454 + int status = -EINVAL;
1456 + struct list_head *tmp_list;
1458 + switch (command) {
1460 + /* Are there any ASTs for us to read?
1461 + * Warning, this returns the number of messages (ASTs)
1462 + * in the queue, NOT the number of bytes to read
1466 + spin_lock(&fi->fi_ast_lock);
1467 + list_for_each(tmp_list, &fi->fi_ast_list)
1469 + spin_unlock(&fi->fi_ast_lock);
1470 + status = put_user(count, (int *)u);
1481 + * ioctls to create/remove lockspaces.
1483 +static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
1484 + uint command, ulong u)
1486 + int status = -EINVAL;
1487 + char ls_name[MAX_LS_NAME_LEN];
1488 + struct user_ls *lsinfo;
1491 + switch (command) {
1492 + case DLM_CREATE_LOCKSPACE:
1493 + if (!capable(CAP_SYS_ADMIN))
1496 + if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
1498 + status = register_lockspace(ls_name, &lsinfo);
1500 + /* If it succeeded then return the minor number */
1502 + status = lsinfo->ls_miscinfo.minor;
1505 + case DLM_FORCE_RELEASE_LOCKSPACE:
1508 + case DLM_RELEASE_LOCKSPACE:
1509 + if (!capable(CAP_SYS_ADMIN))
1512 + down(&user_ls_lock);
1513 + lsinfo = __find_lockspace(u);
1515 + up(&user_ls_lock);
1519 + status = unregister_lockspace(lsinfo, force);
1520 + up(&user_ls_lock);
1530 +/* Deal with the messy stuff of copying a web of structs
1531 + from kernel space to userspace */
1532 +static int copy_query_result(struct ast_info *ast)
1534 + int status = -EFAULT;
1535 + struct dlm_queryinfo qi;
1537 + /* Get the pointers to userspace structs */
1538 + if (copy_from_user(&qi, ast->user_queryinfo,
1539 + sizeof(struct dlm_queryinfo)))
1542 + if (put_user(ast->queryinfo->gqi_lockcount,
1543 + &ast->user_queryinfo->gqi_lockcount))
1546 + if (qi.gqi_resinfo) {
1547 + if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
1548 + sizeof(struct dlm_resinfo)))
1552 + if (qi.gqi_lockinfo) {
1553 + if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
1554 + sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
1560 + if (ast->queryinfo->gqi_lockinfo)
1561 + kfree(ast->queryinfo->gqi_lockinfo);
1563 + if (ast->queryinfo->gqi_resinfo)
1564 + kfree(ast->queryinfo->gqi_resinfo);
1566 + kfree(ast->queryinfo);
1572 +/* Read call, might block if no ASTs are waiting.
1573 + * It will only ever return one message at a time, regardless
1574 + * of how many are pending.
1576 +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
1578 + struct file_info *fi = file->private_data;
1579 + struct ast_info *ast;
1581 + DECLARE_WAITQUEUE(wait, current);
1583 + if (count < sizeof(struct dlm_lock_result))
1586 + spin_lock(&fi->fi_ast_lock);
1587 + if (list_empty(&fi->fi_ast_list)) {
1589 + /* No waiting ASTs.
1590 + * Return EOF if the lockspace been deleted.
1592 + if (test_bit(1, &fi->fi_ls->ls_flags))
1595 + if (file->f_flags & O_NONBLOCK) {
1596 + spin_unlock(&fi->fi_ast_lock);
1600 + add_wait_queue(&fi->fi_wait, &wait);
1603 + set_current_state(TASK_INTERRUPTIBLE);
1604 + if (list_empty(&fi->fi_ast_list) &&
1605 + !signal_pending(current)) {
1607 + spin_unlock(&fi->fi_ast_lock);
1609 + spin_lock(&fi->fi_ast_lock);
1613 + current->state = TASK_RUNNING;
1614 + remove_wait_queue(&fi->fi_wait, &wait);
1616 + if (signal_pending(current)) {
1617 + spin_unlock(&fi->fi_ast_lock);
1618 + return -ERESTARTSYS;
1622 + ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
1623 + list_del(&ast->list);
1624 + spin_unlock(&fi->fi_ast_lock);
1626 + ret = sizeof(struct dlm_lock_result);
1627 + if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
1630 + if (ast->ast_reason == AST_COMP &&
1631 + ast->result.cmd == DLM_USER_LOCK && ast->user_lvbptr) {
1632 + if (copy_to_user(ast->user_lvbptr, ast->result.lksb.sb_lvbptr, DLM_LVB_LEN))
1636 + /* If it was a query then copy the result block back here */
1637 + if (ast->queryinfo) {
1638 + int status = copy_query_result(ast);
1647 +static unsigned int dlm_poll(struct file *file, poll_table *wait)
1649 + struct file_info *fi = file->private_data;
1651 + poll_wait(file, &fi->fi_wait, wait);
1653 + spin_lock(&fi->fi_ast_lock);
1654 + if (!list_empty(&fi->fi_ast_list)) {
1655 + spin_unlock(&fi->fi_ast_lock);
1656 + return POLLIN | POLLRDNORM;
1659 + spin_unlock(&fi->fi_ast_lock);
1663 +static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
1665 + struct lock_info *li;
1668 + if (!kparams->castaddr)
1671 + if (!kparams->lksb)
1674 + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1678 + get_file_info(fi);
1679 + li->li_user_lksb = kparams->lksb;
1680 + li->li_bastparam = kparams->bastparam;
1681 + li->li_bastaddr = kparams->bastaddr;
1682 + li->li_castparam = kparams->castparam;
1683 + li->li_castaddr = kparams->castaddr;
1686 + li->li_cmd = kparams->cmd;
1687 + clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1689 + if (copy_from_user(&li->li_lksb, kparams->lksb,
1690 + sizeof(struct dlm_lksb))) {
1694 + li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
1696 + /* Allocate query structs */
1698 + li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
1699 + if (!li->li_queryinfo)
1702 + /* Mainly to get gqi_lock buffer size */
1703 + if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
1704 + sizeof(struct dlm_queryinfo))) {
1709 + /* Overwrite userspace pointers we just copied with kernel space ones */
1710 + if (li->li_queryinfo->gqi_resinfo) {
1711 + li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
1712 + if (!li->li_queryinfo->gqi_resinfo)
1715 + if (li->li_queryinfo->gqi_lockinfo) {
1716 + li->li_queryinfo->gqi_lockinfo =
1717 + kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
1719 + if (!li->li_queryinfo->gqi_lockinfo)
1723 + li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
1725 + return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
1726 + kparams->flags, /* query */
1731 + kfree(li->li_queryinfo);
1738 +static struct lock_info *allocate_lockinfo(struct file_info *fi, struct dlm_lock_params *kparams)
1740 + struct lock_info *li;
1742 + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1744 + li->li_magic = LOCKINFO_MAGIC;
1746 + li->li_cmd = kparams->cmd;
1747 + li->li_queryinfo = NULL;
1749 + li->li_pend_bastparam = NULL;
1750 + li->li_pend_bastaddr = NULL;
1751 + li->li_lksb.sb_lvbptr = NULL;
1752 + li->li_bastaddr = kparams->bastaddr;
1753 + li->li_bastparam = kparams->bastparam;
1755 + get_file_info(fi);
1760 +static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
1761 + const char *buffer)
1763 + struct lock_info *li;
1765 + char name[DLM_RESNAME_MAXLEN];
1769 + * Validate things that we need to have correct.
1771 + if (!kparams->castaddr)
1774 + if (!kparams->lksb)
1777 + if (!access_ok(VERIFY_WRITE, kparams->lksb, sizeof(struct dlm_lksb)))
1780 + /* Persistent child locks are not available yet */
1781 + if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
1784 + /* For conversions, the lock will already have a lock_info
1785 + block squirelled away in astparam */
1786 + if (kparams->flags & DLM_LKF_CONVERT) {
1787 + struct dlm_lkb *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1792 + li = (struct lock_info *)lkb->lkb_astparam;
1794 + /* li may be NULL if the lock was PERSISTENT and the process went
1795 + away, so we need to allocate a new one */
1797 + li = allocate_lockinfo(fi, kparams);
1799 + spin_lock(&fi->fi_lkb_lock);
1800 + list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1801 + spin_unlock(&fi->fi_lkb_lock);
1808 + if (li->li_magic != LOCKINFO_MAGIC)
1811 + /* For conversions don't overwrite the current blocking AST
1813 + a) if a blocking AST fires before the conversion is queued
1814 + it runs the current handler
1815 + b) if the conversion is cancelled, the original blocking AST
1816 + declaration is active
1817 + The pend_ info is made active when the conversion
1820 + li->li_pend_bastaddr = kparams->bastaddr;
1821 + li->li_pend_bastparam = kparams->bastparam;
1824 + li = allocate_lockinfo(fi, kparams);
1828 + /* Get the lock name */
1829 + if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
1830 + kparams->namelen)) {
1834 + /* semaphore to allow us to complete our work before
1835 + the AST routine runs. In fact we only need (and use) this
1836 + when the initial lock fails */
1837 + init_MUTEX_LOCKED(&li->li_firstlock);
1838 + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1841 + li->li_user_lksb = kparams->lksb;
1842 + li->li_castaddr = kparams->castaddr;
1843 + li->li_castparam = kparams->castparam;
1845 + /* Copy the user's LKSB into kernel space,
1846 + needed for conversions & value block operations.
1847 + Save our kernel-space lvbptr first */
1848 + lvbptr = li->li_lksb.sb_lvbptr;
1849 + if (copy_from_user(&li->li_lksb, kparams->lksb, sizeof(struct dlm_lksb))) {
1853 + /* Store new userland LVBptr and restore kernel one */
1854 + li->li_user_lvbptr = li->li_lksb.sb_lvbptr;
1855 + li->li_lksb.sb_lvbptr = lvbptr;
1857 + /* Copy in the value block */
1858 + if (kparams->flags & DLM_LKF_VALBLK) {
1859 + if (!li->li_lksb.sb_lvbptr) {
1860 + li->li_lksb.sb_lvbptr = kmalloc(DLM_LVB_LEN, GFP_KERNEL);
1861 + if (!li->li_lksb.sb_lvbptr) {
1867 + if (copy_from_user(li->li_lksb.sb_lvbptr, kparams->lksb->sb_lvbptr,
1874 + li->li_user_lvbptr = NULL;
1878 + status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
1879 + kparams->flags, name, kparams->namelen,
1883 + (li->li_pend_bastaddr || li->li_bastaddr) ?
1884 + bast_routine : NULL,
1885 + kparams->range.ra_end ? &kparams->range : NULL);
1887 + /* If it succeeded (this far) with a new lock then keep track of
1888 + it on the file's lkb list */
1889 + if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
1891 + spin_lock(&fi->fi_lkb_lock);
1892 + list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1893 + spin_unlock(&fi->fi_lkb_lock);
1895 + up(&li->li_firstlock);
1897 + /* Copy the lkid back to userspace in case they want to cancel.
1898 + This address has already been tested so /should/ be OK, if not:
1899 + tough - we've taken the lock! */
1900 + copy_to_user(&kparams->lksb->sb_lkid,
1901 + &li->li_lksb.sb_lkid,
1902 + sizeof(li->li_lksb.sb_lkid));
1908 + if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
1910 + release_lockinfo(li);
1916 +static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
1918 + struct lock_info *li;
1919 + struct dlm_lkb *lkb;
1921 + int convert_cancel = 0;
1923 + lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1928 + /* Cancelling a conversion doesn't remove the lock...*/
1929 + if (kparams->flags & DLM_LKF_CANCEL &&
1930 + lkb->lkb_status == GDLM_LKSTS_CONVERT) {
1931 + convert_cancel = 1;
1934 + li = (struct lock_info *)lkb->lkb_astparam;
1936 + li = allocate_lockinfo(fi, kparams);
1937 + spin_lock(&fi->fi_lkb_lock);
1938 + list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1939 + spin_unlock(&fi->fi_lkb_lock);
1944 + if (li->li_magic != LOCKINFO_MAGIC)
1947 + li->li_user_lksb = kparams->lksb;
1948 + li->li_castparam = kparams->castparam;
1949 + li->li_cmd = kparams->cmd;
1951 + /* dlm_unlock() passes a 0 for castaddr which means don't overwrite
1952 + the existing li_castaddr as that's the completion routine for
1953 + unlocks. dlm_unlock_wait() specifies a new AST routine to be
1954 + executed when the unlock completes. */
1955 + if (kparams->castaddr)
1956 + li->li_castaddr = kparams->castaddr;
1958 + /* Have to do it here cos the lkb may not exist after
1960 + if (!convert_cancel) {
1961 + spin_lock(&fi->fi_lkb_lock);
1962 + list_del(&li->li_ownerqueue);
1963 + spin_unlock(&fi->fi_lkb_lock);
1966 + /* Use existing lksb & astparams */
1967 + status = dlm_unlock(fi->fi_ls->ls_lockspace,
1969 + kparams->flags, &li->li_lksb, li);
1970 + if (status && !convert_cancel) {
1971 + /* It failed, put it back on the list */
1972 + spin_lock(&fi->fi_lkb_lock);
1973 + list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1974 + spin_unlock(&fi->fi_lkb_lock);
1980 +/* Write call, submit a locking request */
1981 +static ssize_t dlm_write(struct file *file, const char __user *buffer,
1982 + size_t count, loff_t *ppos)
1984 + struct file_info *fi = file->private_data;
1985 + struct dlm_lock_params kparams;
1990 + if (count < sizeof(kparams)-1) /* -1 because lock name is optional */
1993 + /* Has the lockspace been deleted */
1994 + if (test_bit(1, &fi->fi_ls->ls_flags))
1997 + /* Get the command info */
1998 + if (copy_from_user(&kparams, buffer, sizeof(kparams)))
2001 + if (check_version(&kparams))
2004 + /* Block signals while we are doing this */
2005 + sigfillset(&allsigs);
2006 + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
2008 + switch (kparams.cmd)
2010 + case DLM_USER_LOCK:
2011 + status = do_user_lock(fi, &kparams, buffer);
2014 + case DLM_USER_UNLOCK:
2015 + status = do_user_unlock(fi, &kparams);
2018 + case DLM_USER_QUERY:
2019 + status = do_user_query(fi, &kparams);
2026 + /* Restore signals */
2027 + sigprocmask(SIG_SETMASK, &tmpsig, NULL);
2028 + recalc_sigpending();
2036 +/* Called when the cluster is shutdown uncleanly, all lockspaces
2037 + have been summarily removed */
2038 +void dlm_device_free_devices()
2040 + struct user_ls *tmp;
2041 + struct user_ls *lsinfo;
2043 + down(&user_ls_lock);
2044 + list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
2045 + misc_deregister(&lsinfo->ls_miscinfo);
2047 + /* Tidy up, but don't delete the lsinfo struct until
2048 + all the users have closed their devices */
2049 + list_del(&lsinfo->ls_list);
2050 + set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
2051 + lsinfo->ls_lockspace = NULL;
2053 + up(&user_ls_lock);
2056 +static struct file_operations _dlm_fops = {
2058 + .release = dlm_close,
2059 + .ioctl = dlm_ioctl,
2061 + .write = dlm_write,
2063 + .owner = THIS_MODULE,
2066 +static struct file_operations _dlm_ctl_fops = {
2067 + .open = dlm_ctl_open,
2068 + .release = dlm_ctl_close,
2069 + .ioctl = dlm_ctl_ioctl,
2070 + .owner = THIS_MODULE,
2074 + * Create control device
2076 +int dlm_device_init(void)
2080 + INIT_LIST_HEAD(&user_ls_list);
2081 + init_MUTEX(&user_ls_lock);
2083 + ctl_device.name = "dlm-control";
2084 + ctl_device.fops = &_dlm_ctl_fops;
2085 + ctl_device.minor = MISC_DYNAMIC_MINOR;
2087 + r = misc_register(&ctl_device);
2089 + log_print("misc_register failed for DLM control device");
2096 +void dlm_device_exit(void)
2098 + misc_deregister(&ctl_device);
2102 + * Overrides for Emacs so that we follow Linus's tabbing style.
2103 + * Emacs will notice this stuff at the end of the file and automatically
2104 + * adjust the settings for this buffer only. This must remain at the end
2106 + * ---------------------------------------------------------------------------
2107 + * Local variables:
2108 + * c-file-style: "linux"
2111 diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
2112 --- linux-orig/cluster/dlm/device.h 1970-01-01 07:30:00.000000000 +0730
2113 +++ linux-patched/cluster/dlm/device.h 2004-11-03 11:31:56.000000000 +0800
2115 +/******************************************************************************
2116 +*******************************************************************************
2118 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2119 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2121 +** This copyrighted material is made available to anyone wishing to use,
2122 +** modify, copy, or redistribute it subject to the terms and conditions
2123 +** of the GNU General Public License v.2.
2125 +*******************************************************************************
2126 +******************************************************************************/
2128 +#ifndef __DEVICE_DOT_H__
2129 +#define __DEVICE_DOT_H__
2131 +extern void dlm_device_free_devices(void);
2133 +#endif /* __DEVICE_DOT_H__ */
2134 diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
2135 --- linux-orig/cluster/dlm/dir.c 1970-01-01 07:30:00.000000000 +0730
2136 +++ linux-patched/cluster/dlm/dir.c 2004-11-03 11:31:56.000000000 +0800
2138 +/******************************************************************************
2139 +*******************************************************************************
2141 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2142 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2144 +** This copyrighted material is made available to anyone wishing to use,
2145 +** modify, copy, or redistribute it subject to the terms and conditions
2146 +** of the GNU General Public License v.2.
2148 +*******************************************************************************
2149 +******************************************************************************/
2151 +#include "dlm_internal.h"
2153 +#include "lockspace.h"
2154 +#include "lowcomms.h"
2155 +#include "reccomms.h"
2157 +#include "config.h"
2158 +#include "memory.h"
2159 +#include "recover.h"
2163 + uint32_t rm_nodeid;
2164 + uint16_t rm_length;
2168 +void print_name(char *b, int len)
2171 + for (i = 0; i < len; i++)
2172 + printk("%c", b[i]);
2176 +static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
2178 + spin_lock(&ls->ls_recover_list_lock);
2179 + list_add(&de->list, &ls->ls_recover_list);
2180 + spin_unlock(&ls->ls_recover_list_lock);
2183 +static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
2185 + int found = FALSE;
2186 + struct dlm_direntry *de;
2188 + spin_lock(&ls->ls_recover_list_lock);
2189 + list_for_each_entry(de, &ls->ls_recover_list, list) {
2190 + if (de->length == len) {
2191 + list_del(&de->list);
2192 + de->master_nodeid = 0;
2193 + memset(de->name, 0, len);
2198 + spin_unlock(&ls->ls_recover_list_lock);
2201 + de = allocate_direntry(ls, len);
2205 +void clear_free_de(struct dlm_ls *ls)
2207 + struct dlm_direntry *de;
2209 + spin_lock(&ls->ls_recover_list_lock);
2210 + while (!list_empty(&ls->ls_recover_list)) {
2211 + de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
2213 + list_del(&de->list);
2214 + free_direntry(de);
2216 + spin_unlock(&ls->ls_recover_list_lock);
2220 + * We use the upper 16 bits of the hash value to select the directory node.
2221 + * Low bits are used for distribution of rsb's among hash buckets on each node.
2223 + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
2224 + * num_nodes to the hash value. This value in the desired range is used as an
2225 + * offset into the sorted list of nodeid's to give the particular nodeid of the
2229 +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length)
2231 + struct list_head *tmp;
2232 + struct dlm_csb *csb = NULL;
2233 + uint32_t hash, node, n = 0, nodeid;
2235 + if (ls->ls_num_nodes == 1) {
2236 + nodeid = our_nodeid();
2240 + hash = dlm_hash(name, length);
2241 + node = (hash >> 16) % ls->ls_num_nodes;
2243 + if (ls->ls_node_array) {
2244 + nodeid = ls->ls_node_array[node];
2248 + list_for_each(tmp, &ls->ls_nodes) {
2251 + csb = list_entry(tmp, struct dlm_csb, list);
2255 + DLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u\n",
2256 + ls->ls_num_nodes, n, node););
2257 + nodeid = csb->node->nodeid;
2262 +uint32_t get_directory_nodeid(struct dlm_rsb *rsb)
2264 + return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
2268 +static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
2272 + val = dlm_hash(name, len);
2273 + val &= (ls->ls_dirtbl_size - 1);
2278 +static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
2282 + bucket = dir_hash(ls, de->name, de->length);
2283 + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
2286 +static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
2287 + int namelen, uint32_t bucket)
2289 + struct dlm_direntry *de;
2291 + list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
2292 + if (de->length == namelen && !memcmp(name, de->name, namelen))
2300 +void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen)
2302 + struct dlm_direntry *de;
2305 + bucket = dir_hash(ls, name, namelen);
2307 + write_lock(&ls->ls_dirtbl[bucket].lock);
2309 + de = search_bucket(ls, name, namelen, bucket);
2312 + log_all(ls, "remove fr %u none", nodeid);
2313 + print_name(name, namelen);
2317 + if (de->master_nodeid != nodeid) {
2318 + log_all(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
2319 + print_name(name, namelen);
2323 + list_del(&de->list);
2324 + free_direntry(de);
2326 + write_unlock(&ls->ls_dirtbl[bucket].lock);
2329 +void dlm_dir_clear(struct dlm_ls *ls)
2331 + struct list_head *head;
2332 + struct dlm_direntry *de;
2335 + for (i = 0; i < ls->ls_dirtbl_size; i++) {
2336 + write_lock(&ls->ls_dirtbl[i].lock);
2337 + head = &ls->ls_dirtbl[i].list;
2338 + while (!list_empty(head)) {
2339 + de = list_entry(head->next, struct dlm_direntry, list);
2340 + list_del(&de->list);
2341 + put_free_de(ls, de);
2343 + write_unlock(&ls->ls_dirtbl[i].lock);
2347 +static void resmov_in(struct resmov *rm, char *buf)
2349 + struct resmov tmp;
2351 + memcpy(&tmp, buf, sizeof(struct resmov));
2353 + rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
2354 + rm->rm_length = be16_to_cpu(tmp.rm_length);
2357 +int dlm_dir_rebuild_local(struct dlm_ls *ls)
2359 + struct dlm_csb *csb;
2360 + struct dlm_direntry *de;
2361 + struct dlm_rcom *rc;
2362 + struct resmov mov, last_mov;
2363 + char *b, *last_name;
2364 + int error = -ENOMEM, count = 0;
2366 + log_all(ls, "rebuild resource directory");
2368 + dlm_dir_clear(ls);
2370 + rc = allocate_rcom_buffer(ls);
2374 + last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
2378 + list_for_each_entry(csb, &ls->ls_nodes, list) {
2379 + last_mov.rm_length = 0;
2381 + error = dlm_recovery_stopped(ls);
2385 + memcpy(rc->rc_buf, last_name, last_mov.rm_length);
2386 + rc->rc_datalen = last_mov.rm_length;
2388 + error = rcom_send_message(ls, csb->node->nodeid,
2389 + RECCOMM_RECOVERNAMES, rc, 1);
2396 + * pick each res out of buffer
2402 + resmov_in(&mov, b);
2403 + b += sizeof(struct resmov);
2405 + /* Length of 0 with a non-zero nodeid marks the
2406 + * end of the list */
2407 + if (!mov.rm_length && mov.rm_nodeid)
2410 + /* This is just the end of the block */
2411 + if (!mov.rm_length)
2414 + DLM_ASSERT(mov.rm_nodeid == csb->node->nodeid,);
2417 + de = get_free_de(ls, mov.rm_length);
2421 + de->master_nodeid = mov.rm_nodeid;
2422 + de->length = mov.rm_length;
2423 + memcpy(de->name, b, mov.rm_length);
2424 + b += mov.rm_length;
2426 + add_entry_to_hash(ls, de);
2430 + memset(last_name, 0, DLM_RESNAME_MAXLEN);
2431 + memcpy(last_name, de->name, de->length);
2438 + set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
2441 + log_all(ls, "rebuilt %d resources", count);
2447 + free_rcom_buffer(rc);
2450 + clear_free_de(ls);
2455 + * The reply end of dlm_dir_rebuild_local/RECOVERNAMES. Collect and send as
2456 + * many resource names as can fit in the buffer.
2459 +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2460 + char *outbuf, int outlen, uint32_t nodeid)
2462 + struct list_head *list;
2463 + struct dlm_rsb *start_rsb = NULL, *rsb;
2464 + int offset = 0, start_namelen, error;
2466 + struct resmov tmp;
2467 + uint32_t dir_nodeid;
2470 + * Find the rsb where we left off (or start again)
2473 + start_namelen = inlen;
2474 + start_name = inbuf;
2476 + if (start_namelen > 1) {
2477 + error = find_rsb(ls, NULL, start_name, start_namelen, 0,
2479 + DLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
2480 + release_rsb(start_rsb);
2484 + * Send rsb names for rsb's we're master of and whose directory node
2485 + * matches the requesting node.
2488 + down_read(&ls->ls_root_lock);
2490 + list = start_rsb->res_rootlist.next;
2492 + list = ls->ls_rootres.next;
2494 + for (offset = 0; list != &ls->ls_rootres; list = list->next) {
2495 + rsb = list_entry(list, struct dlm_rsb, res_rootlist);
2496 + if (rsb->res_nodeid)
2499 + dir_nodeid = get_directory_nodeid(rsb);
2500 + if (dir_nodeid != nodeid)
2503 + if (offset + sizeof(struct resmov)*2 + rsb->res_length > outlen) {
2504 + /* Write end-of-block record */
2505 + memset(&tmp, 0, sizeof(struct resmov));
2506 + memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2507 + offset += sizeof(struct resmov);
2511 + memset(&tmp, 0, sizeof(struct resmov));
2512 + tmp.rm_nodeid = cpu_to_be32(our_nodeid());
2513 + tmp.rm_length = cpu_to_be16(rsb->res_length);
2515 + memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2516 + offset += sizeof(struct resmov);
2518 + memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
2519 + offset += rsb->res_length;
2523 + * If we've reached the end of the list (and there's room) write a
2524 + * terminating record.
2527 + if ((list == &ls->ls_rootres) &&
2528 + (offset + sizeof(struct resmov) <= outlen)) {
2530 + memset(&tmp, 0, sizeof(struct resmov));
2531 + /* This only needs to be non-zero */
2532 + tmp.rm_nodeid = cpu_to_be32(1);
2533 + /* and this must be zero */
2534 + tmp.rm_length = 0;
2535 + memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2536 + offset += sizeof(struct resmov);
2540 + up_read(&ls->ls_root_lock);
2544 +static int get_entry(struct dlm_ls *ls, uint32_t nodeid, char *name,
2545 + int namelen, uint32_t *r_nodeid)
2547 + struct dlm_direntry *de, *tmp;
2550 + bucket = dir_hash(ls, name, namelen);
2552 + write_lock(&ls->ls_dirtbl[bucket].lock);
2553 + de = search_bucket(ls, name, namelen, bucket);
2555 + *r_nodeid = de->master_nodeid;
2556 + write_unlock(&ls->ls_dirtbl[bucket].lock);
2557 + if (*r_nodeid == nodeid)
2562 + write_unlock(&ls->ls_dirtbl[bucket].lock);
2564 + de = allocate_direntry(ls, namelen);
2568 + de->master_nodeid = nodeid;
2569 + de->length = namelen;
2570 + memcpy(de->name, name, namelen);
2572 + write_lock(&ls->ls_dirtbl[bucket].lock);
2573 + tmp = search_bucket(ls, name, namelen, bucket);
2575 + free_direntry(de);
2578 + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
2580 + *r_nodeid = de->master_nodeid;
2581 + write_unlock(&ls->ls_dirtbl[bucket].lock);
2585 +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2586 + uint32_t *r_nodeid)
2588 + return get_entry(ls, nodeid, name, namelen, r_nodeid);
2592 + * The node with lowest id queries all nodes to determine when all are done.
2593 + * All other nodes query the low nodeid for this.
2596 +int dlm_dir_rebuild_wait(struct dlm_ls *ls)
2600 + if (ls->ls_low_nodeid == our_nodeid()) {
2601 + error = dlm_wait_status_all(ls, RESDIR_VALID);
2603 + set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
2605 + error = dlm_wait_status_low(ls, RESDIR_ALL_VALID);
2609 diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
2610 --- linux-orig/cluster/dlm/dir.h 1970-01-01 07:30:00.000000000 +0730
2611 +++ linux-patched/cluster/dlm/dir.h 2004-11-03 11:31:56.000000000 +0800
2613 +/******************************************************************************
2614 +*******************************************************************************
2616 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2617 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2619 +** This copyrighted material is made available to anyone wishing to use,
2620 +** modify, copy, or redistribute it subject to the terms and conditions
2621 +** of the GNU General Public License v.2.
2623 +*******************************************************************************
2624 +******************************************************************************/
2626 +#ifndef __DIR_DOT_H__
2627 +#define __DIR_DOT_H__
2629 +void print_name(char *b, int len);
2630 +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length);
2631 +uint32_t get_directory_nodeid(struct dlm_rsb *rsb);
2633 +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2634 + uint32_t *r_nodeid);
2635 +void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name,
2637 +int dlm_dir_rebuild_local(struct dlm_ls *ls);
2638 +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2639 + char *outbuf, int outlen, uint32_t nodeid);
2640 +int dlm_dir_rebuild_wait(struct dlm_ls * ls);
2641 +void dlm_dir_clear(struct dlm_ls *ls);
2642 +void dlm_dir_dump(struct dlm_ls *ls);
2643 +void clear_free_de(struct dlm_ls *ls);
2645 +#endif /* __DIR_DOT_H__ */
2646 diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
2647 --- linux-orig/cluster/dlm/dlm_internal.h 1970-01-01 07:30:00.000000000 +0730
2648 +++ linux-patched/cluster/dlm/dlm_internal.h 2004-11-03 11:31:56.000000000 +0800
2650 +/******************************************************************************
2651 +*******************************************************************************
2653 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2654 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2656 +** This copyrighted material is made available to anyone wishing to use,
2657 +** modify, copy, or redistribute it subject to the terms and conditions
2658 +** of the GNU General Public License v.2.
2660 +*******************************************************************************
2661 +******************************************************************************/
2663 +#ifndef __DLM_INTERNAL_DOT_H__
2664 +#define __DLM_INTERNAL_DOT_H__
2667 + * This is the main header file to be included in each DLM source file.
2670 +#define DLM_RELEASE_NAME "<CVS>"
2672 +#include <linux/slab.h>
2673 +#include <linux/sched.h>
2674 +#include <asm/semaphore.h>
2675 +#include <linux/types.h>
2676 +#include <linux/spinlock.h>
2677 +#include <linux/vmalloc.h>
2678 +#include <asm/uaccess.h>
2679 +#include <linux/list.h>
2680 +#include <linux/errno.h>
2681 +#include <linux/random.h>
2682 +#include <linux/delay.h>
2683 +#include <linux/interrupt.h>
2684 +#include <linux/kthread.h>
2686 +#include <cluster/dlm.h>
2687 +#include <cluster/dlm_device.h>
2688 +#include <cluster/service.h>
2698 +#if (BITS_PER_LONG == 64)
2699 +#define PRIu64 "lu"
2700 +#define PRId64 "ld"
2701 +#define PRIo64 "lo"
2702 +#define PRIx64 "lx"
2703 +#define PRIX64 "lX"
2704 +#define SCNu64 "lu"
2705 +#define SCNd64 "ld"
2706 +#define SCNo64 "lo"
2707 +#define SCNx64 "lx"
2708 +#define SCNX64 "lX"
2710 +#define PRIu64 "Lu"
2711 +#define PRId64 "Ld"
2712 +#define PRIo64 "Lo"
2713 +#define PRIx64 "Lx"
2714 +#define PRIX64 "LX"
2715 +#define SCNu64 "Lu"
2716 +#define SCNd64 "Ld"
2717 +#define SCNo64 "Lo"
2718 +#define SCNx64 "Lx"
2719 +#define SCNX64 "LX"
2722 +#define wchan_cond_sleep_intr(chan, sleep_cond) \
2725 + DECLARE_WAITQUEUE(__wait_chan, current); \
2726 + current->state = TASK_INTERRUPTIBLE; \
2727 + add_wait_queue(&chan, &__wait_chan); \
2728 + if ((sleep_cond)) \
2730 + remove_wait_queue(&chan, &__wait_chan); \
2731 + current->state = TASK_RUNNING; \
2735 +static inline int check_timeout(unsigned long stamp, unsigned int seconds)
2737 + return time_after(jiffies, stamp + seconds * HZ);
2741 +#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
2743 +#define log_all(ls, fmt, args...) \
2745 + printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
2746 + dlm_debug_log(ls, fmt, ##args); \
2749 +#define log_error log_all
2751 +#if defined(DLM_DEBUG2)
2752 +int nibbler_printf(const char *fmt, ...);
2753 +#define log_debug2(fmt, args...) nibbler_printf(fmt"\n", ##args)
2755 +#define log_debug2(fmt, args...)
2759 +#if defined(DLM_DEBUG)
2760 +#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
2762 +#define log_debug(ls, fmt, args...)
2765 +#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
2767 +#define log_debug log_all
2771 +#define DLM_ASSERT(x, do) \
2775 + dlm_locks_dump(); \
2776 + dlm_debug_dump(); \
2777 + printk("\nDLM: Assertion failed on line %d of file %s\n" \
2778 + "DLM: assertion: \"%s\"\n" \
2779 + "DLM: time = %lu\n", \
2780 + __LINE__, __FILE__, #x, jiffies); \
2784 + panic("DLM: Record message above and reboot.\n"); \
2794 +struct dlm_lkbtable;
2795 +struct dlm_rsbtable;
2796 +struct dlm_dirtable;
2797 +struct dlm_direntry;
2798 +struct dlm_recover;
2800 +struct dlm_request;
2803 +struct dlm_query_request;
2804 +struct dlm_query_reply;
2807 +struct dlm_direntry {
2808 + struct list_head list;
2809 + uint32_t master_nodeid;
2814 +struct dlm_dirtable {
2815 + struct list_head list;
2819 +struct dlm_rsbtable {
2820 + struct list_head list;
2824 +struct dlm_lkbtable {
2825 + struct list_head list;
2831 + * Cluster node (per node in cluster)
2835 + struct list_head list;
2837 + atomic_t refcount; /* num csb's referencing */
2841 + * Cluster System Block (per node in a ls)
2845 + struct list_head list; /* per-lockspace node list */
2846 + struct dlm_node * node; /* global node structure */
2847 + int gone_event; /* event id when node removed */
2851 + * Used to save and manage recovery state for a lockspace.
2854 +struct dlm_recover {
2855 + struct list_head list;
2856 + uint32_t * nodeids;
2862 + * Elements in the range array
2865 +#define GR_RANGE_START (0)
2866 +#define GR_RANGE_END (1)
2867 +#define RQ_RANGE_START (2)
2868 +#define RQ_RANGE_END (3)
2871 + * Lockspace structure
2874 +#define LSFL_WORK (0)
2875 +#define LSFL_LS_RUN (1)
2876 +#define LSFL_LS_STOP (2)
2877 +#define LSFL_LS_START (3)
2878 +#define LSFL_LS_FINISH (4)
2879 +#define LSFL_RECCOMM_WAIT (5)
2880 +#define LSFL_RECCOMM_READY (6)
2881 +#define LSFL_NOTIMERS (7)
2882 +#define LSFL_FINISH_RECOVERY (8)
2883 +#define LSFL_RESDIR_VALID (9)
2884 +#define LSFL_ALL_RESDIR_VALID (10)
2885 +#define LSFL_NODES_VALID (11)
2886 +#define LSFL_ALL_NODES_VALID (12)
2887 +#define LSFL_REQUEST_WARN (13)
2888 +#define LSFL_RECOVERD_EXIT (14)
2890 +#define LSST_NONE (0)
2891 +#define LSST_INIT (1)
2892 +#define LSST_INIT_DONE (2)
2893 +#define LSST_CLEAR (3)
2894 +#define LSST_WAIT_START (4)
2895 +#define LSST_RECONFIG_DONE (5)
2898 + struct list_head ls_list; /* list of lockspaces */
2899 + uint32_t ls_local_id; /* local unique lockspace ID */
2900 + uint32_t ls_global_id; /* global unique lockspace ID */
2901 + int ls_allocation; /* Memory allocation policy */
2902 + int ls_count; /* reference count */
2903 + unsigned long ls_flags; /* LSFL_ */
2905 + struct dlm_rsbtable * ls_rsbtbl;
2906 + uint32_t ls_rsbtbl_size;
2908 + struct dlm_lkbtable * ls_lkbtbl;
2909 + uint32_t ls_lkbtbl_size;
2911 + struct dlm_dirtable * ls_dirtbl;
2912 + uint32_t ls_dirtbl_size;
2914 + struct list_head ls_nodes; /* current nodes in ls */
2915 + struct list_head ls_nodes_gone; /* dead node list, recovery */
2916 + uint32_t ls_num_nodes; /* number of nodes in ls */
2917 + uint32_t ls_low_nodeid;
2918 + uint32_t * ls_node_array;
2920 + struct rw_semaphore ls_unlock_sem; /* To prevent unlock on a
2921 + parent lock racing with a
2924 + struct list_head ls_deadlockq; /* List of locks in conversion
2925 + ordered by duetime. for
2926 + deadlock detection */
2928 + /* recovery related */
2930 + struct task_struct * ls_recoverd_task;
2931 + struct semaphore ls_recoverd_lock;
2932 + struct list_head ls_recover; /* dlm_recover structs */
2933 + spinlock_t ls_recover_lock;
2935 + int ls_last_start;
2936 + int ls_last_finish;
2937 + int ls_state; /* recovery states */
2939 + struct rw_semaphore ls_in_recovery; /* block local requests */
2940 + struct list_head ls_requestqueue;/* queue remote requests */
2941 + struct semaphore ls_requestqueue_lock;
2943 + struct dlm_rcom * ls_rcom; /* recovery comms */
2944 + uint32_t ls_rcom_msgid;
2945 + struct semaphore ls_rcom_lock;
2947 + struct list_head ls_recover_list;
2948 + spinlock_t ls_recover_list_lock;
2949 + int ls_recover_list_count;
2950 + wait_queue_head_t ls_wait_general;
2952 + struct list_head ls_rootres; /* root resources */
2953 + struct rw_semaphore ls_root_lock; /* protect rootres list */
2955 + struct list_head ls_rebuild_rootrsb_list; /* Root of lock trees
2956 + we're deserialising */
2965 +#define RESFL_NEW_MASTER (0)
2966 +#define RESFL_RECOVER_LIST (1)
2967 +#define RESFL_MASTER (2)
2970 + struct list_head res_hashchain;
2971 + uint32_t res_bucket;
2973 + struct dlm_ls * res_ls; /* The owning lockspace */
2975 + struct list_head res_rootlist; /* List of root rsb's */
2977 + struct list_head res_subreslist; /* List of all sub-resources
2978 + for this root rsb */
2980 + uint8_t res_depth; /* Depth in resource tree */
2981 + unsigned long res_flags; /* Flags, RESFL_ */
2983 + struct list_head res_grantqueue;
2984 + struct list_head res_convertqueue;
2985 + struct list_head res_waitqueue;
2987 + uint32_t res_nodeid; /* nodeid of master node */
2989 + struct dlm_rsb * res_root; /* root rsb if a subresource */
2990 + struct dlm_rsb * res_parent; /* parent rsb (if any) */
2992 + atomic_t res_ref; /* Number of lkb's */
2993 + uint16_t res_remasterid; /* ID used during remaster */
2995 + struct list_head res_recover_list; /* General list for use
2996 + during recovery */
2997 + int res_recover_msgid;
2998 + int res_newlkid_expect;
3000 + struct rw_semaphore res_lock;
3002 + char * res_lvbptr; /* Lock value block */
3004 + uint8_t res_length;
3005 + char res_name[1]; /* <res_length> bytes */
3009 + * Lock block. To avoid confusion, where flags mirror the public flags, they
3010 + * should have the same value.
3012 + * In general, DLM_LKF flags from dlm.h apply only to lkb_lockqueue_flags
3013 + * and GDLM_LKFLG flags from dlm_internal.h apply only to lkb_flags.
3014 + * The rr_flags field in the request struct is a copy of lkb_lockqueue_flags.
3015 + * There is one dangerous exception: GDLM_LKFLG_RANGE is set in rr_flags
3016 + * when sending a remote range lock request. This value is then copied into
3017 + * the remote lkb_lockqueue_flags field. This means GDLM_LKFLG_RANGE must
3018 + * not have the same value as any external DLM_LKF flag.
3021 +#define GDLM_LKSTS_NEW (0)
3022 +#define GDLM_LKSTS_WAITING (1)
3023 +#define GDLM_LKSTS_GRANTED (2)
3024 +#define GDLM_LKSTS_CONVERT (3)
3026 +/* mirror external flags */
3027 +#define GDLM_LKFLG_VALBLK (0x00000008)
3028 +#define GDLM_LKFLG_PERSISTENT (0x00000080)
3029 +#define GDLM_LKFLG_NODLCKWT (0x00000100)
3030 +#define GDLM_LKFLG_EXPEDITE (0x00000400)
3031 +#define GDLM_LKFLG_ORPHAN (0x00004000)
3032 +/* external flags now go up to: (0x00004000) : DLM_LKF_ORPHAN */
3034 +/* internal-only flags */
3035 +#define GDLM_LKFLG_RANGE (0x00010000)
3036 +#define GDLM_LKFLG_MSTCPY (0x00020000)
3037 +#define GDLM_LKFLG_DELETED (0x00040000)
3038 +#define GDLM_LKFLG_LQCONVERT (0x00080000)
3039 +#define GDLM_LKFLG_LQRESEND (0x00100000)
3040 +#define GDLM_LKFLG_DEMOTED (0x00200000)
3041 +#define GDLM_LKFLG_RESENT (0x00400000)
3042 +#define GDLM_LKFLG_NOREBUILD (0x00800000)
3043 +#define GDLM_LKFLG_UNLOCKDONE (0x01000000)
3045 +#define AST_COMP (1)
3046 +#define AST_BAST (2)
3047 +#define AST_DEL (4)
3050 + uint32_t lkb_flags;
3051 + uint16_t lkb_status; /* grant, wait, convert */
3052 + int8_t lkb_rqmode; /* requested lock mode */
3053 + int8_t lkb_grmode; /* granted lock mode */
3054 + uint32_t lkb_retstatus; /* status to return in lksb */
3055 + uint32_t lkb_id; /* our lock ID */
3056 + struct dlm_lksb * lkb_lksb; /* status block of caller */
3057 + struct list_head lkb_idtbl_list; /* lockidtbl */
3058 + struct list_head lkb_statequeue; /* rsb's g/c/w queue */
3059 + struct dlm_rsb * lkb_resource;
3060 + struct dlm_lkb * lkb_parent; /* parent lock if any */
3061 + atomic_t lkb_childcnt; /* number of children */
3063 + struct list_head lkb_lockqueue; /* queue of locks waiting
3064 + for remote reply */
3065 + int lkb_lockqueue_state; /* reason on lockqueue */
3066 + uint32_t lkb_lockqueue_flags; /* as passed into
3068 + int lkb_ownpid; /* pid of lock owner */
3069 + unsigned long lkb_lockqueue_time; /* time lkb went on the
3071 + unsigned long lkb_duetime; /* for deadlock detection */
3073 + uint32_t lkb_remid; /* id on remote partner */
3074 + uint32_t lkb_nodeid; /* id of remote partner */
3075 + void * lkb_astaddr;
3076 + void * lkb_bastaddr;
3077 + long lkb_astparam;
3078 + struct list_head lkb_astqueue; /* locks with asts to deliver */
3079 + uint16_t lkb_astflags; /* COMP, BAST, DEL */
3080 + uint8_t lkb_bastmode; /* requested mode */
3081 + uint8_t lkb_highbast; /* highest mode bast sent for */
3083 + struct dlm_request * lkb_request;
3085 + struct list_head lkb_deadlockq; /* ls_deadlockq list */
3087 + char * lkb_lvbptr; /* points to lksb lvb on local
3088 + lock, allocated lvb on
3090 + uint64_t * lkb_range; /* Points to an array of 64 bit
3091 + numbers that represent the
3092 + requested and granted ranges
3093 + of the lock. NULL implies
3094 + 0-ffffffffffffffff */
3098 + * Header part of the mid-level comms system. All packets start with
3099 + * this header so we can identify them. The comms packet can
3100 + * contain many of these structs but the are split into individual
3101 + * work units before being passed to the lockqueue routines.
3102 + * below this are the structs that this is a header for
3105 +struct dlm_header {
3106 + uint8_t rh_cmd; /* What we are */
3107 + uint8_t rh_flags; /* maybe just a pad */
3108 + uint16_t rh_length; /* Length of struct (so we can
3109 + send many in 1 message) */
3110 + uint32_t rh_lkid; /* Lock ID tag: ie the local
3111 + (requesting) lock ID */
3112 + uint32_t rh_lockspace; /* Lockspace ID */
3113 +} __attribute__((packed));
3116 + * This is the struct used in a remote lock/unlock/convert request
3117 + * The mid-level comms API should turn this into native byte order.
3118 + * Most "normal" lock operations will use these two structs for
3119 + * communications. Recovery operations use their own structs
3120 + * but still with the gd_req_header on the front.
3123 +struct dlm_request {
3124 + struct dlm_header rr_header;
3125 + uint32_t rr_remlkid; /* Remote lock ID */
3126 + uint32_t rr_remparid; /* Parent's remote lock ID */
3127 + uint32_t rr_flags; /* Flags from lock/convert req*/
3128 + uint64_t rr_range_start; /* Yes, these are in the right
3130 + uint64_t rr_range_end;
3131 + uint32_t rr_status; /* Status to return if this is
3133 + uint32_t rr_pid; /* Owner PID of lock */
3134 + uint8_t rr_rqmode; /* Requested lock mode */
3135 + uint8_t rr_asts; /* Whether the LKB has ASTs */
3136 + char rr_lvb[DLM_LVB_LEN];
3137 + char rr_name[1]; /* As long as needs be. Only
3138 + used for directory lookups.
3139 + The length of this can be
3140 + worked out from the packet
3142 +} __attribute__((packed));
3145 + * This is the struct returned by a remote lock/unlock/convert request
3146 + * The mid-level comms API should turn this into native byte order.
3150 + struct dlm_header rl_header;
3151 + uint32_t rl_lockstate; /* Whether request was
3152 + queued/granted/waiting */
3153 + uint32_t rl_nodeid; /* nodeid of lock master */
3154 + uint32_t rl_status; /* Status to return to caller */
3155 + uint32_t rl_lkid; /* Remote lkid */
3156 + char rl_lvb[DLM_LVB_LEN];
3157 +} __attribute__((packed));
3160 + * Recovery comms message
3164 + struct dlm_header rc_header; /* 32 byte aligned */
3165 + uint32_t rc_msgid;
3166 + uint16_t rc_datalen;
3167 + uint8_t rc_expanded;
3168 + uint8_t rc_subcmd; /* secondary command */
3169 + char rc_buf[1]; /* first byte of data goes here
3170 + and extends beyond here for
3171 + another datalen - 1 bytes.
3172 + rh_length is set to sizeof
3173 + dlm_rcom + datalen - 1 */
3174 +} __attribute__((packed));
3177 +/* A remote query: GDLM_REMCMD_QUERY */
3179 +struct dlm_query_request {
3180 + struct dlm_header rq_header;
3181 + uint32_t rq_mstlkid; /* LockID on master node */
3182 + uint32_t rq_query; /* query from the user */
3183 + uint32_t rq_maxlocks; /* max number of locks we can
3185 +} __attribute__((packed));
3187 +/* First block of a reply query. cmd = GDLM_REMCMD_QUERY */
3188 +/* There may be subsequent blocks of
3189 + lock info in GDLM_REMCMD_QUERYCONT messages which just have
3190 + a normal header. The last of these will have rh_flags set to
3191 + GDLM_REMFLAG_ENDQUERY
3194 +struct dlm_query_reply {
3195 + struct dlm_header rq_header;
3196 + uint32_t rq_numlocks; /* Number of locks in reply */
3197 + uint32_t rq_startlock; /* Which lock this block starts
3198 + at (for multi-block replies) */
3199 + uint32_t rq_status;
3201 + /* Resource information */
3202 + uint32_t rq_grantcount; /* No. of nodes on grantqueue */
3203 + uint32_t rq_convcount; /* No. of nodes on convertq */
3204 + uint32_t rq_waitcount; /* No. of nodes on waitqueue */
3205 + char rq_valblk[DLM_LVB_LEN]; /* Master's LVB
3208 +} __attribute__((packed));
3211 + * Lockqueue wait lock states
3214 +#define GDLM_LQSTATE_WAIT_RSB 1
3215 +#define GDLM_LQSTATE_WAIT_CONVERT 2
3216 +#define GDLM_LQSTATE_WAIT_CONDGRANT 3
3217 +#define GDLM_LQSTATE_WAIT_UNLOCK 4
3219 +/* Commands sent across the comms link */
3220 +#define GDLM_REMCMD_LOOKUP 1
3221 +#define GDLM_REMCMD_LOCKREQUEST 2
3222 +#define GDLM_REMCMD_UNLOCKREQUEST 3
3223 +#define GDLM_REMCMD_CONVREQUEST 4
3224 +#define GDLM_REMCMD_LOCKREPLY 5
3225 +#define GDLM_REMCMD_LOCKGRANT 6
3226 +#define GDLM_REMCMD_SENDBAST 7
3227 +#define GDLM_REMCMD_SENDCAST 8
3228 +#define GDLM_REMCMD_REM_RESDATA 9
3229 +#define GDLM_REMCMD_RECOVERMESSAGE 20
3230 +#define GDLM_REMCMD_RECOVERREPLY 21
3231 +#define GDLM_REMCMD_QUERY 30
3232 +#define GDLM_REMCMD_QUERYREPLY 31
3234 +/* Set in rh_flags when this is the last block of
3235 + query information. Note this could also be the first
3237 +#define GDLM_REMFLAG_ENDQUERY 1
3239 +#ifdef CONFIG_DLM_STATS
3240 +struct dlm_statinfo
3242 + unsigned int cast;
3243 + unsigned int bast;
3244 + unsigned int lockops;
3245 + unsigned int unlockops;
3246 + unsigned int convertops;
3247 + unsigned long lockqueue_time[5];
3248 + unsigned long lockqueue_locks[5];
3250 +extern struct dlm_statinfo dlm_stats;
3257 +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...);
3258 +void dlm_debug_dump(void);
3259 +void dlm_locks_dump(void);
3261 +#endif /* __DLM_INTERNAL_DOT_H__ */
3262 diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
3263 --- linux-orig/cluster/dlm/lkb.c 1970-01-01 07:30:00.000000000 +0730
3264 +++ linux-patched/cluster/dlm/lkb.c 2004-11-03 11:31:56.000000000 +0800
3266 +/******************************************************************************
3267 +*******************************************************************************
3269 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3270 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3272 +** This copyrighted material is made available to anyone wishing to use,
3273 +** modify, copy, or redistribute it subject to the terms and conditions
3274 +** of the GNU General Public License v.2.
3276 +*******************************************************************************
3277 +******************************************************************************/
3282 + * Allocate and free locks on the lock ID table.
3284 + * This is slightly naff but I don't really like the
3285 + * VMS lockidtbl stuff as it uses a realloced array
3286 + * to hold the locks in. I think this is slightly better
3289 + * Any better suggestions gratefully received. Patrick
3293 +#include "dlm_internal.h"
3294 +#include "lockqueue.h"
3296 +#include "config.h"
3298 +#include "memory.h"
3299 +#include "lockspace.h"
3303 + * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
3306 +static struct dlm_lkb *__find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
3308 + uint16_t bucket = lkid & 0xFFFF;
3309 + struct dlm_lkb *lkb;
3311 + if (bucket >= ls->ls_lkbtbl_size)
3314 + list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list){
3315 + if (lkb->lkb_id == lkid)
3323 + * LKB lkid's are 32 bits and have two 16 bit parts. The bottom 16 bits are a
3324 + * random number between 0 and lockidtbl_size-1. This random number specifies
3325 + * the "bucket" for the lkb in lockidtbl. The upper 16 bits are a sequentially
3326 + * assigned per-bucket id.
3328 + * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
3329 + * against the lkid of all lkb's in the bucket to avoid duplication.
3333 +struct dlm_lkb *create_lkb(struct dlm_ls *ls)
3335 + struct dlm_lkb *lkb;
3339 + lkb = allocate_lkb(ls);
3344 + get_random_bytes(&bucket, sizeof(bucket));
3345 + bucket &= (ls->ls_lkbtbl_size - 1);
3347 + write_lock(&ls->ls_lkbtbl[bucket].lock);
3349 + lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
3351 + if (__find_lock_by_id(ls, lkid)) {
3352 + write_unlock(&ls->ls_lkbtbl[bucket].lock);
3356 + lkb->lkb_id = lkid;
3357 + list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
3358 + write_unlock(&ls->ls_lkbtbl[bucket].lock);
3364 + * Free LKB and remove it from the lockidtbl.
3365 + * NB - this always frees the lkb whereas release_rsb doesn't free an
3366 + * rsb unless its reference count is zero.
3369 +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
3371 + uint16_t bucket = lkb->lkb_id & 0xFFFF;
3373 + if (lkb->lkb_status) {
3374 + log_error(ls, "release lkb with status %u", lkb->lkb_status);
3379 + if (lkb->lkb_parent)
3380 + atomic_dec(&lkb->lkb_parent->lkb_childcnt);
3382 + write_lock(&ls->ls_lkbtbl[bucket].lock);
3383 + list_del(&lkb->lkb_idtbl_list);
3384 + write_unlock(&ls->ls_lkbtbl[bucket].lock);
3386 + /* if this is not a master copy then lvbptr points into the user's
3387 + * lksb, so don't free it */
3388 + if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3389 + free_lvb(lkb->lkb_lvbptr);
3391 + if (lkb->lkb_range)
3392 + free_range(lkb->lkb_range);
3397 +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
3399 + struct dlm_lkb *lkb;
3400 + uint16_t bucket = lkid & 0xFFFF;
3402 + read_lock(&ls->ls_lkbtbl[bucket].lock);
3403 + lkb = __find_lock_by_id(ls, lkid);
3404 + read_unlock(&ls->ls_lkbtbl[bucket].lock);
3409 +struct dlm_lkb *dlm_get_lkb(void *lockspace, uint32_t lkid)
3411 + struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
3412 + struct dlm_lkb *lkb = find_lock_by_id(ls, lkid);
3413 + put_lockspace(ls);
3418 + * Initialise the range parts of an LKB.
3421 +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end)
3423 + int ret = -ENOMEM;
3426 + * if this wasn't already a range lock, make it one
3428 + if (!lkb->lkb_range) {
3429 + lkb->lkb_range = allocate_range(lspace);
3430 + if (!lkb->lkb_range)
3434 + * This is needed for conversions that contain ranges where the
3435 + * original lock didn't but it's harmless for new locks too.
3437 + lkb->lkb_range[GR_RANGE_START] = 0LL;
3438 + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
3441 + lkb->lkb_range[RQ_RANGE_START] = start;
3442 + lkb->lkb_range[RQ_RANGE_END] = end;
3449 diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
3450 --- linux-orig/cluster/dlm/lkb.h 1970-01-01 07:30:00.000000000 +0730
3451 +++ linux-patched/cluster/dlm/lkb.h 2004-11-03 11:31:56.000000000 +0800
3453 +/******************************************************************************
3454 +*******************************************************************************
3456 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3457 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3459 +** This copyrighted material is made available to anyone wishing to use,
3460 +** modify, copy, or redistribute it subject to the terms and conditions
3461 +** of the GNU General Public License v.2.
3463 +*******************************************************************************
3464 +******************************************************************************/
3466 +#ifndef __LKB_DOT_H__
3467 +#define __LKB_DOT_H__
3469 +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid);
3470 +struct dlm_lkb *create_lkb(struct dlm_ls *ls);
3471 +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb);
3472 +struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid);
3473 +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end);
3475 +#endif /* __LKB_DOT_H__ */
3476 diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
3477 --- linux-orig/cluster/dlm/locking.c 1970-01-01 07:30:00.000000000 +0730
3478 +++ linux-patched/cluster/dlm/locking.c 2004-11-03 11:31:56.000000000 +0800
3480 +/******************************************************************************
3481 +*******************************************************************************
3483 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3484 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3486 +** This copyrighted material is made available to anyone wishing to use,
3487 +** modify, copy, or redistribute it subject to the terms and conditions
3488 +** of the GNU General Public License v.2.
3490 +*******************************************************************************
3491 +******************************************************************************/
3496 + * This is where the main work of the DLM goes on
3500 +#include "dlm_internal.h"
3501 +#include "lockqueue.h"
3502 +#include "locking.h"
3503 +#include "lockspace.h"
3508 +#include "memory.h"
3511 +#include "lowcomms.h"
3513 +extern struct list_head lslist;
3515 +#define MAX(a, b) (((a) > (b)) ? (a) : (b))
3518 + * Lock compatibilty matrix - thanks Steve
3519 + * UN = Unlocked state. Not really a state, used as a flag
3520 + * PD = Padding. Used to make the matrix a nice power of two in size
3521 + * Other states are the same as the VMS DLM.
3522 + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
3525 +#define modes_compat(gr, rq) \
3526 + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
3528 +const int __dlm_compat_matrix[8][8] = {
3529 + /* UN NL CR CW PR PW EX PD */
3530 + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
3531 + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
3532 + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
3533 + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
3534 + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
3535 + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
3536 + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
3537 + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3541 + * Compatibility matrix for conversions with QUECVT set.
3542 + * Granted mode is the row; requested mode is the column.
3543 + * Usage: matrix[grmode+1][rqmode+1]
3546 +const int __quecvt_compat_matrix[8][8] = {
3547 + /* UN NL CR CW PR PW EX PD */
3548 + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
3549 + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
3550 + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
3551 + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
3552 + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
3553 + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
3554 + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
3555 + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3559 + * This defines the direction of transfer of LVB data.
3560 + * Granted mode is the row; requested mode is the column.
3561 + * Usage: matrix[grmode+1][rqmode+1]
3562 + * 1 = LVB is returned to the caller
3563 + * 0 = LVB is written to the resource
3564 + * -1 = nothing happens to the LVB
3567 +const int __lvb_operations[8][8] = {
3568 + /* UN NL CR CW PR PW EX PD*/
3569 + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
3570 + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
3571 + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
3572 + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
3573 + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
3574 + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
3575 + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
3576 + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
3579 +static void grant_lock(struct dlm_lkb *lkb, int send_remote);
3580 +static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
3581 +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
3582 +static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
3583 + uint32_t flags, void *ast, void *astarg, void *bast,
3584 + struct dlm_range *range);
3585 +static int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb,
3586 + uint32_t flags, char *name, int namelen);
3589 +inline int dlm_modes_compat(int mode1, int mode2)
3591 + return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
3594 +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
3596 + struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, lkb_statequeue);
3598 + if (lkb->lkb_id == first->lkb_id)
3605 + * Return 1 if the locks' ranges overlap
3606 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
3609 +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
3611 + if (!lkb1->lkb_range || !lkb2->lkb_range)
3614 + if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
3615 + lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
3622 + * "A conversion deadlock arises with a pair of lock requests in the converting
3623 + * queue for one resource. The granted mode of each lock blocks the requested
3624 + * mode of the other lock."
3627 +static struct dlm_lkb *conversion_deadlock_detect(struct dlm_rsb *rsb,
3628 + struct dlm_lkb *lkb)
3630 + struct dlm_lkb *this;
3632 + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3636 + if (!ranges_overlap(lkb, this))
3639 + if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
3647 + * Check if the given lkb conflicts with another lkb on the queue.
3650 +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
3652 + struct dlm_lkb *this;
3654 + list_for_each_entry(this, head, lkb_statequeue) {
3657 + if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
3664 + * Return 1 if the lock can be granted, 0 otherwise.
3665 + * Also detect and resolve conversion deadlocks.
3667 + * lkb is the lock to be granted
3669 + * now is 1 if the function is being called in the context of the
3670 + * immediate request, it is 0 if called later, after the lock has been
3673 + * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
3676 +static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
3678 + int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
3681 + * 6-10: Version 5.4 introduced an option to address the phenomenon of
3682 + * a new request for a NL mode lock being blocked.
3684 + * 6-11: If the optional EXPEDITE flag is used with the new NL mode
3685 + * request, then it would be granted. In essence, the use of this flag
3686 + * tells the Lock Manager to expedite theis request by not considering
3687 + * what may be in the CONVERTING or WAITING queues... As of this
3688 + * writing, the EXPEDITE flag can be used only with new requests for NL
3689 + * mode locks. This flag is not valid for conversion requests.
3691 + * A shortcut. Earlier checks return an error if EXPEDITE is used in a
3692 + * conversion or used with a non-NL requested mode. We also know an
3693 + * EXPEDITE request is always granted immediately, so now must always
3694 + * be 1. The full condition to grant an expedite request: (now &&
3695 + * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
3696 + * therefore be shortened to just checking the flag.
3699 + if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
3703 + * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
3704 + * added to the remaining conditions.
3707 + if (queue_conflict(&r->res_grantqueue, lkb))
3711 + * 6-3: By default, a conversion request is immediately granted if the
3712 + * requested mode is compatible with the modes of all other granted
3716 + if (queue_conflict(&r->res_convertqueue, lkb))
3720 + * 6-5: But the default algorithm for deciding whether to grant or
3721 + * queue conversion requests does not by itself guarantee that such
3722 + * requests are serviced on a "first come first serve" basis. This, in
3723 + * turn, can lead to a phenomenon known as "indefinate postponement".
3725 + * 6-7: This issue is dealt with by using the optional QUECVT flag with
3726 + * the system service employed to request a lock conversion. This flag
3727 + * forces certain conversion requests to be queued, even if they are
3728 + * compatible with the granted modes of other locks on the same
3729 + * resource. Thus, the use of this flag results in conversion requests
3730 + * being ordered on a "first come first servce" basis.
3733 + if (now && conv && !(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
3737 + * When using range locks the NOORDER flag is set to avoid the standard
3738 + * vms rules on grant order.
3741 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOORDER)
3745 + * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
3746 + * granted until all other conversion requests ahead of it are granted
3747 + * and/or canceled.
3750 + if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
3754 + * 6-4: By default, a new request is immediately granted only if all
3755 + * three of the following conditions are satisfied when the request is
3757 + * - The queue of ungranted conversion requests for the resource is
3759 + * - The queue of ungranted new requests for the resource is empty.
3760 + * - The mode of the new request is compatible with the most
3761 + * restrictive mode of all granted locks on the resource.
3764 + if (now && !conv && list_empty(&r->res_convertqueue) &&
3765 + list_empty(&r->res_waitqueue))
3769 + * 6-4: Once a lock request is in the queue of ungranted new requests,
3770 + * it cannot be granted until the queue of ungranted conversion
3771 + * requests is empty, all ungranted new requests ahead of it are
3772 + * granted and/or canceled, and it is compatible with the granted mode
3773 + * of the most restrictive lock granted on the resource.
3776 + if (!now && !conv && list_empty(&r->res_convertqueue) &&
3777 + first_in_list(lkb, &r->res_waitqueue))
3782 + * The following, enabled by CONVDEADLK, departs from VMS.
3785 + if (now && conv && (lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK) &&
3786 + conversion_deadlock_detect(r, lkb)) {
3787 + lkb->lkb_grmode = DLM_LOCK_NL;
3788 + lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
3794 +int dlm_lock(void *lockspace,
3796 + struct dlm_lksb *lksb,
3799 + unsigned int namelen,
3801 + void (*ast) (void *astarg),
3803 + void (*bast) (void *astarg, int mode),
3804 + struct dlm_range *range)
3806 + struct dlm_ls *lspace;
3807 + struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
3808 + int ret = -EINVAL;
3810 + lspace = find_lockspace_by_local_id(lockspace);
3814 + if (mode < 0 || mode > DLM_LOCK_EX)
3817 + if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
3820 + if (flags & DLM_LKF_CANCEL)
3823 + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
3826 + if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
3829 + if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
3832 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
3835 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
3838 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
3841 + if (flags & DLM_LKF_EXPEDITE && (mode != DLM_LOCK_NL))
3844 + if (!ast || !lksb)
3847 + if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
3851 + * Take conversion path.
3854 + if (flags & DLM_LKF_CONVERT) {
3855 + ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
3860 +#ifdef CONFIG_DLM_STATS
3861 + dlm_stats.lockops++;
3864 + * Take new lock path.
3868 + down_read(&lspace->ls_unlock_sem);
3870 + parent_lkb = find_lock_by_id(lspace, parent);
3872 + if (!parent_lkb ||
3873 + parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
3874 + parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
3875 + parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3876 + up_read(&lspace->ls_unlock_sem);
3880 + atomic_inc(&parent_lkb->lkb_childcnt);
3881 + up_read(&lspace->ls_unlock_sem);
3884 + down_read(&lspace->ls_in_recovery);
3888 + lkb = create_lkb(lspace);
3891 + lkb->lkb_astaddr = ast;
3892 + lkb->lkb_astparam = (long) astarg;
3893 + lkb->lkb_bastaddr = bast;
3894 + lkb->lkb_rqmode = mode;
3895 + lkb->lkb_grmode = DLM_LOCK_IV;
3896 + lkb->lkb_nodeid = -1;
3897 + lkb->lkb_lksb = lksb;
3898 + lkb->lkb_parent = parent_lkb;
3899 + lkb->lkb_lockqueue_flags = flags;
3900 + lkb->lkb_lvbptr = lksb->sb_lvbptr;
3902 + if (!in_interrupt() && current)
3903 + lkb->lkb_ownpid = (int) current->pid;
3905 + lkb->lkb_ownpid = 0;
3908 + if (range->ra_start > range->ra_end) {
3913 + if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
3917 + /* Convert relevant flags to internal numbers */
3918 + if (flags & DLM_LKF_VALBLK)
3919 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3920 + if (flags & DLM_LKF_PERSISTENT)
3921 + lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
3922 + if (flags & DLM_LKF_NODLCKWT)
3923 + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
3925 + lksb->sb_lkid = lkb->lkb_id;
3927 + ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
3931 + up_read(&lspace->ls_in_recovery);
3935 + put_lockspace(lspace);
3939 + release_lkb(lspace, lkb);
3944 + atomic_dec(&parent_lkb->lkb_childcnt);
3947 + up_read(&lspace->ls_in_recovery);
3950 + put_lockspace(lspace);
3954 +int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb, uint32_t flags,
3955 + char *name, int namelen)
3957 + struct dlm_rsb *rsb, *parent_rsb = NULL;
3958 + struct dlm_lkb *parent_lkb = lkb->lkb_parent;
3960 + int error, dir_error = 0;
3963 + parent_rsb = parent_lkb->lkb_resource;
3965 + error = find_rsb(ls, parent_rsb, name, namelen, CREATE, &rsb);
3968 + lkb->lkb_resource = rsb;
3969 + down_write(&rsb->res_lock);
3971 + log_debug(ls, "(%d) rq %u %x \"%s\"", lkb->lkb_ownpid, lkb->lkb_rqmode,
3972 + lkb->lkb_id, rsb->res_name);
3974 + * Next stage, do we need to find the master or can
3975 + * we get on with the real locking work ?
3979 + if (rsb->res_nodeid == -1) {
3980 + if (get_directory_nodeid(rsb) != our_nodeid()) {
3981 + remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
3982 + up_write(&rsb->res_lock);
3986 + error = dlm_dir_lookup(ls, our_nodeid(), rsb->res_name,
3987 + rsb->res_length, &nodeid);
3989 + DLM_ASSERT(error == -EEXIST,);
3991 + dir_error = error;
3995 + if (nodeid == our_nodeid()) {
3996 + set_bit(RESFL_MASTER, &rsb->res_flags);
3997 + rsb->res_nodeid = 0;
3999 + clear_bit(RESFL_MASTER, &rsb->res_flags);
4000 + rsb->res_nodeid = nodeid;
4004 + log_all(ls, "dir lookup retry %x %u", lkb->lkb_id,
4009 + lkb->lkb_nodeid = rsb->res_nodeid;
4010 + up_write(&rsb->res_lock);
4012 + error = dlm_lock_stage2(ls, lkb, rsb, flags);
4018 + * Locking routine called after we have an RSB, either a copy of a remote one
4019 + * or a local one, or perhaps a shiny new one all of our very own
4022 +int dlm_lock_stage2(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *rsb,
4027 + DLM_ASSERT(rsb->res_nodeid != -1, print_lkb(lkb); print_rsb(rsb););
4029 + if (rsb->res_nodeid) {
4030 + res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4031 + error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
4033 + dlm_lock_stage3(lkb);
4040 + * Called on an RSB's master node to do stage2 locking for a remote lock
4041 + * request. Returns a proper lkb with rsb ready for lock processing.
4042 + * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
4045 +struct dlm_lkb *remote_stage2(int remote_nodeid, struct dlm_ls *ls,
4046 + struct dlm_request *freq)
4048 + struct dlm_rsb *rsb = NULL, *parent_rsb = NULL;
4049 + struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
4050 + int error, namelen;
4052 + if (freq->rr_remparid) {
4053 + parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
4057 + atomic_inc(&parent_lkb->lkb_childcnt);
4058 + parent_rsb = parent_lkb->lkb_resource;
4062 + * A new MSTCPY lkb. Initialize lkb fields including the real lkid and
4063 + * node actually holding the (non-MSTCPY) lkb. AST address are just
4064 + * flags in the master copy.
4067 + lkb = create_lkb(ls);
4070 + lkb->lkb_grmode = DLM_LOCK_IV;
4071 + lkb->lkb_rqmode = freq->rr_rqmode;
4072 + lkb->lkb_parent = parent_lkb;
4073 + lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP);
4074 + lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST);
4075 + lkb->lkb_nodeid = remote_nodeid;
4076 + lkb->lkb_remid = freq->rr_header.rh_lkid;
4077 + lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
4078 + lkb->lkb_lockqueue_flags = freq->rr_flags;
4080 + if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
4081 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
4082 + allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
4083 + if (!lkb->lkb_lvbptr)
4087 + if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
4088 + error = lkb_set_range(ls, lkb, freq->rr_range_start,
4089 + freq->rr_range_end);
4095 + * Get the RSB which this lock is for. Create a new RSB if this is a
4096 + * new lock on a new resource. We must be the master of any new rsb.
4099 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
4101 + error = find_rsb(ls, parent_rsb, freq->rr_name, namelen, MASTER, &rsb);
4106 + log_debug(ls, "send einval to %u", remote_nodeid);
4107 + /* print_name(freq->rr_name, namelen); */
4108 + lkb->lkb_retstatus = -EINVAL;
4112 + lkb->lkb_resource = rsb;
4114 + log_debug(ls, "(%d) rq %u from %u %x \"%s\"",
4115 + lkb->lkb_ownpid, lkb->lkb_rqmode, remote_nodeid,
4116 + lkb->lkb_id, rsb->res_name);
4122 + /* release_lkb handles parent */
4123 + release_lkb(ls, lkb);
4124 + parent_lkb = NULL;
4128 + atomic_dec(&parent_lkb->lkb_childcnt);
4134 + * The final bit of lock request processing on the master node. Here the lock
4135 + * is granted and the completion ast is queued, or the lock is put on the
4136 + * waitqueue and blocking asts are sent.
4139 +void dlm_lock_stage3(struct dlm_lkb *lkb)
4141 + struct dlm_rsb *rsb = lkb->lkb_resource;
4144 + * This is a locally mastered lock on a resource that already exists,
4145 + * see if it can be granted or if it must wait. When this function is
4146 + * called for a remote lock request (process_cluster_request,
4147 + * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
4148 + * requesting node at the end of process_cluster_request, not at the
4149 + * end of grant_lock.
4152 + down_write(&rsb->res_lock);
4154 + if (can_be_granted(rsb, lkb, TRUE)) {
4155 + grant_lock(lkb, 0);
4160 + * This request is not a conversion, so the lkb didn't exist other than
4161 + * for this request and should be freed after EAGAIN is returned in the
4165 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4166 + lkb->lkb_retstatus = -EAGAIN;
4167 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4168 + send_blocking_asts_all(rsb, lkb);
4169 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
4174 + * The requested lkb must wait. Because the rsb of the requested lkb
4175 + * is mastered here, send blocking asts for the lkb's blocking the
4179 + log_debug2("w %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4180 + lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4181 + lkb->lkb_status, rsb->res_name);
4183 + lkb->lkb_retstatus = 0;
4184 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4186 + send_blocking_asts(rsb, lkb);
4189 + up_write(&rsb->res_lock);
4192 +int dlm_unlock(void *lockspace,
4195 + struct dlm_lksb *lksb,
4198 + struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
4199 + struct dlm_lkb *lkb;
4200 + struct dlm_rsb *rsb;
4201 + int ret = -EINVAL;
4204 + log_print("dlm_unlock: lkid %x lockspace not found", lkid);
4208 + lkb = find_lock_by_id(ls, lkid);
4210 + log_debug(ls, "unlock %x no id", lkid);
4214 + /* Can't dequeue a master copy (a remote node's mastered lock) */
4215 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4216 + log_debug(ls, "(%d) unlock %x lkb_flags %x",
4217 + lkb->lkb_ownpid, lkid, lkb->lkb_flags);
4221 + /* Already waiting for a remote lock operation */
4222 + if (lkb->lkb_lockqueue_state) {
4223 + log_debug(ls, "(%d) unlock %x lq%d",
4224 + lkb->lkb_ownpid, lkid, lkb->lkb_lockqueue_state);
4229 +#ifdef CONFIG_DLM_STATS
4230 + dlm_stats.unlockops++;
4232 + /* Can only cancel WAITING or CONVERTing locks.
4233 + * This is just a quick check - it is also checked in unlock_stage2()
4234 + * (which may be on the master) under the semaphore.
4236 + if ((flags & DLM_LKF_CANCEL) &&
4237 + (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
4238 + log_debug(ls, "(%d) unlock %x %x %d",
4239 + lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
4243 + /* "Normal" unlocks must operate on a granted lock */
4244 + if (!(flags & DLM_LKF_CANCEL) &&
4245 + (lkb->lkb_status != GDLM_LKSTS_GRANTED)) {
4246 + log_debug(ls, "(%d) unlock %x %x %d",
4247 + lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
4251 + if (lkb->lkb_flags & GDLM_LKFLG_DELETED) {
4252 + log_debug(ls, "(%d) unlock deleted %x %x %d",
4253 + lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
4257 + down_write(&ls->ls_unlock_sem);
4258 + /* Can't dequeue a lock with sublocks */
4259 + if (atomic_read(&lkb->lkb_childcnt)) {
4260 + up_write(&ls->ls_unlock_sem);
4264 + /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
4265 + if (!(flags & DLM_LKF_CANCEL))
4266 + lkb->lkb_flags |= GDLM_LKFLG_DELETED;
4267 + up_write(&ls->ls_unlock_sem);
4269 + down_read(&ls->ls_in_recovery);
4270 + rsb = find_rsb_to_unlock(ls, lkb);
4272 + log_debug(ls, "(%d) un %x %x %d %d \"%s\"",
4280 + /* Save any new params */
4282 + lkb->lkb_lksb = lksb;
4283 + lkb->lkb_astparam = (long) astarg;
4284 + lkb->lkb_lockqueue_flags = flags;
4286 + if (lkb->lkb_nodeid)
4287 + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
4289 + ret = dlm_unlock_stage2(lkb, rsb, flags);
4290 + up_read(&ls->ls_in_recovery);
4295 + put_lockspace(ls);
4299 +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags)
4301 + int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
4304 + down_write(&rsb->res_lock);
4306 + /* Can only cancel WAITING or CONVERTing locks */
4307 + if ((flags & DLM_LKF_CANCEL) &&
4308 + (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
4309 + lkb->lkb_retstatus = -EINVAL;
4310 + queue_ast(lkb, AST_COMP, 0);
4314 + log_debug2("u %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4315 + lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4316 + lkb->lkb_status, rsb->res_name);
4318 + old_status = lkb_dequeue(lkb);
4321 + * Cancelling a conversion
4324 + if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
4325 + /* VMS semantics say we should send blocking ASTs again here */
4326 + send_blocking_asts(rsb, lkb);
4328 + /* Remove from deadlock detection */
4329 + if (lkb->lkb_duetime)
4330 + remove_from_deadlockqueue(lkb);
4332 + /* Stick it back on the granted queue */
4333 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4334 + lkb->lkb_rqmode = lkb->lkb_grmode;
4336 + /* Was it blocking any other locks? */
4337 + if (first_in_list(lkb, &rsb->res_convertqueue))
4338 + grant_pending_locks(rsb);
4340 + lkb->lkb_retstatus = -DLM_ECANCEL;
4341 + queue_ast(lkb, AST_COMP, 0);
4346 + * If was granted grant any converting or waiting locks
4347 + * and save or clear lvb
4350 + if (old_status == GDLM_LKSTS_GRANTED) {
4351 + if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
4352 + if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
4353 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr,
4355 + if (flags & DLM_LKF_IVVALBLK)
4356 + memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
4359 + grant_pending_locks(rsb);
4361 + DLM_ASSERT(0, print_lkb(lkb); print_rsb(rsb););
4363 + lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK;
4366 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
4368 + up_write(&rsb->res_lock);
4369 + release_lkb(rsb->res_ls, lkb);
4375 + up_write(&rsb->res_lock);
4385 +static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
4386 + uint32_t flags, void *ast, void *astarg, void *bast,
4387 + struct dlm_range *range)
4389 + struct dlm_lkb *lkb;
4390 + struct dlm_rsb *rsb;
4391 + int ret = -EINVAL;
4393 + lkb = find_lock_by_id(ls, lksb->sb_lkid);
4398 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
4403 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4407 + if ((flags & DLM_LKF_QUECVT) &&
4408 + !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
4412 + if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
4416 +#ifdef CONFIG_DLM_STATS
4417 + dlm_stats.convertops++;
4419 + /* Set up the ranges as appropriate */
4421 + if (range->ra_start > range->ra_end)
4424 + if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
4430 + rsb = lkb->lkb_resource;
4431 + down_read(&ls->ls_in_recovery);
4433 + log_debug(ls, "(%d) cv %u %x \"%s\"", lkb->lkb_ownpid, mode,
4434 + lkb->lkb_id, rsb->res_name);
4436 + lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
4437 + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
4439 + if (flags & DLM_LKF_NODLCKWT)
4440 + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
4441 + lkb->lkb_astaddr = ast;
4442 + lkb->lkb_astparam = (long) astarg;
4443 + lkb->lkb_bastaddr = bast;
4444 + lkb->lkb_rqmode = mode;
4445 + lkb->lkb_lockqueue_flags = flags;
4446 + lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
4447 + lkb->lkb_lvbptr = lksb->sb_lvbptr;
4449 + if (rsb->res_nodeid) {
4450 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4451 + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
4453 + ret = dlm_convert_stage2(lkb, FALSE);
4456 + up_read(&ls->ls_in_recovery);
4465 + * For local conversion requests on locally mastered locks this is called
4466 + * directly from dlm_lock/convert_lock. This function is also called for
4467 + * remote conversion requests of MSTCPY locks (from process_cluster_request).
4470 +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast)
4472 + struct dlm_rsb *rsb = lkb->lkb_resource;
4475 + down_write(&rsb->res_lock);
4477 + if (can_be_granted(rsb, lkb, TRUE)) {
4478 + grant_lock(lkb, 0);
4479 + grant_pending_locks(rsb);
4483 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4484 + ret = lkb->lkb_retstatus = -EAGAIN;
4486 + queue_ast(lkb, AST_COMP, 0);
4487 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4488 + send_blocking_asts_all(rsb, lkb);
4492 + log_debug2("c %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4493 + lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4494 + lkb->lkb_status, rsb->res_name);
4496 + lkb->lkb_retstatus = 0;
4497 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4500 + * The granted mode may have been reduced to NL by conversion deadlock
4501 + * avoidance in can_be_granted(). If so, try to grant other locks.
4504 + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
4505 + grant_pending_locks(rsb);
4507 + send_blocking_asts(rsb, lkb);
4509 + if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
4510 + add_to_deadlockqueue(lkb);
4513 + up_write(&rsb->res_lock);
4518 + * Remove lkb from any queue it's on, add it to the granted queue, and queue a
4519 + * completion ast. rsb res_lock must be held in write when this is called.
4522 +static void grant_lock(struct dlm_lkb *lkb, int send_remote)
4524 + struct dlm_rsb *rsb = lkb->lkb_resource;
4526 + if (lkb->lkb_duetime)
4527 + remove_from_deadlockqueue(lkb);
4529 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
4531 + DLM_ASSERT(lkb->lkb_lvbptr,);
4533 + if (!rsb->res_lvbptr)
4534 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
4536 + b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
4538 + memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
4540 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4543 + if (lkb->lkb_range) {
4544 + lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
4545 + lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
4548 + log_debug2("g %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4549 + lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4550 + lkb->lkb_status, rsb->res_name);
4552 + if (lkb->lkb_grmode != lkb->lkb_rqmode) {
4553 + lkb->lkb_grmode = lkb->lkb_rqmode;
4554 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4556 + lkb->lkb_rqmode = DLM_LOCK_IV;
4557 + lkb->lkb_highbast = 0;
4558 + lkb->lkb_retstatus = 0;
4559 + queue_ast(lkb, AST_COMP, 0);
4562 + * A remote conversion request has been granted, either immediately
4563 + * upon being requested or after waiting a bit. In the former case,
4564 + * reply_and_grant() is called. In the later case send_remote is 1 and
4565 + * remote_grant() is called.
4567 + * The "send_remote" flag is set only for locks which are granted "out
4568 + * of band" - ie by another lock being converted or unlocked.
4570 + * The second case occurs when this lkb is granted right away as part
4571 + * of processing the initial request. In that case, we send a single
4572 + * message in reply_and_grant which combines the request reply with the
4576 + if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
4578 + remote_grant(lkb);
4579 + else if (lkb->lkb_request)
4580 + reply_and_grant(lkb);
4585 +static void send_bast_queue(struct list_head *head, struct dlm_lkb *lkb)
4587 + struct dlm_lkb *gr;
4589 + list_for_each_entry(gr, head, lkb_statequeue) {
4590 + if (gr->lkb_bastaddr &&
4591 + gr->lkb_highbast < lkb->lkb_rqmode &&
4592 + ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
4593 + queue_ast(gr, AST_BAST, lkb->lkb_rqmode);
4594 + gr->lkb_highbast = lkb->lkb_rqmode;
4600 + * Notify granted locks if they are blocking a newly forced-to-wait lock.
4603 +static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
4605 + send_bast_queue(&rsb->res_grantqueue, lkb);
4606 + /* check if the following improves performance */
4607 + /* send_bast_queue(&rsb->res_convertqueue, lkb); */
4610 +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
4612 + send_bast_queue(&rsb->res_grantqueue, lkb);
4613 + send_bast_queue(&rsb->res_convertqueue, lkb);
4617 + * Called when a lock has been dequeued. Look for any locks to grant that are
4618 + * waiting for conversion or waiting to be granted.
4619 + * The rsb res_lock must be held in write when this function is called.
4622 +int grant_pending_locks(struct dlm_rsb *r)
4624 + struct dlm_lkb *lkb, *s;
4625 + int8_t high = DLM_LOCK_IV;
4627 + list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
4628 + if (can_be_granted(r, lkb, FALSE))
4629 + grant_lock(lkb, 1);
4631 + high = MAX(lkb->lkb_rqmode, high);
4634 + list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
4635 + if (lkb->lkb_lockqueue_state)
4638 + if (can_be_granted(r, lkb, FALSE))
4639 + grant_lock(lkb, 1);
4641 + high = MAX(lkb->lkb_rqmode, high);
4645 + * If there are locks left on the wait/convert queue then send blocking
4646 + * ASTs to granted locks that are blocking
4648 + * FIXME: This might generate some spurious blocking ASTs for range
4652 + if (high > DLM_LOCK_IV) {
4653 + list_for_each_entry_safe(lkb, s, &r->res_grantqueue,
4655 + if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
4656 + !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
4657 + queue_ast(lkb, AST_BAST, high);
4658 + lkb->lkb_highbast = high;
4667 + * Called to cancel a locking operation that failed due to some internal
4670 + * Waiting locks will be removed, converting locks will be reverted to their
4671 + * granted status, unlocks will be left where they are.
4673 + * A completion AST will be delivered to the caller.
4676 +int cancel_lockop(struct dlm_lkb *lkb, int status)
4678 + int state = lkb->lkb_lockqueue_state;
4679 + uint16_t astflags = AST_COMP;
4681 + lkb->lkb_lockqueue_state = 0;
4684 + case GDLM_LQSTATE_WAIT_RSB:
4685 + astflags |= AST_DEL;
4688 + case GDLM_LQSTATE_WAIT_CONDGRANT:
4689 + res_lkb_dequeue(lkb);
4690 + astflags |= AST_DEL;
4693 + case GDLM_LQSTATE_WAIT_CONVERT:
4694 + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
4696 + /* Remove from deadlock detection */
4697 + if (lkb->lkb_duetime) {
4698 + remove_from_deadlockqueue(lkb);
4702 + case GDLM_LQSTATE_WAIT_UNLOCK:
4703 + /* We can leave this. I think.... */
4707 + lkb->lkb_retstatus = status;
4708 + queue_ast(lkb, astflags, 0);
4714 + * Check for conversion deadlock. If a deadlock was found
4715 + * return lkb to kill, else return NULL
4718 +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb)
4720 + struct dlm_rsb *rsb = lkb->lkb_resource;
4721 + struct list_head *entry;
4723 + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
4725 + /* Work our way up to the head of the queue looking for locks that
4726 + * conflict with us */
4728 + down_read(&rsb->res_lock);
4730 + entry = lkb->lkb_statequeue.prev;
4731 + while (entry != &rsb->res_convertqueue) {
4732 + struct dlm_lkb *lkb2 = list_entry(entry, struct dlm_lkb, lkb_statequeue);
4734 + if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
4735 + up_read(&rsb->res_lock);
4738 + entry = entry->prev;
4740 + up_read(&rsb->res_lock);
4746 + * Conversion operation was cancelled by us (not the user).
4747 + * ret contains the return code to pass onto the user
4750 +void cancel_conversion(struct dlm_lkb *lkb, int ret)
4752 + struct dlm_rsb *rsb = lkb->lkb_resource;
4754 + /* Stick it back on the granted queue */
4755 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4756 + lkb->lkb_rqmode = lkb->lkb_grmode;
4758 + remove_from_deadlockqueue(lkb);
4760 + lkb->lkb_retstatus = ret;
4761 + queue_ast(lkb, AST_COMP, 0);
4766 + * As new master of the rsb for this lkb, we need to handle these requests
4767 + * removed from the lockqueue and originating from local processes:
4768 + * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
4769 + * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
4772 +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state)
4774 + struct dlm_rsb *rsb;
4777 + case GDLM_LQSTATE_WAIT_RSB:
4778 + dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
4779 + lkb->lkb_lockqueue_flags,
4780 + lkb->lkb_resource->res_name,
4781 + lkb->lkb_resource->res_length);
4784 + case GDLM_LQSTATE_WAIT_CONDGRANT:
4785 + res_lkb_dequeue(lkb);
4786 + dlm_lock_stage3(lkb);
4789 + case GDLM_LQSTATE_WAIT_UNLOCK:
4790 + rsb = find_rsb_to_unlock(ls, lkb);
4791 + dlm_unlock_stage2(lkb, rsb, lkb->lkb_lockqueue_flags);
4794 + case GDLM_LQSTATE_WAIT_CONVERT:
4795 + dlm_convert_stage2(lkb, TRUE);
4803 +static void dump_queue(struct list_head *head, char *qname)
4805 + struct dlm_lkb *lkb;
4807 + list_for_each_entry(lkb, head, lkb_statequeue) {
4808 + printk("%s %08x gr %d rq %d flg %x sts %u node %u remid %x "
4818 + lkb->lkb_lockqueue_state,
4819 + lkb->lkb_lockqueue_flags);
4823 +static void dump_rsb(struct dlm_rsb *rsb)
4825 + printk("name \"%s\" flags %lx nodeid %d ref %u\n",
4826 + rsb->res_name, rsb->res_flags, rsb->res_nodeid,
4827 + atomic_read(&rsb->res_ref));
4829 + if (!list_empty(&rsb->res_grantqueue))
4830 + dump_queue(&rsb->res_grantqueue, "G");
4832 + if (!list_empty(&rsb->res_convertqueue))
4833 + dump_queue(&rsb->res_convertqueue, "C");
4835 + if (!list_empty(&rsb->res_waitqueue))
4836 + dump_queue(&rsb->res_waitqueue, "W");
4839 +void dlm_locks_dump(void)
4841 + struct dlm_ls *ls;
4842 + struct dlm_rsb *rsb;
4843 + struct list_head *head;
4846 + lowcomms_stop_accept();
4848 + list_for_each_entry(ls, &lslist, ls_list) {
4849 + down_write(&ls->ls_in_recovery);
4850 + for (i = 0; i < ls->ls_rsbtbl_size; i++) {
4851 + head = &ls->ls_rsbtbl[i].list;
4852 + list_for_each_entry(rsb, head, res_hashchain)
4858 diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
4859 --- linux-orig/cluster/dlm/locking.h 1970-01-01 07:30:00.000000000 +0730
4860 +++ linux-patched/cluster/dlm/locking.h 2004-11-03 11:31:56.000000000 +0800
4862 +/******************************************************************************
4863 +*******************************************************************************
4865 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4866 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4868 +** This copyrighted material is made available to anyone wishing to use,
4869 +** modify, copy, or redistribute it subject to the terms and conditions
4870 +** of the GNU General Public License v.2.
4872 +*******************************************************************************
4873 +******************************************************************************/
4875 +#ifndef __LOCKING_DOT_H__
4876 +#define __LOCKING_DOT_H__
4878 +int dlm_modes_compat(int mode1, int mode2);
4879 +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state);
4880 +void dlm_lock_stage3(struct dlm_lkb *lkb);
4881 +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast);
4882 +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
4883 +int dlm_lock_stage2(struct dlm_ls *lspace, struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
4884 +struct dlm_rsb *create_rsb(struct dlm_ls *lspace, struct dlm_lkb *lkb, char *name, int namelen);
4885 +int free_rsb_if_unused(struct dlm_rsb *rsb);
4886 +struct dlm_lkb *remote_stage2(int remote_csid, struct dlm_ls *lspace,
4887 + struct dlm_request *freq);
4888 +int cancel_lockop(struct dlm_lkb *lkb, int status);
4889 +int dlm_remove_lock(struct dlm_lkb *lkb, uint32_t flags);
4890 +int grant_pending_locks(struct dlm_rsb *rsb);
4891 +void cancel_conversion(struct dlm_lkb *lkb, int ret);
4892 +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb);
4894 +#endif /* __LOCKING_DOT_H__ */
4895 diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
4896 --- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730
4897 +++ linux-patched/cluster/dlm/lockqueue.c 2004-11-03 11:31:56.000000000 +0800
4899 +/******************************************************************************
4900 +*******************************************************************************
4902 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4903 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4905 +** This copyrighted material is made available to anyone wishing to use,
4906 +** modify, copy, or redistribute it subject to the terms and conditions
4907 +** of the GNU General Public License v.2.
4909 +*******************************************************************************
4910 +******************************************************************************/
4915 + * This controls the lock queue, which is where locks
4916 + * come when they need to wait for a remote operation
4919 + * This could also be thought of as the "high-level" comms
4924 +#include "dlm_internal.h"
4925 +#include "lockqueue.h"
4927 +#include "locking.h"
4929 +#include "lowcomms.h"
4930 +#include "midcomms.h"
4931 +#include "reccomms.h"
4933 +#include "lockspace.h"
4935 +#include "memory.h"
4937 +#include "queries.h"
4940 +static void add_reply_lvb(struct dlm_lkb * lkb, struct dlm_reply *reply);
4941 +static void add_request_lvb(struct dlm_lkb * lkb, struct dlm_request *req);
4944 + * format of an entry on the request queue
4947 + struct list_head rqe_list;
4948 + uint32_t rqe_nodeid;
4949 + char rqe_request[1];
4953 + * Add a new request (if appropriate) to the request queue and send the remote
4954 + * request out. - runs in the context of the locking caller
4956 + * Recovery of a remote_stage request if the remote end fails while the lkb
4957 + * is still on the lockqueue:
4959 + * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
4960 + * lockqueue_lkb_mark() at the start of recovery.
4962 + * o Some lkb's will be rebuilt on new master rsb's during recovery.
4963 + * (depends on the type of request, see below).
4965 + * o At the end of recovery, resend_cluster_requests() looks at these
4966 + * LQRESEND lkb's and either:
4968 + * i) resends the request to the new master for the rsb where the
4969 + * request is processed as usual. The lkb remains on the lockqueue until
4970 + * the new master replies and we run process_lockqueue_reply().
4972 + * ii) if we've become the rsb master, remove the lkb from the lockqueue
4973 + * and processes the request locally via process_remastered_lkb().
4975 + * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
4976 + * and the request should be resent if dest node is failed.
4978 + * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
4979 + * wait queue. Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
4980 + * makes send_lkb_queue() skip it). Resend this request to the new master.
4982 + * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue. It will
4983 + * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
4984 + * Resend this request to the new master.
4986 + * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
4987 + * It will be rebuilt on the new master rsb's granted queue. Resend this
4988 + * request to the new master.
4991 +int remote_stage(struct dlm_lkb *lkb, int state)
4995 + lkb->lkb_lockqueue_state = state;
4996 + add_to_lockqueue(lkb);
4998 + error = send_cluster_request(lkb, state);
5000 + log_error(lkb->lkb_resource->res_ls, "remote_stage error %d %x",
5001 + error, lkb->lkb_id);
5002 + /* Leave on lockqueue, it will be resent to correct node during
5009 + * Requests received while the lockspace is in recovery get added to the
5010 + * request queue and processed when recovery is complete.
5013 +void add_to_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
5015 + struct rq_entry *entry;
5016 + int length = hd->rh_length;
5018 + if (test_bit(LSFL_REQUEST_WARN, &ls->ls_flags))
5019 + log_error(ls, "request during recovery from %u", nodeid);
5021 + if (in_nodes_gone(ls, nodeid))
5024 + entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
5026 + // TODO something better
5027 + printk("dlm: add_to_requestqueue: out of memory\n");
5031 + log_debug(ls, "add_to_requestq cmd %d fr %d", hd->rh_cmd, nodeid);
5032 + entry->rqe_nodeid = nodeid;
5033 + memcpy(entry->rqe_request, hd, length);
5035 + down(&ls->ls_requestqueue_lock);
5036 + list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
5037 + up(&ls->ls_requestqueue_lock);
5040 +int process_requestqueue(struct dlm_ls *ls)
5042 + int error = 0, count = 0;
5043 + struct rq_entry *entry;
5044 + struct dlm_header *hd;
5046 + log_all(ls, "process held requests");
5048 + down(&ls->ls_requestqueue_lock);
5051 + if (list_empty(&ls->ls_requestqueue)) {
5052 + up(&ls->ls_requestqueue_lock);
5057 + entry = list_entry(ls->ls_requestqueue.next, struct rq_entry,
5059 + up(&ls->ls_requestqueue_lock);
5060 + hd = (struct dlm_header *) entry->rqe_request;
5062 + log_debug(ls, "process_requestq cmd %d fr %u", hd->rh_cmd,
5063 + entry->rqe_nodeid);
5065 + error = process_cluster_request(entry->rqe_nodeid, hd, TRUE);
5066 + if (error == -EINTR) {
5067 + /* entry is left on requestqueue */
5068 + log_debug(ls, "process_requestqueue abort eintr");
5072 + down(&ls->ls_requestqueue_lock);
5073 + list_del(&entry->rqe_list);
5077 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5078 + log_debug(ls, "process_requestqueue abort ls_run");
5079 + up(&ls->ls_requestqueue_lock);
5085 + log_all(ls, "processed %d requests", count);
5089 +void wait_requestqueue(struct dlm_ls *ls)
5092 + down(&ls->ls_requestqueue_lock);
5093 + if (list_empty(&ls->ls_requestqueue))
5095 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
5097 + up(&ls->ls_requestqueue_lock);
5100 + up(&ls->ls_requestqueue_lock);
5104 + * Resdir requests (lookup or remove) and replies from before recovery are
5105 + * invalid since the resdir was rebuilt. Clear them. Requests from nodes now
5106 + * gone are also invalid.
5109 +void purge_requestqueue(struct dlm_ls *ls)
5112 + struct rq_entry *entry, *safe;
5113 + struct dlm_header *hd;
5114 + struct dlm_lkb *lkb;
5116 + log_all(ls, "purge requests");
5118 + down(&ls->ls_requestqueue_lock);
5120 + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
5121 + hd = (struct dlm_header *) entry->rqe_request;
5123 + if (hd->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
5124 + hd->rh_cmd == GDLM_REMCMD_LOOKUP ||
5125 + in_nodes_gone(ls, entry->rqe_nodeid)) {
5127 + list_del(&entry->rqe_list);
5131 + } else if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
5134 + * Replies to resdir lookups are invalid and must be
5135 + * purged. The lookup requests are marked in
5136 + * lockqueue_lkb_mark and will be resent in
5137 + * resend_cluster_requests. The only way to check if
5138 + * this is a lookup reply is to look at the
5139 + * lockqueue_state of the lkb.
5142 + lkb = find_lock_by_id(ls, hd->rh_lkid);
5144 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
5145 + list_del(&entry->rqe_list);
5151 + up(&ls->ls_requestqueue_lock);
5153 + log_all(ls, "purged %d requests", count);
5157 + * Check if there's a reply for the given lkid in the requestqueue.
5160 +int reply_in_requestqueue(struct dlm_ls *ls, int lkid)
5163 + struct rq_entry *entry;
5164 + struct dlm_header *hd;
5166 + down(&ls->ls_requestqueue_lock);
5168 + list_for_each_entry(entry, &ls->ls_requestqueue, rqe_list) {
5169 + hd = (struct dlm_header *) entry->rqe_request;
5170 + if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY && hd->rh_lkid == lkid){
5171 + log_debug(ls, "reply_in_requestq cmd %d fr %d id %x",
5172 + hd->rh_cmd, entry->rqe_nodeid, lkid);
5177 + up(&ls->ls_requestqueue_lock);
5182 +void allocate_and_copy_lvb(struct dlm_ls *ls, char **lvbptr, char *src)
5185 + *lvbptr = allocate_lvb(ls);
5187 + memcpy(*lvbptr, src, DLM_LVB_LEN);
5191 + * Process a lockqueue LKB after it has had it's remote processing complete and
5192 + * been pulled from the lockqueue. Runs in the context of the DLM recvd thread
5193 + * on the machine that requested the lock.
5196 +static void process_lockqueue_reply(struct dlm_lkb *lkb,
5197 + struct dlm_reply *reply,
5200 + struct dlm_rsb *rsb = lkb->lkb_resource;
5201 + struct dlm_ls *ls = rsb->res_ls;
5202 + int oldstate, state = lkb->lkb_lockqueue_state;
5205 + remove_from_lockqueue(lkb);
5208 + case GDLM_LQSTATE_WAIT_RSB:
5210 + if (reply->rl_status) {
5211 + DLM_ASSERT(reply->rl_status == -EEXIST,);
5212 + if (rsb->res_nodeid == -1) {
5214 + remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
5218 + if (reply->rl_nodeid == our_nodeid()) {
5219 + set_bit(RESFL_MASTER, &rsb->res_flags);
5220 + rsb->res_nodeid = 0;
5222 + clear_bit(RESFL_MASTER, &rsb->res_flags);
5223 + rsb->res_nodeid = reply->rl_nodeid;
5227 + log_debug(ls, "(%d) lu rep %x fr %u %u", lkb->lkb_ownpid,
5228 + lkb->lkb_id, nodeid,
5231 + lkb->lkb_nodeid = rsb->res_nodeid;
5232 + dlm_lock_stage2(ls, lkb, rsb, lkb->lkb_lockqueue_flags);
5235 + case GDLM_LQSTATE_WAIT_CONVERT:
5236 + case GDLM_LQSTATE_WAIT_CONDGRANT:
5239 + * the destination wasn't the master
5240 + * this implies the request was a CONDGRANT
5243 + if (reply->rl_status == -EINVAL) {
5244 + int master_nodeid;
5246 + DLM_ASSERT(state == GDLM_LQSTATE_WAIT_CONDGRANT, );
5248 + log_debug(ls, "(%d) req reply einval %x fr %d r %d %s",
5249 + lkb->lkb_ownpid, lkb->lkb_id, nodeid,
5250 + rsb->res_nodeid, rsb->res_name);
5254 + if (rsb->res_nodeid == lkb->lkb_nodeid || rsb->res_nodeid == -1){
5256 + * We need to re-lookup the master and resend our
5260 + lkb->lkb_nodeid = -1;
5261 + rsb->res_nodeid = -1;
5263 + if (get_directory_nodeid(rsb) != our_nodeid())
5264 + remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
5266 + int error = dlm_dir_lookup(ls, our_nodeid(),
5270 + if (error == -EEXIST) {
5271 + /* don't expect this will happen */
5272 + log_all(ls, "EEXIST %x", lkb->lkb_id);
5277 + if (master_nodeid == our_nodeid()) {
5278 + set_bit(RESFL_MASTER, &rsb->res_flags);
5279 + master_nodeid = 0;
5281 + clear_bit(RESFL_MASTER,&rsb->res_flags);
5283 + rsb->res_nodeid = master_nodeid;
5284 + lkb->lkb_nodeid = master_nodeid;
5286 + dlm_lock_stage2(ls, lkb, rsb,
5287 + lkb->lkb_lockqueue_flags);
5291 + * Another request on this rsb has since found
5292 + * the master, we'll use that one although it too
5293 + * may be invalid requiring us to retry again.
5296 + lkb->lkb_nodeid = rsb->res_nodeid;
5297 + dlm_lock_stage2(ls, lkb, rsb,
5298 + lkb->lkb_lockqueue_flags);
5306 + * After a remote lock/conversion/grant request we put the lock
5307 + * on the right queue and send an AST if appropriate. Any lock
5308 + * shuffling (eg newly granted locks because this one was
5309 + * converted downwards) will be dealt with in seperate messages
5310 + * (which may be in the same network message)
5313 + if (!lkb->lkb_remid)
5314 + lkb->lkb_remid = reply->rl_lkid;
5317 + * The remote request failed (we assume because of NOQUEUE).
5318 + * If this is a new request (non-conv) the lkb was created just
5319 + * for it so the lkb should be freed. If this was a
5320 + * conversion, the lkb already existed so we should put it back
5321 + * on the grant queue.
5324 + if (reply->rl_status != 0) {
5325 + DLM_ASSERT(reply->rl_status == -EAGAIN,);
5327 + if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
5328 + res_lkb_dequeue(lkb);
5329 + lkb->lkb_retstatus = reply->rl_status;
5330 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
5332 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5333 + lkb->lkb_retstatus = reply->rl_status;
5334 + queue_ast(lkb, AST_COMP, 0);
5340 + * The remote request was successful in granting the request or
5341 + * queuing it to be granted later. Add the lkb to the
5342 + * appropriate rsb queue.
5345 + switch (reply->rl_lockstate) {
5346 + case GDLM_LKSTS_GRANTED:
5348 + /* Compact version of grant_lock(). */
5350 + down_write(&rsb->res_lock);
5351 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5352 + memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
5355 + lkb->lkb_grmode = lkb->lkb_rqmode;
5356 + lkb->lkb_rqmode = DLM_LOCK_IV;
5357 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5359 + if (lkb->lkb_range) {
5360 + lkb->lkb_range[GR_RANGE_START] =
5361 + lkb->lkb_range[RQ_RANGE_START];
5362 + lkb->lkb_range[GR_RANGE_END] =
5363 + lkb->lkb_range[RQ_RANGE_END];
5365 + up_write(&rsb->res_lock);
5367 + lkb->lkb_retstatus = 0;
5368 + queue_ast(lkb, AST_COMP, 0);
5371 + case GDLM_LKSTS_WAITING:
5373 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5374 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
5376 + log_error(ls, "wait reply for granted %x %u",
5377 + lkb->lkb_id, lkb->lkb_nodeid);
5380 + case GDLM_LKSTS_CONVERT:
5382 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5383 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
5385 + log_error(ls, "convert reply for granted %x %u",
5386 + lkb->lkb_id, lkb->lkb_nodeid);
5390 + log_error(ls, "process_lockqueue_reply state %d",
5391 + reply->rl_lockstate);
5396 + case GDLM_LQSTATE_WAIT_UNLOCK:
5399 + * Unlocks should never fail. Update local lock info. This
5400 + * always sends completion AST with status in lksb
5403 + DLM_ASSERT(reply->rl_status == 0,);
5404 + oldstate = res_lkb_dequeue(lkb);
5406 + /* Differentiate between unlocks and conversion cancellations */
5407 + if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL) {
5408 + if (oldstate == GDLM_LKSTS_CONVERT) {
5409 + res_lkb_enqueue(lkb->lkb_resource, lkb,
5410 + GDLM_LKSTS_GRANTED);
5411 + lkb->lkb_retstatus = -DLM_ECANCEL;
5412 + queue_ast(lkb, AST_COMP, 0);
5414 + log_error(ls, "cancel state %d", oldstate);
5416 + DLM_ASSERT(oldstate == GDLM_LKSTS_GRANTED,
5419 + lkb->lkb_retstatus = -DLM_EUNLOCK;
5420 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
5425 + log_error(ls, "process_lockqueue_reply id %x state %d",
5426 + lkb->lkb_id, state);
5431 + * Tell a remote node to grant a lock. This happens when we are the master
5432 + * copy for a lock that is actually held on a remote node. The remote end is
5433 + * also responsible for sending the completion AST.
5436 +void remote_grant(struct dlm_lkb *lkb)
5438 + struct writequeue_entry *e;
5439 + struct dlm_request *req;
5441 + // TODO Error handling
5442 + e = lowcomms_get_buffer(lkb->lkb_nodeid,
5443 + sizeof(struct dlm_request),
5444 + lkb->lkb_resource->res_ls->ls_allocation,
5449 + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
5450 + req->rr_header.rh_length = sizeof(struct dlm_request);
5451 + req->rr_header.rh_flags = 0;
5452 + req->rr_header.rh_lkid = lkb->lkb_id;
5453 + req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
5454 + req->rr_remlkid = lkb->lkb_remid;
5455 + req->rr_flags = 0;
5457 + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
5458 + /* This is a confusing non-standard use of rr_flags which is
5459 + * usually used to pass lockqueue_flags. */
5460 + req->rr_flags |= GDLM_LKFLG_DEMOTED;
5463 + add_request_lvb(lkb, req);
5464 + midcomms_send_buffer(&req->rr_header, e);
5467 +void reply_and_grant(struct dlm_lkb *lkb)
5469 + struct dlm_request *req = lkb->lkb_request;
5470 + struct dlm_reply *reply;
5471 + struct writequeue_entry *e;
5473 + // TODO Error handling
5474 + e = lowcomms_get_buffer(lkb->lkb_nodeid,
5475 + sizeof(struct dlm_reply),
5476 + lkb->lkb_resource->res_ls->ls_allocation,
5477 + (char **) &reply);
5481 + reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5482 + reply->rl_header.rh_flags = 0;
5483 + reply->rl_header.rh_length = sizeof(struct dlm_reply);
5484 + reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
5485 + reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
5487 + reply->rl_status = lkb->lkb_retstatus;
5488 + reply->rl_lockstate = lkb->lkb_status;
5489 + reply->rl_lkid = lkb->lkb_id;
5491 + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
5493 + lkb->lkb_request = NULL;
5495 + add_reply_lvb(lkb, reply);
5496 + midcomms_send_buffer(&reply->rl_header, e);
5500 + * Request removal of a dead entry in the resource directory
5503 +void remote_remove_direntry(struct dlm_ls *ls, int nodeid, char *name,
5506 + struct writequeue_entry *e;
5507 + struct dlm_request *req;
5509 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5510 + struct dlm_rcom *rc = allocate_rcom_buffer(ls);
5512 + memcpy(rc->rc_buf, name, namelen);
5513 + rc->rc_datalen = namelen;
5515 + rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
5517 + free_rcom_buffer(rc);
5520 + // TODO Error handling
5521 + e = lowcomms_get_buffer(nodeid,
5522 + sizeof(struct dlm_request) + namelen - 1,
5523 + ls->ls_allocation, (char **) &req);
5527 + memset(req, 0, sizeof(struct dlm_request) + namelen - 1);
5528 + req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
5529 + req->rr_header.rh_length =
5530 + sizeof(struct dlm_request) + namelen - 1;
5531 + req->rr_header.rh_flags = 0;
5532 + req->rr_header.rh_lkid = 0;
5533 + req->rr_header.rh_lockspace = ls->ls_global_id;
5534 + req->rr_remlkid = 0;
5535 + memcpy(req->rr_name, name, namelen);
5537 + midcomms_send_buffer(&req->rr_header, e);
5541 + * Send remote cluster request to directory or master node before the request
5542 + * is put on the lock queue. Runs in the context of the locking caller.
5545 +int send_cluster_request(struct dlm_lkb *lkb, int state)
5547 + uint32_t target_nodeid;
5548 + struct dlm_rsb *rsb = lkb->lkb_resource;
5549 + struct dlm_ls *ls = rsb->res_ls;
5550 + struct dlm_request *req;
5551 + struct writequeue_entry *e;
5553 + if (state == GDLM_LQSTATE_WAIT_RSB)
5554 + target_nodeid = get_directory_nodeid(rsb);
5556 + target_nodeid = lkb->lkb_nodeid;
5558 + /* during recovery it's valid for target_nodeid to equal our own;
5559 + resend_cluster_requests does this to get requests back on track */
5561 + DLM_ASSERT(target_nodeid && target_nodeid != -1,
5564 + printk("target_nodeid %u\n", target_nodeid););
5566 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5567 + /* this may happen when called by resend_cluster_request */
5568 + log_error(ls, "send_cluster_request to %u state %d recovery",
5569 + target_nodeid, state);
5572 + e = lowcomms_get_buffer(target_nodeid,
5573 + sizeof(struct dlm_request) +
5574 + rsb->res_length - 1, ls->ls_allocation,
5578 + memset(req, 0, sizeof(struct dlm_request) + rsb->res_length - 1);
5580 + /* Common stuff, some are just defaults */
5582 + if (lkb->lkb_bastaddr)
5583 + req->rr_asts = AST_BAST;
5584 + if (lkb->lkb_astaddr)
5585 + req->rr_asts |= AST_COMP;
5586 + if (lkb->lkb_parent)
5587 + req->rr_remparid = lkb->lkb_parent->lkb_remid;
5589 + req->rr_flags = lkb->lkb_lockqueue_flags;
5590 + req->rr_rqmode = lkb->lkb_rqmode;
5591 + req->rr_remlkid = lkb->lkb_remid;
5592 + req->rr_pid = lkb->lkb_ownpid;
5593 + req->rr_header.rh_length =
5594 + sizeof(struct dlm_request) + rsb->res_length - 1;
5595 + req->rr_header.rh_flags = 0;
5596 + req->rr_header.rh_lkid = lkb->lkb_id;
5597 + req->rr_header.rh_lockspace = ls->ls_global_id;
5601 + case GDLM_LQSTATE_WAIT_RSB:
5603 + DLM_ASSERT(!lkb->lkb_parent,
5607 + log_debug(ls, "(%d) send lu %x to %u",
5608 + lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5610 + req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
5611 + memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5614 + case GDLM_LQSTATE_WAIT_CONVERT:
5616 + DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5620 + log_debug(ls, "(%d) send cv %x to %u",
5621 + lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5623 + req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
5624 + if (lkb->lkb_range) {
5625 + req->rr_flags |= GDLM_LKFLG_RANGE;
5626 + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5627 + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5631 + case GDLM_LQSTATE_WAIT_CONDGRANT:
5633 + DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5637 + log_debug(ls, "(%d) send rq %x to %u",
5638 + lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5640 + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
5641 + memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5642 + if (lkb->lkb_range) {
5643 + req->rr_flags |= GDLM_LKFLG_RANGE;
5644 + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5645 + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5649 + case GDLM_LQSTATE_WAIT_UNLOCK:
5651 + log_debug(ls, "(%d) send un %x to %u",
5652 + lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5654 + req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
5658 + DLM_ASSERT(0, printk("Unknown cluster request\n"););
5661 + add_request_lvb(lkb, req);
5662 + midcomms_send_buffer(&req->rr_header, e);
5668 + * We got a request from another cluster node, process it and return an info
5669 + * structure with the lock state/LVB etc as required. Executes in the DLM's
5673 +int process_cluster_request(int nodeid, struct dlm_header *req, int recovery)
5675 + struct dlm_ls *lspace;
5676 + struct dlm_lkb *lkb = NULL;
5677 + struct dlm_rsb *rsb;
5678 + int send_reply = 0, status = 0, namelen;
5679 + struct dlm_request *freq = (struct dlm_request *) req;
5680 + struct dlm_reply *rp = (struct dlm_reply *) req;
5681 + struct dlm_reply reply;
5683 + lspace = find_lockspace_by_global_id(req->rh_lockspace);
5686 + log_print("process_cluster_request invalid lockspace %x "
5687 + "from %d req %u", req->rh_lockspace, nodeid,
5692 + /* wait for recoverd to drain requestqueue */
5694 + wait_requestqueue(lspace);
5697 + * If we're in recovery then queue the request for later. Otherwise,
5698 + * we still need to get the "in_recovery" lock to make sure the
5699 + * recovery itself doesn't start until we are done.
5702 + if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
5704 + add_to_requestqueue(lspace, nodeid, req);
5708 + if (!down_read_trylock(&lspace->ls_in_recovery)) {
5715 + * Process the request.
5718 + switch (req->rh_cmd) {
5720 + case GDLM_REMCMD_LOOKUP:
5722 + uint32_t dir_nodeid, r_nodeid;
5725 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5727 + dir_nodeid = name_to_directory_nodeid(lspace,
5730 + if (dir_nodeid != our_nodeid())
5731 + log_debug(lspace, "ignoring directory lookup");
5733 + status = dlm_dir_lookup(lspace, nodeid, freq->rr_name,
5734 + namelen, &r_nodeid);
5735 + reply.rl_status = status;
5736 + reply.rl_lockstate = 0;
5737 + reply.rl_nodeid = r_nodeid;
5742 + case GDLM_REMCMD_REM_RESDATA:
5744 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5745 + dlm_dir_remove(lspace, nodeid, freq->rr_name, namelen);
5748 + case GDLM_REMCMD_LOCKREQUEST:
5750 + lkb = remote_stage2(nodeid, lspace, freq);
5752 + lkb->lkb_request = freq;
5753 + lkb->lkb_ownpid = freq->rr_pid;
5754 + if (lkb->lkb_retstatus != -EINVAL)
5755 + dlm_lock_stage3(lkb);
5758 + * If the request was granted in lock_stage3, then a
5759 + * reply message was already sent in combination with
5760 + * the grant message and lkb_request is NULL.
5763 + if (lkb->lkb_request) {
5764 + lkb->lkb_request = NULL;
5766 + reply.rl_status = lkb->lkb_retstatus;
5767 + reply.rl_lockstate = lkb->lkb_status;
5768 + reply.rl_lkid = lkb->lkb_id;
5771 + * If the request could not be granted and the
5772 + * user won't wait, then free up the LKB
5775 + if (lkb->lkb_retstatus == -EAGAIN) {
5776 + rsb = lkb->lkb_resource;
5777 + release_lkb(lspace, lkb);
5781 + else if (lkb->lkb_retstatus == -EINVAL) {
5782 + release_lkb(lspace, lkb);
5787 + reply.rl_status = -ENOMEM;
5792 + case GDLM_REMCMD_CONVREQUEST:
5794 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5798 + print_request(freq);
5799 + printk("nodeid %u\n", nodeid););
5801 + rsb = lkb->lkb_resource;
5805 + print_request(freq);
5806 + printk("nodeid %u\n", nodeid););
5808 + DLM_ASSERT(!rsb->res_nodeid,
5811 + print_request(freq);
5812 + printk("nodeid %u\n", nodeid););
5814 + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5817 + print_request(freq);
5818 + printk("nodeid %u\n", nodeid););
5820 + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_GRANTED,
5823 + print_request(freq);
5824 + printk("nodeid %u\n", nodeid););
5826 + /* Update orphan lock status */
5827 + if (freq->rr_flags & DLM_LKF_ORPHAN) {
5828 + lkb->lkb_flags |= GDLM_LKFLG_ORPHAN;
5831 + lkb->lkb_rqmode = freq->rr_rqmode;
5832 + lkb->lkb_lockqueue_flags = freq->rr_flags;
5833 + lkb->lkb_request = freq;
5834 + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
5836 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK ||
5837 + freq->rr_flags & DLM_LKF_VALBLK) {
5838 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
5839 + allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
5843 + if (freq->rr_flags & GDLM_LKFLG_RANGE) {
5844 + if (lkb_set_range(lspace, lkb, freq->rr_range_start,
5845 + freq->rr_range_end)) {
5846 + reply.rl_status = -ENOMEM;
5852 + log_debug(lspace, "(%d) cv %u from %u %x \"%s\"",
5853 + lkb->lkb_ownpid, lkb->lkb_rqmode, nodeid,
5854 + lkb->lkb_id, rsb->res_name);
5856 + dlm_convert_stage2(lkb, FALSE);
5859 + * If the conv request was granted in stage2, then a reply
5860 + * message was already sent in combination with the grant
5864 + if (lkb->lkb_request) {
5865 + lkb->lkb_request = NULL;
5867 + reply.rl_status = lkb->lkb_retstatus;
5868 + reply.rl_lockstate = lkb->lkb_status;
5869 + reply.rl_lkid = lkb->lkb_id;
5873 + case GDLM_REMCMD_LOCKREPLY:
5875 + lkb = find_lock_by_id(lspace, req->rh_lkid);
5879 + printk("nodeid %u\n", nodeid););
5881 + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5884 + printk("nodeid %u\n", nodeid););
5886 + process_lockqueue_reply(lkb, rp, nodeid);
5889 + case GDLM_REMCMD_LOCKGRANT:
5892 + * Remote lock has been granted asynchronously. Do a compact
5893 + * version of what grant_lock() does.
5896 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5899 + print_request(freq);
5900 + printk("nodeid %u\n", nodeid););
5902 + rsb = lkb->lkb_resource;
5906 + print_request(freq);
5907 + printk("nodeid %u\n", nodeid););
5909 + DLM_ASSERT(rsb->res_nodeid,
5912 + print_request(freq);
5913 + printk("nodeid %u\n", nodeid););
5915 + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5918 + print_request(freq);
5919 + printk("nodeid %u\n", nodeid););
5921 + if (lkb->lkb_lockqueue_state) {
5922 + log_debug(rsb->res_ls, "grant lock on lockqueue %d",
5923 + lkb->lkb_lockqueue_state);
5925 + /* Don't grant locks that are waiting for an unlock */
5926 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK)
5930 + print_request(freq);
5931 + remove_from_lockqueue(lkb);
5932 + if (!lkb->lkb_remid)
5933 + lkb->lkb_remid = req->rh_lkid;
5936 + down_write(&rsb->res_lock);
5938 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5939 + allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr, freq->rr_lvb);
5941 + lkb->lkb_grmode = lkb->lkb_rqmode;
5942 + lkb->lkb_rqmode = DLM_LOCK_IV;
5944 + if (lkb->lkb_range) {
5945 + lkb->lkb_range[GR_RANGE_START] =
5946 + lkb->lkb_range[RQ_RANGE_START];
5947 + lkb->lkb_range[GR_RANGE_END] =
5948 + lkb->lkb_range[RQ_RANGE_END];
5951 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5952 + up_write(&rsb->res_lock);
5954 + if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
5955 + lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
5957 + lkb->lkb_retstatus = 0;
5958 + queue_ast(lkb, AST_COMP, 0);
5961 + case GDLM_REMCMD_SENDBAST:
5963 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5966 + print_request(freq);
5967 + printk("nodeid %u\n", nodeid););
5969 + if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
5970 + queue_ast(lkb, AST_BAST, freq->rr_rqmode);
5973 + case GDLM_REMCMD_SENDCAST:
5975 + /* This is only used for some error completion ASTs */
5977 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5980 + print_request(freq);
5981 + printk("nodeid %u\n", nodeid););
5983 + /* Return the lock to granted status */
5984 + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
5985 + lkb->lkb_retstatus = freq->rr_status;
5986 + queue_ast(lkb, AST_COMP, 0);
5989 + case GDLM_REMCMD_UNLOCKREQUEST:
5991 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5994 + print_request(freq);
5995 + printk("nodeid %u\n", nodeid););
5997 + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5999 + print_request(freq);
6000 + printk("nodeid %u\n", nodeid););
6002 + DLM_ASSERT(lkb->lkb_nodeid == nodeid,
6004 + print_request(freq);
6005 + printk("nodeid %u\n", nodeid););
6007 + rsb = find_rsb_to_unlock(lspace, lkb);
6009 + log_debug(lspace, "(%d) un from %u %x \"%s\"", lkb->lkb_ownpid,
6010 + nodeid, lkb->lkb_id, rsb->res_name);
6012 + reply.rl_status = dlm_unlock_stage2(lkb, rsb, freq->rr_flags);
6016 + case GDLM_REMCMD_QUERY:
6017 + remote_query(nodeid, lspace, req);
6020 + case GDLM_REMCMD_QUERYREPLY:
6021 + remote_query_reply(nodeid, lspace, req);
6025 + log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
6028 + up_read(&lspace->ls_in_recovery);
6032 + reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
6033 + reply.rl_header.rh_flags = 0;
6034 + reply.rl_header.rh_length = sizeof(reply);
6035 + reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
6036 + reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
6038 + status = midcomms_send_message(nodeid, &reply.rl_header,
6043 + put_lockspace(lspace);
6047 +static void add_reply_lvb(struct dlm_lkb *lkb, struct dlm_reply *reply)
6049 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
6050 + memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
6053 +static void add_request_lvb(struct dlm_lkb *lkb, struct dlm_request *req)
6055 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
6056 + memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
6058 diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
6059 --- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730
6060 +++ linux-patched/cluster/dlm/lockqueue.h 2004-11-03 11:31:56.000000000 +0800
6062 +/******************************************************************************
6063 +*******************************************************************************
6065 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6066 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6068 +** This copyrighted material is made available to anyone wishing to use,
6069 +** modify, copy, or redistribute it subject to the terms and conditions
6070 +** of the GNU General Public License v.2.
6072 +*******************************************************************************
6073 +******************************************************************************/
6075 +#ifndef __LOCKQUEUE_DOT_H__
6076 +#define __LOCKQUEUE_DOT_H__
6078 +void remote_grant(struct dlm_lkb * lkb);
6079 +void reply_and_grant(struct dlm_lkb * lkb);
6080 +int remote_stage(struct dlm_lkb * lkb, int state);
6081 +int process_cluster_request(int csid, struct dlm_header *req, int recovery);
6082 +int send_cluster_request(struct dlm_lkb * lkb, int state);
6083 +void purge_requestqueue(struct dlm_ls * ls);
6084 +int process_requestqueue(struct dlm_ls * ls);
6085 +int reply_in_requestqueue(struct dlm_ls * ls, int lkid);
6086 +void remote_remove_direntry(struct dlm_ls * ls, int nodeid, char *name,
6088 +void allocate_and_copy_lvb(struct dlm_ls * ls, char **lvbptr, char *src);
6090 +#endif /* __LOCKQUEUE_DOT_H__ */
6091 diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
6092 --- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730
6093 +++ linux-patched/cluster/dlm/lockspace.c 2004-11-03 11:31:56.000000000 +0800
6095 +/******************************************************************************
6096 +*******************************************************************************
6098 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6099 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6101 +** This copyrighted material is made available to anyone wishing to use,
6102 +** modify, copy, or redistribute it subject to the terms and conditions
6103 +** of the GNU General Public License v.2.
6105 +*******************************************************************************
6106 +******************************************************************************/
6108 +#include <linux/module.h>
6110 +#include "dlm_internal.h"
6111 +#include "recoverd.h"
6116 +#include "lowcomms.h"
6117 +#include "config.h"
6118 +#include "memory.h"
6119 +#include "lockspace.h"
6120 +#include "device.h"
6122 +#define GDST_NONE (0)
6123 +#define GDST_RUNNING (1)
6125 +static int dlmstate;
6126 +static int dlmcount;
6127 +static struct semaphore dlmstate_lock;
6128 +struct list_head lslist;
6129 +spinlock_t lslist_lock;
6130 +struct kcl_service_ops ls_ops;
6132 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
6135 +void dlm_lockspace_init(void)
6137 + dlmstate = GDST_NONE;
6139 + init_MUTEX(&dlmstate_lock);
6140 + INIT_LIST_HEAD(&lslist);
6141 + spin_lock_init(&lslist_lock);
6144 +struct dlm_ls *find_lockspace_by_name(char *name, int namelen)
6146 + struct dlm_ls *ls;
6148 + spin_lock(&lslist_lock);
6150 + list_for_each_entry(ls, &lslist, ls_list) {
6151 + if (ls->ls_namelen == namelen &&
6152 + memcmp(ls->ls_name, name, namelen) == 0)
6157 + spin_unlock(&lslist_lock);
6161 +struct dlm_ls *find_lockspace_by_global_id(uint32_t id)
6163 + struct dlm_ls *ls;
6165 + spin_lock(&lslist_lock);
6167 + list_for_each_entry(ls, &lslist, ls_list) {
6168 + if (ls->ls_global_id == id) {
6175 + spin_unlock(&lslist_lock);
6179 +struct dlm_ls *find_lockspace_by_local_id(void *id)
6181 + struct dlm_ls *ls;
6183 + spin_lock(&lslist_lock);
6185 + list_for_each_entry(ls, &lslist, ls_list) {
6186 + if (ls->ls_local_id == (uint32_t)(long)id) {
6193 + spin_unlock(&lslist_lock);
6197 +/* must be called with lslist_lock held */
6198 +void hold_lockspace(struct dlm_ls *ls)
6203 +void put_lockspace(struct dlm_ls *ls)
6205 + spin_lock(&lslist_lock);
6207 + spin_unlock(&lslist_lock);
6210 +static void remove_lockspace(struct dlm_ls *ls)
6213 + spin_lock(&lslist_lock);
6214 + if (ls->ls_count == 0) {
6215 + list_del(&ls->ls_list);
6216 + spin_unlock(&lslist_lock);
6219 + spin_unlock(&lslist_lock);
6220 + set_current_state(TASK_INTERRUPTIBLE);
6221 + schedule_timeout(HZ);
6226 + * Called from dlm_init. These are the general threads which are not
6227 + * lockspace-specific and work for all dlm lockspaces.
6230 +static int threads_start(void)
6234 + /* Thread which process lock requests for all ls's */
6235 + error = astd_start();
6237 + log_print("cannot start ast thread %d", error);
6241 + /* Thread for sending/receiving messages for all ls's */
6242 + error = lowcomms_start();
6244 + log_print("cannot start lowcomms %d", error);
6257 +static void threads_stop(void)
6263 +static int init_internal(void)
6267 + if (dlmstate == GDST_RUNNING)
6270 + error = threads_start();
6274 + dlmstate = GDST_RUNNING;
6283 + * Called after dlm module is loaded and before any lockspaces are created.
6284 + * Starts and initializes global threads and structures. These global entities
6285 + * are shared by and independent of all lockspaces.
6287 + * There should be a dlm-specific user command which a person can run which
6288 + * calls this function. If a user hasn't run that command and something
6289 + * creates a new lockspace, this is called first.
6291 + * This also starts the default lockspace.
6298 + down(&dlmstate_lock);
6299 + error = init_internal();
6300 + up(&dlmstate_lock);
6305 +int dlm_release(void)
6309 + down(&dlmstate_lock);
6311 + if (dlmstate == GDST_NONE)
6320 + spin_lock(&lslist_lock);
6321 + if (!list_empty(&lslist)) {
6322 + spin_unlock(&lslist_lock);
6323 + log_print("cannot stop threads, lockspaces still exist");
6326 + spin_unlock(&lslist_lock);
6329 + dlmstate = GDST_NONE;
6332 + up(&dlmstate_lock);
6337 +struct dlm_ls *allocate_ls(int namelen)
6339 + struct dlm_ls *ls;
6341 + ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
6343 + memset(ls, 0, sizeof(struct dlm_ls) + namelen);
6348 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
6350 + struct dlm_ls *ls;
6351 + int i, size, error = -ENOMEM;
6352 + uint32_t local_id = 0;
6354 + if (!try_module_get(THIS_MODULE))
6357 + if (namelen > MAX_SERVICE_NAME_LEN)
6360 + ls = find_lockspace_by_name(name, namelen);
6362 + *lockspace = (void *)(long) ls->ls_local_id;
6367 + * Initialize ls fields
6370 + ls = allocate_ls(namelen);
6374 + memcpy(ls->ls_name, name, namelen);
6375 + ls->ls_namelen = namelen;
6377 + ls->ls_allocation = GFP_KERNEL;
6381 + size = dlm_config.rsbtbl_size;
6382 + ls->ls_rsbtbl_size = size;
6384 + ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
6385 + if (!ls->ls_rsbtbl)
6387 + for (i = 0; i < size; i++) {
6388 + INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
6389 + rwlock_init(&ls->ls_rsbtbl[i].lock);
6392 + size = dlm_config.lkbtbl_size;
6393 + ls->ls_lkbtbl_size = size;
6395 + ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
6396 + if (!ls->ls_lkbtbl)
6398 + for (i = 0; i < size; i++) {
6399 + INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
6400 + rwlock_init(&ls->ls_lkbtbl[i].lock);
6401 + ls->ls_lkbtbl[i].counter = 1;
6404 + size = dlm_config.dirtbl_size;
6405 + ls->ls_dirtbl_size = size;
6407 + ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
6408 + if (!ls->ls_dirtbl)
6410 + for (i = 0; i < size; i++) {
6411 + INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
6412 + rwlock_init(&ls->ls_dirtbl[i].lock);
6415 + INIT_LIST_HEAD(&ls->ls_nodes);
6416 + INIT_LIST_HEAD(&ls->ls_nodes_gone);
6417 + ls->ls_num_nodes = 0;
6418 + ls->ls_node_array = NULL;
6419 + ls->ls_recoverd_task = NULL;
6420 + init_MUTEX(&ls->ls_recoverd_lock);
6421 + INIT_LIST_HEAD(&ls->ls_recover);
6422 + spin_lock_init(&ls->ls_recover_lock);
6423 + INIT_LIST_HEAD(&ls->ls_recover_list);
6424 + ls->ls_recover_list_count = 0;
6425 + spin_lock_init(&ls->ls_recover_list_lock);
6426 + init_waitqueue_head(&ls->ls_wait_general);
6427 + INIT_LIST_HEAD(&ls->ls_rootres);
6428 + INIT_LIST_HEAD(&ls->ls_requestqueue);
6429 + INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
6430 + ls->ls_last_stop = 0;
6431 + ls->ls_last_start = 0;
6432 + ls->ls_last_finish = 0;
6433 + ls->ls_rcom_msgid = 0;
6434 + init_MUTEX(&ls->ls_requestqueue_lock);
6435 + init_MUTEX(&ls->ls_rcom_lock);
6436 + init_rwsem(&ls->ls_unlock_sem);
6437 + init_rwsem(&ls->ls_root_lock);
6438 + init_rwsem(&ls->ls_in_recovery);
6440 + down_write(&ls->ls_in_recovery);
6442 + if (flags & DLM_LSF_NOTIMERS)
6443 + set_bit(LSFL_NOTIMERS, &ls->ls_flags);
6447 + * Connect this lockspace with the cluster manager
6450 + error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
6451 + &ls_ops, TRUE, (void *) ls, &local_id);
6453 + goto out_recoverd;
6455 + ls->ls_state = LSST_INIT;
6456 + ls->ls_local_id = local_id;
6458 + spin_lock(&lslist_lock);
6459 + list_add(&ls->ls_list, &lslist);
6460 + spin_unlock(&lslist_lock);
6462 + error = kcl_join_service(local_id);
6464 + log_error(ls, "service manager join error %d", error);
6468 + /* The ls isn't actually running until it receives a start() from CMAN.
6469 + Neither does it have a global ls id until started. */
6471 + /* Return the local ID as the lockspace handle. I've left this
6472 + cast to a void* as it allows us to replace it with pretty much
6473 + anything at a future date without breaking clients. But returning
6474 + the address of the lockspace is a bad idea as it could get
6475 + forcibly removed, leaving client with a dangling pointer */
6477 + *lockspace = (void *)(long) local_id;
6481 + kcl_unregister_service(ls->ls_local_id);
6483 + dlm_recoverd_stop(ls);
6484 + kfree(ls->ls_dirtbl);
6486 + kfree(ls->ls_lkbtbl);
6488 + kfree(ls->ls_rsbtbl);
6496 + * Called by a system like GFS which wants independent lock spaces.
6499 +int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
6501 + int error = -ENOSYS;
6503 + down(&dlmstate_lock);
6504 + error = init_internal();
6508 + error = new_lockspace(name, namelen, lockspace, flags);
6510 + up(&dlmstate_lock);
6514 +/* Return 1 if the lockspace still has active remote locks,
6515 + * 2 if the lockspace still has active local locks.
6517 +static int lockspace_busy(struct dlm_ls *ls)
6519 + int i, lkb_found = 0;
6520 + struct dlm_lkb *lkb;
6522 + /* NOTE: We check the lockidtbl here rather than the resource table.
6523 + This is because there may be LKBs queued as ASTs that have been
6524 + unlinked from their RSBs and are pending deletion once the AST has
6527 + for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6528 + read_lock(&ls->ls_lkbtbl[i].lock);
6529 + if (!list_empty(&ls->ls_lkbtbl[i].list)) {
6531 + list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
6533 + if (!lkb->lkb_nodeid) {
6534 + read_unlock(&ls->ls_lkbtbl[i].lock);
6539 + read_unlock(&ls->ls_lkbtbl[i].lock);
6544 +static int release_lockspace(struct dlm_ls *ls, int force)
6546 + struct dlm_lkb *lkb;
6547 + struct dlm_rsb *rsb;
6548 + struct dlm_recover *rv;
6549 + struct list_head *head;
6551 + int busy = lockspace_busy(ls);
6553 + /* Don't destroy a busy lockspace */
6558 + kcl_leave_service(ls->ls_local_id);
6559 + kcl_unregister_service(ls->ls_local_id);
6562 + dlm_recoverd_stop(ls);
6564 + remove_lockspace(ls);
6567 + * Free direntry structs.
6570 + dlm_dir_clear(ls);
6571 + kfree(ls->ls_dirtbl);
6574 + * Free all lkb's on lkbtbl[] lists.
6577 + for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6578 + head = &ls->ls_lkbtbl[i].list;
6579 + while (!list_empty(head)) {
6580 + lkb = list_entry(head->next, struct dlm_lkb,
6582 + list_del(&lkb->lkb_idtbl_list);
6584 + if (lkb->lkb_lockqueue_state)
6585 + remove_from_lockqueue(lkb);
6587 + if (lkb->lkb_astflags & (AST_COMP | AST_BAST))
6588 + list_del(&lkb->lkb_astqueue);
6590 + if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
6591 + free_lvb(lkb->lkb_lvbptr);
6597 + kfree(ls->ls_lkbtbl);
6600 + * Free all rsb's on rsbtbl[] lists
6603 + for (i = 0; i < ls->ls_rsbtbl_size; i++) {
6604 + head = &ls->ls_rsbtbl[i].list;
6605 + while (!list_empty(head)) {
6606 + rsb = list_entry(head->next, struct dlm_rsb,
6608 + list_del(&rsb->res_hashchain);
6610 + if (rsb->res_lvbptr)
6611 + free_lvb(rsb->res_lvbptr);
6617 + kfree(ls->ls_rsbtbl);
6620 + * Free structures on any other lists
6623 + head = &ls->ls_recover;
6624 + while (!list_empty(head)) {
6625 + rv = list_entry(head->next, struct dlm_recover, list);
6626 + list_del(&rv->list);
6630 + clear_free_de(ls);
6632 + ls_nodes_clear(ls);
6633 + ls_nodes_gone_clear(ls);
6634 + if (ls->ls_node_array)
6635 + kfree(ls->ls_node_array);
6639 + module_put(THIS_MODULE);
6645 + * Called when a system has released all its locks and is not going to use the
6646 + * lockspace any longer. We blindly free everything we're managing for this
6647 + * lockspace. Remaining nodes will go through the recovery process as if we'd
6648 + * died. The lockspace must continue to function as usual, participating in
6649 + * recoveries, until kcl_leave_service returns.
6651 + * Force has 4 possible values:
6652 + * 0 - don't destroy locksapce if it has any LKBs
6653 + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
6654 + * 2 - destroy lockspace regardless of LKBs
6655 + * 3 - destroy lockspace as part of a forced shutdown
6658 +int dlm_release_lockspace(void *lockspace, int force)
6660 + struct dlm_ls *ls;
6662 + ls = find_lockspace_by_local_id(lockspace);
6665 + put_lockspace(ls);
6666 + return release_lockspace(ls, force);
6670 +/* Called when the cluster is being shut down dirtily */
6671 +void dlm_emergency_shutdown()
6673 + struct dlm_ls *ls;
6674 + struct dlm_ls *tmp;
6676 + /* Shut lowcomms down to prevent any socket activity */
6677 + lowcomms_stop_accept();
6679 + /* Delete the devices that belong the the userland
6680 + lockspaces to be deleted. */
6681 + dlm_device_free_devices();
6683 + /* Now try to clean the lockspaces */
6684 + spin_lock(&lslist_lock);
6686 + list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
6687 + spin_unlock(&lslist_lock);
6688 + release_lockspace(ls, 3);
6689 + spin_lock(&lslist_lock);
6692 + spin_unlock(&lslist_lock);
6695 +struct dlm_recover *allocate_dlm_recover(void)
6697 + struct dlm_recover *rv;
6699 + rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
6701 + memset(rv, 0, sizeof(struct dlm_recover));
6706 + * Called by CMAN on a specific ls. "stop" means set flag which while set
6707 + * causes all new requests to ls to be queued and not submitted until flag is
6708 + * cleared. stop on a ls also needs to cancel any prior starts on the ls.
6709 + * The recoverd thread carries out any work called for by this event.
6712 +static int dlm_ls_stop(void *servicedata)
6714 + struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6717 + spin_lock(&ls->ls_recover_lock);
6718 + ls->ls_last_stop = ls->ls_last_start;
6719 + set_bit(LSFL_LS_STOP, &ls->ls_flags);
6720 + new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
6721 + spin_unlock(&ls->ls_recover_lock);
6724 + * This in_recovery lock does two things:
6726 + * 1) Keeps this function from returning until all threads are out
6727 + * of locking routines and locking is truely stopped.
6728 + * 2) Keeps any new requests from being processed until it's unlocked
6729 + * when recovery is complete.
6733 + down_write(&ls->ls_in_recovery);
6735 + clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
6736 + clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
6737 + clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
6738 + clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
6740 + dlm_recoverd_kick(ls);
6746 + * Called by CMAN on a specific ls. "start" means enable the lockspace to do
6747 + * request processing which first requires that the recovery procedure be
6748 + * stepped through with all nodes sharing the lockspace (nodeids). The first
6749 + * start on the ls after it's created is a special case and requires some extra
6750 + * work like figuring out our own local nodeid. We can't do all this in the
6751 + * calling CMAN context, so we must pass this work off to the recoverd thread
6752 + * which was created in dlm_init(). The recoverd thread carries out any work
6753 + * called for by this event.
6756 +static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
6757 + int event_id, int type)
6759 + struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6760 + struct dlm_recover *rv;
6761 + int error = -ENOMEM;
6763 + rv = allocate_dlm_recover();
6767 + rv->nodeids = nodeids;
6768 + rv->node_count = count;
6769 + rv->event_id = event_id;
6771 + spin_lock(&ls->ls_recover_lock);
6772 + if (ls->ls_last_start == event_id)
6773 + log_all(ls, "repeated start %d stop %d finish %d",
6774 + event_id, ls->ls_last_stop, ls->ls_last_finish);
6775 + ls->ls_last_start = event_id;
6776 + list_add_tail(&rv->list, &ls->ls_recover);
6777 + set_bit(LSFL_LS_START, &ls->ls_flags);
6778 + spin_unlock(&ls->ls_recover_lock);
6780 + dlm_recoverd_kick(ls);
6788 + * Called by CMAN on a specific ls. "finish" means that all nodes which
6789 + * received a "start" have completed the start and called kcl_start_done.
6790 + * The recoverd thread carries out any work called for by this event.
6793 +static void dlm_ls_finish(void *servicedata, int event_id)
6795 + struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6797 + spin_lock(&ls->ls_recover_lock);
6798 + ls->ls_last_finish = event_id;
6799 + set_bit(LSFL_LS_FINISH, &ls->ls_flags);
6800 + spin_unlock(&ls->ls_recover_lock);
6802 + dlm_recoverd_kick(ls);
6805 +struct kcl_service_ops ls_ops = {
6806 + .stop = dlm_ls_stop,
6807 + .start = dlm_ls_start,
6808 + .finish = dlm_ls_finish
6810 diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
6811 --- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730
6812 +++ linux-patched/cluster/dlm/lockspace.h 2004-11-03 11:31:56.000000000 +0800
6814 +/******************************************************************************
6815 +*******************************************************************************
6817 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6818 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6820 +** This copyrighted material is made available to anyone wishing to use,
6821 +** modify, copy, or redistribute it subject to the terms and conditions
6822 +** of the GNU General Public License v.2.
6824 +*******************************************************************************
6825 +******************************************************************************/
6827 +#ifndef __LOCKSPACE_DOT_H__
6828 +#define __LOCKSPACE_DOT_H__
6830 +void dlm_lockspace_init(void);
6831 +int dlm_init(void);
6832 +int dlm_release(void);
6833 +int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
6834 +int dlm_release_lockspace(void *ls, int force);
6835 +void dlm_emergency_shutdown(void);
6836 +struct dlm_ls *find_lockspace_by_global_id(uint32_t id);
6837 +struct dlm_ls *find_lockspace_by_local_id(void *id);
6838 +struct dlm_ls *find_lockspace_by_name(char *name, int namelen);
6839 +void hold_lockspace(struct dlm_ls *ls);
6840 +void put_lockspace(struct dlm_ls *ls);
6842 +#endif /* __LOCKSPACE_DOT_H__ */
6843 diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
6844 --- linux-orig/cluster/dlm/lowcomms.c 1970-01-01 07:30:00.000000000 +0730
6845 +++ linux-patched/cluster/dlm/lowcomms.c 2004-11-03 11:31:56.000000000 +0800
6847 +/******************************************************************************
6848 +*******************************************************************************
6850 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6851 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6853 +** This copyrighted material is made available to anyone wishing to use,
6854 +** modify, copy, or redistribute it subject to the terms and conditions
6855 +** of the GNU General Public License v.2.
6857 +*******************************************************************************
6858 +******************************************************************************/
6863 + * This is the "low-level" comms layer.
6865 + * It is responsible for sending/receiving messages
6866 + * from other nodes in the cluster.
6868 + * Cluster nodes are referred to by their nodeids. nodeids are
6869 + * simply 32 bit numbers to the locking module - if they need to
6870 + * be expanded for the cluster infrastructure then that is it's
6871 + * responsibility. It is this layer's
6872 + * responsibility to resolve these into IP address or
6873 + * whatever it needs for inter-node communication.
6875 + * The comms level is two kernel threads that deal mainly with
6876 + * the receiving of messages from other nodes and passing them
6877 + * up to the mid-level comms layer (which understands the
6878 + * message format) for execution by the locking core, and
6879 + * a send thread which does all the setting up of connections
6880 + * to remote nodes and the sending of data. Threads are not allowed
6881 + * to send their own data because it may cause them to wait in times
6882 + * of high load. Also, this way, the sending thread can collect together
6883 + * messages bound for one node and send them in one block.
6885 + * I don't see any problem with the recv thread executing the locking
6886 + * code on behalf of remote processes as the locking code is
6887 + * short, efficient and never waits.
6892 +#include <asm/ioctls.h>
6893 +#include <net/sock.h>
6894 +#include <net/tcp.h>
6895 +#include <linux/pagemap.h>
6896 +#include <cluster/cnxman.h>
6898 +#include "dlm_internal.h"
6899 +#include "lowcomms.h"
6900 +#include "midcomms.h"
6901 +#include "config.h"
6909 +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
6910 +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
6911 +#define CBUF_EMPTY(cb) ((cb)->len == 0)
6912 +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
6913 +#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \
6914 + (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
6915 +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
6917 +struct connection {
6918 + struct socket *sock; /* NULL if not connected */
6919 + uint32_t nodeid; /* So we know who we are in the list */
6920 + struct rw_semaphore sock_sem; /* Stop connect races */
6921 + struct list_head read_list; /* On this list when ready for reading */
6922 + struct list_head write_list; /* On this list when ready for writing */
6923 + struct list_head state_list; /* On this list when ready to connect */
6924 + unsigned long flags; /* bit 1,2 = We are on the read/write lists */
6925 +#define CF_READ_PENDING 1
6926 +#define CF_WRITE_PENDING 2
6927 +#define CF_CONNECT_PENDING 3
6928 +#define CF_IS_OTHERCON 4
6929 + struct list_head writequeue; /* List of outgoing writequeue_entries */
6930 + struct list_head listenlist; /* List of allocated listening sockets */
6931 + spinlock_t writequeue_lock;
6932 + int (*rx_action) (struct connection *); /* What to do when active */
6933 + struct page *rx_page;
6936 + atomic_t waiting_requests;
6937 +#define MAX_CONNECT_RETRIES 3
6938 + struct connection *othercon;
6940 +#define sock2con(x) ((struct connection *)(x)->sk_user_data)
6942 +/* An entry waiting to be sent */
6943 +struct writequeue_entry {
6944 + struct list_head list;
6945 + struct page *page;
6950 + struct connection *con;
6953 +/* "Template" structure for IPv4 and IPv6 used to fill
6954 + * in the missing bits when converting between cman (which knows
6955 + * nothing about sockaddr structs) and real life where we actually
6956 + * have to connect to these addresses. Also one of these structs
6957 + * will hold the cached "us" address.
6959 + * It's an in6 sockaddr just so there's enough space for anything
6960 + * we're likely to see here.
6962 +static struct sockaddr_in6 local_addr;
6964 +/* Manage daemons */
6965 +static struct task_struct *recv_task;
6966 +static struct task_struct *send_task;
6968 +static wait_queue_t lowcomms_send_waitq_head;
6969 +static wait_queue_head_t lowcomms_send_waitq;
6970 +static wait_queue_t lowcomms_recv_waitq_head;
6971 +static wait_queue_head_t lowcomms_recv_waitq;
6973 +/* An array of pointers to connections, indexed by NODEID */
6974 +static struct connection **connections;
6975 +static struct rw_semaphore connections_lock;
6976 +static kmem_cache_t *con_cache;
6977 +static int conn_array_size;
6978 +static atomic_t accepting;
6980 +/* List of sockets that have reads pending */
6981 +static struct list_head read_sockets;
6982 +static spinlock_t read_sockets_lock;
6984 +/* List of sockets which have writes pending */
6985 +static struct list_head write_sockets;
6986 +static spinlock_t write_sockets_lock;
6988 +/* List of sockets which have connects pending */
6989 +static struct list_head state_sockets;
6990 +static spinlock_t state_sockets_lock;
6992 +/* List of allocated listen sockets */
6993 +static struct list_head listen_sockets;
6995 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
6996 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
6999 +static struct connection *nodeid2con(int nodeid, int allocation)
7001 + struct connection *con = NULL;
7003 + down_read(&connections_lock);
7004 + if (nodeid >= conn_array_size) {
7005 + int new_size = nodeid + dlm_config.conn_increment;
7006 + struct connection **new_conns;
7008 + new_conns = kmalloc(sizeof(struct connection *) *
7009 + new_size, allocation);
7013 + up_read(&connections_lock);
7014 + /* The worst that can happen here (I think), is that
7015 + we get two consecutive reallocations */
7016 + down_write(&connections_lock);
7018 + memset(new_conns, 0, sizeof(struct connection *) * new_size);
7019 + memcpy(new_conns, connections, sizeof(struct connection *) * conn_array_size);
7020 + conn_array_size = new_size;
7021 + kfree(connections);
7022 + connections = new_conns;
7024 + up_write(&connections_lock);
7025 + down_read(&connections_lock);
7028 + con = connections[nodeid];
7029 + if (con == NULL && allocation) {
7030 + con = kmem_cache_alloc(con_cache, allocation);
7034 + memset(con, 0, sizeof(*con));
7035 + con->nodeid = nodeid;
7036 + init_rwsem(&con->sock_sem);
7037 + INIT_LIST_HEAD(&con->writequeue);
7038 + spin_lock_init(&con->writequeue_lock);
7040 + connections[nodeid] = con;
7044 + up_read(&connections_lock);
7048 +/* Data available on socket or listen socket received a connect */
7049 +static void lowcomms_data_ready(struct sock *sk, int count_unused)
7051 + struct connection *con = sock2con(sk);
7053 + atomic_inc(&con->waiting_requests);
7054 + if (test_and_set_bit(CF_READ_PENDING, &con->flags))
7057 + spin_lock_bh(&read_sockets_lock);
7058 + list_add_tail(&con->read_list, &read_sockets);
7059 + spin_unlock_bh(&read_sockets_lock);
7061 + wake_up_interruptible(&lowcomms_recv_waitq);
7064 +static void lowcomms_write_space(struct sock *sk)
7066 + struct connection *con = sock2con(sk);
7068 + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
7071 + spin_lock_bh(&write_sockets_lock);
7072 + list_add_tail(&con->write_list, &write_sockets);
7073 + spin_unlock_bh(&write_sockets_lock);
7075 + wake_up_interruptible(&lowcomms_send_waitq);
7078 +static inline void lowcomms_connect_sock(struct connection *con)
7080 + if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
7082 + if (!atomic_read(&accepting))
7085 + spin_lock_bh(&state_sockets_lock);
7086 + list_add_tail(&con->state_list, &state_sockets);
7087 + spin_unlock_bh(&state_sockets_lock);
7089 + wake_up_interruptible(&lowcomms_send_waitq);
7092 +static void lowcomms_state_change(struct sock *sk)
7094 +/* struct connection *con = sock2con(sk); */
7096 + switch (sk->sk_state) {
7097 + case TCP_ESTABLISHED:
7098 + lowcomms_write_space(sk);
7101 + case TCP_FIN_WAIT1:
7102 + case TCP_FIN_WAIT2:
7103 + case TCP_TIME_WAIT:
7105 + case TCP_CLOSE_WAIT:
7106 + case TCP_LAST_ACK:
7108 + /* FIXME: I think this causes more trouble than it solves.
7109 + lowcomms wil reconnect anyway when there is something to
7110 + send. This just attempts reconnection if a node goes down!
7112 + /* lowcomms_connect_sock(con); */
7116 + printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
7121 +/* Make a socket active */
7122 +static int add_sock(struct socket *sock, struct connection *con)
7126 + /* Install a data_ready callback */
7127 + con->sock->sk->sk_data_ready = lowcomms_data_ready;
7128 + con->sock->sk->sk_write_space = lowcomms_write_space;
7129 + con->sock->sk->sk_state_change = lowcomms_state_change;
7134 +/* Add the port number to an IP6 or 4 sockaddr and return the address
7136 +static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
7139 + saddr->sin6_family = local_addr.sin6_family;
7140 + if (local_addr.sin6_family == AF_INET) {
7141 + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
7142 + in4_addr->sin_port = cpu_to_be16(port);
7143 + *addr_len = sizeof(struct sockaddr_in);
7146 + saddr->sin6_port = cpu_to_be16(port);
7147 + *addr_len = sizeof(struct sockaddr_in6);
7151 +/* Close a remote connection and tidy up */
7152 +static void close_connection(struct connection *con, int and_other)
7154 + down_write(&con->sock_sem);
7157 + sock_release(con->sock);
7159 + if (con->othercon && and_other) {
7160 + /* Argh! recursion in kernel code!
7161 + Actually, this isn't a list so it
7162 + will only re-enter once.
7164 + close_connection(con->othercon, TRUE);
7167 + if (con->rx_page) {
7168 + __free_page(con->rx_page);
7169 + con->rx_page = NULL;
7171 + up_write(&con->sock_sem);
7174 +/* Data received from remote end */
7175 +static int receive_from_sock(struct connection *con)
7178 + struct msghdr msg;
7179 + struct iovec iov[2];
7183 + int call_again_soon = 0;
7185 + down_read(&con->sock_sem);
7187 + if (con->sock == NULL)
7189 + if (con->rx_page == NULL) {
7191 + * This doesn't need to be atomic, but I think it should
7192 + * improve performance if it is.
7194 + con->rx_page = alloc_page(GFP_ATOMIC);
7195 + if (con->rx_page == NULL)
7197 + CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
7201 + * To avoid doing too many short reads, we will reschedule for
7202 + * another time if there are less than 20 bytes left in the buffer.
7204 + if (!CBUF_MAY_ADD(&con->cb, 20))
7207 + msg.msg_control = NULL;
7208 + msg.msg_controllen = 0;
7209 + msg.msg_iovlen = 1;
7210 + msg.msg_iov = iov;
7211 + msg.msg_name = NULL;
7212 + msg.msg_namelen = 0;
7213 + msg.msg_flags = 0;
7216 + * iov[0] is the bit of the circular buffer between the current end
7217 + * point (cb.base + cb.len) and the end of the buffer.
7219 + iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
7220 + iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
7221 + iov[1].iov_len = 0;
7224 + * iov[1] is the bit of the circular buffer between the start of the
7225 + * buffer and the start of the currently used section (cb.base)
7227 + if (CBUF_DATA(&con->cb) >= con->cb.base) {
7228 + iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
7229 + iov[1].iov_len = con->cb.base;
7230 + iov[1].iov_base = page_address(con->rx_page);
7231 + msg.msg_iovlen = 2;
7233 + len = iov[0].iov_len + iov[1].iov_len;
7237 + r = ret = sock_recvmsg(con->sock, &msg, len,
7238 + MSG_DONTWAIT | MSG_NOSIGNAL);
7244 + call_again_soon = 1;
7245 + CBUF_ADD(&con->cb, ret);
7246 + ret = midcomms_process_incoming_buffer(con->nodeid,
7247 + page_address(con->rx_page),
7248 + con->cb.base, con->cb.len,
7250 + if (ret == -EBADMSG) {
7251 + printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
7252 + "iov_len=%u, iov_base[0]=%p, read=%d\n",
7253 + page_address(con->rx_page), con->cb.base, con->cb.len,
7254 + len, iov[0].iov_base, r);
7258 + CBUF_EAT(&con->cb, ret);
7260 + if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
7261 + __free_page(con->rx_page);
7262 + con->rx_page = NULL;
7266 + if (call_again_soon)
7268 + up_read(&con->sock_sem);
7273 + lowcomms_data_ready(con->sock->sk, 0);
7274 + up_read(&con->sock_sem);
7279 + up_read(&con->sock_sem);
7280 + if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
7281 + close_connection(con, FALSE);
7282 + lowcomms_connect_sock(con);
7289 +/* Listening socket is busy, accept a connection */
7290 +static int accept_from_sock(struct connection *con)
7293 + struct sockaddr_in6 peeraddr;
7294 + struct socket *newsock;
7297 + struct connection *newcon;
7299 + memset(&peeraddr, 0, sizeof(peeraddr));
7300 + newsock = sock_alloc();
7304 + down_read(&con->sock_sem);
7306 + result = -ENOTCONN;
7307 + if (con->sock == NULL)
7310 + newsock->type = con->sock->type;
7311 + newsock->ops = con->sock->ops;
7313 + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
7317 + /* Get the connected socket's peer */
7318 + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
7320 + result = -ECONNABORTED;
7324 + /* Get the new node's NODEID */
7325 + nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
7326 + if (nodeid == 0) {
7327 + printk("dlm: connect from non cluster node\n");
7328 + sock_release(newsock);
7329 + up_read(&con->sock_sem);
7333 + log_print("got connection from %d", nodeid);
7335 + /* Check to see if we already have a connection to this node. This
7336 + * could happen if the two nodes initiate a connection at roughly
7337 + * the same time and the connections cross on the wire.
7339 + * In this case we store the incoming one in "othercon"
7341 + newcon = nodeid2con(nodeid, GFP_KERNEL);
7346 + down_write(&newcon->sock_sem);
7347 + if (newcon->sock) {
7348 + struct connection *othercon = newcon->othercon;
7351 + othercon = kmem_cache_alloc(con_cache, GFP_KERNEL);
7353 + printk("dlm: failed to allocate incoming socket\n");
7354 + up_write(&newcon->sock_sem);
7358 + memset(othercon, 0, sizeof(*othercon));
7359 + othercon->nodeid = nodeid;
7360 + othercon->rx_action = receive_from_sock;
7361 + init_rwsem(&othercon->sock_sem);
7362 + set_bit(CF_IS_OTHERCON, &othercon->flags);
7363 + newcon->othercon = othercon;
7365 + othercon->sock = newsock;
7366 + newsock->sk->sk_user_data = othercon;
7367 + add_sock(newsock, othercon);
7370 + newsock->sk->sk_user_data = newcon;
7371 + newcon->rx_action = receive_from_sock;
7372 + add_sock(newsock, newcon);
7376 + up_write(&newcon->sock_sem);
7379 + * Add it to the active queue in case we got data
7380 + * beween processing the accept adding the socket
7381 + * to the read_sockets list
7383 + lowcomms_data_ready(newsock->sk, 0);
7384 + up_read(&con->sock_sem);
7389 + up_read(&con->sock_sem);
7390 + sock_release(newsock);
7392 + if (result != -EAGAIN)
7393 + printk("dlm: error accepting connection from node: %d\n", result);
7397 +/* Connect a new socket to its peer */
7398 +static int connect_to_sock(struct connection *con)
7400 + int result = -EHOSTUNREACH;
7401 + struct sockaddr_in6 saddr;
7403 + struct socket *sock;
7405 + if (con->nodeid == 0) {
7406 + log_print("attempt to connect sock 0 foiled");
7410 + down_write(&con->sock_sem);
7411 + if (con->retries++ > MAX_CONNECT_RETRIES)
7414 + // FIXME not sure this should happen, let alone like this.
7416 + sock_release(con->sock);
7420 + /* Create a socket to communicate with */
7421 + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
7425 + memset(&saddr, 0, sizeof(saddr));
7426 + if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
7429 + sock->sk->sk_user_data = con;
7430 + con->rx_action = receive_from_sock;
7432 + make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
7434 + add_sock(sock, con);
7436 + log_print("connecting to %d", con->nodeid);
7438 + sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
7440 + if (result == -EINPROGRESS)
7446 + up_write(&con->sock_sem);
7448 + * Returning an error here means we've given up trying to connect to
7449 + * a remote node, otherwise we return 0 and reschedule the connetion
7456 + sock_release(con->sock);
7460 + * Some errors are fatal and this list might need adjusting. For other
7461 + * errors we try again until the max number of retries is reached.
7463 + if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
7464 + result != -ENETDOWN && result != EINVAL
7465 + && result != -EPROTONOSUPPORT) {
7466 + lowcomms_connect_sock(con);
7472 +static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
7474 + struct socket *sock = NULL;
7478 + struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
7480 + /* Create a socket to communicate with */
7481 + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
7483 + printk("dlm: Can't create listening comms socket\n");
7489 + result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
7492 + printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
7494 + sock->sk->sk_user_data = con;
7495 + con->rx_action = accept_from_sock;
7498 + /* Bind to our port */
7499 + make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
7500 + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
7502 + printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
7503 + sock_release(sock);
7511 + result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
7514 + printk("dlm: Set keepalive failed: %d\n", result);
7517 + result = sock->ops->listen(sock, 5);
7519 + printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
7520 + sock_release(sock);
7530 +/* Listen on all interfaces */
7531 +static int listen_for_all(void)
7535 + struct socket *sock = NULL;
7536 + struct list_head *addr_list;
7537 + struct connection *con = nodeid2con(0, GFP_KERNEL);
7538 + struct connection *temp;
7539 + struct cluster_node_addr *node_addr;
7540 + char local_addr[sizeof(struct sockaddr_in6)];
7542 + /* This will also fill in local_addr */
7543 + nodeid = lowcomms_our_nodeid();
7545 + addr_list = kcl_get_node_addresses(nodeid);
7547 + printk("dlm: cannot initialise comms layer\n");
7548 + result = -ENOTCONN;
7552 + list_for_each_entry(node_addr, addr_list, list) {
7555 + con = kmem_cache_alloc(con_cache, GFP_KERNEL);
7557 + printk("dlm: failed to allocate listen socket\n");
7561 + memset(con, 0, sizeof(*con));
7562 + init_rwsem(&con->sock_sem);
7563 + spin_lock_init(&con->writequeue_lock);
7564 + INIT_LIST_HEAD(&con->writequeue);
7565 + set_bit(CF_IS_OTHERCON, &con->flags);
7568 + memcpy(local_addr, node_addr->addr, node_addr->addr_len);
7569 + sock = create_listen_sock(con, local_addr,
7570 + node_addr->addr_len);
7572 + add_sock(sock, con);
7574 + /* Keep a list of dynamically allocated listening sockets
7575 + so we can free them at shutdown */
7576 + if (test_bit(CF_IS_OTHERCON, &con->flags)) {
7577 + list_add_tail(&con->listenlist, &listen_sockets);
7581 + result = -EADDRINUSE;
7582 + kmem_cache_free(con_cache, con);
7593 + /* Free up any dynamically allocated listening sockets */
7594 + list_for_each_entry_safe(con, temp, &listen_sockets, listenlist) {
7595 + sock_release(con->sock);
7596 + kmem_cache_free(con_cache, con);
7603 +static struct writequeue_entry *new_writequeue_entry(struct connection *con,
7606 + struct writequeue_entry *entry;
7608 + entry = kmalloc(sizeof(struct writequeue_entry), allocation);
7612 + entry->page = alloc_page(allocation);
7613 + if (!entry->page) {
7618 + entry->offset = 0;
7627 +struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7628 + int allocation, char **ppc)
7630 + struct connection *con = nodeid2con(nodeid, allocation);
7631 + struct writequeue_entry *e;
7638 + if (!atomic_read(&accepting))
7641 + spin_lock(&con->writequeue_lock);
7642 + e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
7643 + if (((struct list_head *) e == &con->writequeue) ||
7644 + (PAGE_CACHE_SIZE - e->end < len)) {
7649 + users = e->users++;
7651 + spin_unlock(&con->writequeue_lock);
7657 + *ppc = page_address(e->page) + offset;
7661 + e = new_writequeue_entry(con, allocation);
7663 + spin_lock(&con->writequeue_lock);
7666 + users = e->users++;
7667 + list_add_tail(&e->list, &con->writequeue);
7668 + spin_unlock(&con->writequeue_lock);
7674 +void lowcomms_commit_buffer(struct writequeue_entry *e)
7676 + struct connection *con = e->con;
7679 + if (!atomic_read(&accepting))
7682 + spin_lock(&con->writequeue_lock);
7683 + users = --e->users;
7686 + e->len = e->end - e->offset;
7688 + spin_unlock(&con->writequeue_lock);
7690 + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
7691 + spin_lock_bh(&write_sockets_lock);
7692 + list_add_tail(&con->write_list, &write_sockets);
7693 + spin_unlock_bh(&write_sockets_lock);
7695 + wake_up_interruptible(&lowcomms_send_waitq);
7700 + spin_unlock(&con->writequeue_lock);
7704 +static void free_entry(struct writequeue_entry *e)
7706 + __free_page(e->page);
7710 +/* Send a message */
7711 +static int send_to_sock(struct connection *con)
7714 + ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
7715 + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
7716 + struct writequeue_entry *e;
7719 + down_read(&con->sock_sem);
7720 + if (con->sock == NULL)
7723 + sendpage = con->sock->ops->sendpage;
7725 + spin_lock(&con->writequeue_lock);
7727 + e = list_entry(con->writequeue.next, struct writequeue_entry,
7729 + if ((struct list_head *) e == &con->writequeue)
7733 + offset = e->offset;
7734 + BUG_ON(len == 0 && e->users == 0);
7735 + spin_unlock(&con->writequeue_lock);
7739 + ret = sendpage(con->sock, e->page, offset, len,
7741 + if (ret == -EAGAIN || ret == 0)
7747 + spin_lock(&con->writequeue_lock);
7751 + if (e->len == 0 && e->users == 0) {
7752 + list_del(&e->list);
7757 + spin_unlock(&con->writequeue_lock);
7759 + up_read(&con->sock_sem);
7763 + up_read(&con->sock_sem);
7764 + close_connection(con, FALSE);
7765 + lowcomms_connect_sock(con);
7769 + up_read(&con->sock_sem);
7770 + lowcomms_connect_sock(con);
7774 +static void clean_one_writequeue(struct connection *con)
7776 + struct list_head *list;
7777 + struct list_head *temp;
7779 + spin_lock(&con->writequeue_lock);
7780 + list_for_each_safe(list, temp, &con->writequeue) {
7781 + struct writequeue_entry *e =
7782 + list_entry(list, struct writequeue_entry, list);
7783 + list_del(&e->list);
7786 + spin_unlock(&con->writequeue_lock);
7789 +/* Called from recovery when it knows that a node has
7790 + left the cluster */
7791 +int lowcomms_close(int nodeid)
7793 + struct connection *con;
7798 + log_print("closing connection to node %d", nodeid);
7799 + con = nodeid2con(nodeid, 0);
7801 + close_connection(con, TRUE);
7802 + clean_one_writequeue(con);
7803 + atomic_set(&con->waiting_requests, 0);
7811 +/* API send message call, may queue the request */
7812 +/* N.B. This is the old interface - use the new one for new calls */
7813 +int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
7815 + struct writequeue_entry *e;
7818 + e = lowcomms_get_buffer(nodeid, len, allocation, &b);
7820 + memcpy(b, buf, len);
7821 + lowcomms_commit_buffer(e);
7827 +/* Look for activity on active sockets */
7828 +static void process_sockets(void)
7830 + struct list_head *list;
7831 + struct list_head *temp;
7833 + spin_lock_bh(&read_sockets_lock);
7834 + list_for_each_safe(list, temp, &read_sockets) {
7835 + struct connection *con =
7836 + list_entry(list, struct connection, read_list);
7837 + list_del(&con->read_list);
7838 + clear_bit(CF_READ_PENDING, &con->flags);
7840 + spin_unlock_bh(&read_sockets_lock);
7842 + /* This can reach zero if we are processing requests
7843 + * as they come in.
7845 + if (atomic_read(&con->waiting_requests) == 0) {
7846 + spin_lock_bh(&read_sockets_lock);
7851 + con->rx_action(con);
7852 + } while (!atomic_dec_and_test(&con->waiting_requests) &&
7853 + !kthread_should_stop());
7855 + /* Don't starve out everyone else */
7857 + spin_lock_bh(&read_sockets_lock);
7859 + spin_unlock_bh(&read_sockets_lock);
7862 +/* Try to send any messages that are pending
7864 +static void process_output_queue(void)
7866 + struct list_head *list;
7867 + struct list_head *temp;
7870 + spin_lock_bh(&write_sockets_lock);
7871 + list_for_each_safe(list, temp, &write_sockets) {
7872 + struct connection *con =
7873 + list_entry(list, struct connection, write_list);
7874 + list_del(&con->write_list);
7875 + clear_bit(CF_WRITE_PENDING, &con->flags);
7877 + spin_unlock_bh(&write_sockets_lock);
7879 + ret = send_to_sock(con);
7882 + spin_lock_bh(&write_sockets_lock);
7884 + spin_unlock_bh(&write_sockets_lock);
7887 +static void process_state_queue(void)
7889 + struct list_head *list;
7890 + struct list_head *temp;
7893 + spin_lock_bh(&state_sockets_lock);
7894 + list_for_each_safe(list, temp, &state_sockets) {
7895 + struct connection *con =
7896 + list_entry(list, struct connection, state_list);
7897 + list_del(&con->state_list);
7898 + clear_bit(CF_CONNECT_PENDING, &con->flags);
7899 + spin_unlock_bh(&state_sockets_lock);
7901 + ret = connect_to_sock(con);
7904 + spin_lock_bh(&state_sockets_lock);
7906 + spin_unlock_bh(&state_sockets_lock);
7910 +/* Discard all entries on the write queues */
7911 +static void clean_writequeues(void)
7915 + for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
7916 + struct connection *con = nodeid2con(nodeid, 0);
7919 + clean_one_writequeue(con);
7923 +static int read_list_empty(void)
7927 + spin_lock_bh(&read_sockets_lock);
7928 + status = list_empty(&read_sockets);
7929 + spin_unlock_bh(&read_sockets_lock);
7934 +/* DLM Transport comms receive daemon */
7935 +static int dlm_recvd(void *data)
7937 + init_waitqueue_head(&lowcomms_recv_waitq);
7938 + init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
7939 + add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
7941 + while (!kthread_should_stop()) {
7942 + set_current_state(TASK_INTERRUPTIBLE);
7943 + if (read_list_empty())
7945 + set_current_state(TASK_RUNNING);
7947 + process_sockets();
7953 +static int write_and_state_lists_empty(void)
7957 + spin_lock_bh(&write_sockets_lock);
7958 + status = list_empty(&write_sockets);
7959 + spin_unlock_bh(&write_sockets_lock);
7961 + spin_lock_bh(&state_sockets_lock);
7962 + if (list_empty(&state_sockets) == 0)
7964 + spin_unlock_bh(&state_sockets_lock);
7969 +/* DLM Transport send daemon */
7970 +static int dlm_sendd(void *data)
7972 + init_waitqueue_head(&lowcomms_send_waitq);
7973 + init_waitqueue_entry(&lowcomms_send_waitq_head, current);
7974 + add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
7976 + while (!kthread_should_stop()) {
7977 + set_current_state(TASK_INTERRUPTIBLE);
7978 + if (write_and_state_lists_empty())
7980 + set_current_state(TASK_RUNNING);
7982 + process_state_queue();
7983 + process_output_queue();
7989 +static void daemons_stop(void)
7991 + kthread_stop(recv_task);
7992 + kthread_stop(send_task);
7995 +static int daemons_start(void)
7997 + struct task_struct *p;
8000 + p = kthread_run(dlm_recvd, NULL, 0, "dlm_recvd");
8001 + error = IS_ERR(p);
8003 + log_print("can't start dlm_recvd %d", error);
8008 + p = kthread_run(dlm_sendd, NULL, 0, "dlm_sendd");
8009 + error = IS_ERR(p);
8011 + log_print("can't start dlm_sendd %d", error);
8012 + kthread_stop(recv_task);
8021 + * Return the largest buffer size we can cope with.
8023 +int lowcomms_max_buffer_size(void)
8025 + return PAGE_CACHE_SIZE;
8028 +void lowcomms_stop(void)
8031 + struct connection *temp;
8032 + struct connection *lcon;
8034 + atomic_set(&accepting, 0);
8036 + /* Set all the activity flags to prevent any
8039 + for (i = 0; i < conn_array_size; i++) {
8040 + if (connections[i])
8041 + connections[i]->flags = 0x7;
8044 + clean_writequeues();
8046 + for (i = 0; i < conn_array_size; i++) {
8047 + if (connections[i]) {
8048 + close_connection(connections[i], TRUE);
8049 + if (connections[i]->othercon)
8050 + kmem_cache_free(con_cache, connections[i]->othercon);
8051 + kmem_cache_free(con_cache, connections[i]);
8055 + kfree(connections);
8056 + connections = NULL;
8058 + /* Free up any dynamically allocated listening sockets */
8059 + list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
8060 + sock_release(lcon->sock);
8061 + kmem_cache_free(con_cache, lcon);
8064 + kmem_cache_destroy(con_cache);
8065 + kcl_releaseref_cluster();
8068 +/* This is quite likely to sleep... */
8069 +int lowcomms_start(void)
8072 + struct connection *temp;
8073 + struct connection *lcon;
8075 + INIT_LIST_HEAD(&read_sockets);
8076 + INIT_LIST_HEAD(&write_sockets);
8077 + INIT_LIST_HEAD(&state_sockets);
8078 + INIT_LIST_HEAD(&listen_sockets);
8080 + spin_lock_init(&read_sockets_lock);
8081 + spin_lock_init(&write_sockets_lock);
8082 + spin_lock_init(&state_sockets_lock);
8083 + init_rwsem(&connections_lock);
8085 + error = -ENOTCONN;
8086 + if (kcl_addref_cluster())
8090 + * Temporarily initialise the waitq head so that lowcomms_send_message
8091 + * doesn't crash if it gets called before the thread is fully
8094 + init_waitqueue_head(&lowcomms_send_waitq);
8097 + connections = kmalloc(sizeof(struct connection *) *
8098 + dlm_config.conn_increment, GFP_KERNEL);
8102 + memset(connections, 0,
8103 + sizeof(struct connection *) * dlm_config.conn_increment);
8105 + conn_array_size = dlm_config.conn_increment;
8107 + con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
8108 + __alignof__(struct connection), 0, NULL, NULL);
8110 + goto fail_free_conn;
8113 + /* Start listening */
8114 + error = listen_for_all();
8116 + goto fail_unlisten;
8118 + error = daemons_start();
8120 + goto fail_unlisten;
8122 + atomic_set(&accepting, 1);
8127 + close_connection(connections[0], 0);
8128 + kmem_cache_free(con_cache, connections[0]);
8129 + list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
8130 + sock_release(lcon->sock);
8131 + kmem_cache_free(con_cache, lcon);
8134 + kmem_cache_destroy(con_cache);
8137 + kcl_releaseref_cluster();
8138 + kfree(connections);
8144 +/* Don't accept any more outgoing work */
8145 +void lowcomms_stop_accept()
8147 + atomic_set(&accepting, 0);
8150 +/* Cluster Manager interface functions for looking up
8151 + nodeids and IP addresses by each other
8154 +/* Return the IP address of a node given its NODEID */
8155 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
8157 + struct list_head *addrs;
8158 + struct cluster_node_addr *node_addr;
8159 + struct cluster_node_addr *current_addr = NULL;
8160 + struct sockaddr_in6 *saddr;
8164 + addrs = kcl_get_node_addresses(nodeid);
8168 + interface = kcl_get_current_interface();
8170 + /* Look for address number <interface> */
8171 + i=0; /* i/f numbers start at 1 */
8172 + list_for_each_entry(node_addr, addrs, list) {
8173 + if (interface == ++i) {
8174 + current_addr = node_addr;
8179 + /* If that failed then just use the first one */
8180 + if (!current_addr)
8181 + current_addr = (struct cluster_node_addr *)addrs->next;
8183 + saddr = (struct sockaddr_in6 *)current_addr->addr;
8185 + /* Extract the IP address */
8186 + if (local_addr.sin6_family == AF_INET) {
8187 + struct sockaddr_in *in4 = (struct sockaddr_in *)saddr;
8188 + struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
8189 + ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
8192 + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
8193 + memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
8199 +/* Return the NODEID for a node given its sockaddr */
8200 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
8202 + struct kcl_cluster_node node;
8203 + struct sockaddr_in6 ipv6_addr;
8204 + struct sockaddr_in ipv4_addr;
8206 + if (local_addr.sin6_family == AF_INET) {
8207 + struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
8208 + memcpy(&ipv4_addr, &local_addr, addr_len);
8209 + memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
8211 + addr = (struct sockaddr *)&ipv4_addr;
8214 + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
8215 + memcpy(&ipv6_addr, &local_addr, addr_len);
8216 + memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
8218 + addr = (struct sockaddr *)&ipv6_addr;
8221 + if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
8222 + return node.node_id;
8227 +int lowcomms_our_nodeid(void)
8229 + struct kcl_cluster_node node;
8230 + struct list_head *addrs;
8231 + struct cluster_node_addr *first_addr;
8232 + static int our_nodeid = 0;
8235 + return our_nodeid;
8237 + if (kcl_get_node_by_nodeid(0, &node) == -1)
8240 + our_nodeid = node.node_id;
8242 + /* Fill in the "template" structure */
8243 + addrs = kcl_get_node_addresses(our_nodeid);
8247 + first_addr = (struct cluster_node_addr *) addrs->next;
8248 + memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
8250 + return node.node_id;
8253 + * Overrides for Emacs so that we follow Linus's tabbing style.
8254 + * Emacs will notice this stuff at the end of the file and automatically
8255 + * adjust the settings for this buffer only. This must remain at the end
8257 + * ---------------------------------------------------------------------------
8258 + * Local variables:
8259 + * c-file-style: "linux"
8262 diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
8263 --- linux-orig/cluster/dlm/lowcomms.h 1970-01-01 07:30:00.000000000 +0730
8264 +++ linux-patched/cluster/dlm/lowcomms.h 2004-11-03 11:31:56.000000000 +0800
8266 +/******************************************************************************
8267 +*******************************************************************************
8269 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8270 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8272 +** This copyrighted material is made available to anyone wishing to use,
8273 +** modify, copy, or redistribute it subject to the terms and conditions
8274 +** of the GNU General Public License v.2.
8276 +*******************************************************************************
8277 +******************************************************************************/
8279 +#ifndef __LOWCOMMS_DOT_H__
8280 +#define __LOWCOMMS_DOT_H__
8282 +/* The old interface */
8283 +int lowcomms_send_message(int csid, char *buf, int len, int allocation);
8285 +/* The new interface */
8286 +struct writequeue_entry;
8287 +extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
8288 + int allocation, char **ppc);
8289 +extern void lowcomms_commit_buffer(struct writequeue_entry *e);
8291 +int lowcomms_start(void);
8292 +void lowcomms_stop(void);
8293 +void lowcomms_stop_accept(void);
8294 +int lowcomms_close(int nodeid);
8295 +int lowcomms_max_buffer_size(void);
8297 +int lowcomms_our_nodeid(void);
8299 +#endif /* __LOWCOMMS_DOT_H__ */
8300 diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
8301 --- linux-orig/cluster/dlm/main.c 1970-01-01 07:30:00.000000000 +0730
8302 +++ linux-patched/cluster/dlm/main.c 2004-11-03 11:31:56.000000000 +0800
8304 +/******************************************************************************
8305 +*******************************************************************************
8307 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8308 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8310 +** This copyrighted material is made available to anyone wishing to use,
8311 +** modify, copy, or redistribute it subject to the terms and conditions
8312 +** of the GNU General Public License v.2.
8314 +*******************************************************************************
8315 +******************************************************************************/
8317 +#define EXPORT_SYMTAB
8319 +#include <linux/init.h>
8320 +#include <linux/proc_fs.h>
8321 +#include <linux/ctype.h>
8322 +#include <linux/module.h>
8323 +#include <net/sock.h>
8325 +#include <cluster/cnxman.h>
8327 +#include "dlm_internal.h"
8328 +#include "lockspace.h"
8332 +#include "locking.h"
8333 +#include "config.h"
8334 +#include "memory.h"
8335 +#include "recover.h"
8336 +#include "lowcomms.h"
8338 +int dlm_device_init(void);
8339 +void dlm_device_exit(void);
8340 +void dlm_proc_init(void);
8341 +void dlm_proc_exit(void);
8344 +/* Cluster manager callbacks, we want to know if a node dies
8345 + N.B. this is independent of lockspace-specific event callbacks from SM */
8347 +static void cman_callback(kcl_callback_reason reason, long arg)
8349 + /* This is unconditional. so do what we can to tidy up */
8350 + if (reason == LEAVING) {
8351 + dlm_emergency_shutdown();
8355 +int __init init_dlm(void)
8358 + dlm_lockspace_init();
8360 + dlm_device_init();
8361 + dlm_memory_init();
8362 + dlm_config_init();
8364 + kcl_add_callback(cman_callback);
8366 + printk("DLM %s (built %s %s) installed\n",
8367 + DLM_RELEASE_NAME, __DATE__, __TIME__);
8372 +void __exit exit_dlm(void)
8374 + kcl_remove_callback(cman_callback);
8376 + dlm_device_exit();
8377 + dlm_memory_exit();
8378 + dlm_config_exit();
8382 +MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
8383 +MODULE_AUTHOR("Red Hat, Inc.");
8384 +MODULE_LICENSE("GPL");
8386 +module_init(init_dlm);
8387 +module_exit(exit_dlm);
8389 +EXPORT_SYMBOL(dlm_init);
8390 +EXPORT_SYMBOL(dlm_release);
8391 +EXPORT_SYMBOL(dlm_new_lockspace);
8392 +EXPORT_SYMBOL(dlm_release_lockspace);
8393 +EXPORT_SYMBOL(dlm_lock);
8394 +EXPORT_SYMBOL(dlm_unlock);
8395 +EXPORT_SYMBOL(dlm_debug_dump);
8396 +EXPORT_SYMBOL(dlm_locks_dump);
8397 diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
8398 --- linux-orig/cluster/dlm/memory.c 1970-01-01 07:30:00.000000000 +0730
8399 +++ linux-patched/cluster/dlm/memory.c 2004-11-03 11:31:56.000000000 +0800
8401 +/******************************************************************************
8402 +*******************************************************************************
8404 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8405 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8407 +** This copyrighted material is made available to anyone wishing to use,
8408 +** modify, copy, or redistribute it subject to the terms and conditions
8409 +** of the GNU General Public License v.2.
8411 +*******************************************************************************
8412 +******************************************************************************/
8416 + * memory allocation routines
8420 +#include "dlm_internal.h"
8421 +#include "memory.h"
8422 +#include "config.h"
8424 +/* as the man says...Shouldn't this be in a header file somewhere? */
8425 +#define BYTES_PER_WORD sizeof(void *)
8427 +static kmem_cache_t *rsb_cache_small;
8428 +static kmem_cache_t *rsb_cache_large;
8429 +static kmem_cache_t *lkb_cache;
8430 +static kmem_cache_t *lvb_cache;
8431 +static kmem_cache_t *resdir_cache_large;
8432 +static kmem_cache_t *resdir_cache_small;
8434 +/* The thresholds above which we allocate large RSBs/direntry rather than small
8435 + * ones. This must make the resultant structure end on a word boundary */
8436 +#define LARGE_RSB_NAME 28
8437 +#define LARGE_RES_NAME 28
8439 +int dlm_memory_init()
8441 + int ret = -ENOMEM;
8445 + kmem_cache_create("dlm_rsb(small)",
8446 + (sizeof(struct dlm_rsb) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
8447 + __alignof__(struct dlm_rsb), 0, NULL, NULL);
8448 + if (!rsb_cache_small)
8452 + kmem_cache_create("dlm_rsb(large)",
8453 + sizeof(struct dlm_rsb) + DLM_RESNAME_MAXLEN,
8454 + __alignof__(struct dlm_rsb), 0, NULL, NULL);
8455 + if (!rsb_cache_large)
8456 + goto out_free_rsbs;
8458 + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
8459 + __alignof__(struct dlm_lkb), 0, NULL, NULL);
8461 + goto out_free_rsbl;
8463 + resdir_cache_large =
8464 + kmem_cache_create("dlm_resdir(l)",
8465 + sizeof(struct dlm_direntry) + DLM_RESNAME_MAXLEN,
8466 + __alignof__(struct dlm_direntry), 0, NULL, NULL);
8467 + if (!resdir_cache_large)
8468 + goto out_free_lkb;
8470 + resdir_cache_small =
8471 + kmem_cache_create("dlm_resdir(s)",
8472 + (sizeof(struct dlm_direntry) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
8473 + __alignof__(struct dlm_direntry), 0, NULL, NULL);
8474 + if (!resdir_cache_small)
8475 + goto out_free_resl;
8477 + /* LVB cache also holds ranges, so should be 64bit aligned */
8478 + lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
8479 + __alignof__(uint64_t), 0, NULL, NULL);
8481 + goto out_free_ress;
8487 + kmem_cache_destroy(resdir_cache_small);
8490 + kmem_cache_destroy(resdir_cache_large);
8493 + kmem_cache_destroy(lkb_cache);
8496 + kmem_cache_destroy(rsb_cache_large);
8499 + kmem_cache_destroy(rsb_cache_small);
8505 +void dlm_memory_exit()
8507 + kmem_cache_destroy(rsb_cache_large);
8508 + kmem_cache_destroy(rsb_cache_small);
8509 + kmem_cache_destroy(lkb_cache);
8510 + kmem_cache_destroy(resdir_cache_small);
8511 + kmem_cache_destroy(resdir_cache_large);
8512 + kmem_cache_destroy(lvb_cache);
8515 +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
8517 + struct dlm_rsb *r;
8519 + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
8521 + if (namelen >= LARGE_RSB_NAME)
8522 + r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
8524 + r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
8527 + memset(r, 0, sizeof(struct dlm_rsb) + namelen);
8532 +void free_rsb(struct dlm_rsb *r)
8534 + int length = r->res_length;
8537 + memset(r, 0x55, sizeof(struct dlm_rsb) + r->res_length);
8540 + if (length >= LARGE_RSB_NAME)
8541 + kmem_cache_free(rsb_cache_large, r);
8543 + kmem_cache_free(rsb_cache_small, r);
8546 +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
8548 + struct dlm_lkb *l;
8550 + l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
8552 + memset(l, 0, sizeof(struct dlm_lkb));
8557 +void free_lkb(struct dlm_lkb *l)
8560 + memset(l, 0xAA, sizeof(struct dlm_lkb));
8562 + kmem_cache_free(lkb_cache, l);
8565 +struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
8567 + struct dlm_direntry *rd;
8569 + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
8571 + if (namelen >= LARGE_RES_NAME)
8572 + rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
8574 + rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
8577 + memset(rd, 0, sizeof(struct dlm_direntry));
8582 +void free_direntry(struct dlm_direntry *de)
8584 + if (de->length >= LARGE_RES_NAME)
8585 + kmem_cache_free(resdir_cache_large, de);
8587 + kmem_cache_free(resdir_cache_small, de);
8590 +char *allocate_lvb(struct dlm_ls *ls)
8594 + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8596 + memset(l, 0, DLM_LVB_LEN);
8601 +void free_lvb(char *l)
8603 + kmem_cache_free(lvb_cache, l);
8606 +/* Ranges are allocated from the LVB cache as they are the same size (4x64
8608 +uint64_t *allocate_range(struct dlm_ls * ls)
8612 + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8614 + memset(l, 0, DLM_LVB_LEN);
8619 +void free_range(uint64_t *l)
8621 + kmem_cache_free(lvb_cache, l);
8624 +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls)
8626 + struct dlm_rcom *rc;
8628 + rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
8630 + memset(rc, 0, dlm_config.buffer_size);
8635 +void free_rcom_buffer(struct dlm_rcom *rc)
8639 diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
8640 --- linux-orig/cluster/dlm/memory.h 1970-01-01 07:30:00.000000000 +0730
8641 +++ linux-patched/cluster/dlm/memory.h 2004-11-03 11:31:56.000000000 +0800
8643 +/******************************************************************************
8644 +*******************************************************************************
8646 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8647 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8649 +** This copyrighted material is made available to anyone wishing to use,
8650 +** modify, copy, or redistribute it subject to the terms and conditions
8651 +** of the GNU General Public License v.2.
8653 +*******************************************************************************
8654 +******************************************************************************/
8656 +#ifndef __MEMORY_DOT_H__
8657 +#define __MEMORY_DOT_H__
8659 +int dlm_memory_init(void);
8660 +void dlm_memory_exit(void);
8661 +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
8662 +void free_rsb(struct dlm_rsb *r);
8663 +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
8664 +void free_lkb(struct dlm_lkb *l);
8665 +struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
8666 +void free_direntry(struct dlm_direntry *de);
8667 +char *allocate_lvb(struct dlm_ls *ls);
8668 +void free_lvb(char *l);
8669 +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls);
8670 +void free_rcom_buffer(struct dlm_rcom *rc);
8671 +uint64_t *allocate_range(struct dlm_ls *ls);
8672 +void free_range(uint64_t *l);
8674 +#endif /* __MEMORY_DOT_H__ */
8675 diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
8676 --- linux-orig/cluster/dlm/midcomms.c 1970-01-01 07:30:00.000000000 +0730
8677 +++ linux-patched/cluster/dlm/midcomms.c 2004-11-03 11:31:56.000000000 +0800
8679 +/******************************************************************************
8680 +*******************************************************************************
8682 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8683 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8685 +** This copyrighted material is made available to anyone wishing to use,
8686 +** modify, copy, or redistribute it subject to the terms and conditions
8687 +** of the GNU General Public License v.2.
8689 +*******************************************************************************
8690 +******************************************************************************/
8695 + * This is the appallingly named "mid-level" comms layer.
8697 + * Its purpose is to take packets from the "real" comms layer,
8698 + * split them up into packets and pass them to the interested
8699 + * part of the locking mechanism.
8701 + * It also takes messages from the locking layer, formats them
8702 + * into packets and sends them to the comms layer.
8704 + * It knows the format of the mid-level messages used and nodeidss
8705 + * but it does not know how to resolve a nodeid into an IP address
8706 + * or any of the comms channel details
8710 +#include "dlm_internal.h"
8711 +#include "lowcomms.h"
8712 +#include "midcomms.h"
8713 +#include "lockqueue.h"
8715 +#include "reccomms.h"
8716 +#include "config.h"
8718 +/* Byteorder routines */
8720 +static void host_to_network(void *msg)
8722 + struct dlm_header *head = msg;
8723 + struct dlm_request *req = msg;
8724 + struct dlm_reply *rep = msg;
8725 + struct dlm_query_request *qreq = msg;
8726 + struct dlm_query_reply *qrep= msg;
8727 + struct dlm_rcom *rc = msg;
8729 + /* Force into network byte order */
8732 + * Do the common header first
8735 + head->rh_length = cpu_to_le16(head->rh_length);
8736 + head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
8737 + /* Leave the lkid alone as it is transparent at the remote end */
8740 + * Do the fields in the remlockrequest or remlockreply structs
8743 + switch (req->rr_header.rh_cmd) {
8745 + case GDLM_REMCMD_LOCKREQUEST:
8746 + case GDLM_REMCMD_CONVREQUEST:
8747 + req->rr_range_start = cpu_to_le64(req->rr_range_start);
8748 + req->rr_range_end = cpu_to_le64(req->rr_range_end);
8749 + /* Deliberate fall through */
8750 + case GDLM_REMCMD_UNLOCKREQUEST:
8751 + case GDLM_REMCMD_LOOKUP:
8752 + case GDLM_REMCMD_LOCKGRANT:
8753 + case GDLM_REMCMD_SENDBAST:
8754 + case GDLM_REMCMD_SENDCAST:
8755 + case GDLM_REMCMD_REM_RESDATA:
8756 + req->rr_flags = cpu_to_le32(req->rr_flags);
8757 + req->rr_status = cpu_to_le32(req->rr_status);
8760 + case GDLM_REMCMD_LOCKREPLY:
8761 + rep->rl_lockstate = cpu_to_le32(rep->rl_lockstate);
8762 + rep->rl_nodeid = cpu_to_le32(rep->rl_nodeid);
8763 + rep->rl_status = cpu_to_le32(rep->rl_status);
8766 + case GDLM_REMCMD_RECOVERMESSAGE:
8767 + case GDLM_REMCMD_RECOVERREPLY:
8768 + rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
8769 + rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
8772 + case GDLM_REMCMD_QUERY:
8773 + qreq->rq_mstlkid = cpu_to_le32(qreq->rq_mstlkid);
8774 + qreq->rq_query = cpu_to_le32(qreq->rq_query);
8775 + qreq->rq_maxlocks = cpu_to_le32(qreq->rq_maxlocks);
8778 + case GDLM_REMCMD_QUERYREPLY:
8779 + qrep->rq_numlocks = cpu_to_le32(qrep->rq_numlocks);
8780 + qrep->rq_status = cpu_to_le32(qrep->rq_status);
8781 + qrep->rq_grantcount = cpu_to_le32(qrep->rq_grantcount);
8782 + qrep->rq_waitcount = cpu_to_le32(qrep->rq_waitcount);
8783 + qrep->rq_convcount = cpu_to_le32(qrep->rq_convcount);
8787 + printk("dlm: warning, unknown REMCMD type %u\n",
8788 + req->rr_header.rh_cmd);
8792 +static void network_to_host(void *msg)
8794 + struct dlm_header *head = msg;
8795 + struct dlm_request *req = msg;
8796 + struct dlm_reply *rep = msg;
8797 + struct dlm_query_request *qreq = msg;
8798 + struct dlm_query_reply *qrep = msg;
8799 + struct dlm_rcom *rc = msg;
8801 + /* Force into host byte order */
8804 + * Do the common header first
8807 + head->rh_length = le16_to_cpu(head->rh_length);
8808 + head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
8809 + /* Leave the lkid alone as it is transparent at the remote end */
8812 + * Do the fields in the remlockrequest or remlockreply structs
8815 + switch (req->rr_header.rh_cmd) {
8817 + case GDLM_REMCMD_LOCKREQUEST:
8818 + case GDLM_REMCMD_CONVREQUEST:
8819 + req->rr_range_start = le64_to_cpu(req->rr_range_start);
8820 + req->rr_range_end = le64_to_cpu(req->rr_range_end);
8821 + case GDLM_REMCMD_LOOKUP:
8822 + case GDLM_REMCMD_UNLOCKREQUEST:
8823 + case GDLM_REMCMD_LOCKGRANT:
8824 + case GDLM_REMCMD_SENDBAST:
8825 + case GDLM_REMCMD_SENDCAST:
8826 + case GDLM_REMCMD_REM_RESDATA:
8827 + /* Actually, not much to do here as the remote lock IDs are
8828 + * transparent too */
8829 + req->rr_flags = le32_to_cpu(req->rr_flags);
8830 + req->rr_status = le32_to_cpu(req->rr_status);
8833 + case GDLM_REMCMD_LOCKREPLY:
8834 + rep->rl_lockstate = le32_to_cpu(rep->rl_lockstate);
8835 + rep->rl_nodeid = le32_to_cpu(rep->rl_nodeid);
8836 + rep->rl_status = le32_to_cpu(rep->rl_status);
8839 + case GDLM_REMCMD_RECOVERMESSAGE:
8840 + case GDLM_REMCMD_RECOVERREPLY:
8841 + rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
8842 + rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
8846 + case GDLM_REMCMD_QUERY:
8847 + qreq->rq_mstlkid = le32_to_cpu(qreq->rq_mstlkid);
8848 + qreq->rq_query = le32_to_cpu(qreq->rq_query);
8849 + qreq->rq_maxlocks = le32_to_cpu(qreq->rq_maxlocks);
8852 + case GDLM_REMCMD_QUERYREPLY:
8853 + qrep->rq_numlocks = le32_to_cpu(qrep->rq_numlocks);
8854 + qrep->rq_status = le32_to_cpu(qrep->rq_status);
8855 + qrep->rq_grantcount = le32_to_cpu(qrep->rq_grantcount);
8856 + qrep->rq_waitcount = le32_to_cpu(qrep->rq_waitcount);
8857 + qrep->rq_convcount = le32_to_cpu(qrep->rq_convcount);
8861 + printk("dlm: warning, unknown REMCMD type %u\n",
8862 + req->rr_header.rh_cmd);
8866 +static void copy_from_cb(void *dst, const void *base, unsigned offset,
8867 + unsigned len, unsigned limit)
8869 + unsigned copy = len;
8871 + if ((copy + offset) > limit)
8872 + copy = limit - offset;
8873 + memcpy(dst, base + offset, copy);
8876 + memcpy(dst + copy, base, len);
8879 +static void khexdump(const unsigned char *c, int len)
8881 + while (len > 16) {
8883 + "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
8884 + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
8885 + c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
8890 + printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
8896 + printk(KERN_INFO "%02x\n", c[0]);
8903 + * Called from the low-level comms layer to process a buffer of
8906 + * Only complete messages are processed here, any "spare" bytes from
8907 + * the end of a buffer are saved and tacked onto the front of the next
8908 + * message that comes in. I doubt this will happen very often but we
8909 + * need to be able to cope with it and I don't want the task to be waiting
8910 + * for packets to come in when there is useful work to be done.
8913 +int midcomms_process_incoming_buffer(int nodeid, const void *base,
8914 + unsigned offset, unsigned len,
8917 + unsigned char __tmp[sizeof(struct dlm_header) + 64];
8918 + struct dlm_header *msg = (struct dlm_header *) __tmp;
8924 + while (len > sizeof(struct dlm_header)) {
8925 + /* Get message header and check it over */
8926 + copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
8928 + msglen = le16_to_cpu(msg->rh_length);
8929 + id = msg->rh_lkid;
8930 + space = msg->rh_lockspace;
8932 + /* Check message size */
8934 + if (msglen < sizeof(struct dlm_header))
8937 + if (msglen > dlm_config.buffer_size) {
8938 + printk("dlm: message size from %d too big %d(pkt len=%d)\n", nodeid, msglen, len);
8939 + khexdump((const unsigned char *) msg, len);
8944 + /* Not enough in buffer yet? wait for some more */
8948 + /* Make sure our temp buffer is large enough */
8949 + if (msglen > sizeof(__tmp) &&
8950 + msg == (struct dlm_header *) __tmp) {
8951 + msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
8956 + copy_from_cb(msg, base, offset, msglen, limit);
8957 + BUG_ON(id != msg->rh_lkid);
8958 + BUG_ON(space != msg->rh_lockspace);
8961 + offset &= (limit - 1);
8963 + network_to_host(msg);
8965 + if ((msg->rh_cmd > 32) ||
8966 + (msg->rh_cmd == 0) ||
8967 + (msg->rh_length < sizeof(struct dlm_header)) ||
8968 + (msg->rh_length > dlm_config.buffer_size)) {
8970 + printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
8971 + "lkid=%u, lockspace=%u\n",
8972 + msg->rh_cmd, msg->rh_flags, msg->rh_length,
8973 + msg->rh_lkid, msg->rh_lockspace);
8975 + printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
8976 + "ret=%u, limit=%08x newbuf=%d\n",
8977 + base, offset, len, ret, limit,
8978 + ((struct dlm_header *) __tmp == msg));
8980 + khexdump((const unsigned char *) msg, msg->rh_length);
8985 + switch (msg->rh_cmd) {
8986 + case GDLM_REMCMD_RECOVERMESSAGE:
8987 + case GDLM_REMCMD_RECOVERREPLY:
8988 + process_recovery_comm(nodeid, msg);
8991 + process_cluster_request(nodeid, msg, FALSE);
8995 + if (msg != (struct dlm_header *) __tmp)
8998 + return err ? err : ret;
9002 + * Send a lowcomms buffer
9005 +void midcomms_send_buffer(struct dlm_header *msg, struct writequeue_entry *e)
9007 + host_to_network(msg);
9008 + lowcomms_commit_buffer(e);
9012 + * Make the message into network byte order and send it
9015 +int midcomms_send_message(uint32_t nodeid, struct dlm_header *msg,
9018 + int len = msg->rh_length;
9020 + host_to_network(msg);
9023 + * Loopback. In fact, the locking code pretty much prevents this from
9024 + * being needed but it can happen when the directory node is also the
9028 + if (nodeid == our_nodeid())
9029 + return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
9032 + return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
9034 diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
9035 --- linux-orig/cluster/dlm/midcomms.h 1970-01-01 07:30:00.000000000 +0730
9036 +++ linux-patched/cluster/dlm/midcomms.h 2004-11-03 11:31:56.000000000 +0800
9038 +/******************************************************************************
9039 +*******************************************************************************
9041 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9042 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9044 +** This copyrighted material is made available to anyone wishing to use,
9045 +** modify, copy, or redistribute it subject to the terms and conditions
9046 +** of the GNU General Public License v.2.
9048 +*******************************************************************************
9049 +******************************************************************************/
9051 +#ifndef __MIDCOMMS_DOT_H__
9052 +#define __MIDCOMMS_DOT_H__
9054 +int midcomms_send_message(uint32_t csid, struct dlm_header *msg,
9056 +int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
9057 + unsigned len, unsigned limit);
9058 +void midcomms_send_buffer(struct dlm_header *msg,
9059 + struct writequeue_entry *e);
9061 +#endif /* __MIDCOMMS_DOT_H__ */
9062 diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
9063 --- linux-orig/cluster/dlm/nodes.c 1970-01-01 07:30:00.000000000 +0730
9064 +++ linux-patched/cluster/dlm/nodes.c 2004-11-03 11:31:56.000000000 +0800
9066 +/******************************************************************************
9067 +*******************************************************************************
9069 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9070 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9072 +** This copyrighted material is made available to anyone wishing to use,
9073 +** modify, copy, or redistribute it subject to the terms and conditions
9074 +** of the GNU General Public License v.2.
9076 +*******************************************************************************
9077 +******************************************************************************/
9079 +#include <net/sock.h>
9080 +#include <cluster/cnxman.h>
9082 +#include "dlm_internal.h"
9083 +#include "lowcomms.h"
9085 +#include "recover.h"
9086 +#include "reccomms.h"
9089 +static struct list_head cluster_nodes;
9090 +static spinlock_t node_lock;
9093 +void dlm_nodes_init(void)
9095 + INIT_LIST_HEAD(&cluster_nodes);
9096 + spin_lock_init(&node_lock);
9099 +static struct dlm_node *search_node(uint32_t nodeid)
9101 + struct dlm_node *node;
9103 + list_for_each_entry(node, &cluster_nodes, list) {
9104 + if (node->nodeid == nodeid)
9112 +static void put_node(struct dlm_node *node)
9114 + spin_lock(&node_lock);
9115 + if (atomic_dec_and_test(&node->refcount)) {
9116 + lowcomms_close(node->nodeid);
9117 + list_del(&node->list);
9118 + spin_unlock(&node_lock);
9122 + spin_unlock(&node_lock);
9125 +static int get_node(uint32_t nodeid, struct dlm_node **ndp)
9127 + struct dlm_node *node, *node2;
9128 + int error = -ENOMEM;
9130 + spin_lock(&node_lock);
9131 + node = search_node(nodeid);
9133 + atomic_inc(&node->refcount);
9134 + spin_unlock(&node_lock);
9139 + node = (struct dlm_node *) kmalloc(sizeof(struct dlm_node), GFP_KERNEL);
9143 + memset(node, 0, sizeof(struct dlm_node));
9144 + node->nodeid = nodeid;
9146 + spin_lock(&node_lock);
9147 + node2 = search_node(nodeid);
9149 + atomic_inc(&node2->refcount);
9150 + spin_unlock(&node_lock);
9156 + atomic_set(&node->refcount, 1);
9157 + list_add_tail(&node->list, &cluster_nodes);
9158 + spin_unlock(&node_lock);
9167 +int init_new_csb(uint32_t nodeid, struct dlm_csb **ret_csb)
9169 + struct dlm_csb *csb;
9170 + struct dlm_node *node;
9171 + int error = -ENOMEM;
9173 + csb = (struct dlm_csb *) kmalloc(sizeof(struct dlm_csb), GFP_KERNEL);
9177 + memset(csb, 0, sizeof(struct dlm_csb));
9179 + error = get_node(nodeid, &node);
9193 +void release_csb(struct dlm_csb *csb)
9195 + put_node(csb->node);
9199 +uint32_t our_nodeid(void)
9201 + return lowcomms_our_nodeid();
9204 +static void make_node_array(struct dlm_ls *ls)
9206 + struct dlm_csb *csb;
9210 + if (ls->ls_node_array) {
9211 + kfree(ls->ls_node_array);
9212 + ls->ls_node_array = NULL;
9215 + array = kmalloc(sizeof(uint32_t) * ls->ls_num_nodes, GFP_KERNEL);
9219 + list_for_each_entry(csb, &ls->ls_nodes, list)
9220 + array[i++] = csb->node->nodeid;
9222 + ls->ls_node_array = array;
9225 +int nodes_reconfig_wait(struct dlm_ls *ls)
9229 + if (ls->ls_low_nodeid == our_nodeid()) {
9230 + error = dlm_wait_status_all(ls, NODES_VALID);
9232 + set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
9234 + /* Experimental: this delay should allow any final messages
9235 + * from the previous node to be received before beginning
9238 + if (ls->ls_num_nodes == 1) {
9239 + current->state = TASK_UNINTERRUPTIBLE;
9240 + schedule_timeout((2) * HZ);
9244 + error = dlm_wait_status_low(ls, NODES_ALL_VALID);
9249 +static void add_ordered_node(struct dlm_ls *ls, struct dlm_csb *new)
9251 + struct dlm_csb *csb = NULL;
9252 + struct list_head *tmp;
9253 + struct list_head *newlist = &new->list;
9254 + struct list_head *head = &ls->ls_nodes;
9256 + list_for_each(tmp, head) {
9257 + csb = list_entry(tmp, struct dlm_csb, list);
9259 + if (new->node->nodeid < csb->node->nodeid)
9264 + list_add_tail(newlist, head);
9266 + /* FIXME: can use list macro here */
9267 + newlist->prev = tmp->prev;
9268 + newlist->next = tmp;
9269 + tmp->prev->next = newlist;
9270 + tmp->prev = newlist;
9274 +int ls_nodes_reconfig(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
9276 + struct dlm_csb *csb, *safe;
9277 + int error, i, found, pos = 0, neg = 0;
9278 + uint32_t low = (uint32_t) (-1);
9281 + * Remove (and save) departed nodes from lockspace's nodes list
9284 + list_for_each_entry_safe(csb, safe, &ls->ls_nodes, list) {
9286 + for (i = 0; i < rv->node_count; i++) {
9287 + if (csb->node->nodeid == rv->nodeids[i]) {
9295 + csb->gone_event = rv->event_id;
9296 + list_del(&csb->list);
9297 + list_add_tail(&csb->list, &ls->ls_nodes_gone);
9298 + ls->ls_num_nodes--;
9299 + log_all(ls, "remove node %u", csb->node->nodeid);
9304 + * Add new nodes to lockspace's nodes list
9307 + for (i = 0; i < rv->node_count; i++) {
9309 + list_for_each_entry(csb, &ls->ls_nodes, list) {
9310 + if (csb->node->nodeid == rv->nodeids[i]) {
9319 + error = init_new_csb(rv->nodeids[i], &csb);
9320 + DLM_ASSERT(!error,);
9322 + add_ordered_node(ls, csb);
9323 + ls->ls_num_nodes++;
9324 + log_all(ls, "add node %u", csb->node->nodeid);
9328 + list_for_each_entry(csb, &ls->ls_nodes, list) {
9329 + if (csb->node->nodeid < low)
9330 + low = csb->node->nodeid;
9333 + ls->ls_low_nodeid = low;
9334 + set_bit(LSFL_NODES_VALID, &ls->ls_flags);
9336 + make_node_array(ls);
9338 + error = nodes_reconfig_wait(ls);
9340 + log_all(ls, "total nodes %d", ls->ls_num_nodes);
9345 +static void nodes_clear(struct list_head *head)
9347 + struct dlm_csb *csb;
9349 + while (!list_empty(head)) {
9350 + csb = list_entry(head->next, struct dlm_csb, list);
9351 + list_del(&csb->list);
9356 +void ls_nodes_clear(struct dlm_ls *ls)
9358 + nodes_clear(&ls->ls_nodes);
9359 + ls->ls_num_nodes = 0;
9362 +void ls_nodes_gone_clear(struct dlm_ls *ls)
9364 + nodes_clear(&ls->ls_nodes_gone);
9367 +int ls_nodes_init(struct dlm_ls *ls, struct dlm_recover *rv)
9369 + struct dlm_csb *csb;
9371 + uint32_t low = (uint32_t) (-1);
9373 + /* nodes may be left from a previous failed start */
9374 + ls_nodes_clear(ls);
9376 + log_all(ls, "add nodes");
9378 + for (i = 0; i < rv->node_count; i++) {
9379 + error = init_new_csb(rv->nodeids[i], &csb);
9383 + add_ordered_node(ls, csb);
9384 + ls->ls_num_nodes++;
9386 + if (csb->node->nodeid < low)
9387 + low = csb->node->nodeid;
9390 + ls->ls_low_nodeid = low;
9391 + set_bit(LSFL_NODES_VALID, &ls->ls_flags);
9392 + make_node_array(ls);
9394 + error = nodes_reconfig_wait(ls);
9396 + log_all(ls, "total nodes %d", ls->ls_num_nodes);
9399 + ls_nodes_clear(ls);
9403 +int in_nodes_gone(struct dlm_ls *ls, uint32_t nodeid)
9405 + struct dlm_csb *csb;
9407 + list_for_each_entry(csb, &ls->ls_nodes_gone, list) {
9408 + if (csb->node->nodeid == nodeid)
9413 diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
9414 --- linux-orig/cluster/dlm/nodes.h 1970-01-01 07:30:00.000000000 +0730
9415 +++ linux-patched/cluster/dlm/nodes.h 2004-11-03 11:31:56.000000000 +0800
9417 +/******************************************************************************
9418 +*******************************************************************************
9420 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9421 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9423 +** This copyrighted material is made available to anyone wishing to use,
9424 +** modify, copy, or redistribute it subject to the terms and conditions
9425 +** of the GNU General Public License v.2.
9427 +*******************************************************************************
9428 +******************************************************************************/
9430 +#ifndef __NODES_DOT_H__
9431 +#define __NODES_DOT_H__
9433 +void dlm_nodes_init(void);
9434 +int init_new_csb(uint32_t nodeid, struct dlm_csb ** ret_csb);
9435 +void release_csb(struct dlm_csb * csb);
9436 +uint32_t our_nodeid(void);
9437 +int ls_nodes_reconfig(struct dlm_ls * ls, struct dlm_recover * gr, int *neg);
9438 +int ls_nodes_init(struct dlm_ls * ls, struct dlm_recover * gr);
9439 +int in_nodes_gone(struct dlm_ls * ls, uint32_t nodeid);
9440 +void ls_nodes_clear(struct dlm_ls *ls);
9441 +void ls_nodes_gone_clear(struct dlm_ls *ls);
9443 +#endif /* __NODES_DOT_H__ */
9444 diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
9445 --- linux-orig/cluster/dlm/proc.c 1970-01-01 07:30:00.000000000 +0730
9446 +++ linux-patched/cluster/dlm/proc.c 2004-11-03 11:31:56.000000000 +0800
9448 +/******************************************************************************
9449 +*******************************************************************************
9451 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9452 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9454 +** This copyrighted material is made available to anyone wishing to use,
9455 +** modify, copy, or redistribute it subject to the terms and conditions
9456 +** of the GNU General Public License v.2.
9458 +*******************************************************************************
9459 +******************************************************************************/
9461 +#include <linux/init.h>
9462 +#include <linux/proc_fs.h>
9463 +#include <linux/ctype.h>
9464 +#include <linux/seq_file.h>
9465 +#include <linux/module.h>
9467 +#include "dlm_internal.h"
9468 +#include "lockspace.h"
9470 +#if defined(DLM_DEBUG)
9471 +#define DLM_DEBUG_SIZE (1024)
9472 +#define MAX_DEBUG_MSG_LEN (64)
9474 +#define DLM_DEBUG_SIZE (0)
9475 +#define MAX_DEBUG_MSG_LEN (0)
9478 +static char * debug_buf;
9479 +static unsigned int debug_size;
9480 +static unsigned int debug_point;
9481 +static int debug_wrap;
9482 +static spinlock_t debug_lock;
9483 +static struct proc_dir_entry * debug_proc_entry = NULL;
9484 +static char proc_ls_name[255] = "";
9486 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9487 +static struct proc_dir_entry * locks_proc_entry = NULL;
9488 +static struct seq_operations locks_info_op;
9489 +static struct proc_dir_entry * dir_proc_entry = NULL;
9490 +static struct seq_operations dir_info_op;
9494 + * /proc/cluster/dlm_locks - dump resources and locks
9497 +static int locks_open(struct inode *inode, struct file *file)
9499 + return seq_open(file, &locks_info_op);
9502 +/* Write simply sets the lockspace to use */
9503 +static ssize_t locks_write(struct file *file, const char *buf,
9504 + size_t count, loff_t * ppos)
9506 + if (count < sizeof(proc_ls_name)) {
9507 + copy_from_user(proc_ls_name, buf, count);
9508 + proc_ls_name[count] = '\0';
9510 + /* Remove any trailing LF so that lazy users
9511 + can just echo "lsname" > /proc/cluster/dlm_locks */
9512 + if (proc_ls_name[count - 1] == '\n')
9513 + proc_ls_name[count - 1] = '\0';
9520 +static struct file_operations locks_fops = {
9522 + write:locks_write,
9525 + release:seq_release,
9528 +struct ls_dumpinfo {
9530 + struct list_head *next;
9531 + struct dlm_ls *ls;
9532 + struct dlm_rsb *rsb;
9533 + struct dlm_direntry *de;
9536 +static int print_resource(struct dlm_rsb * res, struct seq_file *s);
9538 +static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
9543 + /* Find the next non-empty hash bucket */
9544 + for (i = di->entry; i < di->ls->ls_rsbtbl_size; i++) {
9545 + read_lock(&di->ls->ls_rsbtbl[i].lock);
9546 + if (!list_empty(&di->ls->ls_rsbtbl[i].list)) {
9547 + di->next = di->ls->ls_rsbtbl[i].list.next;
9548 + read_unlock(&di->ls->ls_rsbtbl[i].lock);
9551 + read_unlock(&di->ls->ls_rsbtbl[i].lock);
9555 + if (di->entry >= di->ls->ls_rsbtbl_size)
9556 + return NULL; /* End of hash list */
9557 + } else { /* Find the next entry in the list */
9559 + read_lock(&di->ls->ls_rsbtbl[i].lock);
9560 + di->next = di->next->next;
9561 + if (di->next->next == di->ls->ls_rsbtbl[i].list.next) {
9562 + /* End of list - move to next bucket */
9565 + read_unlock(&di->ls->ls_rsbtbl[i].lock);
9566 + return next_rsb(di); /* do the top half of this conditional */
9568 + read_unlock(&di->ls->ls_rsbtbl[i].lock);
9570 + di->rsb = list_entry(di->next, struct dlm_rsb, res_hashchain);
9575 +static void *s_start(struct seq_file *m, loff_t *pos)
9577 + struct ls_dumpinfo *di;
9578 + struct dlm_ls *ls;
9581 + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9585 + di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
9590 + seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
9597 + for (i = 0; i < *pos; i++)
9598 + if (next_rsb(di) == NULL)
9601 + return next_rsb(di);
9604 +static void *s_next(struct seq_file *m, void *p, loff_t *pos)
9606 + struct ls_dumpinfo *di = p;
9610 + return next_rsb(di);
9613 +static int s_show(struct seq_file *m, void *p)
9615 + struct ls_dumpinfo *di = p;
9616 + return print_resource(di->rsb, m);
9619 +static void s_stop(struct seq_file *m, void *p)
9624 +static struct seq_operations locks_info_op = {
9631 +static char *print_lockmode(int mode)
9653 +static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
9654 + struct dlm_rsb *res)
9657 + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
9659 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9660 + || lkb->lkb_status == GDLM_LKSTS_WAITING)
9661 + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
9663 + if (lkb->lkb_range) {
9664 + /* This warns on Alpha. Tough. Only I see it */
9665 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9666 + || lkb->lkb_status == GDLM_LKSTS_GRANTED)
9667 + seq_printf(s, " %" PRIx64 "-%" PRIx64,
9668 + lkb->lkb_range[GR_RANGE_START],
9669 + lkb->lkb_range[GR_RANGE_END]);
9670 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9671 + || lkb->lkb_status == GDLM_LKSTS_WAITING)
9672 + seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
9673 + lkb->lkb_range[RQ_RANGE_START],
9674 + lkb->lkb_range[RQ_RANGE_END]);
9677 + if (lkb->lkb_nodeid) {
9678 + if (lkb->lkb_nodeid != res->res_nodeid)
9679 + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
9682 + seq_printf(s, " Master: %08x", lkb->lkb_remid);
9685 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
9686 + seq_printf(s, " LQ: %d,0x%x", lkb->lkb_lockqueue_state,
9687 + lkb->lkb_lockqueue_flags);
9689 + seq_printf(s, "\n");
9692 +static int print_resource(struct dlm_rsb *res, struct seq_file *s)
9695 + struct list_head *locklist;
9697 + seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
9698 + res->res_parent, res->res_length);
9699 + for (i = 0; i < res->res_length; i++) {
9700 + if (isprint(res->res_name[i]))
9701 + seq_printf(s, "%c", res->res_name[i]);
9703 + seq_printf(s, "%c", '.');
9705 + if (res->res_nodeid)
9706 + seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
9709 + seq_printf(s, "\" \nMaster Copy\n");
9711 + /* Print the LVB: */
9712 + if (res->res_lvbptr) {
9713 + seq_printf(s, "LVB: ");
9714 + for (i = 0; i < DLM_LVB_LEN; i++) {
9715 + if (i == DLM_LVB_LEN / 2)
9716 + seq_printf(s, "\n ");
9717 + seq_printf(s, "%02x ",
9718 + (unsigned char) res->res_lvbptr[i]);
9720 + seq_printf(s, "\n");
9723 + /* Print the locks attached to this resource */
9724 + seq_printf(s, "Granted Queue\n");
9725 + list_for_each(locklist, &res->res_grantqueue) {
9726 + struct dlm_lkb *this_lkb =
9727 + list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9728 + print_lock(s, this_lkb, res);
9731 + seq_printf(s, "Conversion Queue\n");
9732 + list_for_each(locklist, &res->res_convertqueue) {
9733 + struct dlm_lkb *this_lkb =
9734 + list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9735 + print_lock(s, this_lkb, res);
9738 + seq_printf(s, "Waiting Queue\n");
9739 + list_for_each(locklist, &res->res_waitqueue) {
9740 + struct dlm_lkb *this_lkb =
9741 + list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9742 + print_lock(s, this_lkb, res);
9750 + * /proc/cluster/dlm_dir - dump resource directory
9753 +static int print_de(struct dlm_direntry *de, struct seq_file *s)
9755 + char strname[DLM_RESNAME_MAXLEN+1];
9757 + memset(strname, 0, DLM_RESNAME_MAXLEN+1);
9758 + memcpy(strname, de->name, de->length);
9760 + seq_printf(s, "%s %u\n", strname, de->master_nodeid);
9764 +static int dir_open(struct inode *inode, struct file *file)
9766 + return seq_open(file, &dir_info_op);
9769 +static ssize_t dir_write(struct file *file, const char *buf,
9770 + size_t count, loff_t *ppos)
9772 + return locks_write(file, buf, count, ppos);
9775 +static struct file_operations dir_fops = {
9777 + .write = dir_write,
9779 + .llseek = seq_lseek,
9780 + .release = seq_release,
9781 + .owner = THIS_MODULE,
9784 +static struct ls_dumpinfo *next_de(struct ls_dumpinfo *di)
9789 + /* Find the next non-empty hash bucket */
9790 + for (i = di->entry; i < di->ls->ls_dirtbl_size; i++) {
9791 + read_lock(&di->ls->ls_dirtbl[i].lock);
9792 + if (!list_empty(&di->ls->ls_dirtbl[i].list)) {
9793 + di->next = di->ls->ls_dirtbl[i].list.next;
9794 + read_unlock(&di->ls->ls_dirtbl[i].lock);
9797 + read_unlock(&di->ls->ls_dirtbl[i].lock);
9801 + if (di->entry >= di->ls->ls_dirtbl_size)
9802 + return NULL; /* End of hash list */
9803 + } else { /* Find the next entry in the list */
9805 + read_lock(&di->ls->ls_dirtbl[i].lock);
9806 + di->next = di->next->next;
9807 + if (di->next->next == di->ls->ls_dirtbl[i].list.next) {
9808 + /* End of list - move to next bucket */
9811 + read_unlock(&di->ls->ls_dirtbl[i].lock);
9812 + return next_de(di); /* do the top half of this conditional */
9814 + read_unlock(&di->ls->ls_dirtbl[i].lock);
9816 + di->de = list_entry(di->next, struct dlm_direntry, list);
9821 +static void *dir_start(struct seq_file *m, loff_t *pos)
9823 + struct ls_dumpinfo *di;
9824 + struct dlm_ls *ls;
9827 + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9831 + di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
9836 + seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
9842 + for (i = 0; i < *pos; i++)
9843 + if (next_de(di) == NULL)
9846 + return next_de(di);
9849 +static void *dir_next(struct seq_file *m, void *p, loff_t *pos)
9851 + struct ls_dumpinfo *di = p;
9855 + return next_de(di);
9858 +static int dir_show(struct seq_file *m, void *p)
9860 + struct ls_dumpinfo *di = p;
9861 + return print_de(di->de, m);
9864 +static void dir_stop(struct seq_file *m, void *p)
9869 +static struct seq_operations dir_info_op = {
9870 + .start = dir_start,
9875 +#endif /* CONFIG_CLUSTER_DLM_PROCLOCKS */
9877 +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...)
9880 + int i, n, size, len;
9881 + char buf[MAX_DEBUG_MSG_LEN+1];
9883 + spin_lock(&debug_lock);
9888 + size = MAX_DEBUG_MSG_LEN;
9889 + memset(buf, 0, size+1);
9891 + n = snprintf(buf, size, "%s ", ls->ls_name);
9894 + va_start(va, fmt);
9895 + vsnprintf(buf+n, size, fmt, va);
9898 + len = strlen(buf);
9899 + if (len > MAX_DEBUG_MSG_LEN-1)
9900 + len = MAX_DEBUG_MSG_LEN-1;
9902 + buf[len+1] = '\0';
9904 + for (i = 0; i < strlen(buf); i++) {
9905 + debug_buf[debug_point++] = buf[i];
9907 + if (debug_point == debug_size) {
9913 + spin_unlock(&debug_lock);
9916 +void dlm_debug_dump(void)
9920 + spin_lock(&debug_lock);
9922 + for (i = debug_point; i < debug_size; i++)
9923 + printk("%c", debug_buf[i]);
9925 + for (i = 0; i < debug_point; i++)
9926 + printk("%c", debug_buf[i]);
9927 + spin_unlock(&debug_lock);
9930 +void dlm_debug_setup(int size)
9934 + if (size > PAGE_SIZE)
9937 + b = kmalloc(size, GFP_KERNEL);
9939 + spin_lock(&debug_lock);
9944 + debug_size = size;
9948 + memset(debug_buf, 0, debug_size);
9950 + spin_unlock(&debug_lock);
9953 +static void dlm_debug_init(void)
9959 + spin_lock_init(&debug_lock);
9961 + dlm_debug_setup(DLM_DEBUG_SIZE);
9964 +#ifdef CONFIG_PROC_FS
9965 +int dlm_debug_info(char *b, char **start, off_t offset, int length)
9969 + spin_lock(&debug_lock);
9972 + for (i = debug_point; i < debug_size; i++)
9973 + n += sprintf(b + n, "%c", debug_buf[i]);
9975 + for (i = 0; i < debug_point; i++)
9976 + n += sprintf(b + n, "%c", debug_buf[i]);
9978 + spin_unlock(&debug_lock);
9984 +#ifdef CONFIG_DLM_STATS
9985 +struct dlm_statinfo dlm_stats;
9986 +static struct proc_dir_entry *stats_proc_entry = NULL;
9987 +static int dlm_stats_info(char *b, char **start, off_t offset, int length)
9991 + long lq_locks = 0;
9992 + unsigned long lq_time = 0;
9994 + n += sprintf(b+n, "DLM stats (HZ=%d)\n\n", HZ);
9995 + n += sprintf(b+n, "Lock operations: %7d\n", dlm_stats.lockops);
9996 + n += sprintf(b+n, "Unlock operations: %7d\n", dlm_stats.unlockops);
9997 + n += sprintf(b+n, "Convert operations: %7d\n", dlm_stats.convertops);
9998 + n += sprintf(b+n, "Completion ASTs: %7d\n", dlm_stats.cast);
9999 + n += sprintf(b+n, "Blocking ASTs: %7d\n", dlm_stats.bast);
10000 + n += sprintf(b+n, "\n");
10001 + n += sprintf(b+n, "Lockqueue num waittime ave\n");
10002 + for (i=1; i<=4 ; i++) {
10003 + char *lq_reason="???";
10005 + case 1: lq_reason = "WAIT_RSB ";
10007 + case 2: lq_reason = "WAIT_CONV ";
10009 + case 3: lq_reason = "WAIT_GRANT ";
10011 + case 4: lq_reason = "WAIT_UNLOCK";
10014 + if (dlm_stats.lockqueue_locks[i])
10015 + n += sprintf(b+n, "%s %6lu %7lu %3lu\n",
10017 + dlm_stats.lockqueue_locks[i],
10018 + dlm_stats.lockqueue_time[i],
10019 + dlm_stats.lockqueue_time[i]/
10020 + dlm_stats.lockqueue_locks[i]);
10022 + lq_locks += dlm_stats.lockqueue_locks[i];
10023 + lq_time += dlm_stats.lockqueue_time[i];
10026 + n += sprintf(b+n, "Total %6lu %7lu %3lu\n",
10027 + lq_locks, lq_time, lq_time/lq_locks);
10031 +static int dlm_stats_clear(struct file *file, const char __user *buffer,
10032 + unsigned long count, void *data)
10034 + memset(&dlm_stats, 0, sizeof(dlm_stats));
10037 +#endif /* CONFIG_DLM_STATS */
10039 +void dlm_proc_init(void)
10041 +#ifdef CONFIG_PROC_FS
10042 + debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
10044 + if (!debug_proc_entry)
10047 + debug_proc_entry->get_info = &dlm_debug_info;
10050 +#ifdef CONFIG_DLM_STATS
10051 + stats_proc_entry = create_proc_entry("cluster/dlm_stats",
10052 + S_IRUSR | S_IWUSR, NULL);
10053 + if (!stats_proc_entry)
10056 + stats_proc_entry->get_info = &dlm_stats_info;
10057 + stats_proc_entry->write_proc = &dlm_stats_clear;
10060 + dlm_debug_init();
10062 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
10063 + locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
10065 + NULL, NULL, NULL);
10066 + if (!locks_proc_entry)
10068 + locks_proc_entry->proc_fops = &locks_fops;
10070 + dir_proc_entry = create_proc_read_entry("cluster/dlm_dir",
10072 + NULL, NULL, NULL);
10073 + if (!dir_proc_entry)
10075 + dir_proc_entry->proc_fops = &dir_fops;
10079 +void dlm_proc_exit(void)
10081 +#ifdef CONFIG_PROC_FS
10082 + if (debug_proc_entry) {
10083 + remove_proc_entry("cluster/dlm_debug", NULL);
10084 + dlm_debug_setup(0);
10088 +#ifdef CONFIG_DLM_STATS
10089 + if (stats_proc_entry)
10090 + remove_proc_entry("cluster/dlm_stats", NULL);
10093 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
10094 + if (locks_proc_entry)
10095 + remove_proc_entry("cluster/dlm_locks", NULL);
10096 + if (dir_proc_entry)
10097 + remove_proc_entry("cluster/dlm_dir", NULL);
10100 diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
10101 --- linux-orig/cluster/dlm/queries.c 1970-01-01 07:30:00.000000000 +0730
10102 +++ linux-patched/cluster/dlm/queries.c 2004-11-03 11:31:56.000000000 +0800
10104 +/******************************************************************************
10105 +*******************************************************************************
10107 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10108 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10110 +** This copyrighted material is made available to anyone wishing to use,
10111 +** modify, copy, or redistribute it subject to the terms and conditions
10112 +** of the GNU General Public License v.2.
10114 +*******************************************************************************
10115 +******************************************************************************/
10120 + * This file provides the kernel query interface to the DLM.
10124 +#define EXPORT_SYMTAB
10125 +#include <linux/module.h>
10127 +#include "dlm_internal.h"
10128 +#include "lockspace.h"
10129 +#include "lockqueue.h"
10130 +#include "locking.h"
10132 +#include "nodes.h"
10135 +#include "memory.h"
10136 +#include "lowcomms.h"
10137 +#include "midcomms.h"
10140 +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo);
10141 +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo);
10144 + * API entry point.
10146 +int dlm_query(void *lockspace,
10147 + struct dlm_lksb *lksb,
10149 + struct dlm_queryinfo *qinfo,
10150 + void (ast_routine(void *)),
10153 + int status = -EINVAL;
10154 + struct dlm_lkb *target_lkb;
10155 + struct dlm_lkb *query_lkb = NULL; /* Our temporary LKB */
10156 + struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
10162 + if (!ast_routine)
10167 + if (!qinfo->gqi_lockinfo)
10168 + qinfo->gqi_locksize = 0;
10170 + /* Find the lkid */
10171 + target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
10175 + /* If the user wants a list of locks that are blocking or
10176 + not blocking this lock, then it must be waiting
10179 + if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
10180 + (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
10181 + target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
10184 + /* We now allocate an LKB for our own use (so we can hang
10185 + * things like the AST routine and the lksb from it) */
10186 + lksb->sb_status = -EBUSY;
10187 + query_lkb = create_lkb(ls);
10188 + if (!query_lkb) {
10189 + status = -ENOMEM;
10192 + query_lkb->lkb_astaddr = ast_routine;
10193 + query_lkb->lkb_astparam = (long)astarg;
10194 + query_lkb->lkb_resource = target_lkb->lkb_resource;
10195 + query_lkb->lkb_lksb = lksb;
10197 + /* Don't free the resource while we are querying it. This ref
10198 + * will be dropped when the LKB is freed */
10199 + hold_rsb(query_lkb->lkb_resource);
10201 + /* Fill in the stuff that's always local */
10202 + if (qinfo->gqi_resinfo) {
10203 + if (target_lkb->lkb_resource->res_nodeid)
10204 + qinfo->gqi_resinfo->rsi_masternode =
10205 + target_lkb->lkb_resource->res_nodeid;
10207 + qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
10208 + qinfo->gqi_resinfo->rsi_length =
10209 + target_lkb->lkb_resource->res_length;
10210 + memcpy(qinfo->gqi_resinfo->rsi_name,
10211 + target_lkb->lkb_resource->res_name,
10212 + qinfo->gqi_resinfo->rsi_length);
10215 + /* If the master is local (or the user doesn't want the overhead of a
10216 + * remote call) - fill in the details here */
10217 + if (target_lkb->lkb_resource->res_nodeid == 0 ||
10218 + (query & DLM_QUERY_LOCAL)) {
10221 + /* Resource info */
10222 + if (qinfo->gqi_resinfo) {
10223 + query_resource(target_lkb->lkb_resource,
10224 + qinfo->gqi_resinfo);
10228 + if (qinfo->gqi_lockinfo) {
10229 + status = query_locks(query, target_lkb, qinfo);
10232 + query_lkb->lkb_retstatus = status;
10233 + queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
10236 + /* An AST will be delivered so we must return success here */
10241 + /* Remote master */
10242 + if (target_lkb->lkb_resource->res_nodeid != 0)
10244 + struct dlm_query_request *remquery;
10245 + struct writequeue_entry *e;
10247 + /* Clear this cos the receiving end adds to it with
10248 + each incoming packet */
10249 + qinfo->gqi_lockcount = 0;
10251 + /* Squirrel a pointer to the query info struct
10252 + somewhere illegal */
10253 + query_lkb->lkb_request = (struct dlm_request *) qinfo;
10255 + e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
10256 + sizeof(struct dlm_query_request),
10257 + ls->ls_allocation,
10258 + (char **) &remquery);
10260 + status = -ENOBUFS;
10264 + /* Build remote packet */
10265 + memset(remquery, 0, sizeof(struct dlm_query_request));
10267 + remquery->rq_maxlocks = qinfo->gqi_locksize;
10268 + remquery->rq_query = query;
10269 + remquery->rq_mstlkid = target_lkb->lkb_remid;
10270 + if (qinfo->gqi_lockinfo)
10271 + remquery->rq_maxlocks = qinfo->gqi_locksize;
10273 + remquery->rq_header.rh_cmd = GDLM_REMCMD_QUERY;
10274 + remquery->rq_header.rh_flags = 0;
10275 + remquery->rq_header.rh_length = sizeof(struct dlm_query_request);
10276 + remquery->rq_header.rh_lkid = query_lkb->lkb_id;
10277 + remquery->rq_header.rh_lockspace = ls->ls_global_id;
10279 + midcomms_send_buffer(&remquery->rq_header, e);
10284 + put_lockspace(ls);
10288 +static inline int valid_range(struct dlm_range *r)
10290 + if (r->ra_start != 0ULL ||
10291 + r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
10297 +static void put_int(int x, char *buf, int *offp)
10299 + x = cpu_to_le32(x);
10300 + memcpy(buf + *offp, &x, sizeof(int));
10301 + *offp += sizeof(int);
10304 +static void put_int64(uint64_t x, char *buf, int *offp)
10306 + x = cpu_to_le64(x);
10307 + memcpy(buf + *offp, &x, sizeof(uint64_t));
10308 + *offp += sizeof(uint64_t);
10311 +static int get_int(char *buf, int *offp)
10314 + memcpy(&value, buf + *offp, sizeof(int));
10315 + *offp += sizeof(int);
10316 + return le32_to_cpu(value);
10319 +static uint64_t get_int64(char *buf, int *offp)
10323 + memcpy(&value, buf + *offp, sizeof(uint64_t));
10324 + *offp += sizeof(uint64_t);
10325 + return le64_to_cpu(value);
10328 +#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
10330 +/* Called from recvd to get lock info for a remote node */
10331 +int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
10333 + struct dlm_query_request *query = (struct dlm_query_request *) msg;
10334 + struct dlm_query_reply *reply;
10335 + struct dlm_resinfo resinfo;
10336 + struct dlm_queryinfo qinfo;
10337 + struct writequeue_entry *e;
10339 + struct dlm_lkb *lkb;
10342 + int finished = 0;
10343 + int cur_lock = 0;
10344 + int start_lock = 0;
10346 + lkb = find_lock_by_id(ls, query->rq_mstlkid);
10348 + status = -EINVAL;
10352 + qinfo.gqi_resinfo = &resinfo;
10353 + qinfo.gqi_locksize = query->rq_maxlocks;
10355 + /* Get the resource bits */
10356 + query_resource(lkb->lkb_resource, &resinfo);
10358 + /* Now get the locks if wanted */
10359 + if (query->rq_maxlocks) {
10360 + qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
10362 + if (!qinfo.gqi_lockinfo) {
10363 + status = -ENOMEM;
10367 + status = query_locks(query->rq_query, lkb, &qinfo);
10368 + if (status && status != -E2BIG) {
10369 + kfree(qinfo.gqi_lockinfo);
10374 + qinfo.gqi_lockinfo = NULL;
10375 + qinfo.gqi_lockcount = 0;
10378 + /* Send as many blocks as needed for all the locks */
10381 + int msg_len = sizeof(struct dlm_query_reply);
10382 + int last_msg_len = msg_len; /* keeps compiler quiet */
10385 + /* First work out how many locks we can fit into a block */
10386 + for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
10388 + last_msg_len = msg_len;
10390 + msg_len += LOCK_LEN;
10391 + if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
10392 + valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
10394 + msg_len += sizeof(uint64_t) * 4;
10398 + /* There must be a neater way of doing this... */
10399 + if (msg_len > PAGE_SIZE) {
10401 + msg_len = last_msg_len;
10407 + e = lowcomms_get_buffer(nodeid,
10409 + ls->ls_allocation,
10410 + (char **) &reply);
10412 + kfree(qinfo.gqi_lockinfo);
10413 + status = -ENOBUFS;
10417 + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
10418 + reply->rq_header.rh_length = msg_len;
10419 + reply->rq_header.rh_lkid = msg->rh_lkid;
10420 + reply->rq_header.rh_lockspace = msg->rh_lockspace;
10422 + reply->rq_status = status;
10423 + reply->rq_startlock = cur_lock;
10424 + reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
10425 + reply->rq_convcount = qinfo.gqi_resinfo->rsi_convcount;
10426 + reply->rq_waitcount = qinfo.gqi_resinfo->rsi_waitcount;
10427 + memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
10429 + buf = (char *)reply;
10430 + bufidx = sizeof(struct dlm_query_reply);
10432 + for (; cur_lock < last_lock; cur_lock++) {
10434 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
10435 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
10436 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
10437 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
10438 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
10439 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
10440 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
10441 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_ownpid, buf, &bufidx);
10443 + if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
10444 + valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
10446 + buf[bufidx++] = 1;
10447 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
10448 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
10449 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
10450 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
10453 + buf[bufidx++] = 0;
10457 + if (cur_lock == qinfo.gqi_lockcount) {
10458 + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
10462 + reply->rq_header.rh_flags = 0;
10465 + reply->rq_numlocks = cur_lock - start_lock;
10466 + start_lock = cur_lock;
10468 + midcomms_send_buffer(&reply->rq_header, e);
10469 + } while (!finished);
10471 + kfree(qinfo.gqi_lockinfo);
10476 + e = lowcomms_get_buffer(nodeid,
10477 + sizeof(struct dlm_query_reply),
10478 + ls->ls_allocation,
10479 + (char **) &reply);
10481 + status = -ENOBUFS;
10484 + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
10485 + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
10486 + reply->rq_header.rh_length = sizeof(struct dlm_query_reply);
10487 + reply->rq_header.rh_lkid = msg->rh_lkid;
10488 + reply->rq_header.rh_lockspace = msg->rh_lockspace;
10489 + reply->rq_status = status;
10490 + reply->rq_numlocks = 0;
10491 + reply->rq_startlock = 0;
10492 + reply->rq_grantcount = 0;
10493 + reply->rq_convcount = 0;
10494 + reply->rq_waitcount = 0;
10496 + midcomms_send_buffer(&reply->rq_header, e);
10501 +/* Reply to a remote query */
10502 +int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
10504 + struct dlm_lkb *query_lkb;
10505 + struct dlm_queryinfo *qinfo;
10506 + struct dlm_query_reply *reply;
10511 + query_lkb = find_lock_by_id(ls, msg->rh_lkid);
10515 + qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
10516 + reply = (struct dlm_query_reply *) msg;
10518 + /* Copy the easy bits first */
10519 + qinfo->gqi_lockcount += reply->rq_numlocks;
10520 + if (qinfo->gqi_resinfo) {
10521 + qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
10522 + qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
10523 + qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
10524 + memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
10528 + /* Now unpack the locks */
10529 + bufidx = sizeof(struct dlm_query_reply);
10530 + buf = (char *) msg;
10532 + DLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
10533 + printk("start = %d, num + %d. Max= %d\n",
10534 + reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
10536 + for (i = reply->rq_startlock;
10537 + i < reply->rq_startlock + reply->rq_numlocks; i++) {
10538 + qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
10539 + qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
10540 + qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
10541 + qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
10542 + qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
10543 + qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
10544 + qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
10545 + qinfo->gqi_lockinfo[i].lki_ownpid = get_int(buf, &bufidx);
10546 + if (buf[bufidx++]) {
10547 + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
10548 + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = get_int64(buf, &bufidx);
10549 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
10550 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = get_int64(buf, &bufidx);
10553 + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
10554 + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
10555 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
10556 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
10560 + /* If this was the last block then now tell the user */
10561 + if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
10562 + query_lkb->lkb_retstatus = reply->rq_status;
10563 + queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
10570 +/* Aggregate resource information */
10571 +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo)
10573 + struct list_head *tmp;
10575 + if (rsb->res_lvbptr)
10576 + memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
10578 + down_read(&rsb->res_lock);
10579 + resinfo->rsi_grantcount = 0;
10580 + list_for_each(tmp, &rsb->res_grantqueue) {
10581 + resinfo->rsi_grantcount++;
10584 + resinfo->rsi_waitcount = 0;
10585 + list_for_each(tmp, &rsb->res_waitqueue) {
10586 + resinfo->rsi_waitcount++;
10589 + resinfo->rsi_convcount = 0;
10590 + list_for_each(tmp, &rsb->res_convertqueue) {
10591 + resinfo->rsi_convcount++;
10593 + up_read(&rsb->res_lock);
10598 +static int add_lock(struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
10602 + /* Don't fill it in if the buffer is full */
10603 + if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
10606 + /* gqi_lockcount contains the number of locks we have returned */
10607 + entry = qinfo->gqi_lockcount++;
10609 + /* Fun with master copies */
10610 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
10611 + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
10612 + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
10615 + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
10616 + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
10619 + /* Also make sure we always have a valid nodeid in there, the
10620 + calling end may not know which node "0" is */
10621 + if (lkb->lkb_nodeid)
10622 + qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
10624 + qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
10626 + if (lkb->lkb_parent)
10627 + qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
10629 + qinfo->gqi_lockinfo[entry].lki_parent = 0;
10631 + qinfo->gqi_lockinfo[entry].lki_state = lkb->lkb_status;
10632 + qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
10633 + qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
10634 + qinfo->gqi_lockinfo[entry].lki_ownpid = lkb->lkb_ownpid;
10636 + if (lkb->lkb_range) {
10637 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
10638 + lkb->lkb_range[GR_RANGE_START];
10639 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
10640 + lkb->lkb_range[GR_RANGE_END];
10641 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
10642 + lkb->lkb_range[RQ_RANGE_START];
10643 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
10644 + lkb->lkb_range[RQ_RANGE_END];
10646 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
10647 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
10648 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
10649 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
10654 +static int query_lkb_queue(struct dlm_rsb *rsb,
10655 + struct list_head *queue, int query,
10656 + struct dlm_queryinfo *qinfo)
10658 + struct list_head *tmp;
10660 + int mode = query & DLM_QUERY_MODE_MASK;
10662 + down_read(&rsb->res_lock);
10663 + list_for_each(tmp, queue) {
10664 + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
10667 + if (query & DLM_QUERY_RQMODE)
10668 + lkmode = lkb->lkb_rqmode;
10670 + lkmode = lkb->lkb_grmode;
10672 + /* Add the LKB info to the list if it matches the criteria in
10673 + * the query bitmap */
10674 + switch (query & DLM_QUERY_MASK) {
10675 + case DLM_QUERY_LOCKS_ALL:
10676 + status = add_lock(lkb, qinfo);
10679 + case DLM_QUERY_LOCKS_HIGHER:
10680 + if (lkmode > mode)
10681 + status = add_lock(lkb, qinfo);
10684 + case DLM_QUERY_LOCKS_EQUAL:
10685 + if (lkmode == mode)
10686 + status = add_lock(lkb, qinfo);
10689 + case DLM_QUERY_LOCKS_LOWER:
10690 + if (lkmode < mode)
10691 + status = add_lock(lkb, qinfo);
10693 + case DLM_QUERY_LOCKS_ORPHAN:
10694 + if (lkb->lkb_flags & GDLM_LKFLG_ORPHAN)
10695 + status = add_lock(lkb, qinfo);
10699 + up_read(&rsb->res_lock);
10704 + * Return 1 if the locks' ranges overlap
10705 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
10707 +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
10709 + if (!lkb1->lkb_range || !lkb2->lkb_range)
10712 + if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
10713 + lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
10718 +extern const int __dlm_compat_matrix[8][8];
10721 +static int get_blocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
10723 + struct list_head *tmp;
10726 + down_read(&qlkb->lkb_resource->res_lock);
10727 + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
10728 + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
10730 + if (ranges_overlap(lkb, qlkb) &&
10731 + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
10732 + status = add_lock(lkb, qinfo);
10734 + up_read(&qlkb->lkb_resource->res_lock);
10739 +static int get_nonblocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
10741 + struct list_head *tmp;
10744 + down_read(&qlkb->lkb_resource->res_lock);
10745 + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
10746 + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
10748 + if (!(ranges_overlap(lkb, qlkb) &&
10749 + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
10750 + status = add_lock(lkb, qinfo);
10752 + up_read(&qlkb->lkb_resource->res_lock);
10757 +/* Gather a list of appropriate locks */
10758 +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
10763 + /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
10764 + * was requested as the mode
10766 + if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
10767 + query &= ~DLM_QUERY_MODE_MASK;
10768 + if (query & DLM_QUERY_RQMODE)
10769 + query |= lkb->lkb_rqmode;
10771 + query |= lkb->lkb_grmode;
10774 + qinfo->gqi_lockcount = 0;
10776 + /* BLOCKING/NOTBLOCK only look at the granted queue */
10777 + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
10778 + return get_blocking_locks(lkb, qinfo);
10780 + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
10781 + return get_nonblocking_locks(lkb, qinfo);
10783 + /* Do the lock queues that were requested */
10784 + if (query & DLM_QUERY_QUEUE_GRANT) {
10785 + status = query_lkb_queue(lkb->lkb_resource,
10786 + &lkb->lkb_resource->res_grantqueue,
10790 + if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
10791 + status = query_lkb_queue(lkb->lkb_resource,
10792 + &lkb->lkb_resource->res_convertqueue,
10796 + if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
10797 + status = query_lkb_queue(lkb->lkb_resource,
10798 + &lkb->lkb_resource->res_waitqueue,
10806 +EXPORT_SYMBOL(dlm_query);
10808 + * Overrides for Emacs so that we follow Linus's tabbing style.
10809 + * Emacs will notice this stuff at the end of the file and automatically
10810 + * adjust the settings for this buffer only. This must remain at the end
10812 + * ---------------------------------------------------------------------------
10813 + * Local variables:
10814 + * c-file-style: "linux"
10817 diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
10818 --- linux-orig/cluster/dlm/queries.h 1970-01-01 07:30:00.000000000 +0730
10819 +++ linux-patched/cluster/dlm/queries.h 2004-11-03 11:31:56.000000000 +0800
10821 +/******************************************************************************
10822 +*******************************************************************************
10824 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10825 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10827 +** This copyrighted material is made available to anyone wishing to use,
10828 +** modify, copy, or redistribute it subject to the terms and conditions
10829 +** of the GNU General Public License v.2.
10831 +*******************************************************************************
10832 +******************************************************************************/
10834 +#ifndef __QUERIES_DOT_H__
10835 +#define __QUERIES_DOT_H__
10837 +extern int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10838 +extern int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10840 +#endif /* __QUERIES_DOT_H__ */
10841 diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
10842 --- linux-orig/cluster/dlm/rebuild.c 1970-01-01 07:30:00.000000000 +0730
10843 +++ linux-patched/cluster/dlm/rebuild.c 2004-11-03 11:31:56.000000000 +0800
10845 +/******************************************************************************
10846 +*******************************************************************************
10848 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10849 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10851 +** This copyrighted material is made available to anyone wishing to use,
10852 +** modify, copy, or redistribute it subject to the terms and conditions
10853 +** of the GNU General Public License v.2.
10855 +*******************************************************************************
10856 +******************************************************************************/
10859 + * Rebuild RSB's on new masters. Functions for transferring locks and
10860 + * subresources to new RSB masters during recovery.
10863 +#include "dlm_internal.h"
10864 +#include "reccomms.h"
10867 +#include "nodes.h"
10868 +#include "config.h"
10869 +#include "memory.h"
10870 +#include "recover.h"
10873 +/* Types of entity serialised in remastering messages */
10874 +#define REMASTER_ROOTRSB 1
10875 +#define REMASTER_RSB 2
10876 +#define REMASTER_LKB 3
10878 +struct rcom_fill {
10879 + char * outbuf; /* Beginning of data */
10880 + int offset; /* Current offset into outbuf */
10881 + int maxlen; /* Max value of offset */
10884 + struct dlm_rsb * rsb;
10885 + struct dlm_rsb * subrsb;
10886 + struct dlm_lkb * lkb;
10887 + struct list_head * lkbqueue;
10890 +typedef struct rcom_fill rcom_fill_t;
10893 +struct rebuild_node {
10894 + struct list_head list;
10896 + struct dlm_rsb * rootrsb;
10898 +typedef struct rebuild_node rebuild_node_t;
10902 + * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
10903 + * master. The rsb will be "done" with recovery when the new master has
10904 + * replied with all the new remote lockid's for this rsb's lkb's.
10907 +void expect_new_lkids(struct dlm_rsb *rsb)
10909 + rsb->res_newlkid_expect = 0;
10910 + recover_list_add(rsb);
10914 + * This function is called on root rsb or subrsb when another lkb is being sent
10915 + * to the new master for which we expect to receive a corresponding remote lkid
10918 +void need_new_lkid(struct dlm_rsb *rsb)
10920 + struct dlm_rsb *root = rsb;
10922 + if (rsb->res_parent)
10923 + root = rsb->res_root;
10925 + if (!root->res_newlkid_expect)
10926 + recover_list_add(root);
10928 + DLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
10930 + root->res_newlkid_expect++;
10934 + * This function is called for each lkb for which a new remote lkid is
10935 + * received. Decrement the expected number of remote lkids expected for the
10939 +void have_new_lkid(struct dlm_lkb *lkb)
10941 + struct dlm_rsb *root = lkb->lkb_resource;
10943 + if (root->res_parent)
10944 + root = root->res_root;
10946 + down_write(&root->res_lock);
10948 + DLM_ASSERT(root->res_newlkid_expect,
10949 + printk("newlkid_expect=%d\n", root->res_newlkid_expect););
10951 + root->res_newlkid_expect--;
10953 + if (!root->res_newlkid_expect) {
10954 + clear_bit(RESFL_NEW_MASTER, &root->res_flags);
10955 + recover_list_del(root);
10957 + up_write(&root->res_lock);
10961 + * Return the rebuild struct for a node - will create an entry on the rootrsb
10962 + * list if necessary.
10964 + * Currently no locking is needed here as it all happens in the dlm_recvd
10968 +static rebuild_node_t *find_rebuild_root(struct dlm_ls *ls, int nodeid)
10970 + rebuild_node_t *node = NULL;
10972 + list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
10973 + if (node->nodeid == nodeid)
10977 + /* Not found, add one */
10978 + node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
10982 + node->nodeid = nodeid;
10983 + node->rootrsb = NULL;
10984 + list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
10990 + * Tidy up after a rebuild run. Called when all recovery has finished
10993 +void rebuild_freemem(struct dlm_ls *ls)
10995 + rebuild_node_t *node = NULL, *s;
10997 + list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
10998 + list_del(&node->list);
11003 +static void put_int(int x, char *buf, int *offp)
11005 + x = cpu_to_le32(x);
11006 + memcpy(buf + *offp, &x, sizeof(int));
11007 + *offp += sizeof(int);
11010 +static void put_int64(uint64_t x, char *buf, int *offp)
11012 + x = cpu_to_le64(x);
11013 + memcpy(buf + *offp, &x, sizeof(uint64_t));
11014 + *offp += sizeof(uint64_t);
11017 +static void put_bytes(char *x, int len, char *buf, int *offp)
11019 + put_int(len, buf, offp);
11020 + memcpy(buf + *offp, x, len);
11024 +static void put_char(char x, char *buf, int *offp)
11030 +static int get_int(char *buf, int *offp)
11033 + memcpy(&value, buf + *offp, sizeof(int));
11034 + *offp += sizeof(int);
11035 + return le32_to_cpu(value);
11038 +static uint64_t get_int64(char *buf, int *offp)
11042 + memcpy(&value, buf + *offp, sizeof(uint64_t));
11043 + *offp += sizeof(uint64_t);
11044 + return le64_to_cpu(value);
11047 +static char get_char(char *buf, int *offp)
11049 + char x = buf[*offp];
11055 +static void get_bytes(char *bytes, int *len, char *buf, int *offp)
11057 + *len = get_int(buf, offp);
11058 + memcpy(bytes, buf + *offp, *len);
11062 +static int lkb_length(struct dlm_lkb *lkb)
11066 + len += sizeof(int); /* lkb_id */
11067 + len += sizeof(int); /* lkb_resource->res_reamasterid */
11068 + len += sizeof(int); /* lkb_flags */
11069 + len += sizeof(int); /* lkb_status */
11070 + len += sizeof(char); /* lkb_rqmode */
11071 + len += sizeof(char); /* lkb_grmode */
11072 + len += sizeof(int); /* lkb_childcnt */
11073 + len += sizeof(int); /* lkb_parent->lkb_id */
11074 + len += sizeof(int); /* lkb_bastaddr */
11075 + len += sizeof(int); /* lkb_ownpid */
11077 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11078 + len += sizeof(int); /* number of lvb bytes */
11079 + len += DLM_LVB_LEN;
11082 + if (lkb->lkb_range) {
11083 + len += sizeof(uint64_t);
11084 + len += sizeof(uint64_t);
11085 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
11086 + len += sizeof(uint64_t);
11087 + len += sizeof(uint64_t);
11095 + * It's up to the caller to be sure there's enough space in the buffer.
11098 +static void serialise_lkb(struct dlm_lkb *lkb, char *buf, int *offp)
11102 + /* Need to tell the remote end if we have a range */
11103 + flags = lkb->lkb_flags;
11104 + if (lkb->lkb_range)
11105 + flags |= GDLM_LKFLG_RANGE;
11108 + * See lkb_length()
11109 + * Total: 30 (no lvb) or 66 (with lvb) bytes
11112 + put_int(lkb->lkb_id, buf, offp);
11113 + put_int(lkb->lkb_resource->res_remasterid, buf, offp);
11114 + put_int(flags, buf, offp);
11115 + put_int(lkb->lkb_status, buf, offp);
11116 + put_char(lkb->lkb_rqmode, buf, offp);
11117 + put_char(lkb->lkb_grmode, buf, offp);
11118 + put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
11120 + if (lkb->lkb_parent)
11121 + put_int(lkb->lkb_parent->lkb_id, buf, offp);
11123 + put_int(0, buf, offp);
11125 + if (lkb->lkb_bastaddr)
11126 + put_int(1, buf, offp);
11128 + put_int(0, buf, offp);
11129 + put_int(lkb->lkb_ownpid, buf, offp);
11131 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11132 + DLM_ASSERT(lkb->lkb_lvbptr,);
11133 + put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
11136 + /* Only send the range we actually need */
11137 + if (lkb->lkb_range) {
11138 + switch (lkb->lkb_status) {
11139 + case GDLM_LKSTS_CONVERT:
11140 + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
11141 + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
11142 + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
11143 + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
11145 + case GDLM_LKSTS_WAITING:
11146 + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
11147 + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
11149 + case GDLM_LKSTS_GRANTED:
11150 + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
11151 + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
11159 +static int rsb_length(struct dlm_rsb *rsb)
11163 + len += sizeof(int); /* number of res_name bytes */
11164 + len += rsb->res_length; /* res_name */
11165 + len += sizeof(int); /* res_remasterid */
11166 + len += sizeof(int); /* res_parent->res_remasterid */
11171 +static inline struct dlm_rsb *next_subrsb(struct dlm_rsb *subrsb)
11173 + struct list_head *tmp;
11174 + struct dlm_rsb *r;
11176 + tmp = subrsb->res_subreslist.next;
11177 + r = list_entry(tmp, struct dlm_rsb, res_subreslist);
11182 +static inline int last_in_list(struct dlm_rsb *r, struct list_head *head)
11184 + struct dlm_rsb *last;
11185 + last = list_entry(head->prev, struct dlm_rsb, res_subreslist);
11191 +static int lkbs_to_remaster_list(struct list_head *head)
11193 + struct dlm_lkb *lkb;
11195 + list_for_each_entry(lkb, head, lkb_statequeue) {
11196 + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
11204 + * Used to decide if an rsb should be rebuilt on a new master. An rsb only
11205 + * needs to be rebuild if we have lkb's queued on it. NOREBUILD lkb's are not
11209 +static int lkbs_to_remaster(struct dlm_rsb *r)
11211 + struct dlm_rsb *sub;
11213 + if (lkbs_to_remaster_list(&r->res_grantqueue))
11215 + if (lkbs_to_remaster_list(&r->res_convertqueue))
11217 + if (lkbs_to_remaster_list(&r->res_waitqueue))
11220 + list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
11221 + if (lkbs_to_remaster_list(&sub->res_grantqueue))
11223 + if (lkbs_to_remaster_list(&sub->res_convertqueue))
11225 + if (lkbs_to_remaster_list(&sub->res_waitqueue))
11232 +static void serialise_rsb(struct dlm_rsb *rsb, char *buf, int *offp)
11235 + * See rsb_length()
11236 + * Total: 36 bytes (4 + 24 + 4 + 4)
11239 + put_bytes(rsb->res_name, rsb->res_length, buf, offp);
11240 + put_int(rsb->res_remasterid, buf, offp);
11242 + if (rsb->res_parent)
11243 + put_int(rsb->res_parent->res_remasterid, buf, offp);
11245 + put_int(0, buf, offp);
11247 + DLM_ASSERT(!rsb->res_lvbptr,);
11251 + * Flatten an LKB into a buffer for sending to the new RSB master. As a
11252 + * side-effect the nodeid of the lock is set to the nodeid of the new RSB
11256 +static int pack_one_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb,
11257 + rcom_fill_t *fill)
11259 + if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
11262 + lkb->lkb_nodeid = r->res_nodeid;
11264 + put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
11265 + serialise_lkb(lkb, fill->outbuf, &fill->offset);
11268 + need_new_lkid(r);
11276 + * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
11279 +static int pack_lkb_queue(struct dlm_rsb *r, struct list_head *queue,
11280 + rcom_fill_t *fill)
11282 + struct dlm_lkb *lkb;
11285 + list_for_each_entry(lkb, queue, lkb_statequeue) {
11286 + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
11289 + error = pack_one_lkb(r, lkb, fill);
11298 + fill->lkbqueue = queue;
11303 +static int pack_lkb_queues(struct dlm_rsb *r, rcom_fill_t *fill)
11307 + error = pack_lkb_queue(r, &r->res_grantqueue, fill);
11311 + error = pack_lkb_queue(r, &r->res_convertqueue, fill);
11315 + error = pack_lkb_queue(r, &r->res_waitqueue, fill);
11322 + * Pack remaining lkb's for rsb or subrsb. This may include a partial lkb
11323 + * queue and full lkb queues.
11326 +static int pack_lkb_remaining(struct dlm_rsb *r, rcom_fill_t *fill)
11328 + struct list_head *tmp, *start, *end;
11329 + struct dlm_lkb *lkb;
11333 + * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
11336 + error = pack_one_lkb(r, fill->lkb, fill);
11340 + start = fill->lkb->lkb_statequeue.next;
11341 + end = fill->lkbqueue;
11343 + for (tmp = start; tmp != end; tmp = tmp->next) {
11344 + lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
11346 + error = pack_one_lkb(r, lkb, fill);
11354 + * Pack all lkb's on r's queues following fill->lkbqueue.
11357 + if (fill->lkbqueue == &r->res_waitqueue)
11359 + if (fill->lkbqueue == &r->res_convertqueue)
11362 + DLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
11364 + error = pack_lkb_queue(r, &r->res_convertqueue, fill);
11368 + error = pack_lkb_queue(r, &r->res_waitqueue, fill);
11374 +static int pack_one_subrsb(struct dlm_rsb *rsb, struct dlm_rsb *subrsb,
11375 + rcom_fill_t *fill)
11379 + down_write(&subrsb->res_lock);
11381 + if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
11384 + subrsb->res_nodeid = rsb->res_nodeid;
11385 + subrsb->res_remasterid = ++fill->remasterid;
11387 + put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
11388 + serialise_rsb(subrsb, fill->outbuf, &fill->offset);
11390 + error = pack_lkb_queues(subrsb, fill);
11394 + up_write(&subrsb->res_lock);
11399 + up_write(&subrsb->res_lock);
11400 + fill->subrsb = subrsb;
11405 +static int pack_subrsbs(struct dlm_rsb *rsb, struct dlm_rsb *in_subrsb,
11406 + rcom_fill_t *fill)
11408 + struct dlm_rsb *subrsb;
11412 + * When an initial subrsb is given, we know it needs to be packed.
11413 + * When no initial subrsb is given, begin with the first (if any exist).
11416 + if (!in_subrsb) {
11417 + if (list_empty(&rsb->res_subreslist))
11420 + subrsb = list_entry(rsb->res_subreslist.next, struct dlm_rsb,
11423 + subrsb = in_subrsb;
11426 + error = pack_one_subrsb(rsb, subrsb, fill);
11430 + if (last_in_list(subrsb, &rsb->res_subreslist))
11433 + subrsb = next_subrsb(subrsb);
11441 + * Finish packing whatever is left in an rsb tree. If space runs out while
11442 + * finishing, save subrsb/lkb and this will be called again for the same rsb.
11444 + * !subrsb && lkb, we left off part way through root rsb's lkbs.
11445 + * subrsb && !lkb, we left off just before starting a new subrsb.
11446 + * subrsb && lkb, we left off part way through a subrsb's lkbs.
11447 + * !subrsb && !lkb, we shouldn't be in this function, but starting
11448 + * a new rsb in pack_rsb_tree().
11451 +static int pack_rsb_tree_remaining(struct dlm_ls *ls, struct dlm_rsb *rsb,
11452 + rcom_fill_t *fill)
11454 + struct dlm_rsb *subrsb = NULL;
11457 + if (!fill->subrsb && fill->lkb) {
11458 + error = pack_lkb_remaining(rsb, fill);
11462 + error = pack_subrsbs(rsb, NULL, fill);
11467 + else if (fill->subrsb && !fill->lkb) {
11468 + error = pack_subrsbs(rsb, fill->subrsb, fill);
11473 + else if (fill->subrsb && fill->lkb) {
11474 + error = pack_lkb_remaining(fill->subrsb, fill);
11478 + if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
11481 + subrsb = next_subrsb(fill->subrsb);
11483 + error = pack_subrsbs(rsb, subrsb, fill);
11488 + fill->subrsb = NULL;
11489 + fill->lkb = NULL;
11496 + * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
11497 + * buffer. When the buffer runs out of space, save the place to restart (the
11498 + * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
11501 +static int pack_rsb_tree(struct dlm_ls *ls, struct dlm_rsb *rsb,
11502 + rcom_fill_t *fill)
11504 + int error = -ENOSPC;
11506 + fill->remasterid = 0;
11509 + * Pack the root rsb itself. A 1 byte type precedes the serialised
11510 + * rsb. Then pack the lkb's for the root rsb.
11513 + down_write(&rsb->res_lock);
11515 + if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
11518 + rsb->res_remasterid = ++fill->remasterid;
11519 + put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
11520 + serialise_rsb(rsb, fill->outbuf, &fill->offset);
11522 + error = pack_lkb_queues(rsb, fill);
11526 + up_write(&rsb->res_lock);
11529 + * Pack subrsb/lkb's under the root rsb.
11532 + error = pack_subrsbs(rsb, NULL, fill);
11537 + up_write(&rsb->res_lock);
11542 + * Given an RSB, return the next RSB that should be sent to a new master.
11545 +static struct dlm_rsb *next_remastered_rsb(struct dlm_ls *ls,
11546 + struct dlm_rsb *rsb)
11548 + struct list_head *tmp, *start, *end;
11549 + struct dlm_rsb *r;
11552 + start = ls->ls_rootres.next;
11554 + start = rsb->res_rootlist.next;
11556 + end = &ls->ls_rootres;
11558 + for (tmp = start; tmp != end; tmp = tmp->next) {
11559 + r = list_entry(tmp, struct dlm_rsb, res_rootlist);
11561 + if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
11562 + if (r->res_nodeid && lkbs_to_remaster(r)) {
11563 + expect_new_lkids(r);
11566 + clear_bit(RESFL_NEW_MASTER, &r->res_flags);
11574 + * Given an rcom buffer, fill it with RSB's that need to be sent to a single
11575 + * new master node. In the case where all the data to send to one node
11576 + * requires multiple messages, this function needs to resume filling each
11577 + * successive buffer from the point where it left off when the previous buffer
11581 +static void fill_rcom_buffer(struct dlm_ls *ls, rcom_fill_t *fill,
11582 + uint32_t *nodeid)
11584 + struct dlm_rsb *rsb, *prev_rsb = fill->rsb;
11587 + fill->offset = 0;
11592 + * The first time this function is called.
11595 + rsb = next_remastered_rsb(ls, NULL);
11599 + } else if (fill->subrsb || fill->lkb) {
11602 + * Continue packing an rsb tree that was partially packed last
11603 + * time (fill->subrsb/lkb indicates where packing of last block
11608 + *nodeid = rsb->res_nodeid;
11610 + error = pack_rsb_tree_remaining(ls, rsb, fill);
11611 + if (error == -ENOSPC)
11614 + rsb = next_remastered_rsb(ls, prev_rsb);
11618 + if (rsb->res_nodeid != prev_rsb->res_nodeid)
11625 + * Pack rsb trees into the buffer until we run out of space, run out of
11626 + * new rsb's or hit a new nodeid.
11629 + *nodeid = rsb->res_nodeid;
11632 + error = pack_rsb_tree(ls, rsb, fill);
11633 + if (error == -ENOSPC)
11638 + rsb = next_remastered_rsb(ls, prev_rsb);
11642 + if (rsb->res_nodeid != prev_rsb->res_nodeid)
11656 + * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
11659 +int rebuild_rsbs_send(struct dlm_ls *ls)
11661 + struct dlm_rcom *rc;
11662 + rcom_fill_t fill;
11666 + DLM_ASSERT(recover_list_empty(ls),);
11668 + log_all(ls, "rebuild locks");
11671 + rc = allocate_rcom_buffer(ls);
11675 + down_read(&ls->ls_root_lock);
11678 + memset(&fill, 0, sizeof(rcom_fill_t));
11679 + fill.outbuf = rc->rc_buf;
11680 + fill.maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
11683 + fill_rcom_buffer(ls, &fill, &nodeid);
11684 + if (!fill.offset)
11687 + rc->rc_datalen = fill.offset;
11688 + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
11690 + up_read(&ls->ls_root_lock);
11695 + error = dlm_recovery_stopped(ls);
11697 + up_read(&ls->ls_root_lock);
11701 + while (fill.more);
11703 + up_read(&ls->ls_root_lock);
11705 + error = dlm_wait_function(ls, &recover_list_empty);
11707 + log_all(ls, "rebuilt %d locks", fill.count);
11710 + free_rcom_buffer(rc);
11716 +static struct dlm_rsb *find_by_remasterid(struct dlm_ls *ls, int remasterid,
11717 + struct dlm_rsb *rootrsb)
11719 + struct dlm_rsb *rsb;
11721 + DLM_ASSERT(rootrsb,);
11723 + if (rootrsb->res_remasterid == remasterid) {
11728 + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
11729 + if (rsb->res_remasterid == remasterid)
11739 + * Search a queue for the given remote lock id (remlkid).
11742 +static struct dlm_lkb *search_remlkid(struct list_head *statequeue, int nodeid,
11745 + struct dlm_lkb *lkb;
11747 + list_for_each_entry(lkb, statequeue, lkb_statequeue) {
11748 + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
11757 + * Given a remote lock ID (and a parent resource), return the local LKB for it
11758 + * Hopefully we dont need to do this too often on deep lock trees. This is
11759 + * VERY suboptimal for anything but the smallest lock trees. It searches the
11760 + * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
11761 + * returns the LKB address. OPTIMISATION: we should keep a list of these while
11762 + * we are building up the remastered LKBs
11765 +static struct dlm_lkb *find_by_remlkid(struct dlm_rsb *rootrsb, int nodeid,
11768 + struct dlm_lkb *lkb;
11769 + struct dlm_rsb *rsb;
11771 + lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
11775 + lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
11779 + lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
11783 + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
11784 + lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
11788 + lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
11792 + lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
11803 + * Unpack an LKB from a remaster operation
11806 +static int deserialise_lkb(struct dlm_ls *ls, int rem_nodeid,
11807 + struct dlm_rsb *rootrsb, char *buf, int *ptr,
11808 + char *outbuf, int *outoffp)
11810 + struct dlm_lkb *lkb, *exist_lkb = NULL;
11811 + struct dlm_rsb *rsb;
11812 + int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
11814 + remote_lkid = get_int(buf, ptr);
11816 + rsb_rmid = get_int(buf, ptr);
11817 + rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
11818 + DLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
11821 + * We could have received this lkb already from a previous recovery
11822 + * that was interrupted. We still need to advance ptr so read in
11823 + * lkb and then release it. FIXME: verify this is valid.
11825 + lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
11827 + log_all(ls, "lkb %x exists %s", remote_lkid, rsb->res_name);
11831 + lkb = create_lkb(ls);
11835 + lkb->lkb_remid = remote_lkid;
11836 + lkb->lkb_flags = get_int(buf, ptr);
11837 + status = get_int(buf, ptr);
11838 + lkb->lkb_rqmode = get_char(buf, ptr);
11839 + lkb->lkb_grmode = get_char(buf, ptr);
11840 + atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
11842 + parentid = get_int(buf, ptr);
11843 + lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
11844 + lkb->lkb_ownpid = get_int(buf, ptr);
11846 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11847 + lkb->lkb_lvbptr = allocate_lvb(ls);
11848 + if (!lkb->lkb_lvbptr)
11850 + get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
11853 + if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
11854 + uint64_t start, end;
11856 + /* Don't need to keep the range flag, for comms use only */
11857 + lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
11858 + start = get_int64(buf, ptr);
11859 + end = get_int64(buf, ptr);
11861 + lkb->lkb_range = allocate_range(ls);
11862 + if (!lkb->lkb_range)
11865 + switch (status) {
11866 + case GDLM_LKSTS_CONVERT:
11867 + lkb->lkb_range[RQ_RANGE_START] = start;
11868 + lkb->lkb_range[RQ_RANGE_END] = end;
11869 + start = get_int64(buf, ptr);
11870 + end = get_int64(buf, ptr);
11871 + lkb->lkb_range[GR_RANGE_START] = start;
11872 + lkb->lkb_range[GR_RANGE_END] = end;
11874 + case GDLM_LKSTS_WAITING:
11875 + lkb->lkb_range[RQ_RANGE_START] = start;
11876 + lkb->lkb_range[RQ_RANGE_END] = end;
11879 + case GDLM_LKSTS_GRANTED:
11880 + lkb->lkb_range[GR_RANGE_START] = start;
11881 + lkb->lkb_range[GR_RANGE_END] = end;
11889 + /* verify lkb and exist_lkb values match? */
11890 + release_lkb(ls, lkb);
11895 + /* Resolve local lock LKB address from parent ID */
11897 + lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
11900 + atomic_inc(&rsb->res_ref);
11901 + lkb->lkb_resource = rsb;
11903 + lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
11904 + lkb->lkb_nodeid = rem_nodeid;
11907 + * Put the lkb on an RSB queue. An lkb that's in the midst of a
11908 + * conversion request (on the requesting node's lockqueue and has
11909 + * LQCONVERT set) should be put on the granted queue. The convert
11910 + * request will be resent by the requesting node.
11913 + if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
11914 + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
11915 + DLM_ASSERT(status == GDLM_LKSTS_CONVERT,
11916 + printk("status=%d\n", status););
11917 + lkb->lkb_rqmode = DLM_LOCK_IV;
11918 + status = GDLM_LKSTS_GRANTED;
11921 + lkb_enqueue(rsb, lkb, status);
11924 + * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
11927 + if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
11928 + && lkb->lkb_grmode > DLM_LOCK_NL) {
11929 + if (!rsb->res_lvbptr)
11930 + rsb->res_lvbptr = allocate_lvb(ls);
11931 + if (!rsb->res_lvbptr)
11933 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
11937 + * Clear flags that may have been sent over that are only relevant in
11938 + * the context of the sender.
11941 + lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND |
11942 + GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED);
11945 + /* Return the new LKID to the caller's buffer */
11946 + put_int(lkb->lkb_id, outbuf, outoffp);
11947 + put_int(lkb->lkb_remid, outbuf, outoffp);
11954 +static struct dlm_rsb *deserialise_rsb(struct dlm_ls *ls, int nodeid,
11955 + struct dlm_rsb *rootrsb, char *buf,
11960 + int parent_remasterid;
11961 + char name[DLM_RESNAME_MAXLEN];
11963 + struct dlm_rsb *parent = NULL;
11964 + struct dlm_rsb *rsb;
11966 + get_bytes(name, &length, buf, ptr);
11967 + remasterid = get_int(buf, ptr);
11968 + parent_remasterid = get_int(buf, ptr);
11970 + if (parent_remasterid)
11971 + parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
11974 + * The rsb reference from this find_or_create_rsb() will keep the rsb
11975 + * around while we add new lkb's to it from deserialise_lkb. Each of
11976 + * the lkb's will add an rsb reference. The reference added here is
11977 + * removed by release_rsb() after all lkb's are added.
11980 + error = find_rsb(ls, parent, name, length, CREATE, &rsb);
11981 + DLM_ASSERT(!error,);
11983 + set_bit(RESFL_MASTER, &rsb->res_flags);
11985 + /* There is a case where the above needs to create the RSB. */
11986 + if (rsb->res_nodeid == -1)
11987 + rsb->res_nodeid = our_nodeid();
11989 + rsb->res_remasterid = remasterid;
11995 + * Processing at the receiving end of a NEWLOCKS message from a node in
11996 + * rebuild_rsbs_send(). Rebuild a remastered lock tree. Nodeid is the remote
11997 + * node whose locks we are now mastering. For a reply we need to send back the
11998 + * new lockids of the remastered locks so that remote ops can find them.
12001 +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
12003 + struct dlm_rcom *rc;
12004 + struct dlm_rsb *rsb = NULL;
12005 + rebuild_node_t *rnode;
12007 + int outptr, ptr = 0, error = -ENOMEM;
12009 + rnode = find_rebuild_root(ls, nodeid);
12014 + * Allocate a buffer for the reply message which is a list of remote
12015 + * lock IDs and their (new) local lock ids. It will always be big
12016 + * enough to fit <n> ID pairs if it already fit <n> LKBs.
12019 + rc = allocate_rcom_buffer(ls);
12022 + outbuf = rc->rc_buf;
12026 + * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
12027 + * created. Each deserialise_rsb adds an rsb reference that must be
12028 + * removed with release_rsb once all new lkb's for an rsb have been
12032 + while (ptr < len) {
12035 + type = get_char(buf, &ptr);
12038 + case REMASTER_ROOTRSB:
12040 + release_rsb(rsb);
12041 + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
12043 + rnode->rootrsb = rsb;
12046 + case REMASTER_RSB:
12048 + release_rsb(rsb);
12049 + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
12053 + case REMASTER_LKB:
12054 + deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
12055 + outbuf, &outptr);
12059 + DLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
12060 + "len=%d\n", type, nodeid, ptr,
12066 + release_rsb(rsb);
12069 + * Reply with the new lock IDs.
12072 + rc->rc_datalen = outptr;
12073 + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
12075 + free_rcom_buffer(rc);
12082 + * Processing for a NEWLOCKIDS message. Called when we get the reply from the
12083 + * new master telling us what the new remote lock IDs are for the remastered
12087 +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
12094 + while (offset < len) {
12097 + struct dlm_lkb *lkb;
12099 + if (offset + 8 > len) {
12100 + log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
12101 + "length nodeid=%d offset=%d len=%d",
12102 + nodeid, offset, len);
12106 + remote_id = get_int(buf, &offset);
12107 + local_id = get_int(buf, &offset);
12109 + lkb = find_lock_by_id(ls, local_id);
12111 + lkb->lkb_remid = remote_id;
12112 + have_new_lkid(lkb);
12114 + log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
12115 + "nodeid=%d id=%x remid=%x offset=%d len=%d",
12116 + nodeid, local_id, remote_id, offset, len);
12120 + if (recover_list_empty(ls))
12121 + wake_up(&ls->ls_wait_general);
12125 diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
12126 --- linux-orig/cluster/dlm/rebuild.h 1970-01-01 07:30:00.000000000 +0730
12127 +++ linux-patched/cluster/dlm/rebuild.h 2004-11-03 11:31:56.000000000 +0800
12129 +/******************************************************************************
12130 +*******************************************************************************
12132 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12133 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12135 +** This copyrighted material is made available to anyone wishing to use,
12136 +** modify, copy, or redistribute it subject to the terms and conditions
12137 +** of the GNU General Public License v.2.
12139 +*******************************************************************************
12140 +******************************************************************************/
12142 +#ifndef __REBUILD_DOT_H__
12143 +#define __REBUILD_DOT_H__
12145 +int rebuild_rsbs_send(struct dlm_ls *ls);
12146 +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
12147 +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
12148 +int rebuild_freemem(struct dlm_ls *ls);
12150 +#endif /* __REBUILD_DOT_H__ */
12151 diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
12152 --- linux-orig/cluster/dlm/reccomms.c 1970-01-01 07:30:00.000000000 +0730
12153 +++ linux-patched/cluster/dlm/reccomms.c 2004-11-03 11:31:56.000000000 +0800
12155 +/******************************************************************************
12156 +*******************************************************************************
12158 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12159 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12161 +** This copyrighted material is made available to anyone wishing to use,
12162 +** modify, copy, or redistribute it subject to the terms and conditions
12163 +** of the GNU General Public License v.2.
12165 +*******************************************************************************
12166 +******************************************************************************/
12168 +#include "dlm_internal.h"
12169 +#include "lowcomms.h"
12170 +#include "midcomms.h"
12171 +#include "reccomms.h"
12172 +#include "nodes.h"
12173 +#include "lockspace.h"
12174 +#include "recover.h"
12176 +#include "config.h"
12177 +#include "rebuild.h"
12178 +#include "memory.h"
12180 +/* Running on the basis that only a single recovery communication will be done
12181 + * at a time per lockspace */
12183 +static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc);
12185 +static int rcom_response(struct dlm_ls *ls)
12187 + return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12191 + * rcom_send_message - send or request recovery data
12192 + * @ls: the lockspace
12193 + * @nodeid: node to which the message is sent
12194 + * @type: type of recovery message
12195 + * @rc: the rc buffer to send
12196 + * @need_reply: wait for reply if this is set
12198 + * Using this interface
12199 + * i) Allocate an rc buffer:
12200 + * rc = allocate_rcom_buffer(ls);
12201 + * ii) Copy data to send beginning at rc->rc_buf:
12202 + * memcpy(rc->rc_buf, mybuf, mylen);
12203 + * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
12204 + * rc->rc_datalen = mylen
12205 + * iv) Submit the rc to this function:
12206 + * rcom_send_message(rc);
12208 + * The max value of "mylen" is dlm_config.buffer_size - sizeof(struct
12209 + * dlm_rcom). If more data must be passed in one send, use
12210 + * rcom_expand_buffer() which incrementally increases the size of the rc buffer
12211 + * by dlm_config.buffer_size bytes.
12213 + * Any data returned for the message (when need_reply is set) will saved in
12214 + * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
12215 + * number of bytes copied into rc->rc_buf.
12217 + * Returns: 0 on success, -EXXX on failure
12220 +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
12221 + struct dlm_rcom *rc, int need_reply)
12225 + if (!rc->rc_datalen)
12226 + rc->rc_datalen = 1;
12229 + * Fill in the header.
12232 + rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
12233 + rc->rc_header.rh_lockspace = ls->ls_global_id;
12234 + rc->rc_header.rh_length = sizeof(struct dlm_rcom) + rc->rc_datalen - 1;
12235 + rc->rc_subcmd = type;
12236 + rc->rc_msgid = ++ls->ls_rcom_msgid;
12239 + * When a reply is received, the reply data goes back into this buffer.
12240 + * Synchronous rcom requests (need_reply=1) are serialised because of
12241 + * the single ls_rcom.
12244 + if (need_reply) {
12245 + down(&ls->ls_rcom_lock);
12246 + ls->ls_rcom = rc;
12250 + * After sending the message we'll wait at the end of this function to
12251 + * get a reply. The READY flag will be set when the reply has been
12252 + * received and requested data has been copied into
12253 + * ls->ls_rcom->rc_buf;
12256 + DLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
12259 + * The WAIT bit indicates that we're waiting for and willing to accept a
12260 + * reply. Any replies are ignored unless this bit is set.
12263 + set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
12266 + * Process the message locally.
12269 + if (nodeid == our_nodeid()) {
12270 + rcom_process_message(ls, nodeid, rc);
12275 + * Send the message.
12278 + log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
12280 + error = midcomms_send_message(nodeid, (struct dlm_header *) rc,
12282 + DLM_ASSERT(error >= 0, printk("error = %d\n", error););
12286 + * Wait for a reply. Once a reply is processed from midcomms, the
12287 + * READY bit will be set and we'll be awoken (dlm_wait_function will
12291 + if (need_reply) {
12292 + error = dlm_wait_function(ls, &rcom_response);
12294 + log_debug(ls, "rcom wait error %d", error);
12298 + clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
12299 + clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12302 + up(&ls->ls_rcom_lock);
12308 + * Runs in same context as midcomms.
12311 +static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc)
12313 + struct dlm_rcom rc_stack;
12314 + struct dlm_rcom *reply = NULL;
12315 + int status, datalen, maxlen;
12316 + uint32_t r_nodeid, be_nodeid;
12321 + if (dlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
12322 + log_error(ls, "ignoring recovery message %x from %u",
12323 + rc->rc_subcmd, nodeid);
12327 + switch (rc->rc_subcmd) {
12329 + case RECCOMM_STATUS:
12331 + memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12332 + reply = &rc_stack;
12334 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12335 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12336 + reply->rc_subcmd = rc->rc_subcmd;
12337 + reply->rc_msgid = rc->rc_msgid;
12338 + reply->rc_buf[0] = 0;
12340 + if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
12341 + reply->rc_buf[0] |= RESDIR_VALID;
12343 + if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
12344 + reply->rc_buf[0] |= RESDIR_ALL_VALID;
12346 + if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
12347 + reply->rc_buf[0] |= NODES_VALID;
12349 + if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
12350 + reply->rc_buf[0] |= NODES_ALL_VALID;
12352 + reply->rc_datalen = 1;
12353 + reply->rc_header.rh_length =
12354 + sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12356 + log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
12359 + case RECCOMM_RECOVERNAMES:
12361 + reply = allocate_rcom_buffer(ls);
12362 + DLM_ASSERT(reply,);
12363 + maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
12365 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12366 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12367 + reply->rc_subcmd = rc->rc_subcmd;
12368 + reply->rc_msgid = rc->rc_msgid;
12371 + * The other node wants a bunch of resource names. The name of
12372 + * the resource to begin with is in rc->rc_buf.
12375 + datalen = dlm_dir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
12376 + reply->rc_buf, maxlen, nodeid);
12378 + reply->rc_datalen = datalen;
12379 + reply->rc_header.rh_length =
12380 + sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12382 + log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
12383 + reply->rc_msgid);
12386 + case RECCOMM_GETMASTER:
12388 + reply = allocate_rcom_buffer(ls);
12389 + DLM_ASSERT(reply,);
12391 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12392 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12393 + reply->rc_subcmd = rc->rc_subcmd;
12394 + reply->rc_msgid = rc->rc_msgid;
12397 + * The other node wants to know the master of a named resource.
12400 + status = dlm_dir_lookup(ls, nodeid, rc->rc_buf, rc->rc_datalen,
12402 + if (status != 0) {
12403 + log_all(ls, "rcom lookup error %d", status);
12404 + free_rcom_buffer(reply);
12408 + be_nodeid = cpu_to_be32(r_nodeid);
12409 + memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
12410 + reply->rc_datalen = sizeof(uint32_t);
12411 + reply->rc_header.rh_length =
12412 + sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12415 + case RECCOMM_BULKLOOKUP:
12417 + reply = allocate_rcom_buffer(ls);
12418 + DLM_ASSERT(reply,);
12420 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12421 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12422 + reply->rc_subcmd = rc->rc_subcmd;
12423 + reply->rc_msgid = rc->rc_msgid;
12426 + * This is a bulk version of the above and just returns a
12427 + * buffer full of node ids to match the resources
12430 + datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
12431 + rc->rc_datalen, reply->rc_buf);
12432 + if (datalen < 0) {
12433 + free_rcom_buffer(reply);
12438 + reply->rc_datalen = datalen;
12439 + reply->rc_header.rh_length =
12440 + sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12444 + * These RECCOMM messages don't need replies.
12447 + case RECCOMM_NEWLOCKS:
12448 + rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
12451 + case RECCOMM_NEWLOCKIDS:
12452 + rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
12455 + case RECCOMM_REMRESDATA:
12456 + dlm_dir_remove(ls, nodeid, rc->rc_buf, rc->rc_datalen);
12460 + DLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
12464 + if (nodeid == our_nodeid()) {
12465 + DLM_ASSERT(rc == ls->ls_rcom,);
12466 + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
12467 + rc->rc_datalen = reply->rc_datalen;
12469 + midcomms_send_message(nodeid,
12470 + (struct dlm_header *) reply,
12474 + if (reply != &rc_stack)
12475 + free_rcom_buffer(reply);
12479 +static void process_reply_sync(struct dlm_ls *ls, uint32_t nodeid,
12480 + struct dlm_rcom *reply)
12482 + struct dlm_rcom *rc = ls->ls_rcom;
12484 + if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
12485 + log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
12489 + if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
12490 + log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
12491 + reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
12495 + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
12496 + rc->rc_datalen = reply->rc_datalen;
12499 + * Tell the thread waiting in rcom_send_message() that it can go ahead.
12502 + set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12503 + wake_up(&ls->ls_wait_general);
12506 +static void process_reply_async(struct dlm_ls *ls, uint32_t nodeid,
12507 + struct dlm_rcom *reply)
12509 + restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
12510 + reply->rc_msgid);
12514 + * Runs in same context as midcomms.
12517 +static void rcom_process_reply(struct dlm_ls *ls, uint32_t nodeid,
12518 + struct dlm_rcom *reply)
12520 + if (dlm_recovery_stopped(ls)) {
12521 + log_error(ls, "ignoring recovery reply %x from %u",
12522 + reply->rc_subcmd, nodeid);
12526 + switch (reply->rc_subcmd) {
12527 + case RECCOMM_GETMASTER:
12528 + process_reply_async(ls, nodeid, reply);
12530 + case RECCOMM_STATUS:
12531 + case RECCOMM_NEWLOCKS:
12532 + case RECCOMM_NEWLOCKIDS:
12533 + case RECCOMM_RECOVERNAMES:
12534 + process_reply_sync(ls, nodeid, reply);
12537 + log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
12538 + reply->rc_subcmd, nodeid);
12543 +static int send_ls_not_ready(uint32_t nodeid, struct dlm_header *header)
12545 + struct writequeue_entry *wq;
12546 + struct dlm_rcom *rc = (struct dlm_rcom *) header;
12547 + struct dlm_rcom *reply;
12549 + wq = lowcomms_get_buffer(nodeid, sizeof(struct dlm_rcom), GFP_KERNEL,
12550 + (char **)&reply);
12554 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12555 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12556 + reply->rc_subcmd = rc->rc_subcmd;
12557 + reply->rc_msgid = rc->rc_msgid;
12558 + reply->rc_buf[0] = 0;
12560 + reply->rc_datalen = 1;
12561 + reply->rc_header.rh_length = sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12563 + midcomms_send_buffer((struct dlm_header *)reply, wq);
12569 + * Runs in same context as midcomms. Both recovery requests and recovery
12570 + * replies come through this function.
12573 +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header)
12575 + struct dlm_ls *ls = find_lockspace_by_global_id(header->rh_lockspace);
12576 + struct dlm_rcom *rc = (struct dlm_rcom *) header;
12578 + /* If the lockspace doesn't exist then still send a status message
12579 + back; it's possible that it just doesn't have its global_id yet. */
12582 + send_ls_not_ready(nodeid, header);
12586 + switch (header->rh_cmd) {
12587 + case GDLM_REMCMD_RECOVERMESSAGE:
12588 + rcom_process_message(ls, nodeid, rc);
12591 + case GDLM_REMCMD_RECOVERREPLY:
12592 + rcom_process_reply(ls, nodeid, rc);
12596 + DLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
12599 + put_lockspace(ls);
12602 diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
12603 --- linux-orig/cluster/dlm/reccomms.h 1970-01-01 07:30:00.000000000 +0730
12604 +++ linux-patched/cluster/dlm/reccomms.h 2004-11-03 11:31:56.000000000 +0800
12606 +/******************************************************************************
12607 +*******************************************************************************
12609 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12610 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12612 +** This copyrighted material is made available to anyone wishing to use,
12613 +** modify, copy, or redistribute it subject to the terms and conditions
12614 +** of the GNU General Public License v.2.
12616 +*******************************************************************************
12617 +******************************************************************************/
12619 +#ifndef __RECCOMMS_DOT_H__
12620 +#define __RECCOMMS_DOT_H__
12624 +#define RESDIR_VALID (1)
12625 +#define RESDIR_ALL_VALID (2)
12626 +#define NODES_VALID (4)
12627 +#define NODES_ALL_VALID (8)
12629 +#define RECCOMM_STATUS (1)
12630 +#define RECCOMM_RECOVERNAMES (2)
12631 +#define RECCOMM_GETMASTER (3)
12632 +#define RECCOMM_BULKLOOKUP (4)
12633 +#define RECCOMM_NEWLOCKS (5)
12634 +#define RECCOMM_NEWLOCKIDS (6)
12635 +#define RECCOMM_REMRESDATA (7)
12637 +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
12638 + struct dlm_rcom *rc, int need_reply);
12639 +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header);
12642 diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
12643 --- linux-orig/cluster/dlm/recover.c 1970-01-01 07:30:00.000000000 +0730
12644 +++ linux-patched/cluster/dlm/recover.c 2004-11-03 11:31:56.000000000 +0800
12646 +/******************************************************************************
12647 +*******************************************************************************
12649 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12650 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12652 +** This copyrighted material is made available to anyone wishing to use,
12653 +** modify, copy, or redistribute it subject to the terms and conditions
12654 +** of the GNU General Public License v.2.
12656 +*******************************************************************************
12657 +******************************************************************************/
12659 +#include "dlm_internal.h"
12660 +#include "reccomms.h"
12662 +#include "locking.h"
12664 +#include "lockspace.h"
12666 +#include "nodes.h"
12667 +#include "config.h"
12669 +#include "memory.h"
12672 + * Called in recovery routines to check whether the recovery process has been
12673 + * interrupted/stopped by another transition. A recovery in-process will abort
12674 + * if the lockspace is "stopped" so that a new recovery process can start from
12675 + * the beginning when the lockspace is "started" again.
12678 +int dlm_recovery_stopped(struct dlm_ls *ls)
12680 + return test_bit(LSFL_LS_STOP, &ls->ls_flags);
12683 +static void dlm_wait_timer_fn(unsigned long data)
12685 + struct dlm_ls *ls = (struct dlm_ls *) data;
12687 + wake_up(&ls->ls_wait_general);
12691 + * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
12692 + * set due to failure of a node in ls_nodes). When another function thinks it
12693 + * could have completed the waited-on task, they should wake up ls_wait_general
12694 + * to get an immediate response rather than waiting for the timer to detect the
12695 + * result. A timer wakes us up periodically while waiting to see if we should
12696 + * abort due to a node failure.
12699 +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
12701 + struct timer_list timer;
12704 + init_timer(&timer);
12705 + timer.function = dlm_wait_timer_fn;
12706 + timer.data = (long) ls;
12709 + mod_timer(&timer, jiffies + (dlm_config.recover_timer * HZ));
12711 + wchan_cond_sleep_intr(ls->ls_wait_general,
12713 + !test_bit(LSFL_LS_STOP, &ls->ls_flags));
12715 + if (timer_pending(&timer))
12716 + del_timer(&timer);
12721 + if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
12730 +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status)
12732 + struct dlm_rcom rc_stack, *rc;
12733 + struct dlm_csb *csb;
12737 + memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12739 + rc->rc_datalen = 0;
12741 + list_for_each_entry(csb, &ls->ls_nodes, list) {
12743 + error = dlm_recovery_stopped(ls);
12747 + error = rcom_send_message(ls, csb->node->nodeid,
12748 + RECCOMM_STATUS, rc, 1);
12752 + status = rc->rc_buf[0];
12753 + if (status & wait_status)
12756 + set_current_state(TASK_INTERRUPTIBLE);
12757 + schedule_timeout(HZ >> 1);
12766 +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status)
12768 + struct dlm_rcom rc_stack, *rc;
12769 + uint32_t nodeid = ls->ls_low_nodeid;
12773 + memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12775 + rc->rc_datalen = 0;
12778 + error = dlm_recovery_stopped(ls);
12782 + error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
12786 + status = rc->rc_buf[0];
12787 + if (status & wait_status)
12790 + set_current_state(TASK_INTERRUPTIBLE);
12791 + schedule_timeout(HZ >> 1);
12799 +static int purge_queue(struct dlm_ls *ls, struct list_head *queue)
12801 + struct dlm_lkb *lkb, *safe;
12802 + struct dlm_rsb *rsb;
12805 + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
12806 + if (!lkb->lkb_nodeid)
12809 + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
12811 + if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
12812 + list_del(&lkb->lkb_statequeue);
12814 + rsb = lkb->lkb_resource;
12815 + lkb->lkb_status = 0;
12817 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
12818 + && &lkb->lkb_duetime)
12819 + remove_from_deadlockqueue(lkb);
12821 + release_lkb(ls, lkb);
12822 + release_rsb_locked(rsb);
12831 + * Go through local restbl and for each rsb we're master of, clear out any
12832 + * lkb's held by departed nodes.
12835 +int restbl_lkb_purge(struct dlm_ls *ls)
12837 + struct list_head *tmp2, *safe2;
12839 + struct dlm_rsb *rootrsb, *safe, *rsb;
12841 + log_all(ls, "purge locks of departed nodes");
12842 + down_write(&ls->ls_root_lock);
12844 + list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
12846 + if (rootrsb->res_nodeid)
12849 + hold_rsb(rootrsb);
12850 + down_write(&rootrsb->res_lock);
12852 + /* This traverses the subreslist in reverse order so we purge
12853 + * the children before their parents. */
12855 + for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
12856 + tmp2 != &rootrsb->res_subreslist;
12857 + tmp2 = safe2, safe2 = safe2->prev) {
12858 + rsb = list_entry(tmp2, struct dlm_rsb, res_subreslist);
12861 + purge_queue(ls, &rsb->res_grantqueue);
12862 + purge_queue(ls, &rsb->res_convertqueue);
12863 + purge_queue(ls, &rsb->res_waitqueue);
12864 + release_rsb_locked(rsb);
12866 + count += purge_queue(ls, &rootrsb->res_grantqueue);
12867 + count += purge_queue(ls, &rootrsb->res_convertqueue);
12868 + count += purge_queue(ls, &rootrsb->res_waitqueue);
12870 + up_write(&rootrsb->res_lock);
12871 + release_rsb_locked(rootrsb);
12874 + up_write(&ls->ls_root_lock);
12875 + log_all(ls, "purged %d locks", count);
12881 + * Grant any locks that have become grantable after a purge
12884 +int restbl_grant_after_purge(struct dlm_ls *ls)
12886 + struct dlm_rsb *root, *rsb, *safe;
12889 + down_read(&ls->ls_root_lock);
12891 + list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
12892 + /* only the rsb master grants locks */
12893 + if (root->res_nodeid)
12896 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
12897 + log_debug(ls, "restbl_grant_after_purge aborted");
12899 + up_read(&ls->ls_root_lock);
12903 + down_write(&root->res_lock);
12904 + grant_pending_locks(root);
12905 + up_write(&root->res_lock);
12907 + list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
12908 + down_write(&rsb->res_lock);
12909 + grant_pending_locks(rsb);
12910 + up_write(&rsb->res_lock);
12913 + up_read(&ls->ls_root_lock);
12920 + * Set the lock master for all LKBs in a lock queue
12923 +static void set_lock_master(struct list_head *queue, int nodeid)
12925 + struct dlm_lkb *lkb;
12927 + list_for_each_entry(lkb, queue, lkb_statequeue) {
12928 + /* Don't muck around with pre-exising sublocks */
12929 + if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
12930 + lkb->lkb_nodeid = nodeid;
12934 +static void set_master_lkbs(struct dlm_rsb *rsb)
12936 + set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
12937 + set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
12938 + set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
12942 + * This rsb struct is now the master so it is responsible for keeping the
12943 + * latest rsb. Find if any current lkb's have an up to date copy of the lvb to
12944 + * be used as the rsb copy. An equivalent step occurs as new lkb's arrive for
12945 + * this rsb in deserialise_lkb.
12948 +static void set_rsb_lvb(struct dlm_rsb *rsb)
12950 + struct dlm_lkb *lkb;
12952 + list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
12954 + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12955 + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12956 + (lkb->lkb_grmode > DLM_LOCK_NL))
12958 + if (!rsb->res_lvbptr)
12959 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12961 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12966 + list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
12968 + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12969 + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12970 + (lkb->lkb_grmode > DLM_LOCK_NL))
12972 + if (!rsb->res_lvbptr)
12973 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12975 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12982 + * Propogate the new master nodeid to locks, subrsbs, sublocks.
12983 + * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
12986 +static void set_new_master(struct dlm_rsb *rsb, uint32_t nodeid)
12988 + struct dlm_rsb *subrsb;
12990 + down_write(&rsb->res_lock);
12992 + if (nodeid == our_nodeid()) {
12993 + set_bit(RESFL_MASTER, &rsb->res_flags);
12994 + rsb->res_nodeid = 0;
12995 + set_rsb_lvb(rsb);
12997 + rsb->res_nodeid = nodeid;
12999 + set_master_lkbs(rsb);
13001 + list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
13002 + subrsb->res_nodeid = rsb->res_nodeid;
13003 + set_master_lkbs(subrsb);
13006 + up_write(&rsb->res_lock);
13008 + set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
13012 + * The recover_list contains all the rsb's for which we've requested the new
13013 + * master nodeid. As replies are returned from the resource directories the
13014 + * rsb's are removed from the list. When the list is empty we're done.
13016 + * The recover_list is later similarly used for all rsb's for which we've sent
13017 + * new lkb's and need to receive new corresponding lkid's.
13020 +int recover_list_empty(struct dlm_ls *ls)
13024 + spin_lock(&ls->ls_recover_list_lock);
13025 + empty = list_empty(&ls->ls_recover_list);
13026 + spin_unlock(&ls->ls_recover_list_lock);
13031 +int recover_list_count(struct dlm_ls *ls)
13035 + spin_lock(&ls->ls_recover_list_lock);
13036 + count = ls->ls_recover_list_count;
13037 + spin_unlock(&ls->ls_recover_list_lock);
13042 +void recover_list_add(struct dlm_rsb *rsb)
13044 + struct dlm_ls *ls = rsb->res_ls;
13046 + spin_lock(&ls->ls_recover_list_lock);
13047 + if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
13048 + list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
13049 + ls->ls_recover_list_count++;
13052 + spin_unlock(&ls->ls_recover_list_lock);
13055 +void recover_list_del(struct dlm_rsb *rsb)
13057 + struct dlm_ls *ls = rsb->res_ls;
13059 + spin_lock(&ls->ls_recover_list_lock);
13060 + clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
13061 + list_del(&rsb->res_recover_list);
13062 + ls->ls_recover_list_count--;
13063 + spin_unlock(&ls->ls_recover_list_lock);
13065 + release_rsb(rsb);
13068 +static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, int msgid)
13070 + struct dlm_rsb *rsb = NULL;
13072 + spin_lock(&ls->ls_recover_list_lock);
13074 + list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
13075 + if (rsb->res_recover_msgid == msgid)
13081 + spin_unlock(&ls->ls_recover_list_lock);
13085 +static int rsb_master_lookup(struct dlm_rsb *rsb, struct dlm_rcom *rc)
13087 + struct dlm_ls *ls = rsb->res_ls;
13088 + uint32_t dir_nodeid, r_nodeid;
13091 + dir_nodeid = get_directory_nodeid(rsb);
13093 + if (dir_nodeid == our_nodeid()) {
13094 + error = dlm_dir_lookup(ls, dir_nodeid, rsb->res_name,
13095 + rsb->res_length, &r_nodeid);
13096 + if (error == -EEXIST) {
13097 + log_all(ls, "rsb_master_lookup %u EEXIST %s",
13098 + r_nodeid, rsb->res_name);
13099 + } else if (error)
13102 + set_new_master(rsb, r_nodeid);
13104 + /* As we are the only thread doing recovery this
13105 + should be safe. if not then we need to use a different
13106 + ID somehow. We must set it in the RSB before rcom_send_msg
13107 + completes cos we may get a reply quite quickly.
13109 + rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
13111 + recover_list_add(rsb);
13113 + memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
13114 + rc->rc_datalen = rsb->res_length;
13116 + error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
13126 +static int needs_update(struct dlm_ls *ls, struct dlm_rsb *r)
13128 + if (!r->res_nodeid)
13131 + if (r->res_nodeid == -1)
13134 + if (in_nodes_gone(ls, r->res_nodeid))
13141 + * Go through local root resources and for each rsb which has a master which
13142 + * has departed, get the new master nodeid from the resdir. The resdir will
13143 + * assign mastery to the first node to look up the new master. That means
13144 + * we'll discover in this lookup if we're the new master of any rsb's.
13146 + * We fire off all the resdir requests individually and asynchronously to the
13147 + * correct resdir node. The replies are processed in rsb_master_recv().
13150 +int restbl_rsb_update(struct dlm_ls *ls)
13152 + struct dlm_rsb *rsb, *safe;
13153 + struct dlm_rcom *rc;
13154 + int error = -ENOMEM;
13157 + log_all(ls, "update remastered resources");
13159 + rc = allocate_rcom_buffer(ls);
13163 + down_read(&ls->ls_root_lock);
13165 + list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
13166 + error = dlm_recovery_stopped(ls);
13168 + up_read(&ls->ls_root_lock);
13172 + if (needs_update(ls, rsb)) {
13173 + error = rsb_master_lookup(rsb, rc);
13175 + up_read(&ls->ls_root_lock);
13181 + up_read(&ls->ls_root_lock);
13183 + error = dlm_wait_function(ls, &recover_list_empty);
13185 + log_all(ls, "updated %d resources", count);
13187 + free_rcom_buffer(rc);
13192 +int restbl_rsb_update_recv(struct dlm_ls *ls, uint32_t nodeid, char *buf,
13193 + int length, int msgid)
13195 + struct dlm_rsb *rsb;
13196 + uint32_t be_nodeid;
13198 + rsb = recover_list_find(ls, msgid);
13200 + log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
13204 + memcpy(&be_nodeid, buf, sizeof(uint32_t));
13205 + set_new_master(rsb, be32_to_cpu(be_nodeid));
13206 + recover_list_del(rsb);
13208 + if (recover_list_empty(ls))
13209 + wake_up(&ls->ls_wait_general);
13216 + * This function not used any longer.
13219 +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
13222 + char *inbufptr, *outbufptr;
13225 + * The other node wants nodeids matching the resource names in inbuf.
13226 + * The resource names are packed into inbuf as
13227 + * [len1][name1][len2][name2]... where lenX is 1 byte and nameX is
13228 + * lenX bytes. Matching nodeids are packed into outbuf in order
13229 + * [nodeid1][nodeid2]...
13232 + inbufptr = inbuf;
13233 + outbufptr = outbuf;
13235 + while (inbufptr < inbuf + inlen) {
13236 + uint32_t r_nodeid, be_nodeid;
13239 + status = dlm_dir_lookup(ls, nodeid, inbufptr + 1, *inbufptr,
13244 + inbufptr += *inbufptr + 1;
13246 + be_nodeid = cpu_to_be32(r_nodeid);
13247 + memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
13248 + outbufptr += sizeof(uint32_t);
13250 + /* add assertion that outbufptr - outbuf is not > than ... */
13253 + return (outbufptr - outbuf);
13257 diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
13258 --- linux-orig/cluster/dlm/recover.h 1970-01-01 07:30:00.000000000 +0730
13259 +++ linux-patched/cluster/dlm/recover.h 2004-11-03 11:31:56.000000000 +0800
13261 +/******************************************************************************
13262 +*******************************************************************************
13264 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13265 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13267 +** This copyrighted material is made available to anyone wishing to use,
13268 +** modify, copy, or redistribute it subject to the terms and conditions
13269 +** of the GNU General Public License v.2.
13271 +*******************************************************************************
13272 +******************************************************************************/
13274 +#ifndef __RECOVER_DOT_H__
13275 +#define __RECOVER_DOT_H__
13277 +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls));
13278 +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status);
13279 +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status);
13280 +int dlm_recovery_stopped(struct dlm_ls *ls);
13281 +int recover_list_empty(struct dlm_ls *ls);
13282 +int recover_list_count(struct dlm_ls *ls);
13283 +void recover_list_add(struct dlm_rsb *rsb);
13284 +void recover_list_del(struct dlm_rsb *rsb);
13285 +int restbl_lkb_purge(struct dlm_ls *ls);
13286 +void restbl_grant_after_purge(struct dlm_ls *ls);
13287 +int restbl_rsb_update(struct dlm_ls *ls);
13288 +int restbl_rsb_update_recv(struct dlm_ls *ls, int nodeid, char *buf, int len,
13290 +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
13293 +#endif /* __RECOVER_DOT_H__ */
13294 diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
13295 --- linux-orig/cluster/dlm/recoverd.c 1970-01-01 07:30:00.000000000 +0730
13296 +++ linux-patched/cluster/dlm/recoverd.c 2004-11-03 11:31:56.000000000 +0800
13298 +/******************************************************************************
13299 +*******************************************************************************
13301 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13302 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13304 +** This copyrighted material is made available to anyone wishing to use,
13305 +** modify, copy, or redistribute it subject to the terms and conditions
13306 +** of the GNU General Public License v.2.
13308 +*******************************************************************************
13309 +******************************************************************************/
13311 +#include "dlm_internal.h"
13312 +#include "nodes.h"
13315 +#include "recover.h"
13316 +#include "lockspace.h"
13317 +#include "lowcomms.h"
13318 +#include "lockqueue.h"
13320 +#include "rebuild.h"
13323 + * next_move actions
13326 +#define DO_STOP (1)
13327 +#define DO_START (2)
13328 +#define DO_FINISH (3)
13329 +#define DO_FINISH_STOP (4)
13330 +#define DO_FINISH_START (5)
13333 + * Queue of lockspaces (dlm_recover structs) which need to be
13334 + * started/recovered
13337 +static int enable_locking(struct dlm_ls *ls, int event_id)
13341 + spin_lock(&ls->ls_recover_lock);
13342 + if (ls->ls_last_stop < event_id) {
13343 + set_bit(LSFL_LS_RUN, &ls->ls_flags);
13344 + up_write(&ls->ls_in_recovery);
13347 + log_debug(ls, "enable_locking: abort %d", event_id);
13349 + spin_unlock(&ls->ls_recover_lock);
13353 +static int ls_first_start(struct dlm_ls *ls, struct dlm_recover *rv)
13357 + log_all(ls, "recover event %u (first)", rv->event_id);
13359 + kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
13361 + error = ls_nodes_init(ls, rv);
13363 + log_error(ls, "nodes_init failed %d", error);
13367 + error = dlm_dir_rebuild_local(ls);
13369 + log_error(ls, "dlm_dir_rebuild_local failed %d", error);
13373 + error = dlm_dir_rebuild_wait(ls);
13375 + log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
13379 + log_all(ls, "recover event %u done", rv->event_id);
13380 + kcl_start_done(ls->ls_local_id, rv->event_id);
13387 + * We are given here a new group of nodes which are in the lockspace. We first
13388 + * figure out the differences in ls membership from when we were last running.
13389 + * If nodes from before are gone, then there will be some lock recovery to do.
13390 + * If there are only nodes which have joined, then there's no lock recovery.
13392 + * note: cman requires an rc to finish starting on an revent (where nodes die)
13393 + * before it allows an sevent (where nodes join) to be processed. This means
13394 + * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
13398 +static int ls_reconfig(struct dlm_ls *ls, struct dlm_recover *rv)
13400 + int error, neg = 0;
13402 + log_all(ls, "recover event %u", rv->event_id);
13405 + * this list may be left over from a previous aborted recovery
13408 + rebuild_freemem(ls);
13411 + * Add or remove nodes from the lockspace's ls_nodes list.
13414 + error = ls_nodes_reconfig(ls, rv, &neg);
13416 + log_error(ls, "nodes_reconfig failed %d", error);
13421 + * Rebuild our own share of the resdir by collecting from all other
13422 + * nodes rsb name/master pairs for which the name hashes to us.
13425 + error = dlm_dir_rebuild_local(ls);
13427 + log_error(ls, "dlm_dir_rebuild_local failed %d", error);
13432 + * Purge resdir-related requests that are being held in requestqueue.
13433 + * All resdir requests from before recovery started are invalid now due
13434 + * to the resdir rebuild and will be resent by the requesting nodes.
13437 + purge_requestqueue(ls);
13438 + set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
13441 + * Wait for all nodes to complete resdir rebuild.
13444 + error = dlm_dir_rebuild_wait(ls);
13446 + log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
13451 + * Mark our own lkb's waiting in the lockqueue for remote replies from
13452 + * nodes that are now departed. These will be resent to the new
13453 + * masters in resend_cluster_requests. Also mark resdir lookup
13454 + * requests for resending.
13457 + lockqueue_lkb_mark(ls);
13459 + error = dlm_recovery_stopped(ls);
13465 + * Clear lkb's for departed nodes. This can't fail since it
13466 + * doesn't involve communicating with other nodes.
13469 + restbl_lkb_purge(ls);
13472 + * Get new master id's for rsb's of departed nodes. This fails
13473 + * if we can't communicate with other nodes.
13476 + error = restbl_rsb_update(ls);
13478 + log_error(ls, "restbl_rsb_update failed %d", error);
13483 + * Send our lkb info to new masters. This fails if we can't
13484 + * communicate with a node.
13487 + error = rebuild_rsbs_send(ls);
13489 + log_error(ls, "rebuild_rsbs_send failed %d", error);
13494 + clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
13496 + log_all(ls, "recover event %u done", rv->event_id);
13497 + kcl_start_done(ls->ls_local_id, rv->event_id);
13501 + log_all(ls, "recover event %d error %d", rv->event_id, error);
13505 +static void clear_finished_nodes(struct dlm_ls *ls, int finish_event)
13507 + struct dlm_csb *csb, *safe;
13509 + list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, list) {
13510 + if (csb->gone_event <= finish_event) {
13511 + list_del(&csb->list);
13512 + release_csb(csb);
13518 + * Between calls to this routine for a ls, there can be multiple stop/start
13519 + * events from cman where every start but the latest is cancelled by stops.
13520 + * There can only be a single finish from cman because every finish requires us
13521 + * to call start_done. A single finish event could be followed by multiple
13522 + * stop/start events. This routine takes any combination of events from cman
13523 + * and boils them down to one course of action.
13526 +static int next_move(struct dlm_ls *ls, struct dlm_recover **rv_out,
13529 + LIST_HEAD(events);
13530 + unsigned int cmd = 0, stop, start, finish;
13531 + unsigned int last_stop, last_start, last_finish;
13532 + struct dlm_recover *rv = NULL, *start_rv = NULL;
13535 + * Grab the current state of cman/sm events.
13538 + spin_lock(&ls->ls_recover_lock);
13540 + stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
13541 + start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
13542 + finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
13544 + last_stop = ls->ls_last_stop;
13545 + last_start = ls->ls_last_start;
13546 + last_finish = ls->ls_last_finish;
13548 + while (!list_empty(&ls->ls_recover)) {
13549 + rv = list_entry(ls->ls_recover.next, struct dlm_recover, list);
13550 + list_del(&rv->list);
13551 + list_add_tail(&rv->list, &events);
13555 + * There are two cases where we need to adjust these event values:
13556 + * 1. - we get a first start
13557 + * - we get a stop
13558 + * - we process the start + stop here and notice this special case
13560 + * 2. - we get a first start
13561 + * - we process the start
13562 + * - we get a stop
13563 + * - we process the stop here and notice this special case
13565 + * In both cases, the first start we received was aborted by a
13566 + * stop before we received a finish. last_finish being zero is the
13567 + * indication that this is the "first" start, i.e. we've not yet
13568 + * finished a start; if we had, last_finish would be non-zero.
13569 + * Part of the problem arises from the fact that when we initially
13570 + * get start/stop/start, SM uses the same event id for both starts
13571 + * (since the first was cancelled).
13573 + * In both cases, last_start and last_stop will be equal.
13574 + * In both cases, finish=0.
13575 + * In the first case start=1 && stop=1.
13576 + * In the second case start=0 && stop=1.
13578 + * In both cases, we need to make adjustments to values so:
13579 + * - we process the current event (now) as a normal stop
13580 + * - the next start we receive will be processed normally
13581 + * (taking into account the assertions below)
13583 + * In the first case, dlm_ls_start() will have printed the
13584 + * "repeated start" warning.
13586 + * In the first case we need to get rid of the recover event struct.
13588 + * - set stop=1, start=0, finish=0 for case 4 below
13589 + * - last_stop and last_start must be set equal per the case 4 assert
13590 + * - ls_last_stop = 0 so the next start will be larger
13591 + * - ls_last_start = 0 not really necessary (avoids dlm_ls_start print)
13594 + if (!last_finish && (last_start == last_stop)) {
13595 + log_all(ls, "move reset %u,%u,%u ids %u,%u,%u", stop,
13596 + start, finish, last_stop, last_start, last_finish);
13602 + ls->ls_last_stop = 0;
13603 + ls->ls_last_start = 0;
13605 + while (!list_empty(&events)) {
13606 + rv = list_entry(events.next, struct dlm_recover, list);
13607 + list_del(&rv->list);
13608 + kfree(rv->nodeids);
13612 + spin_unlock(&ls->ls_recover_lock);
13614 + log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
13615 + last_stop, last_start, last_finish);
13618 + * Toss start events which have since been cancelled.
13621 + while (!list_empty(&events)) {
13622 + DLM_ASSERT(start,);
13623 + rv = list_entry(events.next, struct dlm_recover, list);
13624 + list_del(&rv->list);
13626 + if (rv->event_id <= last_stop) {
13627 + log_debug(ls, "move skip event %u", rv->event_id);
13628 + kfree(rv->nodeids);
13632 + log_debug(ls, "move use event %u", rv->event_id);
13633 + DLM_ASSERT(!start_rv,);
13639 + * Eight possible combinations of events.
13643 + if (!stop && !start && !finish) {
13644 + DLM_ASSERT(!start_rv,);
13650 + if (!stop && !start && finish) {
13651 + DLM_ASSERT(!start_rv,);
13652 + DLM_ASSERT(last_start > last_stop,);
13653 + DLM_ASSERT(last_finish == last_start,);
13655 + *finish_out = last_finish;
13660 + if (!stop && start && !finish) {
13661 + DLM_ASSERT(start_rv,);
13662 + DLM_ASSERT(last_start > last_stop,);
13664 + *rv_out = start_rv;
13669 + if (!stop && start && finish) {
13670 + DLM_ASSERT(0, printk("finish and start with no stop\n"););
13674 + if (stop && !start && !finish) {
13675 + DLM_ASSERT(!start_rv,);
13676 + DLM_ASSERT(last_start == last_stop,);
13682 + if (stop && !start && finish) {
13683 + DLM_ASSERT(!start_rv,);
13684 + DLM_ASSERT(last_finish == last_start,);
13685 + DLM_ASSERT(last_stop == last_start,);
13686 + cmd = DO_FINISH_STOP;
13687 + *finish_out = last_finish;
13692 + if (stop && start && !finish) {
13694 + DLM_ASSERT(last_start > last_stop,);
13696 + *rv_out = start_rv;
13698 + DLM_ASSERT(last_stop == last_start,);
13705 + if (stop && start && finish) {
13707 + DLM_ASSERT(last_start > last_stop,);
13708 + DLM_ASSERT(last_start > last_finish,);
13709 + cmd = DO_FINISH_START;
13710 + *finish_out = last_finish;
13711 + *rv_out = start_rv;
13713 + DLM_ASSERT(last_start == last_stop,);
13714 + DLM_ASSERT(last_start > last_finish,);
13715 + cmd = DO_FINISH_STOP;
13716 + *finish_out = last_finish;
13726 + * This function decides what to do given every combination of current
13727 + * lockspace state and next lockspace state.
13730 +static void do_ls_recovery(struct dlm_ls *ls)
13732 + struct dlm_recover *rv = NULL;
13733 + int error, cur_state, next_state = 0, do_now, finish_event = 0;
13735 + do_now = next_move(ls, &rv, &finish_event);
13739 + cur_state = ls->ls_state;
13742 + DLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
13743 + log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
13746 + * LSST_CLEAR - we're not in any recovery state. We can get a stop or
13747 + * a stop and start which equates with a START.
13750 + if (cur_state == LSST_CLEAR) {
13751 + switch (do_now) {
13753 + next_state = LSST_WAIT_START;
13757 + error = ls_reconfig(ls, rv);
13759 + next_state = LSST_WAIT_START;
13761 + next_state = LSST_RECONFIG_DONE;
13764 + case DO_FINISH: /* invalid */
13765 + case DO_FINISH_STOP: /* invalid */
13766 + case DO_FINISH_START: /* invalid */
13774 + * LSST_WAIT_START - we're not running because of getting a stop or
13775 + * failing a start. We wait in this state for another stop/start or
13776 + * just the next start to begin another reconfig attempt.
13779 + if (cur_state == LSST_WAIT_START) {
13780 + switch (do_now) {
13785 + error = ls_reconfig(ls, rv);
13787 + next_state = LSST_WAIT_START;
13789 + next_state = LSST_RECONFIG_DONE;
13792 + case DO_FINISH: /* invalid */
13793 + case DO_FINISH_STOP: /* invalid */
13794 + case DO_FINISH_START: /* invalid */
13802 + * LSST_RECONFIG_DONE - we entered this state after successfully
13803 + * completing ls_reconfig and calling kcl_start_done. We expect to get
13804 + * a finish if everything goes ok. A finish could be followed by stop
13805 + * or stop/start before we get here to check it. Or a finish may never
13806 + * happen, only stop or stop/start.
13809 + if (cur_state == LSST_RECONFIG_DONE) {
13810 + switch (do_now) {
13812 + rebuild_freemem(ls);
13814 + clear_finished_nodes(ls, finish_event);
13815 + next_state = LSST_CLEAR;
13817 + error = enable_locking(ls, finish_event);
13821 + error = process_requestqueue(ls);
13825 + error = resend_cluster_requests(ls);
13829 + restbl_grant_after_purge(ls);
13831 + log_all(ls, "recover event %u finished", finish_event);
13835 + next_state = LSST_WAIT_START;
13838 + case DO_FINISH_STOP:
13839 + clear_finished_nodes(ls, finish_event);
13840 + next_state = LSST_WAIT_START;
13843 + case DO_FINISH_START:
13844 + clear_finished_nodes(ls, finish_event);
13845 + /* fall into DO_START */
13848 + error = ls_reconfig(ls, rv);
13850 + next_state = LSST_WAIT_START;
13852 + next_state = LSST_RECONFIG_DONE;
13862 + * LSST_INIT - state after ls is created and before it has been
13863 + * started. A start operation will cause the ls to be started for the
13864 + * first time. A failed start will cause to just wait in INIT for
13865 + * another stop/start.
13868 + if (cur_state == LSST_INIT) {
13869 + switch (do_now) {
13871 + error = ls_first_start(ls, rv);
13873 + next_state = LSST_INIT_DONE;
13879 + case DO_FINISH: /* invalid */
13880 + case DO_FINISH_STOP: /* invalid */
13881 + case DO_FINISH_START: /* invalid */
13889 + * LSST_INIT_DONE - after the first start operation is completed
13890 + * successfully and kcl_start_done() called. If there are no errors, a
13891 + * finish will arrive next and we'll move to LSST_CLEAR.
13894 + if (cur_state == LSST_INIT_DONE) {
13895 + switch (do_now) {
13897 + case DO_FINISH_STOP:
13898 + next_state = LSST_WAIT_START;
13902 + case DO_FINISH_START:
13903 + error = ls_reconfig(ls, rv);
13905 + next_state = LSST_WAIT_START;
13907 + next_state = LSST_RECONFIG_DONE;
13911 + next_state = LSST_CLEAR;
13913 + enable_locking(ls, finish_event);
13915 + process_requestqueue(ls);
13917 + log_all(ls, "recover event %u finished", finish_event);
13928 + ls->ls_state = next_state;
13931 + kfree(rv->nodeids);
13936 +int dlm_recoverd(void *arg)
13938 + struct dlm_ls *ls = arg;
13940 + hold_lockspace(ls);
13943 + set_current_state(TASK_INTERRUPTIBLE);
13944 + if (!test_bit(LSFL_WORK, &ls->ls_flags))
13946 + set_current_state(TASK_RUNNING);
13948 + if (test_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags)) {
13949 + down(&ls->ls_recoverd_lock);
13950 + ls->ls_recoverd_task = NULL;
13951 + up(&ls->ls_recoverd_lock);
13955 + if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) {
13956 + do_ls_recovery(ls);
13958 + down(&ls->ls_recoverd_lock);
13959 + if (ls->ls_state == LSST_CLEAR &&
13960 + !test_bit(LSFL_WORK, &ls->ls_flags)) {
13961 + ls->ls_recoverd_task = NULL;
13962 + up(&ls->ls_recoverd_lock);
13965 + up(&ls->ls_recoverd_lock);
13970 + put_lockspace(ls);
13974 +void dlm_recoverd_kick(struct dlm_ls *ls)
13976 + struct task_struct *p;
13978 + down(&ls->ls_recoverd_lock);
13979 + set_bit(LSFL_WORK, &ls->ls_flags);
13981 + if (!ls->ls_recoverd_task) {
13982 + p = kthread_run(dlm_recoverd, (void *) ls, 0, "dlm_recoverd");
13984 + log_error(ls, "can't start dlm_recoverd %ld",
13988 + ls->ls_recoverd_task = p;
13990 + wake_up_process(ls->ls_recoverd_task);
13992 + up(&ls->ls_recoverd_lock);
13995 +void dlm_recoverd_stop(struct dlm_ls *ls)
13997 + set_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags);
14000 + down(&ls->ls_recoverd_lock);
14001 + if (!ls->ls_recoverd_task) {
14002 + up(&ls->ls_recoverd_lock);
14005 + wake_up_process(ls->ls_recoverd_task);
14006 + up(&ls->ls_recoverd_lock);
14011 diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
14012 --- linux-orig/cluster/dlm/recoverd.h 1970-01-01 07:30:00.000000000 +0730
14013 +++ linux-patched/cluster/dlm/recoverd.h 2004-11-03 11:31:56.000000000 +0800
14015 +/******************************************************************************
14016 +*******************************************************************************
14018 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14019 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14021 +** This copyrighted material is made available to anyone wishing to use,
14022 +** modify, copy, or redistribute it subject to the terms and conditions
14023 +** of the GNU General Public License v.2.
14025 +*******************************************************************************
14026 +******************************************************************************/
14028 +#ifndef __RECOVERD_DOT_H__
14029 +#define __RECOVERD_DOT_H__
14031 +int dlm_recoverd(void *arg);
14032 +void dlm_recoverd_kick(struct dlm_ls *ls);
14033 +void dlm_recoverd_stop(struct dlm_ls *ls);
14035 +#endif /* __RECOVERD_DOT_H__ */
14036 diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
14037 --- linux-orig/cluster/dlm/rsb.c 1970-01-01 07:30:00.000000000 +0730
14038 +++ linux-patched/cluster/dlm/rsb.c 2004-11-03 11:31:56.000000000 +0800
14040 +/******************************************************************************
14041 +*******************************************************************************
14043 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14044 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14046 +** This copyrighted material is made available to anyone wishing to use,
14047 +** modify, copy, or redistribute it subject to the terms and conditions
14048 +** of the GNU General Public License v.2.
14050 +*******************************************************************************
14051 +******************************************************************************/
14053 +#include "dlm_internal.h"
14054 +#include "locking.h"
14055 +#include "memory.h"
14056 +#include "lockqueue.h"
14057 +#include "nodes.h"
14062 +static struct dlm_rsb *search_hashchain(struct list_head *head,
14063 + struct dlm_rsb *parent,
14064 + char *name, int namelen)
14066 + struct dlm_rsb *r;
14068 + list_for_each_entry(r, head, res_hashchain) {
14069 + if ((parent == r->res_parent) && (namelen == r->res_length) &&
14070 + (memcmp(name, r->res_name, namelen) == 0)) {
14079 + * A way to arbitrarily hold onto an rsb which we already have a reference to
14080 + * to make sure it doesn't go away. Opposite of release_rsb().
14083 +void hold_rsb(struct dlm_rsb *r)
14085 + atomic_inc(&r->res_ref);
14089 + * release_rsb() - Decrement reference count on rsb struct. Free the rsb
14090 + * struct when there are zero references. Every lkb for the rsb adds a
14091 + * reference. When ref is zero there can be no more lkb's for the rsb, on the
14092 + * queue's or anywhere else.
14095 +static void _release_rsb(struct dlm_rsb *r, int locked)
14097 + struct dlm_ls *ls = r->res_ls;
14099 + int removed = FALSE;
14101 + write_lock(&ls->ls_rsbtbl[r->res_bucket].lock);
14102 + if (atomic_dec_and_test(&r->res_ref)) {
14103 + DLM_ASSERT(list_empty(&r->res_grantqueue), print_rsb(r););
14104 + DLM_ASSERT(list_empty(&r->res_waitqueue), print_rsb(r););
14105 + DLM_ASSERT(list_empty(&r->res_convertqueue), print_rsb(r););
14107 + list_del(&r->res_hashchain);
14109 + write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock);
14115 + down_write(&ls->ls_root_lock);
14116 + if (r->res_parent)
14117 + list_del(&r->res_subreslist);
14119 + list_del(&r->res_rootlist);
14121 + up_write(&ls->ls_root_lock);
14123 + if (r->res_parent || !test_bit(RESFL_MASTER, &r->res_flags))
14126 + nodeid = get_directory_nodeid(r);
14128 + if (nodeid != our_nodeid())
14129 + remote_remove_direntry(ls, nodeid, r->res_name, r->res_length);
14131 + dlm_dir_remove(ls, nodeid, r->res_name, r->res_length);
14133 + if (r->res_lvbptr)
14134 + free_lvb(r->res_lvbptr);
14139 +void release_rsb(struct dlm_rsb *r)
14141 + _release_rsb(r, 0);
14144 +void release_rsb_locked(struct dlm_rsb *r)
14146 + _release_rsb(r, 1);
14149 +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb)
14151 + struct dlm_rsb *r = lkb->lkb_resource;
14156 + * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
14157 + * If the rsb exists, its ref count is incremented by this function. If it
14158 + * doesn't exist, it's created with a ref count of one.
14161 +int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name, int len,
14162 + int flags, struct dlm_rsb **rp)
14165 + struct dlm_rsb *r, *tmp;
14166 + int error = -ENOMEM;
14168 + DLM_ASSERT(len <= DLM_RESNAME_MAXLEN,);
14170 + bucket = dlm_hash(name, len);
14171 + bucket &= (ls->ls_rsbtbl_size - 1);
14173 + read_lock(&ls->ls_rsbtbl[bucket].lock);
14174 + r = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len);
14176 + if (r->res_nodeid != 0 && (flags & MASTER))
14179 + atomic_inc(&r->res_ref);
14181 + read_unlock(&ls->ls_rsbtbl[bucket].lock);
14186 + /* Always create sublocks */
14187 + if (!(flags & CREATE) && !parent) {
14192 + r = allocate_rsb(ls, len);
14196 + INIT_LIST_HEAD(&r->res_subreslist);
14197 + INIT_LIST_HEAD(&r->res_grantqueue);
14198 + INIT_LIST_HEAD(&r->res_convertqueue);
14199 + INIT_LIST_HEAD(&r->res_waitqueue);
14201 + memcpy(r->res_name, name, len);
14202 + r->res_length = len;
14204 + init_rwsem(&r->res_lock);
14205 + atomic_set(&r->res_ref, 1);
14206 + r->res_bucket = bucket;
14209 + r->res_parent = parent;
14210 + r->res_depth = parent->res_depth + 1;
14211 + r->res_root = parent->res_root;
14212 + r->res_nodeid = parent->res_nodeid;
14214 + r->res_parent = NULL;
14215 + r->res_depth = 1;
14217 + r->res_nodeid = -1;
14220 + write_lock(&ls->ls_rsbtbl[bucket].lock);
14221 + tmp = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len);
14223 + atomic_inc(&tmp->res_ref);
14224 + write_unlock(&ls->ls_rsbtbl[bucket].lock);
14228 + list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
14229 + write_unlock(&ls->ls_rsbtbl[bucket].lock);
14231 + down_write(&ls->ls_root_lock);
14233 + list_add_tail(&r->res_subreslist,
14234 + &r->res_root->res_subreslist);
14236 + list_add(&r->res_rootlist, &ls->ls_rootres);
14237 + up_write(&ls->ls_root_lock);
14251 + * Add a LKB to a resource's grant/convert/wait queue. in order
14254 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
14256 + struct dlm_lkb *lkb = NULL;
14258 + list_for_each_entry(lkb, head, lkb_statequeue) {
14259 + if (lkb->lkb_rqmode < mode)
14264 + /* No entries in the queue, we are alone */
14265 + list_add_tail(new, head);
14267 + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
14272 + * The rsb res_lock must be held in write when this function is called.
14275 +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
14277 + DLM_ASSERT(!lkb->lkb_status,
14281 + lkb->lkb_status = type;
14284 + case GDLM_LKSTS_WAITING:
14285 + if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE)
14286 + list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
14288 + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
14291 + case GDLM_LKSTS_GRANTED:
14292 + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
14293 + lkb->lkb_grmode);
14296 + case GDLM_LKSTS_CONVERT:
14297 + if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE)
14298 + list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
14300 + list_add_tail(&lkb->lkb_statequeue,
14301 + &r->res_convertqueue);
14309 +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
14311 + down_write(&r->res_lock);
14312 + lkb_enqueue(r, lkb, type);
14313 + up_write(&r->res_lock);
14317 + * The rsb res_lock must be held in write when this function is called.
14320 +int lkb_dequeue(struct dlm_lkb *lkb)
14322 + int status = lkb->lkb_status;
14327 + lkb->lkb_status = 0;
14328 + list_del(&lkb->lkb_statequeue);
14334 +int res_lkb_dequeue(struct dlm_lkb *lkb)
14338 + down_write(&lkb->lkb_resource->res_lock);
14339 + status = lkb_dequeue(lkb);
14340 + up_write(&lkb->lkb_resource->res_lock);
14346 + * The rsb res_lock must be held in write when this function is called.
14349 +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
14353 + status = lkb_dequeue(lkb);
14354 + lkb_enqueue(r, lkb, type);
14359 +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
14363 + down_write(&r->res_lock);
14364 + status = lkb_swqueue(r, lkb, type);
14365 + up_write(&r->res_lock);
14369 diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
14370 --- linux-orig/cluster/dlm/rsb.h 1970-01-01 07:30:00.000000000 +0730
14371 +++ linux-patched/cluster/dlm/rsb.h 2004-11-03 11:31:56.000000000 +0800
14373 +/******************************************************************************
14374 +*******************************************************************************
14376 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14377 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14379 +** This copyrighted material is made available to anyone wishing to use,
14380 +** modify, copy, or redistribute it subject to the terms and conditions
14381 +** of the GNU General Public License v.2.
14383 +*******************************************************************************
14384 +******************************************************************************/
14386 +#ifndef __RSB_DOT_H__
14387 +#define __RSB_DOT_H__
14392 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
14393 +void release_rsb(struct dlm_rsb *r);
14394 +void release_rsb_locked(struct dlm_rsb *r);
14395 +void hold_rsb(struct dlm_rsb *r);
14396 +int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name,
14397 + int namelen, int flags, struct dlm_rsb **rp);
14398 +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb);
14399 +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14400 +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14401 +int lkb_dequeue(struct dlm_lkb *lkb);
14402 +int res_lkb_dequeue(struct dlm_lkb *lkb);
14403 +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14404 +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14406 +#endif /* __RSB_DOT_H__ */
14407 diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
14408 --- linux-orig/cluster/dlm/util.c 1970-01-01 07:30:00.000000000 +0730
14409 +++ linux-patched/cluster/dlm/util.c 2004-11-03 11:31:56.000000000 +0800
14411 +/******************************************************************************
14412 +*******************************************************************************
14414 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14415 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14417 +** This copyrighted material is made available to anyone wishing to use,
14418 +** modify, copy, or redistribute it subject to the terms and conditions
14419 +** of the GNU General Public License v.2.
14421 +*******************************************************************************
14422 +******************************************************************************/
14424 +#include "dlm_internal.h"
14426 +static const uint32_t crc_32_tab[] = {
14427 + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
14428 + 0xe963a535, 0x9e6495a3,
14429 + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
14430 + 0xe7b82d07, 0x90bf1d91,
14431 + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
14432 + 0xf4d4b551, 0x83d385c7,
14433 + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
14434 + 0xfa0f3d63, 0x8d080df5,
14435 + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
14436 + 0xd20d85fd, 0xa50ab56b,
14437 + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
14438 + 0xdcd60dcf, 0xabd13d59,
14439 + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
14440 + 0xcfba9599, 0xb8bda50f,
14441 + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
14442 + 0xc1611dab, 0xb6662d3d,
14443 + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
14444 + 0x9fbfe4a5, 0xe8b8d433,
14445 + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
14446 + 0x91646c97, 0xe6635c01,
14447 + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
14448 + 0x8208f4c1, 0xf50fc457,
14449 + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
14450 + 0x8cd37cf3, 0xfbd44c65,
14451 + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
14452 + 0xa4d1c46d, 0xd3d6f4fb,
14453 + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
14454 + 0xaa0a4c5f, 0xdd0d7cc9,
14455 + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
14456 + 0xb966d409, 0xce61e49f,
14457 + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
14458 + 0xb7bd5c3b, 0xc0ba6cad,
14459 + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
14460 + 0x04db2615, 0x73dc1683,
14461 + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
14462 + 0x0a00ae27, 0x7d079eb1,
14463 + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
14464 + 0x196c3671, 0x6e6b06e7,
14465 + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
14466 + 0x17b7be43, 0x60b08ed5,
14467 + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
14468 + 0x3fb506dd, 0x48b2364b,
14469 + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
14470 + 0x316e8eef, 0x4669be79,
14471 + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
14472 + 0x220216b9, 0x5505262f,
14473 + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
14474 + 0x2cd99e8b, 0x5bdeae1d,
14475 + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
14476 + 0x72076785, 0x05005713,
14477 + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
14478 + 0x7cdcefb7, 0x0bdbdf21,
14479 + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
14480 + 0x6fb077e1, 0x18b74777,
14481 + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
14482 + 0x616bffd3, 0x166ccf45,
14483 + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
14484 + 0x4969474d, 0x3e6e77db,
14485 + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
14486 + 0x47b2cf7f, 0x30b5ffe9,
14487 + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
14488 + 0x54de5729, 0x23d967bf,
14489 + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
14490 + 0x5a05df1b, 0x2d02ef8d
14494 + * dlm_hash - hash an array of data
14495 + * @data: the data to be hashed
14496 + * @len: the length of data to be hashed
14498 + * Copied from GFS.
14500 + * Take some data and convert it to a 32-bit hash.
14502 + * The hash function is a 32-bit CRC of the data. The algorithm uses
14503 + * the crc_32_tab table above.
14505 + * This may not be the fastest hash function, but it does a fair bit better
14506 + * at providing uniform results than the others I've looked at. That's
14507 + * really important for efficient directories.
14509 + * Returns: the hash
14512 +uint32_t dlm_hash(const char *data, int len)
14514 + uint32_t hash = 0xFFFFFFFF;
14516 + for (; len--; data++)
14517 + hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
14524 +void print_lkb(struct dlm_lkb *lkb)
14526 + printk("dlm: lkb\n"
14543 + lkb->lkb_lockqueue_state,
14544 + lkb->lkb_lockqueue_flags);
14547 +void print_rsb(struct dlm_rsb *r)
14549 + printk("dlm: rsb\n"
14557 + atomic_read(&r->res_ref));
14560 +void print_request(struct dlm_request *req)
14562 + printk("dlm: request\n"
14569 + req->rr_header.rh_cmd,
14570 + req->rr_header.rh_lkid,
14577 +void print_reply(struct dlm_reply *rp)
14579 + printk("dlm: reply\n"
14586 + rp->rl_header.rh_cmd,
14587 + rp->rl_header.rh_lkid,
14588 + rp->rl_lockstate,
14594 diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
14595 --- linux-orig/cluster/dlm/util.h 1970-01-01 07:30:00.000000000 +0730
14596 +++ linux-patched/cluster/dlm/util.h 2004-11-03 11:31:56.000000000 +0800
14598 +/******************************************************************************
14599 +*******************************************************************************
14601 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14602 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14604 +** This copyrighted material is made available to anyone wishing to use,
14605 +** modify, copy, or redistribute it subject to the terms and conditions
14606 +** of the GNU General Public License v.2.
14608 +*******************************************************************************
14609 +******************************************************************************/
14611 +#ifndef __UTIL_DOT_H__
14612 +#define __UTIL_DOT_H__
14614 +uint32_t dlm_hash(const char *data, int len);
14616 +void print_lkb(struct dlm_lkb *lkb);
14617 +void print_rsb(struct dlm_rsb *r);
14618 +void print_request(struct dlm_request *req);
14619 +void print_reply(struct dlm_reply *rp);
14622 diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
14623 --- linux-orig/include/cluster/dlm.h 1970-01-01 07:30:00.000000000 +0730
14624 +++ linux-patched/include/cluster/dlm.h 2004-11-03 11:31:56.000000000 +0800
14626 +/******************************************************************************
14627 +*******************************************************************************
14629 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14630 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14632 +** This copyrighted material is made available to anyone wishing to use,
14633 +** modify, copy, or redistribute it subject to the terms and conditions
14634 +** of the GNU General Public License v.2.
14636 +*******************************************************************************
14637 +******************************************************************************/
14639 +#ifndef __DLM_DOT_H__
14640 +#define __DLM_DOT_H__
14643 + * Interface to DLM - routines and structures to use DLM lockspaces.
14650 +#define DLM_LOCK_IV (-1) /* invalid */
14651 +#define DLM_LOCK_NL (0) /* null */
14652 +#define DLM_LOCK_CR (1) /* concurrent read */
14653 +#define DLM_LOCK_CW (2) /* concurrent write */
14654 +#define DLM_LOCK_PR (3) /* protected read */
14655 +#define DLM_LOCK_PW (4) /* protected write */
14656 +#define DLM_LOCK_EX (5) /* exclusive */
14659 + * Maximum size in bytes of a dlm_lock name
14662 +#define DLM_RESNAME_MAXLEN (64)
14665 + * Size in bytes of Lock Value Block
14668 +#define DLM_LVB_LEN (32)
14671 + * Flags to dlm_new_lockspace
14673 + * DLM_LSF_NOTIMERS
14675 + * Do not subject locks in this lockspace to time-outs.
14678 +#define DLM_LSF_NOTIMERS (1)
14681 + * Flags to dlm_lock
14683 + * DLM_LKF_NOQUEUE
14685 + * Do not queue the lock request on the wait queue if it cannot be granted
14686 + * immediately. If the lock cannot be granted because of this flag, DLM will
14687 + * either return -EAGAIN from the dlm_lock call or will return 0 from
14688 + * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
14690 + * DLM_LKF_CONVERT
14692 + * Indicates a lock conversion request. For conversions the name and namelen
14693 + * are ignored and the lock ID in the LKSB is used to identify the lock.
14697 + * Requests DLM to return the current contents of the lock value block in the
14698 + * lock status block. When this flag is set in a lock conversion from PW or EX
14699 + * modes, DLM assigns the value specified in the lock status block to the lock
14700 + * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
14701 + * containing application-specific information.
14705 + * Force a conversion request to be queued, even if it is compatible with
14706 + * the granted modes of other locks on the same resource.
14710 + * Used to cancel a pending conversion (with dlm_unlock). Lock is returned to
14711 + * previously granted mode.
14713 + * DLM_LKF_IVVALBLK
14715 + * Invalidate/clear the lock value block.
14717 + * DLM_LKF_CONVDEADLK
14719 + * The granted mode of a lock being converted (from a non-NL mode) can be
14720 + * changed to NL in the process of acquiring the requested mode to avoid
14721 + * conversion deadlock.
14723 + * DLM_LKF_PERSISTENT
14725 + * Only relevant to locks originating in userspace. Signals to the ioctl.c code
14726 + * that this lock should not be unlocked when the process exits.
14728 + * DLM_LKF_NODLKWT
14730 + * This lock is not to be checked for conversion deadlocks.
14732 + * DLM_LKF_NODLCKBLK
14734 + * not yet implemented
14736 + * DLM_LKF_EXPEDITE
14738 + * Used only with new requests for NL mode locks. Tells the lock manager
14739 + * to grant the lock, ignoring other locks in convert and wait queues.
14741 + * DLM_LKF_NOQUEUEBAST
14743 + * Send blocking AST's before returning -EAGAIN to the caller. It is only
14744 + * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
14745 + * NOQUEUE requests otherwise.
14747 + * DLM_LKF_HEADQUE
14749 + * Add a lock to the head of the convert or wait queue rather than the tail.
14751 + * DLM_LKF_NOORDER
14753 + * Disregard the standard grant order rules and grant a lock as soon as it
14754 + * is compatible with other granted locks.
14757 +#define DLM_LKF_NOQUEUE (0x00000001)
14758 +#define DLM_LKF_CANCEL (0x00000002)
14759 +#define DLM_LKF_CONVERT (0x00000004)
14760 +#define DLM_LKF_VALBLK (0x00000008)
14761 +#define DLM_LKF_QUECVT (0x00000010)
14762 +#define DLM_LKF_IVVALBLK (0x00000020)
14763 +#define DLM_LKF_CONVDEADLK (0x00000040)
14764 +#define DLM_LKF_PERSISTENT (0x00000080)
14765 +#define DLM_LKF_NODLCKWT (0x00000100)
14766 +#define DLM_LKF_NODLCKBLK (0x00000200)
14767 +#define DLM_LKF_EXPEDITE (0x00000400)
14768 +#define DLM_LKF_NOQUEUEBAST (0x00000800)
14769 +#define DLM_LKF_HEADQUE (0x00001000)
14770 +#define DLM_LKF_NOORDER (0x00002000)
14771 +#define DLM_LKF_ORPHAN (0x00004000)
14774 + * Some return codes that are not in errno.h
14777 +#define DLM_ECANCEL (0x10001)
14778 +#define DLM_EUNLOCK (0x10002)
14780 +typedef void dlm_lockspace_t;
14783 + * Lock range structure
14786 +struct dlm_range {
14787 + uint64_t ra_start;
14792 + * Lock status block
14794 + * Use this structure to specify the contents of the lock value block. For a
14795 + * conversion request, this structure is used to specify the lock ID of the
14796 + * lock. DLM writes the status of the lock request and the lock ID assigned
14797 + * to the request in the lock status block.
14799 + * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
14800 + * It is available when dlm_lock returns.
14802 + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
14803 + * shown for the DLM_LKF_VALBLK flag.
14805 + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
14806 + * it was first demoted to NL to avoid conversion deadlock.
14808 + * sb_status: the returned status of the lock request set prior to AST
14809 + * execution. Possible return values:
14811 + * 0 if lock request was successful
14812 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14813 + * -ENOMEM if there is no memory to process request
14814 + * -EINVAL if there are invalid parameters
14815 + * -DLM_EUNLOCK if unlock request was successful
14819 +#define DLM_SBF_DEMOTED (0x01)
14823 + uint32_t sb_lkid;
14825 + char * sb_lvbptr;
14829 + * These defines are the bits that make up the query code.
14832 +/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
14833 + * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
14834 +#define DLM_LOCK_THIS 0x0007
14835 +#define DLM_QUERY_MODE_MASK 0x0007
14837 +/* Bits 3, 4, 5 bitmap of queue(s) to query */
14838 +#define DLM_QUERY_QUEUE_WAIT 0x0008
14839 +#define DLM_QUERY_QUEUE_CONVERT 0x0010
14840 +#define DLM_QUERY_QUEUE_GRANT 0x0020
14841 +#define DLM_QUERY_QUEUE_GRANTED 0x0030 /* Shorthand */
14842 +#define DLM_QUERY_QUEUE_ALL 0x0038 /* Shorthand */
14844 +/* Bit 6, Return only the information that can be established without a network
14845 + * round-trip. The caller must be aware of the implications of this. Useful for
14846 + * just getting the master node id or resource name. */
14847 +#define DLM_QUERY_LOCAL 0x0040
14849 +/* Bits 8 up, query type */
14850 +#define DLM_QUERY_LOCKS_HIGHER 0x0100
14851 +#define DLM_QUERY_LOCKS_LOWER 0x0200
14852 +#define DLM_QUERY_LOCKS_EQUAL 0x0300
14853 +#define DLM_QUERY_LOCKS_BLOCKING 0x0400
14854 +#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
14855 +#define DLM_QUERY_LOCKS_ALL 0x0600
14856 +#define DLM_QUERY_LOCKS_ORPHAN 0x0700
14857 +#define DLM_QUERY_MASK 0x0F00
14859 +/* GRMODE is the default for mode comparisons,
14860 + RQMODE might also be handy */
14861 +#define DLM_QUERY_GRMODE 0x0000
14862 +#define DLM_QUERY_RQMODE 0x1000
14864 +/* Structures passed into and out of the query */
14866 +struct dlm_lockinfo {
14867 + int lki_lkid; /* Lock ID on originating node */
14868 + int lki_mstlkid; /* Lock ID on master node */
14870 + int lki_node; /* Originating node (not master) */
14871 + int lki_ownpid; /* Owner pid on originating node */
14872 + uint8_t lki_state; /* Queue the lock is on */
14873 + uint8_t lki_grmode; /* Granted mode */
14874 + uint8_t lki_rqmode; /* Requested mode */
14875 + struct dlm_range lki_grrange; /* Granted range, if applicable */
14876 + struct dlm_range lki_rqrange; /* Requested range, if applicable */
14879 +struct dlm_resinfo {
14881 + int rsi_grantcount; /* No. of nodes on grant queue */
14882 + int rsi_convcount; /* No. of nodes on convert queue */
14883 + int rsi_waitcount; /* No. of nodes on wait queue */
14884 + int rsi_masternode; /* Master for this resource */
14885 + char rsi_name[DLM_RESNAME_MAXLEN]; /* Resource name */
14886 + char rsi_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable
14890 +struct dlm_queryinfo {
14891 + struct dlm_resinfo *gqi_resinfo;
14892 + struct dlm_lockinfo *gqi_lockinfo; /* This points to an array
14894 + int gqi_locksize; /* input */
14895 + int gqi_lockcount; /* output */
14902 + * Starts and initializes DLM threads and structures. Creation of the first
14903 + * lockspace will call this if it has not been called already.
14905 + * Returns: 0 if successful, -EXXX on error
14908 +int dlm_init(void);
14913 + * Stops DLM threads.
14915 + * Returns: 0 if successful, -EXXX on error
14918 +int dlm_release(void);
14921 + * dlm_new_lockspace
14923 + * Starts a lockspace with the given name. If the named lockspace exists in
14924 + * the cluster, the calling node joins it.
14927 +int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
14931 + * dlm_release_lockspace
14933 + * Stop a lockspace.
14936 +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
14941 + * Make an asyncronous request to acquire or convert a lock on a named
14944 + * lockspace: context for the request
14945 + * mode: the requested mode of the lock (DLM_LOCK_)
14946 + * lksb: lock status block for input and async return values
14947 + * flags: input flags (DLM_LKF_)
14948 + * name: name of the resource to lock, can be binary
14949 + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
14950 + * parent: the lock ID of a parent lock or 0 if none
14951 + * lockast: function DLM executes when it completes processing the request
14952 + * astarg: argument passed to lockast and bast functions
14953 + * bast: function DLM executes when this lock later blocks another request
14956 + * 0 if request is successfully queued for processing
14957 + * -EINVAL if any input parameters are invalid
14958 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14959 + * -ENOMEM if there is no memory to process request
14960 + * -ENOTCONN if there is a communication error
14962 + * If the call to dlm_lock returns an error then the operation has failed and
14963 + * the AST routine will not be called. If dlm_lock returns 0 it is still
14964 + * possible that the lock operation will fail. The AST routine will be called
14965 + * when the locking is complete and the status is returned in the lksb.
14967 + * If the AST routines or parameter are passed to a conversion operation then
14968 + * they will overwrite those values that were passed to a previous dlm_lock
14971 + * AST routines should not block (at least not for long), but may make
14972 + * any locking calls they please.
14975 +int dlm_lock(dlm_lockspace_t *lockspace,
14977 + struct dlm_lksb *lksb,
14980 + unsigned int namelen,
14982 + void (*lockast) (void *astarg),
14984 + void (*bast) (void *astarg, int mode),
14985 + struct dlm_range *range);
14990 + * Asynchronously release a lock on a resource. The AST routine is called
14991 + * when the resource is successfully unlocked.
14993 + * lockspace: context for the request
14994 + * lkid: the lock ID as returned in the lksb
14995 + * flags: input flags (DLM_LKF_)
14996 + * lksb: if NULL the lksb parameter passed to last lock request is used
14997 + * astarg: the arg used with the completion ast for the unlock
15000 + * 0 if request is successfully queued for processing
15001 + * -EINVAL if any input parameters are invalid
15002 + * -ENOTEMPTY if the lock still has sublocks
15003 + * -EBUSY if the lock is waiting for a remote lock operation
15004 + * -ENOTCONN if there is a communication error
15007 +extern int dlm_unlock(dlm_lockspace_t *lockspace,
15010 + struct dlm_lksb *lksb,
15013 +/* Query interface
15015 + * Query the other holders of a resource, given a known lock ID
15017 + * lockspace: context for the request
15018 + * lksb: LKSB, sb_lkid contains the lock ID of a valid lock
15019 + * on the resource. sb_status will contain the status
15020 + * of the request on completion.
15021 + * query: query bitmap see DLM_QUERY_* above
15022 + * qinfo: pointer to dlm_queryinfo structure
15023 + * ast_routine: AST routine to call on completion
15024 + * artarg: argument to AST routine. It is "traditional"
15025 + * to put the qinfo pointer into lksb->sb_lvbptr
15026 + * and pass the lksb in here.
15028 +extern int dlm_query(dlm_lockspace_t *lockspace,
15029 + struct dlm_lksb *lksb,
15031 + struct dlm_queryinfo *qinfo,
15032 + void (ast_routine(void *)),
15036 +void dlm_debug_dump(void);
15037 +void dlm_locks_dump(void);
15039 +#endif /* __KERNEL__ */
15041 +#endif /* __DLM_DOT_H__ */
15042 diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
15043 --- linux-orig/include/cluster/dlm_device.h 1970-01-01 07:30:00.000000000 +0730
15044 +++ linux-patched/include/cluster/dlm_device.h 2004-11-03 11:31:56.000000000 +0800
15046 +/******************************************************************************
15047 +*******************************************************************************
15049 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
15050 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
15052 +** This copyrighted material is made available to anyone wishing to use,
15053 +** modify, copy, or redistribute it subject to the terms and conditions
15054 +** of the GNU General Public License v.2.
15056 +*******************************************************************************
15057 +******************************************************************************/
15059 +/* This is the device interface for dlm, most users will use a library
15063 +/* Version of the device interface */
15064 +#define DLM_DEVICE_VERSION_MAJOR 2
15065 +#define DLM_DEVICE_VERSION_MINOR 0
15066 +#define DLM_DEVICE_VERSION_PATCH 0
15068 +/* struct passed to the lock write */
15069 +struct dlm_lock_params {
15070 + uint32_t version[3];
15076 + struct dlm_range range;
15082 + struct dlm_lksb *lksb;
15087 +/* struct read from the "device" fd,
15088 + consists mainly of userspace pointers for the library to use */
15089 +struct dlm_lock_result {
15092 + void (*astaddr)(void *astparam);
15093 + struct dlm_lksb *user_lksb;
15094 + struct dlm_lksb lksb; /* But this has real data in it */
15095 + uint8_t bast_mode; /* Not yet used */
15098 +/* commands passed to the device */
15099 +#define DLM_USER_LOCK 1
15100 +#define DLM_USER_UNLOCK 2
15101 +#define DLM_USER_QUERY 3
15103 +/* Arbitrary length restriction */
15104 +#define MAX_LS_NAME_LEN 64
15106 +/* ioctls on the device */
15107 +#define DLM_CREATE_LOCKSPACE _IOW('D', 0x01, char *)
15108 +#define DLM_RELEASE_LOCKSPACE _IOW('D', 0x02, char *)
15109 +#define DLM_FORCE_RELEASE_LOCKSPACE _IOW('D', 0x03, char *)