1 # Add DLM to the build system
2 diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
3 --- linux-2.6.7/cluster/Kconfig 2004-06-17 15:00:36.000000000 +0800
4 +++ linux/cluster/Kconfig 2004-06-17 15:00:57.000000000 +0800
5 @@ -10,4 +10,22 @@ config CLUSTER
6 needed by all the other components. It provides membership services
7 for those other subsystems.
10 + tristate "Distributed Lock Manager"
13 + A fully distributed lock manager, providing cluster-wide locking services
14 + and protected lock namespaces for kernel and userland applications.
16 +config CLUSTER_DLM_PROCLOCKS
17 + boolean "/proc/locks support for DLM"
18 + depends on CLUSTER_DLM
21 + If this option is enabled a file will appear in /proc/cluster/dlm_locks.
22 + write into this "file" the name of a lockspace known to the DLM and then
23 + read out a list of all the resources and locks in that lockspace that are
24 + known to the local node. Note because the DLM is distributed this may not
25 + be the full lock picture.
28 diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
29 --- linux-2.6.7/cluster/Makefile 2004-06-17 15:00:36.000000000 +0800
30 +++ linux/cluster/Makefile 2004-06-17 15:00:57.000000000 +0800
34 obj-$(CONFIG_CLUSTER) += cman/
35 +obj-$(CONFIG_CLUSTER_DLM) += dlm/
36 diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile
37 --- linux-2.6.7/cluster/dlm/Makefile 1970-01-01 07:30:00.000000000 +0730
38 +++ linux/cluster/dlm/Makefile 2004-06-17 15:00:57.000000000 +0800
62 +obj-$(CONFIG_CLUSTER_DLM) += dlm.o
63 diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
64 --- linux-orig/cluster/dlm/ast.c 1970-01-01 07:30:00.000000000 +0730
65 +++ linux-patched/cluster/dlm/ast.c 2004-07-13 18:57:22.000000000 +0800
67 +/******************************************************************************
68 +*******************************************************************************
70 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
71 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
73 +** This copyrighted material is made available to anyone wishing to use,
74 +** modify, copy, or redistribute it subject to the terms and conditions
75 +** of the GNU General Public License v.2.
77 +*******************************************************************************
78 +******************************************************************************/
81 + * This delivers ASTs and checks for dead remote requests and deadlocks.
84 +#include <linux/timer.h>
86 +#include "dlm_internal.h"
88 +#include "lockqueue.h"
92 +#include "lowcomms.h"
93 +#include "midcomms.h"
99 +/* Wake up flags for astd */
100 +#define GDLMD_WAKE_ASTS 1
101 +#define GDLMD_WAKE_TIMER 2
103 +static struct list_head _deadlockqueue;
104 +static struct semaphore _deadlockqueue_lock;
105 +static struct list_head _lockqueue;
106 +static struct semaphore _lockqueue_lock;
107 +static struct timer_list _lockqueue_timer;
108 +static struct list_head _ast_queue;
109 +static struct semaphore _ast_queue_lock;
110 +static wait_queue_head_t _astd_waitchan;
111 +static atomic_t _astd_running;
112 +static long _astd_pid;
113 +static unsigned long _astd_wakeflags;
114 +static struct completion _astd_done;
116 +void add_to_lockqueue(struct dlm_lkb *lkb)
118 + /* Time stamp the entry so we know if it's been waiting too long */
119 + lkb->lkb_lockqueue_time = jiffies;
121 + down(&_lockqueue_lock);
122 + list_add(&lkb->lkb_lockqueue, &_lockqueue);
123 + up(&_lockqueue_lock);
126 +void remove_from_lockqueue(struct dlm_lkb *lkb)
128 + down(&_lockqueue_lock);
129 + list_del(&lkb->lkb_lockqueue);
130 + up(&_lockqueue_lock);
133 +void add_to_deadlockqueue(struct dlm_lkb *lkb)
135 + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
137 + lkb->lkb_duetime = jiffies;
138 + down(&_deadlockqueue_lock);
139 + list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
140 + up(&_deadlockqueue_lock);
143 +void remove_from_deadlockqueue(struct dlm_lkb *lkb)
145 + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
148 + down(&_deadlockqueue_lock);
149 + list_del(&lkb->lkb_deadlockq);
150 + up(&_deadlockqueue_lock);
152 + /* Invalidate the due time */
153 + memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
157 + * deliver an AST to a user
160 +static void deliver_ast(struct dlm_lkb *lkb, uint16_t ast_type)
162 + void (*cast) (long param) = lkb->lkb_astaddr;
163 + void (*bast) (long param, int mode) = lkb->lkb_bastaddr;
165 + if (ast_type == AST_BAST) {
168 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
170 + bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode);
174 + cast(lkb->lkb_astparam);
179 + * Queue an AST for delivery, this will only deal with
180 + * kernel ASTs, usermode API will piggyback on top of this.
182 + * This can be called in either the user or DLM context.
183 + * ASTs are queued EVEN IF we are already running in dlm_astd
184 + * context as we don't know what other locks are held (eg we could
185 + * be being called from a lock operation that was called from
187 + * If the AST is to be queued remotely then a message is sent to
188 + * the target system via midcomms.
191 +void queue_ast(struct dlm_lkb *lkb, uint16_t flags, uint8_t rqmode)
193 + struct dlm_request req;
195 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
197 + * Send a message to have an ast queued remotely. Note: we do
198 + * not send remote completion asts, they are handled as part of
199 + * remote lock granting.
201 + if (flags & AST_BAST) {
202 + req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
203 + req.rr_header.rh_length = sizeof(req);
204 + req.rr_header.rh_flags = 0;
205 + req.rr_header.rh_lkid = lkb->lkb_id;
206 + req.rr_header.rh_lockspace =
207 + lkb->lkb_resource->res_ls->ls_global_id;
208 + req.rr_status = lkb->lkb_retstatus;
209 + req.rr_remlkid = lkb->lkb_remid;
210 + req.rr_rqmode = rqmode;
212 + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
213 + lkb->lkb_resource->res_ls->ls_allocation);
214 + } else if (lkb->lkb_retstatus == -EDEADLOCK) {
216 + * We only queue remote Completion ASTs here for error
217 + * completions that happen out of band.
218 + * DEADLOCK is one such.
220 + req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
221 + req.rr_header.rh_length = sizeof(req);
222 + req.rr_header.rh_flags = 0;
223 + req.rr_header.rh_lkid = lkb->lkb_id;
224 + req.rr_header.rh_lockspace =
225 + lkb->lkb_resource->res_ls->ls_global_id;
226 + req.rr_status = lkb->lkb_retstatus;
227 + req.rr_remlkid = lkb->lkb_remid;
228 + req.rr_rqmode = rqmode;
230 + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
231 + lkb->lkb_resource->res_ls->ls_allocation);
235 + * Prepare info that will be returned in ast/bast.
238 + if (flags & AST_BAST) {
239 + lkb->lkb_bastmode = rqmode;
241 + lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
243 + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
244 + lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
246 + lkb->lkb_lksb->sb_flags = 0;
249 + down(&_ast_queue_lock);
250 + if (lkb->lkb_astflags & AST_DEL)
251 + log_print("queue_ast on deleted lkb %x ast %x pid %u",
252 + lkb->lkb_id, lkb->lkb_astflags, current->pid);
253 + if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST)))
254 + list_add_tail(&lkb->lkb_astqueue, &_ast_queue);
255 + lkb->lkb_astflags |= flags;
256 + up(&_ast_queue_lock);
258 + /* It is the responsibility of the caller to call wake_astd()
259 + * after it has finished other locking operations that request
260 + * the ASTs to be delivered after */
265 + * Process any LKBs on the AST queue.
268 +static void process_asts(void)
270 + struct dlm_lkb *lkb;
274 + down(&_ast_queue_lock);
275 + if (list_empty(&_ast_queue)) {
276 + up(&_ast_queue_lock);
280 + lkb = list_entry(_ast_queue.next, struct dlm_lkb, lkb_astqueue);
281 + list_del(&lkb->lkb_astqueue);
282 + flags = lkb->lkb_astflags;
283 + lkb->lkb_astflags = 0;
284 + up(&_ast_queue_lock);
286 + if (flags & AST_COMP)
287 + deliver_ast(lkb, AST_COMP);
289 + if (flags & AST_BAST)
290 + deliver_ast(lkb, AST_BAST);
292 + if (flags & AST_DEL) {
293 + struct dlm_rsb *rsb = lkb->lkb_resource;
294 + struct dlm_ls *ls = rsb->res_ls;
296 + DLM_ASSERT(lkb->lkb_astflags == 0,
297 + printk("%x %x\n", lkb->lkb_id, lkb->lkb_astflags););
299 + down_read(&ls->ls_in_recovery);
300 + release_lkb(ls, lkb);
302 + up_read(&ls->ls_in_recovery);
309 +void lockqueue_lkb_mark(struct dlm_ls *ls)
311 + struct dlm_lkb *lkb, *safe;
314 + log_all(ls, "mark waiting requests");
316 + down(&_lockqueue_lock);
318 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
320 + if (lkb->lkb_resource->res_ls != ls)
324 + * These lkb's are new and the master is being looked up. Mark
325 + * the lkb request to be resent. Even if the destination node
326 + * for the request is still living and has our request, it will
327 + * purge all resdir requests in purge_requestqueue. If there's
328 + * a reply to the LOOKUP request in our requestqueue (the reply
329 + * arrived after ls_stop), it is invalid and will be discarded
330 + * in purge_requestqueue, too.
333 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
334 + DLM_ASSERT(lkb->lkb_nodeid == -1,
336 + print_rsb(lkb->lkb_resource););
338 + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
344 + * These lkb's have an outstanding request to a bygone node.
345 + * The request will be redirected to the new master node in
346 + * resend_cluster_requests(). Don't mark the request for
347 + * resending if there's a reply for it saved in the
351 + if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
352 + !reply_in_requestqueue(ls, lkb->lkb_id)) {
354 + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
357 + * Don't rebuild this lkb on a new rsb in
358 + * rebuild_rsbs_send().
361 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONDGRANT) {
362 + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_WAITING,
364 + print_rsb(lkb->lkb_resource););
365 + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
369 + * This flag indicates to the new master that his lkb
370 + * is in the midst of a convert request and should be
371 + * placed on the granted queue rather than the convert
372 + * queue. We will resend this convert request to the
376 + else if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONVERT) {
377 + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,
379 + print_rsb(lkb->lkb_resource););
380 + lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
386 + up(&_lockqueue_lock);
388 + log_all(ls, "marked %d requests", count);
391 +int resend_cluster_requests(struct dlm_ls *ls)
393 + struct dlm_lkb *lkb, *safe;
394 + int error = 0, state, count = 0;
396 + log_all(ls, "resend marked requests");
398 + down(&_lockqueue_lock);
400 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
402 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
403 + log_debug(ls, "resend_cluster_requests: aborted");
408 + if (lkb->lkb_resource->res_ls != ls)
411 + log_debug(ls, "resend_cluster_requests id=%x nodeid=%d "
412 + "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid,
413 + lkb->lkb_lockqueue_state, lkb->lkb_flags);
416 + * Resend/process the lockqueue lkb's (in-progres requests)
417 + * that were flagged at the start of recovery in
418 + * lockqueue_lkb_mark().
421 + if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
422 + lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
423 + lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
424 + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
426 + if (lkb->lkb_nodeid == -1) {
428 + * Send lookup to new resdir node.
430 + lkb->lkb_lockqueue_time = jiffies;
431 + send_cluster_request(lkb,
432 + lkb->lkb_lockqueue_state);
435 + else if (lkb->lkb_nodeid != 0) {
437 + * There's a new RSB master (that's not us.)
439 + lkb->lkb_lockqueue_time = jiffies;
440 + send_cluster_request(lkb,
441 + lkb->lkb_lockqueue_state);
446 + * We are the new RSB master for this lkb
449 + state = lkb->lkb_lockqueue_state;
450 + lkb->lkb_lockqueue_state = 0;
451 + /* list_del equals remove_from_lockqueue() */
452 + list_del(&lkb->lkb_lockqueue);
453 + process_remastered_lkb(ls, lkb, state);
459 + up(&_lockqueue_lock);
461 + log_all(ls, "resent %d requests", count);
466 + * Process any LKBs on the Lock queue, this
467 + * just looks at the entries to see if they have been
468 + * on the queue too long and fails the requests if so.
471 +static void process_lockqueue(void)
473 + struct dlm_lkb *lkb, *safe;
477 + down(&_lockqueue_lock);
479 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
480 + ls = lkb->lkb_resource->res_ls;
482 + if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
485 + /* Don't time out locks that are in transition */
486 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
489 + if (check_timeout(lkb->lkb_lockqueue_time,
490 + dlm_config.lock_timeout)) {
492 + list_del(&lkb->lkb_lockqueue);
493 + up(&_lockqueue_lock);
494 + cancel_lockop(lkb, -ETIMEDOUT);
495 + down(&_lockqueue_lock);
498 + up(&_lockqueue_lock);
503 + if (atomic_read(&_astd_running))
504 + mod_timer(&_lockqueue_timer,
505 + jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
508 +/* Look for deadlocks */
509 +static void process_deadlockqueue(void)
511 + struct dlm_lkb *lkb, *safe;
513 + down(&_deadlockqueue_lock);
515 + list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
516 + struct dlm_lkb *kill_lkb;
518 + /* Only look at "due" locks */
519 + if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
522 + /* Don't look at locks that are in transition */
523 + if (!test_bit(LSFL_LS_RUN,
524 + &lkb->lkb_resource->res_ls->ls_flags))
527 + up(&_deadlockqueue_lock);
529 + /* Lock has hit due time, check for conversion deadlock */
530 + kill_lkb = conversion_deadlock_check(lkb);
532 + cancel_conversion(kill_lkb, -EDEADLOCK);
534 + down(&_deadlockqueue_lock);
536 + up(&_deadlockqueue_lock);
539 +static __inline__ int no_asts(void)
543 + down(&_ast_queue_lock);
544 + ret = list_empty(&_ast_queue);
545 + up(&_ast_queue_lock);
549 +static void lockqueue_timer_fn(unsigned long arg)
551 + set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags);
552 + wake_up(&_astd_waitchan);
556 + * DLM daemon which delivers asts.
559 +static int dlm_astd(void *data)
561 + daemonize("dlm_astd");
563 + INIT_LIST_HEAD(&_lockqueue);
564 + init_MUTEX(&_lockqueue_lock);
565 + INIT_LIST_HEAD(&_deadlockqueue);
566 + init_MUTEX(&_deadlockqueue_lock);
567 + INIT_LIST_HEAD(&_ast_queue);
568 + init_MUTEX(&_ast_queue_lock);
569 + init_waitqueue_head(&_astd_waitchan);
570 + complete(&_astd_done);
573 + * Set a timer to check the lockqueue for dead locks (and deadlocks).
576 + init_timer(&_lockqueue_timer);
577 + _lockqueue_timer.function = lockqueue_timer_fn;
578 + _lockqueue_timer.data = 0;
579 + mod_timer(&_lockqueue_timer,
580 + jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
582 + while (atomic_read(&_astd_running)) {
583 + wchan_cond_sleep_intr(_astd_waitchan, no_asts());
585 + if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags))
588 + if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) {
589 + process_lockqueue();
590 + if (dlm_config.deadlocktime)
591 + process_deadlockqueue();
595 + if (timer_pending(&_lockqueue_timer))
596 + del_timer(&_lockqueue_timer);
598 + complete(&_astd_done);
603 +void wake_astd(void)
605 + set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags);
606 + wake_up(&_astd_waitchan);
611 + init_completion(&_astd_done);
612 + atomic_set(&_astd_running, 1);
613 + _astd_pid = kernel_thread(dlm_astd, NULL, 0);
614 + wait_for_completion(&_astd_done);
620 + atomic_set(&_astd_running, 0);
622 + wait_for_completion(&_astd_done);
624 diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
625 --- linux-orig/cluster/dlm/ast.h 1970-01-01 07:30:00.000000000 +0730
626 +++ linux-patched/cluster/dlm/ast.h 2004-07-13 18:57:22.000000000 +0800
628 +/******************************************************************************
629 +*******************************************************************************
631 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
632 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
634 +** This copyrighted material is made available to anyone wishing to use,
635 +** modify, copy, or redistribute it subject to the terms and conditions
636 +** of the GNU General Public License v.2.
638 +*******************************************************************************
639 +******************************************************************************/
641 +#ifndef __AST_DOT_H__
642 +#define __AST_DOT_H__
644 +void lockqueue_lkb_mark(struct dlm_ls *ls);
645 +int resend_cluster_requests(struct dlm_ls *ls);
646 +void add_to_lockqueue(struct dlm_lkb *lkb);
647 +void remove_from_lockqueue(struct dlm_lkb *lkb);
648 +void add_to_deadlockqueue(struct dlm_lkb *lkb);
649 +void remove_from_deadlockqueue(struct dlm_lkb *lkb);
650 +void queue_ast(struct dlm_lkb *lkb, uint16_t astflags, uint8_t rqmode);
651 +void wake_astd(void);
652 +int astd_start(void);
653 +void astd_stop(void);
655 +#endif /* __AST_DOT_H__ */
656 diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
657 --- linux-orig/cluster/dlm/config.c 1970-01-01 07:30:00.000000000 +0730
658 +++ linux-patched/cluster/dlm/config.c 2004-07-13 18:57:22.000000000 +0800
660 +/******************************************************************************
661 +*******************************************************************************
663 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
664 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
666 +** This copyrighted material is made available to anyone wishing to use,
667 +** modify, copy, or redistribute it subject to the terms and conditions
668 +** of the GNU General Public License v.2.
670 +*******************************************************************************
671 +******************************************************************************/
673 +#include <linux/module.h>
674 +#include <linux/proc_fs.h>
676 +#include "dlm_internal.h"
677 +#include "lowcomms.h"
680 +/* Config file defaults */
681 +#define DEFAULT_TCP_PORT 21064
682 +#define DEFAULT_LOCK_TIMEOUT 30
683 +#define DEFAULT_BUFFER_SIZE 4096
684 +#define DEFAULT_RSBTBL_SIZE 256
685 +#define DEFAULT_LKBTBL_SIZE 1024
686 +#define DEFAULT_DIRTBL_SIZE 512
687 +#define DEFAULT_MAX_CONNECTIONS 128
688 +#define DEFAULT_DEADLOCKTIME 10
690 +struct config_info dlm_config = {
691 + .tcp_port = DEFAULT_TCP_PORT,
692 + .lock_timeout = DEFAULT_LOCK_TIMEOUT,
693 + .buffer_size = DEFAULT_BUFFER_SIZE,
694 + .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
695 + .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
696 + .dirtbl_size = DEFAULT_DIRTBL_SIZE,
697 + .max_connections = DEFAULT_MAX_CONNECTIONS,
698 + .deadlocktime = DEFAULT_DEADLOCKTIME,
702 +static struct config_proc_info {
707 + .name = "tcp_port",
708 + .value = &dlm_config.tcp_port,
711 + .name = "lock_timeout",
712 + .value = &dlm_config.lock_timeout,
715 + .name = "buffer_size",
716 + .value = &dlm_config.buffer_size,
719 + .name = "rsbtbl_size",
720 + .value = &dlm_config.rsbtbl_size,
723 + .name = "lkbtbl_size",
724 + .value = &dlm_config.lkbtbl_size,
727 + .name = "dirtbl_size",
728 + .value = &dlm_config.dirtbl_size,
731 + .name = "max_connections",
732 + .value = &dlm_config.max_connections,
735 + .name = "deadlocktime",
736 + .value = &dlm_config.deadlocktime,
739 +static struct proc_dir_entry *dlm_dir;
741 +static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
742 + int *eof, void *data)
744 + struct config_proc_info *cinfo = data;
745 + return snprintf(page, count, "%d\n", *cinfo->value);
748 +static int dlm_config_write_proc(struct file *file, const char *buffer,
749 + unsigned long count, void *data)
751 + struct config_proc_info *cinfo = data;
755 + value = simple_strtoul(buffer, &end, 10);
757 + *cinfo->value = value;
761 +int dlm_config_init(void)
764 + struct proc_dir_entry *pde;
766 + dlm_dir = proc_mkdir("cluster/config/dlm", 0);
770 + dlm_dir->owner = THIS_MODULE;
772 + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
773 + pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
775 + pde->data = &config_proc[i];
776 + pde->write_proc = dlm_config_write_proc;
777 + pde->read_proc = dlm_config_read_proc;
783 +void dlm_config_exit(void)
787 + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
788 + remove_proc_entry(config_proc[i].name, dlm_dir);
789 + remove_proc_entry("cluster/config/dlm", NULL);
791 diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
792 --- linux-orig/cluster/dlm/config.h 1970-01-01 07:30:00.000000000 +0730
793 +++ linux-patched/cluster/dlm/config.h 2004-07-13 18:57:22.000000000 +0800
795 +/******************************************************************************
796 +*******************************************************************************
798 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
799 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
801 +** This copyrighted material is made available to anyone wishing to use,
802 +** modify, copy, or redistribute it subject to the terms and conditions
803 +** of the GNU General Public License v.2.
805 +*******************************************************************************
806 +******************************************************************************/
808 +#ifndef __CONFIG_DOT_H__
809 +#define __CONFIG_DOT_H__
811 +struct config_info {
818 + int max_connections;
822 +extern struct config_info dlm_config;
823 +extern int dlm_config_init(void);
824 +extern void dlm_config_exit(void);
826 +#endif /* __CONFIG_DOT_H__ */
827 diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
828 --- linux-orig/cluster/dlm/device.c 1970-01-01 07:30:00.000000000 +0730
829 +++ linux-patched/cluster/dlm/device.c 2004-07-13 18:57:22.000000000 +0800
831 +/******************************************************************************
832 +*******************************************************************************
834 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
835 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
837 +** This copyrighted material is made available to anyone wishing to use,
838 +** modify, copy, or redistribute it subject to the terms and conditions
839 +** of the GNU General Public License v.2.
841 +*******************************************************************************
842 +******************************************************************************/
847 + * This is the userland interface to the DLM.
849 + * The locking is done via a misc char device (find the
850 + * registered minor number in /proc/misc).
852 + * User code should not use this interface directly but
853 + * call the library routines in libdlm.a instead.
857 +#include <linux/miscdevice.h>
858 +#include <linux/init.h>
859 +#include <linux/wait.h>
860 +#include <linux/module.h>
861 +#include <linux/file.h>
862 +#include <linux/fs.h>
863 +#include <linux/poll.h>
864 +#include <linux/signal.h>
865 +#include <linux/spinlock.h>
866 +#include <asm/ioctls.h>
868 +#include "dlm_internal.h"
871 +extern struct dlm_lkb *dlm_get_lkb(struct dlm_ls *, int);
872 +static struct file_operations _dlm_fops;
873 +static const char *name_prefix="dlm";
874 +static struct list_head user_ls_list;
876 +/* Flags in li_flags */
877 +#define LI_FLAG_COMPLETE 1
878 +#define LI_FLAG_FIRSTLOCK 2
882 + struct dlm_lksb li_lksb;
883 + wait_queue_head_t li_waitq;
884 + unsigned long li_flags;
885 + void __user *li_astparam;
886 + void __user *li_astaddr;
887 + void __user *li_bastaddr;
888 + struct file_info *li_file;
889 + struct dlm_lksb __user *li_user_lksb;
890 + struct semaphore li_firstlock;
891 + struct dlm_queryinfo *li_queryinfo;
892 + struct dlm_queryinfo __user *li_user_queryinfo;
895 +/* A queued AST no less */
897 + struct dlm_lock_result result;
898 + struct dlm_queryinfo *queryinfo;
899 + struct dlm_queryinfo __user *user_queryinfo;
900 + struct list_head list;
903 +/* One of these per userland lockspace */
905 + void *ls_lockspace;
906 + atomic_t ls_refcnt;
907 + long ls_flags; /* bit 1 means LS has been deleted */
909 + /* Passed into misc_register() */
910 + struct miscdevice ls_miscinfo;
911 + struct list_head ls_list;
914 +/* misc_device info for the control device */
915 +static struct miscdevice ctl_device;
918 + * Stuff we hang off the file struct.
919 + * The first two are to cope with unlocking all the
920 + * locks help by a process when it dies.
923 + struct list_head fi_lkb_list; /* List of active lkbs */
924 + spinlock_t fi_lkb_lock;
925 + struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
926 + spinlock_t fi_ast_lock;
927 + wait_queue_head_t fi_wait;
928 + struct user_ls *fi_ls;
929 + atomic_t fi_refcnt; /* Number of users */
930 + unsigned long fi_flags; /* Bit 1 means the device is open */
934 +/* get and put ops for file_info.
935 + Actually I don't really like "get" and "put", but everyone
936 + else seems to use them and I can't think of anything
937 + nicer at the moment */
938 +static void get_file_info(struct file_info *f)
940 + atomic_inc(&f->fi_refcnt);
943 +static void put_file_info(struct file_info *f)
945 + if (atomic_dec_and_test(&f->fi_refcnt))
949 +/* Find a lockspace struct given the device minor number */
950 +static struct user_ls *find_lockspace(int minor)
952 + struct user_ls *lsinfo;
954 + list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
956 + if (lsinfo->ls_miscinfo.minor == minor)
962 +static void add_lockspace_to_list(struct user_ls *lsinfo)
964 + list_add(&lsinfo->ls_list, &user_ls_list);
967 +/* Register a lockspace with the DLM and create a misc
968 + device for userland to access it */
969 +static int register_lockspace(char *name, struct user_ls **ls)
971 + struct user_ls *newls;
975 + namelen = strlen(name)+strlen(name_prefix)+2;
977 + newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
980 + memset(newls, 0, sizeof(struct user_ls));
982 + newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
983 + if (!newls->ls_miscinfo.name) {
987 + snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
989 + status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1,
990 + strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1,
991 + &newls->ls_lockspace, 0);
994 + kfree(newls->ls_miscinfo.name);
999 + newls->ls_miscinfo.fops = &_dlm_fops;
1000 + newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
1002 + status = misc_register(&newls->ls_miscinfo);
1004 + log_print("failed to register misc device for %s", name);
1005 + dlm_release_lockspace(newls->ls_lockspace, 0);
1006 + kfree(newls->ls_miscinfo.name);
1012 + add_lockspace_to_list(newls);
1017 +static int unregister_lockspace(struct user_ls *lsinfo, int force)
1021 + status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
1025 + status = misc_deregister(&lsinfo->ls_miscinfo);
1029 + list_del(&lsinfo->ls_list);
1030 + kfree(lsinfo->ls_miscinfo.name);
1036 +/* Add it to userland's AST queue */
1037 +static void add_to_astqueue(struct lock_info *li, void *astaddr)
1039 + struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
1043 + ast->result.astparam = li->li_astparam;
1044 + ast->result.astaddr = astaddr;
1045 + ast->result.user_lksb = li->li_user_lksb;
1046 + ast->result.cmd = li->li_cmd;
1047 + memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
1049 + /* These two will both be NULL for anything other than queries */
1050 + ast->queryinfo = li->li_queryinfo;
1051 + ast->user_queryinfo = li->li_user_queryinfo;
1053 + spin_lock(&li->li_file->fi_ast_lock);
1054 + list_add_tail(&ast->list, &li->li_file->fi_ast_list);
1055 + spin_unlock(&li->li_file->fi_ast_lock);
1056 + wake_up_interruptible(&li->li_file->fi_wait);
1059 +static void bast_routine(void *param, int mode)
1061 + struct lock_info *li = param;
1064 + add_to_astqueue(li, li->li_bastaddr);
1069 + * This is the kernel's AST routine.
1070 + * All lock, unlock & query operations complete here.
1071 + * The only syncronous ops are those done during device close.
1073 +static void ast_routine(void *param)
1075 + struct lock_info *li = param;
1077 + /* Param may be NULL if a persistent lock is unlocked by someone else */
1081 + /* If it's an async request then post data to the user's AST queue. */
1082 + if (li->li_astaddr) {
1084 + /* Only queue AST if the device is still open */
1085 + if (test_bit(1, &li->li_file->fi_flags))
1086 + add_to_astqueue(li, li->li_astaddr);
1088 + /* If it's a new lock operation that failed, then
1089 + * remove it from the owner queue and free the
1090 + * lock_info. The DLM will not free the LKB until this
1091 + * AST has completed.
1093 + if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1094 + li->li_lksb.sb_status != 0) {
1095 + struct dlm_lkb *lkb;
1097 + /* Wait till dlm_lock() has finished */
1098 + down(&li->li_firstlock);
1099 + lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1101 + spin_lock(&li->li_file->fi_lkb_lock);
1102 + list_del(&lkb->lkb_ownerqueue);
1103 + spin_unlock(&li->li_file->fi_lkb_lock);
1105 + up(&li->li_firstlock);
1106 + put_file_info(li->li_file);
1110 + /* Free unlocks & queries */
1111 + if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
1112 + li->li_cmd == DLM_USER_QUERY) {
1113 + put_file_info(li->li_file);
1118 + /* Syncronous request, just wake up the caller */
1119 + set_bit(LI_FLAG_COMPLETE, &li->li_flags);
1120 + wake_up_interruptible(&li->li_waitq);
1125 + * Wait for the lock op to complete and return the status.
1127 +static int wait_for_ast(struct lock_info *li)
1129 + /* Wait for the AST routine to complete */
1130 + set_task_state(current, TASK_INTERRUPTIBLE);
1131 + while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
1134 + set_task_state(current, TASK_RUNNING);
1136 + return li->li_lksb.sb_status;
1140 +/* Open on control device */
1141 +static int dlm_ctl_open(struct inode *inode, struct file *file)
1146 +/* Close on control device */
1147 +static int dlm_ctl_close(struct inode *inode, struct file *file)
1152 +/* Open on lockspace device */
1153 +static int dlm_open(struct inode *inode, struct file *file)
1155 + struct file_info *f;
1156 + struct user_ls *lsinfo;
1158 + lsinfo = find_lockspace(iminor(inode));
1162 + f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
1166 + atomic_inc(&lsinfo->ls_refcnt);
1167 + INIT_LIST_HEAD(&f->fi_lkb_list);
1168 + INIT_LIST_HEAD(&f->fi_ast_list);
1169 + spin_lock_init(&f->fi_ast_lock);
1170 + spin_lock_init(&f->fi_lkb_lock);
1171 + init_waitqueue_head(&f->fi_wait);
1172 + f->fi_ls = lsinfo;
1173 + atomic_set(&f->fi_refcnt, 1);
1174 + set_bit(1, &f->fi_flags);
1176 + file->private_data = f;
1181 +/* Check the user's version matches ours */
1182 +static int check_version(struct dlm_lock_params *params)
1184 + if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
1185 + (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
1186 + params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
1188 + log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
1189 + params->version[0],
1190 + params->version[1],
1191 + params->version[2],
1192 + DLM_DEVICE_VERSION_MAJOR,
1193 + DLM_DEVICE_VERSION_MINOR,
1194 + DLM_DEVICE_VERSION_PATCH);
1200 +/* Close on lockspace device */
1201 +static int dlm_close(struct inode *inode, struct file *file)
1203 + struct file_info *f = file->private_data;
1204 + struct lock_info li;
1207 + struct dlm_lkb *lkb, *safe;
1208 + struct user_ls *lsinfo;
1209 + DECLARE_WAITQUEUE(wq, current);
1211 + lsinfo = find_lockspace(iminor(inode));
1215 + /* Mark this closed so that ASTs will not be delivered any more */
1216 + clear_bit(1, &f->fi_flags);
1218 + /* Block signals while we are doing this */
1219 + sigfillset(&allsigs);
1220 + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1222 + /* We use our own lock_info struct here, so that any
1223 + * outstanding "real" ASTs will be delivered with the
1224 + * corresponding "real" params, thus freeing the lock_info
1225 + * that belongs the lock. This catches the corner case where
1226 + * a lock is BUSY when we try to unlock it here
1228 + memset(&li, 0, sizeof(li));
1229 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1230 + init_waitqueue_head(&li.li_waitq);
1231 + add_wait_queue(&li.li_waitq, &wq);
1234 + * Free any outstanding locks, they are on the
1235 + * list in LIFO order so there should be no problems
1236 + * about unlocking parents before children.
1237 + * Although we don't remove the lkbs from the list here
1238 + * (what would be the point?), foreach_safe is needed
1239 + * because the lkbs are freed during dlm_unlock operations
1241 + list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) {
1245 + struct lock_info *old_li;
1247 + /* Make a copy of this pointer. If all goes well we will
1248 + * free it later. if not it will be left to the AST routine
1251 + old_li = (struct lock_info *)lkb->lkb_astparam;
1253 + /* Don't unlock persistent locks */
1254 + if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
1255 + list_del(&lkb->lkb_ownerqueue);
1257 + /* But tidy our references in it */
1259 + lkb->lkb_astparam = (long)NULL;
1264 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1266 + /* If it's not granted then cancel the request.
1267 + * If the lock was WAITING then it will be dropped,
1268 + * if it was converting then it will be reverted to GRANTED,
1269 + * then we will unlock it.
1271 + lock_status = lkb->lkb_status;
1273 + if (lock_status != GDLM_LKSTS_GRANTED)
1274 + flags = DLM_LKF_CANCEL;
1276 + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1278 + /* Must wait for it to complete as the next lock could be its
1281 + wait_for_ast(&li);
1283 + /* If it was waiting for a conversion, it will
1284 + now be granted so we can unlock it properly */
1285 + if (lock_status == GDLM_LKSTS_CONVERT) {
1287 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1288 + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li);
1291 + wait_for_ast(&li);
1293 + /* Unlock suceeded, free the lock_info struct. */
1294 + if (status == 0) {
1300 + remove_wait_queue(&li.li_waitq, &wq);
1302 + /* If this is the last reference, and the lockspace has been deleted
1303 + the free the struct */
1304 + if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
1308 + /* Restore signals */
1309 + sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1310 + recalc_sigpending();
1316 + * ioctls to create/remove lockspaces, and check how many
1317 + * outstanding ASTs there are against a particular LS.
1319 +static int dlm_ioctl(struct inode *inode, struct file *file,
1320 + uint command, ulong u)
1322 + struct file_info *fi = file->private_data;
1323 + int status = -EINVAL;
1325 + struct list_head *tmp_list;
1327 + switch (command) {
1329 + /* Are there any ASTs for us to read?
1330 + * Warning, this returns the number of messages (ASTs)
1331 + * in the queue, NOT the number of bytes to read
1335 + spin_lock(&fi->fi_ast_lock);
1336 + list_for_each(tmp_list, &fi->fi_ast_list)
1338 + spin_unlock(&fi->fi_ast_lock);
1339 + status = put_user(count, (int *)u);
1350 + * ioctls to create/remove lockspaces.
1352 +static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
1353 + uint command, ulong u)
1355 + int status = -EINVAL;
1356 + char ls_name[MAX_LS_NAME_LEN];
1357 + struct user_ls *lsinfo;
1360 + switch (command) {
1361 + case DLM_CREATE_LOCKSPACE:
1362 + if (!capable(CAP_SYS_ADMIN))
1365 + if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
1367 + status = register_lockspace(ls_name, &lsinfo);
1369 + /* If it succeeded then return the minor number */
1371 + status = lsinfo->ls_miscinfo.minor;
1374 + case DLM_FORCE_RELEASE_LOCKSPACE:
1377 + case DLM_RELEASE_LOCKSPACE:
1378 + if (!capable(CAP_SYS_ADMIN))
1381 + lsinfo = find_lockspace(u);
1384 + status = unregister_lockspace(lsinfo, force);
1394 +/* Deal with the messy stuff of copying a web of structs
1395 + from kernel space to userspace */
1396 +static int copy_query_result(struct ast_info *ast)
1398 + int status = -EFAULT;
1399 + struct dlm_queryinfo qi;
1401 + /* Get the pointers to userspace structs */
1402 + if (copy_from_user(&qi, ast->user_queryinfo,
1403 + sizeof(struct dlm_queryinfo)))
1406 + /* TODO: does this deref a user pointer? */
1407 + if (put_user(ast->queryinfo->gqi_lockcount,
1408 + &ast->user_queryinfo->gqi_lockcount))
1411 + if (qi.gqi_resinfo) {
1412 + if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
1413 + sizeof(struct dlm_resinfo)))
1417 + if (qi.gqi_lockinfo) {
1418 + if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
1419 + sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
1425 + if (ast->queryinfo->gqi_lockinfo)
1426 + kfree(ast->queryinfo->gqi_lockinfo);
1428 + if (ast->queryinfo->gqi_resinfo)
1429 + kfree(ast->queryinfo->gqi_resinfo);
1431 + kfree(ast->queryinfo);
1437 +/* Read call, might block if no ASTs are waiting.
1438 + * It will only ever return one message at a time, regardless
1439 + * of how many are pending.
1441 +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
1443 + struct file_info *fi = file->private_data;
1444 + struct ast_info *ast;
1446 + DECLARE_WAITQUEUE(wait, current);
1448 + if (count < sizeof(struct dlm_lock_result))
1451 + spin_lock(&fi->fi_ast_lock);
1452 + if (list_empty(&fi->fi_ast_list)) {
1454 + /* No waiting ASTs.
1455 + * Return EOF if the lockspace been deleted.
1457 + if (test_bit(1, &fi->fi_ls->ls_flags))
1460 + if (file->f_flags & O_NONBLOCK) {
1461 + spin_unlock(&fi->fi_ast_lock);
1465 + add_wait_queue(&fi->fi_wait, &wait);
1468 + set_current_state(TASK_INTERRUPTIBLE);
1469 + if (list_empty(&fi->fi_ast_list) &&
1470 + !signal_pending(current)) {
1472 + spin_unlock(&fi->fi_ast_lock);
1474 + spin_lock(&fi->fi_ast_lock);
1478 + current->state = TASK_RUNNING;
1479 + remove_wait_queue(&fi->fi_wait, &wait);
1481 + if (signal_pending(current)) {
1482 + spin_unlock(&fi->fi_ast_lock);
1483 + return -ERESTARTSYS;
1487 + ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
1488 + list_del(&ast->list);
1489 + spin_unlock(&fi->fi_ast_lock);
1491 + ret = sizeof(struct dlm_lock_result);
1492 + if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
1495 + /* If it was a query then copy the result block back here */
1496 + if (ast->queryinfo) {
1497 + int status = copy_query_result(ast);
1506 +static unsigned int dlm_poll(struct file *file, poll_table *wait)
1508 + struct file_info *fi = file->private_data;
1510 + poll_wait(file, &fi->fi_wait, wait);
1512 + spin_lock(&fi->fi_ast_lock);
1513 + if (!list_empty(&fi->fi_ast_list)) {
1514 + spin_unlock(&fi->fi_ast_lock);
1515 + return POLLIN | POLLRDNORM;
1518 + spin_unlock(&fi->fi_ast_lock);
1522 +static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
1524 + struct lock_info *li;
1527 + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1531 + get_file_info(fi);
1532 + li->li_user_lksb = kparams->lksb;
1533 + li->li_astparam = kparams->astparam;
1534 + li->li_bastaddr = kparams->bastaddr;
1535 + li->li_astaddr = kparams->astaddr;
1538 + li->li_cmd = kparams->cmd;
1539 + clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1541 + if (copy_from_user(&li->li_lksb, kparams->lksb,
1542 + sizeof(struct dlm_lksb))) {
1546 + li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
1548 + /* Allocate query structs */
1550 + li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
1551 + if (!li->li_queryinfo)
1554 + /* Mainly to get gqi_lock buffer size */
1555 + if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
1556 + sizeof(struct dlm_queryinfo))) {
1561 + /* Overwrite userspace pointers we just copied with kernel space ones */
1562 + if (li->li_queryinfo->gqi_resinfo) {
1563 + li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
1564 + if (!li->li_queryinfo->gqi_resinfo)
1567 + if (li->li_queryinfo->gqi_lockinfo) {
1568 + li->li_queryinfo->gqi_lockinfo =
1569 + kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
1571 + if (!li->li_queryinfo->gqi_lockinfo)
1575 + li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
1577 + return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
1578 + kparams->flags, /* query */
1583 + kfree(li->li_queryinfo);
1590 +static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
1591 + const char *buffer)
1593 + struct lock_info *li;
1595 + char name[DLM_RESNAME_MAXLEN];
1598 + * Validate things that we need to have correct.
1600 + if (kparams->namelen > DLM_RESNAME_MAXLEN)
1603 + if (!kparams->astaddr)
1606 + if (!kparams->lksb)
1609 + /* Get the lock name */
1610 + if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
1611 + kparams->namelen)) {
1615 + /* For conversions, the lock will already have a lock_info
1616 + block squirelled away in astparam */
1617 + if (kparams->flags & DLM_LKF_CONVERT) {
1618 + struct dlm_lkb *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1622 + li = (struct lock_info *)lkb->lkb_astparam;
1624 + /* Only override these if they are provided */
1625 + if (li->li_user_lksb)
1626 + li->li_user_lksb = kparams->lksb;
1627 + if (li->li_astparam)
1628 + li->li_astparam = kparams->astparam;
1629 + if (li->li_bastaddr)
1630 + li->li_bastaddr = kparams->bastaddr;
1631 + if (li->li_bastaddr)
1632 + li->li_astaddr = kparams->astaddr;
1636 + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1640 + li->li_user_lksb = kparams->lksb;
1641 + li->li_astparam = kparams->astparam;
1642 + li->li_bastaddr = kparams->bastaddr;
1643 + li->li_astaddr = kparams->astaddr;
1646 + li->li_cmd = kparams->cmd;
1647 + li->li_queryinfo = NULL;
1649 + /* semaphore to allow us to complete our work before
1650 + the AST routine runs. In fact we only need (and use) this
1651 + when the initial lock fails */
1652 + init_MUTEX_LOCKED(&li->li_firstlock);
1653 + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1655 + get_file_info(fi);
1658 + /* Copy the user's LKSB into kernel space,
1659 + needed for conversions & value block operations */
1660 + if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb,
1661 + sizeof(struct dlm_lksb)))
1665 + status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
1666 + kparams->flags, name, kparams->namelen,
1670 + li->li_bastaddr ? bast_routine : NULL,
1671 + kparams->range.ra_end ? &kparams->range : NULL);
1673 + /* If it succeeded (this far) with a new lock then keep track of
1674 + it on the file's lkb list */
1675 + if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
1676 + struct dlm_lkb *lkb;
1677 + lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1680 + spin_lock(&fi->fi_lkb_lock);
1681 + list_add(&lkb->lkb_ownerqueue,
1682 + &fi->fi_lkb_list);
1683 + spin_unlock(&fi->fi_lkb_lock);
1686 + log_print("failed to get lkb for new lock");
1688 + up(&li->li_firstlock);
1694 +static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
1696 + struct lock_info *li;
1697 + struct dlm_lkb *lkb;
1700 + lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1705 + li = (struct lock_info *)lkb->lkb_astparam;
1707 + li->li_user_lksb = kparams->lksb;
1708 + li->li_astparam = kparams->astparam;
1709 + li->li_cmd = kparams->cmd;
1711 + /* Have to do it here cos the lkb may not exist after
1713 + spin_lock(&fi->fi_lkb_lock);
1714 + list_del(&lkb->lkb_ownerqueue);
1715 + spin_unlock(&fi->fi_lkb_lock);
1717 + /* Use existing lksb & astparams */
1718 + status = dlm_unlock(fi->fi_ls->ls_lockspace,
1720 + kparams->flags, NULL, NULL);
1725 +/* Write call, submit a locking request */
1726 +static ssize_t dlm_write(struct file *file, const char __user *buffer,
1727 + size_t count, loff_t *ppos)
1729 + struct file_info *fi = file->private_data;
1730 + struct dlm_lock_params kparams;
1735 + if (count < sizeof(kparams))
1738 + /* Has the lockspace been deleted */
1739 + if (test_bit(1, &fi->fi_ls->ls_flags))
1742 + /* Get the command info */
1743 + if (copy_from_user(&kparams, buffer, sizeof(kparams)))
1746 + if (check_version(&kparams))
1749 + /* Block signals while we are doing this */
1750 + sigfillset(&allsigs);
1751 + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1753 + switch (kparams.cmd)
1755 + case DLM_USER_LOCK:
1756 + status = do_user_lock(fi, &kparams, buffer);
1759 + case DLM_USER_UNLOCK:
1760 + status = do_user_unlock(fi, &kparams);
1763 + case DLM_USER_QUERY:
1764 + status = do_user_query(fi, &kparams);
1771 + /* Restore signals */
1772 + sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1773 + recalc_sigpending();
1781 +void dlm_device_free_devices()
1783 + struct user_ls *tmp;
1784 + struct user_ls *lsinfo;
1786 + list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
1787 + misc_deregister(&lsinfo->ls_miscinfo);
1789 + /* Tidy up, but don't delete the lsinfo struct until
1790 + all the users have closed their devices */
1791 + list_del(&lsinfo->ls_list);
1792 + kfree(lsinfo->ls_miscinfo.name);
1793 + set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
1797 +static struct file_operations _dlm_fops = {
1799 + .release = dlm_close,
1800 + .ioctl = dlm_ioctl,
1802 + .write = dlm_write,
1804 + .owner = THIS_MODULE,
1807 +static struct file_operations _dlm_ctl_fops = {
1808 + .open = dlm_ctl_open,
1809 + .release = dlm_ctl_close,
1810 + .ioctl = dlm_ctl_ioctl,
1811 + .owner = THIS_MODULE,
1815 + * Create control device
1817 +int dlm_device_init(void)
1821 + INIT_LIST_HEAD(&user_ls_list);
1823 + ctl_device.name = "dlm-control";
1824 + ctl_device.fops = &_dlm_ctl_fops;
1825 + ctl_device.minor = MISC_DYNAMIC_MINOR;
1827 + r = misc_register(&ctl_device);
1829 + log_print("misc_register failed for DLM control device");
1836 +void dlm_device_exit(void)
1838 + misc_deregister(&ctl_device);
1842 + * Overrides for Emacs so that we follow Linus's tabbing style.
1843 + * Emacs will notice this stuff at the end of the file and automatically
1844 + * adjust the settings for this buffer only. This must remain at the end
1846 + * ---------------------------------------------------------------------------
1847 + * Local variables:
1848 + * c-file-style: "linux"
1851 diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
1852 --- linux-orig/cluster/dlm/device.h 1970-01-01 07:30:00.000000000 +0730
1853 +++ linux-patched/cluster/dlm/device.h 2004-07-13 18:57:22.000000000 +0800
1855 +/******************************************************************************
1856 +*******************************************************************************
1858 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
1859 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
1861 +** This copyrighted material is made available to anyone wishing to use,
1862 +** modify, copy, or redistribute it subject to the terms and conditions
1863 +** of the GNU General Public License v.2.
1865 +*******************************************************************************
1866 +******************************************************************************/
1868 +#ifndef __DEVICE_DOT_H__
1869 +#define __DEVICE_DOT_H__
1871 +extern void dlm_device_free_devices(void);
1873 +#endif /* __DEVICE_DOT_H__ */
1874 diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
1875 --- linux-orig/cluster/dlm/dir.c 1970-01-01 07:30:00.000000000 +0730
1876 +++ linux-patched/cluster/dlm/dir.c 2004-07-13 18:57:22.000000000 +0800
1878 +/******************************************************************************
1879 +*******************************************************************************
1881 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
1882 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
1884 +** This copyrighted material is made available to anyone wishing to use,
1885 +** modify, copy, or redistribute it subject to the terms and conditions
1886 +** of the GNU General Public License v.2.
1888 +*******************************************************************************
1889 +******************************************************************************/
1891 +#include "dlm_internal.h"
1893 +#include "lockspace.h"
1894 +#include "lowcomms.h"
1895 +#include "reccomms.h"
1897 +#include "config.h"
1898 +#include "memory.h"
1899 +#include "recover.h"
1903 + uint32_t rm_nodeid;
1904 + uint16_t rm_length;
1910 + * We use the upper 16 bits of the hash value to select the directory node.
1911 + * Low bits are used for distribution of rsb's among hash buckets on each node.
1913 + * From the hash value, we are interested in arriving at a final value between
1914 + * zero and the number of nodes minus one (num_nodes - 1).
1916 + * To accomplish this scaling, we take the nearest power of two larger than
1917 + * num_nodes and subtract one to create a bit mask. The mask is applied to the
1918 + * hash, reducing the range to nearer the final range.
1920 + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
1921 + * num_nodes to the previously masked hash value.
1923 + * This value in the desired range is used as an offset into the sorted list of
1924 + * nodeid's to give the particular nodeid of the directory node.
1927 +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length)
1929 + struct list_head *tmp;
1930 + struct dlm_csb *csb = NULL;
1931 + uint32_t hash, node, n = 0, nodeid;
1933 + if (ls->ls_num_nodes == 1) {
1934 + nodeid = our_nodeid();
1938 + hash = dlm_hash(name, length);
1939 + node = (hash >> 16) & ls->ls_nodes_mask;
1940 + node %= ls->ls_num_nodes;
1942 + list_for_each(tmp, &ls->ls_nodes) {
1945 + csb = list_entry(tmp, struct dlm_csb, list);
1949 + DLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n",
1950 + ls->ls_num_nodes, n, node, ls->ls_nodes_mask););
1951 + nodeid = csb->node->nodeid;
1957 +uint32_t get_directory_nodeid(struct dlm_rsb *rsb)
1959 + return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
1963 +static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
1967 + val = dlm_hash(name, len);
1968 + val &= (ls->ls_dirtbl_size - 1);
1973 +static void add_resdata_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
1977 + bucket = dir_hash(ls, de->name, de->length);
1978 + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
1981 +static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
1982 + int namelen, uint32_t bucket)
1984 + struct dlm_direntry *de;
1986 + list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
1987 + if (de->length == namelen && !memcmp(name, de->name, namelen))
1995 +void remove_resdata(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen)
1997 + struct dlm_direntry *de;
2000 + bucket = dir_hash(ls, name, namelen);
2002 + write_lock(&ls->ls_dirtbl[bucket].lock);
2004 + de = search_bucket(ls, name, namelen, bucket);
2007 + log_debug(ls, "remove from %u none", nodeid);
2011 + if (de->master_nodeid != nodeid) {
2012 + log_debug(ls, "remove from %u ID %u",
2013 + nodeid, de->master_nodeid);
2017 + list_del(&de->list);
2020 + write_unlock(&ls->ls_dirtbl[bucket].lock);
2023 +void dlm_dir_clear(struct dlm_ls *ls)
2025 + struct list_head *head;
2026 + struct dlm_direntry *de;
2029 + for (i = 0; i < ls->ls_dirtbl_size; i++) {
2030 + head = &ls->ls_dirtbl[i].list;
2031 + while (!list_empty(head)) {
2032 + de = list_entry(head->next, struct dlm_direntry, list);
2033 + list_del(&de->list);
2039 +static void resmov_in(struct resmov *rm, char *buf)
2041 + struct resmov tmp;
2043 + memcpy(&tmp, buf, sizeof(struct resmov));
2045 + rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
2046 + rm->rm_length = be16_to_cpu(tmp.rm_length);
2049 +int dlm_dir_rebuild_local(struct dlm_ls *ls)
2051 + struct dlm_csb *csb;
2052 + struct dlm_direntry *de;
2053 + struct dlm_rcom *rc;
2054 + struct resmov mov, last_mov;
2055 + char *b, *last_name;
2056 + int error = -ENOMEM, count = 0;
2058 + log_all(ls, "rebuild resource directory");
2060 + dlm_dir_clear(ls);
2062 + rc = allocate_rcom_buffer(ls);
2066 + last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
2070 + list_for_each_entry(csb, &ls->ls_nodes, list) {
2071 + last_mov.rm_length = 0;
2073 + error = dlm_recovery_stopped(ls);
2077 + memcpy(rc->rc_buf, last_name, last_mov.rm_length);
2078 + rc->rc_datalen = last_mov.rm_length;
2080 + error = rcom_send_message(ls, csb->node->nodeid,
2081 + RECCOMM_RECOVERNAMES, rc, 1);
2088 + * pick each res out of buffer
2094 + resmov_in(&mov, b);
2095 + b += sizeof(struct resmov);
2097 + /* Length of 0 with a non-zero nodeid marks the
2098 + * end of the list */
2099 + if (!mov.rm_length && mov.rm_nodeid)
2102 + /* This is just the end of the block */
2103 + if (!mov.rm_length)
2107 + de = allocate_resdata(ls, mov.rm_length);
2111 + de->master_nodeid = mov.rm_nodeid;
2112 + de->length = mov.rm_length;
2114 + memcpy(de->name, b, mov.rm_length);
2115 + b += mov.rm_length;
2117 + add_resdata_to_hash(ls, de);
2121 + memset(last_name, 0, DLM_RESNAME_MAXLEN);
2122 + memcpy(last_name, de->name, de->length);
2129 + set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
2132 + log_all(ls, "rebuilt %d resources", count);
2138 + free_rcom_buffer(rc);
2145 + * The reply end of dlm_dir_rebuild_local/RECOVERNAMES. Collect and send as
2146 + * many resource names as can fit in the buffer.
2149 +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2150 + char *outbuf, int outlen, uint32_t nodeid)
2152 + struct list_head *list;
2153 + struct dlm_rsb *start_rsb = NULL, *rsb;
2154 + int offset = 0, start_namelen, error;
2156 + struct resmov tmp;
2157 + uint32_t dir_nodeid;
2160 + * Find the rsb where we left off (or start again)
2163 + start_namelen = inlen;
2164 + start_name = inbuf;
2166 + if (start_namelen > 1) {
2167 + error = find_or_create_rsb(ls, NULL, start_name,
2168 + start_namelen, 0, &start_rsb);
2169 + DLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
2170 + release_rsb(start_rsb);
2174 + * Send rsb names for rsb's we're master of and whose directory node
2175 + * matches the requesting node.
2178 + down_read(&ls->ls_rec_rsblist);
2180 + list = start_rsb->res_rootlist.next;
2182 + list = ls->ls_rootres.next;
2184 + for (offset = 0; list != &ls->ls_rootres; list = list->next) {
2185 + rsb = list_entry(list, struct dlm_rsb, res_rootlist);
2186 + if (rsb->res_nodeid)
2189 + dir_nodeid = get_directory_nodeid(rsb);
2190 + if (dir_nodeid != nodeid)
2193 + if (offset + sizeof(struct resmov)*2 + rsb->res_length > outlen) {
2194 + /* Write end-of-block record */
2195 + memset(&tmp, 0, sizeof(struct resmov));
2196 + memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2197 + offset += sizeof(struct resmov);
2201 + memset(&tmp, 0, sizeof(struct resmov));
2202 + tmp.rm_nodeid = cpu_to_be32(our_nodeid());
2203 + tmp.rm_length = cpu_to_be16(rsb->res_length);
2205 + memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2206 + offset += sizeof(struct resmov);
2208 + memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
2209 + offset += rsb->res_length;
2213 + * If we've reached the end of the list (and there's room) write a
2214 + * terminating record.
2217 + if ((list == &ls->ls_rootres) &&
2218 + (offset + sizeof(struct resmov) <= outlen)) {
2220 + memset(&tmp, 0, sizeof(struct resmov));
2221 + /* This only needs to be non-zero */
2222 + tmp.rm_nodeid = cpu_to_be32(1);
2223 + /* and this must be zero */
2224 + tmp.rm_length = 0;
2225 + memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2226 + offset += sizeof(struct resmov);
2230 + up_read(&ls->ls_rec_rsblist);
2234 +static int get_resdata(struct dlm_ls *ls, uint32_t nodeid, char *name,
2235 + int namelen, uint32_t *r_nodeid, int recovery)
2237 + struct dlm_direntry *de, *tmp;
2240 + bucket = dir_hash(ls, name, namelen);
2242 + write_lock(&ls->ls_dirtbl[bucket].lock);
2243 + de = search_bucket(ls, name, namelen, bucket);
2245 + *r_nodeid = de->master_nodeid;
2246 + write_unlock(&ls->ls_dirtbl[bucket].lock);
2250 + write_unlock(&ls->ls_dirtbl[bucket].lock);
2252 + de = allocate_resdata(ls, namelen);
2256 + de->master_nodeid = nodeid;
2257 + de->length = namelen;
2258 + memcpy(de->name, name, namelen);
2260 + write_lock(&ls->ls_dirtbl[bucket].lock);
2261 + tmp = search_bucket(ls, name, namelen, bucket);
2266 + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
2268 + *r_nodeid = de->master_nodeid;
2269 + write_unlock(&ls->ls_dirtbl[bucket].lock);
2275 +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2276 + uint32_t *r_nodeid)
2278 + return get_resdata(ls, nodeid, name, namelen, r_nodeid, 0);
2281 +int dlm_dir_lookup_recovery(struct dlm_ls *ls, uint32_t nodeid, char *name,
2282 + int namelen, uint32_t *r_nodeid)
2284 + return get_resdata(ls, nodeid, name, namelen, r_nodeid, 1);
2288 + * The node with lowest id queries all nodes to determine when all are done.
2289 + * All other nodes query the low nodeid for this.
2292 +int dlm_dir_rebuild_wait(struct dlm_ls *ls)
2296 + if (ls->ls_low_nodeid == our_nodeid()) {
2297 + error = dlm_wait_status_all(ls, RESDIR_VALID);
2299 + set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
2301 + error = dlm_wait_status_low(ls, RESDIR_ALL_VALID);
2305 diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
2306 --- linux-orig/cluster/dlm/dir.h 1970-01-01 07:30:00.000000000 +0730
2307 +++ linux-patched/cluster/dlm/dir.h 2004-07-13 18:57:22.000000000 +0800
2309 +/******************************************************************************
2310 +*******************************************************************************
2312 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2313 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2315 +** This copyrighted material is made available to anyone wishing to use,
2316 +** modify, copy, or redistribute it subject to the terms and conditions
2317 +** of the GNU General Public License v.2.
2319 +*******************************************************************************
2320 +******************************************************************************/
2322 +#ifndef __DIR_DOT_H__
2323 +#define __DIR_DOT_H__
2325 +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2326 + uint32_t *r_nodeid);
2327 +int dlm_dir_lookup_recovery(struct dlm_ls *ls, uint32_t nodeid, char *name,
2328 + int namelen, uint32_t *r_nodeid);
2329 +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length);
2330 +uint32_t get_directory_nodeid(struct dlm_rsb *rsb);
2331 +void remove_resdata(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen);
2332 +int dlm_dir_rebuild_local(struct dlm_ls *ls);
2333 +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2334 + char *outbuf, int outlen, uint32_t nodeid);
2335 +int dlm_dir_rebuild_wait(struct dlm_ls * ls);
2336 +void dlm_dir_clear(struct dlm_ls *ls);
2337 +void dlm_dir_dump(struct dlm_ls *ls);
2339 +#endif /* __DIR_DOT_H__ */
2340 diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
2341 --- linux-orig/cluster/dlm/dlm_internal.h 1970-01-01 07:30:00.000000000 +0730
2342 +++ linux-patched/cluster/dlm/dlm_internal.h 2004-07-13 18:57:22.000000000 +0800
2344 +/******************************************************************************
2345 +*******************************************************************************
2347 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2348 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2350 +** This copyrighted material is made available to anyone wishing to use,
2351 +** modify, copy, or redistribute it subject to the terms and conditions
2352 +** of the GNU General Public License v.2.
2354 +*******************************************************************************
2355 +******************************************************************************/
2357 +#ifndef __DLM_INTERNAL_DOT_H__
2358 +#define __DLM_INTERNAL_DOT_H__
2361 + * This is the main header file to be included in each DLM source file.
2364 +#define DLM_RELEASE_NAME "<CVS>"
2366 +#include <linux/slab.h>
2367 +#include <linux/sched.h>
2368 +#include <asm/semaphore.h>
2369 +#include <linux/types.h>
2370 +#include <linux/spinlock.h>
2371 +#include <linux/vmalloc.h>
2372 +#include <asm/uaccess.h>
2373 +#include <linux/list.h>
2374 +#include <linux/errno.h>
2375 +#include <linux/random.h>
2377 +#include <cluster/dlm.h>
2378 +#include <cluster/dlm_device.h>
2379 +#include <cluster/service.h>
2389 +#if (BITS_PER_LONG == 64)
2390 +#define PRIu64 "lu"
2391 +#define PRId64 "ld"
2392 +#define PRIo64 "lo"
2393 +#define PRIx64 "lx"
2394 +#define PRIX64 "lX"
2395 +#define SCNu64 "lu"
2396 +#define SCNd64 "ld"
2397 +#define SCNo64 "lo"
2398 +#define SCNx64 "lx"
2399 +#define SCNX64 "lX"
2401 +#define PRIu64 "Lu"
2402 +#define PRId64 "Ld"
2403 +#define PRIo64 "Lo"
2404 +#define PRIx64 "Lx"
2405 +#define PRIX64 "LX"
2406 +#define SCNu64 "Lu"
2407 +#define SCNd64 "Ld"
2408 +#define SCNo64 "Lo"
2409 +#define SCNx64 "Lx"
2410 +#define SCNX64 "LX"
2413 +#define wchan_cond_sleep_intr(chan, sleep_cond) \
2416 + DECLARE_WAITQUEUE(__wait_chan, current); \
2417 + current->state = TASK_INTERRUPTIBLE; \
2418 + add_wait_queue(&chan, &__wait_chan); \
2419 + if ((sleep_cond)) \
2421 + remove_wait_queue(&chan, &__wait_chan); \
2422 + current->state = TASK_RUNNING; \
2426 +static inline int check_timeout(unsigned long stamp, unsigned int seconds)
2428 + return time_after(jiffies, stamp + seconds * HZ);
2432 +#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
2434 +#define log_all(ls, fmt, args...) \
2436 + printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
2437 + dlm_debug_log(ls, fmt, ##args); \
2440 +#define log_error log_all
2444 +#if defined(DLM_DEBUG)
2445 +#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
2447 +#define log_debug(ls, fmt, args...)
2450 +#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
2452 +#define log_debug log_all
2456 +#define DLM_ASSERT(x, do) \
2460 + dlm_locks_dump(); \
2461 + dlm_debug_dump(); \
2462 + printk("\nDLM: Assertion failed on line %d of file %s\n" \
2463 + "DLM: assertion: \"%s\"\n" \
2464 + "DLM: time = %lu\n", \
2465 + __LINE__, __FILE__, #x, jiffies); \
2469 + panic("DLM: Record message above and reboot.\n"); \
2479 +struct dlm_lkbtable;
2480 +struct dlm_rsbtable;
2481 +struct dlm_dirtable;
2482 +struct dlm_direntry;
2483 +struct dlm_recover;
2485 +struct dlm_request;
2488 +struct dlm_query_request;
2489 +struct dlm_query_reply;
2492 +struct dlm_direntry {
2493 + struct list_head list;
2494 + uint32_t master_nodeid;
2499 +struct dlm_dirtable {
2500 + struct list_head list;
2504 +struct dlm_rsbtable {
2505 + struct list_head list;
2509 +struct dlm_lkbtable {
2510 + struct list_head list;
2516 + * Cluster node (per node in cluster)
2520 + struct list_head list;
2522 + int refcount; /* num csb's referencing */
2526 + * Cluster System Block (per node in a ls)
2530 + struct list_head list; /* per-lockspace node list */
2531 + struct dlm_node * node; /* global node structure */
2532 + int gone_event; /* event id when node removed */
2534 + /* recovery stats for debugging */
2536 + uint32_t names_send_count;
2537 + uint32_t names_send_msgid;
2538 + uint32_t names_recv_count;
2539 + uint32_t names_recv_msgid;
2540 + uint32_t locks_send_count;
2541 + uint32_t locks_send_msgid;
2542 + uint32_t locks_recv_count;
2543 + uint32_t locks_recv_msgid;
2547 + * Used to save and manage recovery state for a lockspace.
2550 +struct dlm_recover {
2551 + struct list_head list;
2552 + uint32_t * nodeids;
2558 + * Elements in the range array
2561 +#define GR_RANGE_START (0)
2562 +#define GR_RANGE_END (1)
2563 +#define RQ_RANGE_START (2)
2564 +#define RQ_RANGE_END (3)
2567 + * Lockspace structure
2570 +#define LSFL_WORK (0)
2571 +#define LSFL_LS_RUN (1)
2572 +#define LSFL_LS_STOP (2)
2573 +#define LSFL_LS_START (3)
2574 +#define LSFL_LS_FINISH (4)
2575 +#define LSFL_RECCOMM_WAIT (5)
2576 +#define LSFL_RECCOMM_READY (6)
2577 +#define LSFL_NOTIMERS (7)
2578 +#define LSFL_FINISH_RECOVERY (8)
2579 +#define LSFL_RESDIR_VALID (9)
2580 +#define LSFL_ALL_RESDIR_VALID (10)
2581 +#define LSFL_NODES_VALID (11)
2582 +#define LSFL_ALL_NODES_VALID (12)
2583 +#define LSFL_REQUEST_WARN (13)
2584 +#define LSFL_NOCONVGRANT (14)
2586 +#define LSST_NONE (0)
2587 +#define LSST_INIT (1)
2588 +#define LSST_INIT_DONE (2)
2589 +#define LSST_CLEAR (3)
2590 +#define LSST_WAIT_START (4)
2591 +#define LSST_RECONFIG_DONE (5)
2594 + struct list_head ls_list; /* list of lockspaces */
2595 + uint32_t ls_local_id; /* local unique lockspace ID */
2596 + uint32_t ls_global_id; /* global unique lockspace ID */
2597 + int ls_allocation; /* Memory allocation policy */
2598 + unsigned long ls_flags; /* LSFL_ */
2600 + struct dlm_rsbtable * ls_rsbtbl;
2601 + uint32_t ls_rsbtbl_size;
2603 + struct dlm_lkbtable * ls_lkbtbl;
2604 + uint32_t ls_lkbtbl_size;
2606 + struct dlm_dirtable * ls_dirtbl;
2607 + uint32_t ls_dirtbl_size;
2609 + struct list_head ls_nodes; /* current nodes in RC */
2610 + struct list_head ls_nodes_gone; /* dead node list, recovery */
2611 + uint32_t ls_num_nodes; /* number of nodes in RC */
2612 + uint32_t ls_nodes_mask;
2613 + uint32_t ls_low_nodeid;
2615 + struct rw_semaphore ls_unlock_sem; /* To prevent unlock on a
2616 + parent lock racing with a
2619 + struct list_head ls_deadlockq; /* List of locks in conversion
2620 + ordered by duetime. for
2621 + deadlock detection */
2623 + /* recovery related */
2625 + struct list_head ls_recover; /* dlm_recover structs */
2626 + spinlock_t ls_recover_lock;
2628 + int ls_last_start;
2629 + int ls_last_finish;
2630 + int ls_state; /* recovery states */
2632 + struct rw_semaphore ls_in_recovery; /* block local requests */
2633 + struct list_head ls_requestqueue;/* queue remote requests */
2635 + struct dlm_rcom * ls_rcom; /* recovery comms */
2636 + uint32_t ls_rcom_msgid;
2637 + struct semaphore ls_rcom_lock;
2639 + struct list_head ls_recover_list;
2640 + spinlock_t ls_recover_list_lock;
2641 + int ls_recover_list_count;
2642 + wait_queue_head_t ls_wait_general;
2644 + struct list_head ls_rootres; /* List of root resources */
2646 + struct rw_semaphore ls_rec_rsblist; /* To prevent incoming recovery
2647 + operations happening while
2650 + struct rw_semaphore ls_gap_rsblist; /* To protect rootres list
2651 + in grant_after_purge() which
2652 + runs outside recovery */
2654 + struct list_head ls_rebuild_rootrsb_list; /* Root of lock trees
2665 +#define RESFL_NEW_MASTER (0)
2666 +#define RESFL_RECOVER_LIST (1)
2667 +#define RESFL_MASTER (2)
2670 + struct list_head res_hashchain;
2671 + uint32_t res_bucket;
2673 + struct dlm_ls * res_ls; /* The owning lockspace */
2675 + struct list_head res_rootlist; /* List of root rsb's */
2677 + struct list_head res_subreslist; /* List of all sub-resources
2678 + for this root rsb */
2680 + uint8_t res_depth; /* Depth in resource tree */
2681 + unsigned long res_flags; /* Flags, RESFL_ */
2683 + struct list_head res_grantqueue;
2684 + struct list_head res_convertqueue;
2685 + struct list_head res_waitqueue;
2687 + uint32_t res_nodeid; /* nodeid of master node */
2689 + struct dlm_rsb * res_root; /* root rsb if a subresource */
2690 + struct dlm_rsb * res_parent; /* parent rsb (if any) */
2692 + atomic_t res_ref; /* Number of lkb's */
2693 + uint16_t res_remasterid; /* ID used during remaster */
2695 + struct list_head res_recover_list; /* General list for use
2696 + during recovery */
2697 + int res_recover_msgid;
2698 + int res_newlkid_expect;
2700 + struct rw_semaphore res_lock;
2702 + char * res_lvbptr; /* Lock value block */
2704 + uint8_t res_length;
2705 + char res_name[1]; /* <res_length> bytes */
2709 + * Lock block. To avoid confusion, where flags mirror the
2710 + * public flags, they should have the same value.
2713 +#define GDLM_LKSTS_NEW (0)
2714 +#define GDLM_LKSTS_WAITING (1)
2715 +#define GDLM_LKSTS_GRANTED (2)
2716 +#define GDLM_LKSTS_CONVERT (3)
2718 +#define GDLM_LKFLG_VALBLK (0x00000008)
2719 +#define GDLM_LKFLG_PERSISTENT (0x00000080) /* Don't unlock when process exits */
2720 +#define GDLM_LKFLG_NODLCKWT (0x00000100) /* Don't do deadlock detection */
2721 +#define GDLM_LKFLG_EXPEDITE (0x00000400) /* Move to head of convert queue */
2723 +/* Internal flags */
2724 +#define GDLM_LKFLG_RANGE (0x00001000) /* Range field is present
2725 + (remote protocol only) */
2726 +#define GDLM_LKFLG_MSTCPY (0x00002000)
2727 +#define GDLM_LKFLG_DELETED (0x00004000) /* LKB is being deleted */
2728 +#define GDLM_LKFLG_LQCONVERT (0x00008000)
2729 +#define GDLM_LKFLG_LQRESEND (0x00010000) /* LKB on lockqueue must be resent */
2730 +#define GDLM_LKFLG_DEMOTED (0x00020000)
2731 +#define GDLM_LKFLG_RESENT (0x00040000)
2732 +#define GDLM_LKFLG_NOREBUILD (0x00080000)
2734 +#define AST_COMP (1)
2735 +#define AST_BAST (2)
2736 +#define AST_DEL (4)
2739 + uint32_t lkb_flags;
2740 + uint16_t lkb_status; /* grant, wait, convert */
2741 + int8_t lkb_rqmode; /* requested lock mode */
2742 + int8_t lkb_grmode; /* granted lock mode */
2743 + uint32_t lkb_retstatus; /* status to return in lksb */
2744 + uint32_t lkb_id; /* our lock ID */
2745 + struct dlm_lksb * lkb_lksb; /* status block of caller */
2746 + struct list_head lkb_idtbl_list; /* lockidtbl */
2747 + struct list_head lkb_statequeue; /* rsb's g/c/w queue */
2748 + struct dlm_rsb * lkb_resource;
2749 + struct list_head lkb_ownerqueue; /* list of locks owned by a
2751 + struct dlm_lkb * lkb_parent; /* parent lock if any */
2752 + atomic_t lkb_childcnt; /* number of children */
2754 + struct list_head lkb_lockqueue; /* queue of locks waiting
2755 + for remote reply */
2756 + int lkb_lockqueue_state; /* reason on lockqueue */
2757 + int lkb_lockqueue_flags; /* as passed into
2759 + unsigned long lkb_lockqueue_time; /* time lkb went on the
2761 + unsigned long lkb_duetime; /* for deadlock detection */
2763 + uint32_t lkb_remid; /* id on remote partner */
2764 + uint32_t lkb_nodeid; /* id of remote partner */
2766 + void * lkb_astaddr;
2767 + void * lkb_bastaddr;
2768 + long lkb_astparam;
2769 + struct list_head lkb_astqueue; /* locks with asts to deliver */
2770 + uint16_t lkb_astflags; /* COMP, BAST, DEL */
2771 + uint8_t lkb_bastmode; /* requested mode */
2772 + uint8_t lkb_highbast; /* highest mode bast sent for */
2774 + struct dlm_request * lkb_request;
2776 + struct list_head lkb_deadlockq; /* ls_deadlockq list */
2778 + char * lkb_lvbptr; /* points to lksb lvb on local
2779 + lock, allocated lvb on
2781 + uint64_t * lkb_range; /* Points to an array of 64 bit
2782 + numbers that represent the
2783 + requested and granted ranges
2784 + of the lock. NULL implies
2785 + 0-ffffffffffffffff */
2789 + * Header part of the mid-level comms system. All packets start with
2790 + * this header so we can identify them. The comms packet can
2791 + * contain many of these structs but the are split into individual
2792 + * work units before being passed to the lockqueue routines.
2793 + * below this are the structs that this is a header for
2796 +struct dlm_header {
2797 + uint8_t rh_cmd; /* What we are */
2798 + uint8_t rh_flags; /* maybe just a pad */
2799 + uint16_t rh_length; /* Length of struct (so we can
2800 + send many in 1 message) */
2801 + uint32_t rh_lkid; /* Lock ID tag: ie the local
2802 + (requesting) lock ID */
2803 + uint32_t rh_lockspace; /* Lockspace ID */
2807 + * This is the struct used in a remote lock/unlock/convert request
2808 + * The mid-level comms API should turn this into native byte order.
2809 + * Most "normal" lock operations will use these two structs for
2810 + * communications. Recovery operations use their own structs
2811 + * but still with the gd_req_header on the front.
2814 +struct dlm_request {
2815 + struct dlm_header rr_header;
2816 + uint32_t rr_remlkid; /* Remote lock ID */
2817 + uint32_t rr_remparid; /* Parent's remote lock ID */
2818 + uint32_t rr_flags; /* Flags from lock/convert req*/
2819 + uint64_t rr_range_start; /* Yes, these are in the right
2821 + uint64_t rr_range_end;
2822 + uint32_t rr_status; /* Status to return if this is
2824 + uint8_t rr_rqmode; /* Requested lock mode */
2825 + uint8_t rr_asts; /* Whether the LKB has ASTs */
2826 + char rr_lvb[DLM_LVB_LEN];
2827 + char rr_name[1]; /* As long as needs be. Only
2828 + used for directory lookups.
2829 + The length of this can be
2830 + worked out from the packet
2835 + * This is the struct returned by a remote lock/unlock/convert request
2836 + * The mid-level comms API should turn this into native byte order.
2840 + struct dlm_header rl_header;
2841 + uint32_t rl_lockstate; /* Whether request was
2842 + queued/granted/waiting */
2843 + uint32_t rl_nodeid; /* nodeid of lock master */
2844 + uint32_t rl_status; /* Status to return to caller */
2845 + uint32_t rl_lkid; /* Remote lkid */
2846 + char rl_lvb[DLM_LVB_LEN];
2850 + * Recovery comms message
2854 + struct dlm_header rc_header; /* 32 byte aligned */
2855 + uint32_t rc_msgid;
2856 + uint16_t rc_datalen;
2857 + uint8_t rc_expanded;
2858 + uint8_t rc_subcmd; /* secondary command */
2859 + char rc_buf[1]; /* first byte of data goes here
2860 + and extends beyond here for
2861 + another datalen - 1 bytes.
2862 + rh_length is set to sizeof
2863 + dlm_rcom + datalen - 1 */
2867 +/* A remote query: GDLM_REMCMD_QUERY */
2869 +struct dlm_query_request {
2870 + struct dlm_header rq_header;
2871 + uint32_t rq_mstlkid; /* LockID on master node */
2872 + uint32_t rq_query; /* query from the user */
2873 + uint32_t rq_maxlocks; /* max number of locks we can
2877 +/* First block of a reply query. cmd = GDLM_REMCMD_QUERY */
2878 +/* There may be subsequent blocks of
2879 + lock info in GDLM_REMCMD_QUERYCONT messages which just have
2880 + a normal header. The last of these will have rh_flags set to
2881 + GDLM_REMFLAG_ENDQUERY
2884 +struct dlm_query_reply {
2885 + struct dlm_header rq_header;
2886 + uint32_t rq_numlocks; /* Number of locks in reply */
2887 + uint32_t rq_startlock; /* Which lock this block starts
2888 + at (for multi-block replies) */
2889 + uint32_t rq_status;
2891 + /* Resource information */
2892 + uint32_t rq_grantcount; /* No. of nodes on grantqueue */
2893 + uint32_t rq_convcount; /* No. of nodes on convertq */
2894 + uint32_t rq_waitcount; /* No. of nodes on waitqueue */
2895 + char rq_valblk[DLM_LVB_LEN]; /* Master's LVB
2901 + * Lockqueue wait lock states
2904 +#define GDLM_LQSTATE_WAIT_RSB 1
2905 +#define GDLM_LQSTATE_WAIT_CONVERT 2
2906 +#define GDLM_LQSTATE_WAIT_CONDGRANT 3
2907 +#define GDLM_LQSTATE_WAIT_UNLOCK 4
2909 +/* Commands sent across the comms link */
2910 +#define GDLM_REMCMD_LOOKUP 1
2911 +#define GDLM_REMCMD_LOCKREQUEST 2
2912 +#define GDLM_REMCMD_UNLOCKREQUEST 3
2913 +#define GDLM_REMCMD_CONVREQUEST 4
2914 +#define GDLM_REMCMD_LOCKREPLY 5
2915 +#define GDLM_REMCMD_LOCKGRANT 6
2916 +#define GDLM_REMCMD_SENDBAST 7
2917 +#define GDLM_REMCMD_SENDCAST 8
2918 +#define GDLM_REMCMD_REM_RESDATA 9
2919 +#define GDLM_REMCMD_RECOVERMESSAGE 20
2920 +#define GDLM_REMCMD_RECOVERREPLY 21
2921 +#define GDLM_REMCMD_QUERY 30
2922 +#define GDLM_REMCMD_QUERYREPLY 31
2924 +/* Set in rh_flags when this is the last block of
2925 + query information. Note this could also be the first
2927 +#define GDLM_REMFLAG_ENDQUERY 1
2933 +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...);
2934 +void dlm_debug_dump(void);
2935 +void dlm_locks_dump(void);
2937 +#endif /* __DLM_INTERNAL_DOT_H__ */
2938 diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
2939 --- linux-orig/cluster/dlm/lkb.c 1970-01-01 07:30:00.000000000 +0730
2940 +++ linux-patched/cluster/dlm/lkb.c 2004-07-13 18:57:22.000000000 +0800
2942 +/******************************************************************************
2943 +*******************************************************************************
2945 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2946 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2948 +** This copyrighted material is made available to anyone wishing to use,
2949 +** modify, copy, or redistribute it subject to the terms and conditions
2950 +** of the GNU General Public License v.2.
2952 +*******************************************************************************
2953 +******************************************************************************/
2958 + * Allocate and free locks on the lock ID table.
2960 + * This is slightly naff but I don't really like the
2961 + * VMS lockidtbl stuff as it uses a realloced array
2962 + * to hold the locks in. I think this is slightly better
2965 + * Any better suggestions gratefully received. Patrick
2969 +#include "dlm_internal.h"
2970 +#include "lockqueue.h"
2972 +#include "config.h"
2974 +#include "memory.h"
2975 +#include "lockspace.h"
2979 + * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
2982 +static struct dlm_lkb *__find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
2984 + uint16_t bucket = lkid & 0xFFFF;
2985 + struct dlm_lkb *lkb;
2987 + if (bucket >= ls->ls_lkbtbl_size)
2990 + list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list){
2991 + if (lkb->lkb_id == lkid)
2999 + * LKB lkid's are 32 bits and have two 16 bit parts. The bottom 16 bits are a
3000 + * random number between 0 and lockidtbl_size-1. This random number specifies
3001 + * the "bucket" for the lkb in lockidtbl. The upper 16 bits are a sequentially
3002 + * assigned per-bucket id.
3004 + * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
3005 + * against the lkid of all lkb's in the bucket to avoid duplication.
3009 +struct dlm_lkb *create_lkb(struct dlm_ls *ls)
3011 + struct dlm_lkb *lkb;
3015 + lkb = allocate_lkb(ls);
3020 + get_random_bytes(&bucket, sizeof(bucket));
3021 + bucket &= (ls->ls_lkbtbl_size - 1);
3023 + write_lock(&ls->ls_lkbtbl[bucket].lock);
3025 + lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
3027 + if (__find_lock_by_id(ls, lkid)) {
3028 + write_unlock(&ls->ls_lkbtbl[bucket].lock);
3032 + lkb->lkb_id = lkid;
3033 + list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
3034 + write_unlock(&ls->ls_lkbtbl[bucket].lock);
3040 + * Free LKB and remove it from the lockidtbl.
3041 + * NB - this always frees the lkb whereas release_rsb doesn't free an
3042 + * rsb unless its reference count is zero.
3045 +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
3047 + uint16_t bucket = lkb->lkb_id & 0xFFFF;
3049 + if (lkb->lkb_status) {
3050 + log_error(ls, "release lkb with status %u", lkb->lkb_status);
3055 + if (lkb->lkb_parent)
3056 + atomic_dec(&lkb->lkb_parent->lkb_childcnt);
3058 + write_lock(&ls->ls_lkbtbl[bucket].lock);
3059 + list_del(&lkb->lkb_idtbl_list);
3060 + write_unlock(&ls->ls_lkbtbl[bucket].lock);
3062 + /* if this is not a master copy then lvbptr points into the user's
3063 + * lksb, so don't free it */
3064 + if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3065 + free_lvb(lkb->lkb_lvbptr);
3067 + if (lkb->lkb_range)
3068 + free_range(lkb->lkb_range);
3073 +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
3075 + struct dlm_lkb *lkb;
3076 + uint16_t bucket = lkid & 0xFFFF;
3078 + read_lock(&ls->ls_lkbtbl[bucket].lock);
3079 + lkb = __find_lock_by_id(ls, lkid);
3080 + read_unlock(&ls->ls_lkbtbl[bucket].lock);
3085 +struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid)
3087 + struct dlm_ls *lspace = find_lockspace_by_local_id(ls);
3088 + return find_lock_by_id(lspace, lkid);
3092 + * Initialise the range parts of an LKB.
3095 +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end)
3097 + int ret = -ENOMEM;
3100 + * if this wasn't already a range lock, make it one
3102 + if (!lkb->lkb_range) {
3103 + lkb->lkb_range = allocate_range(lspace);
3104 + if (!lkb->lkb_range)
3108 + * This is needed for conversions that contain ranges where the
3109 + * original lock didn't but it's harmless for new locks too.
3111 + lkb->lkb_range[GR_RANGE_START] = 0LL;
3112 + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
3115 + lkb->lkb_range[RQ_RANGE_START] = start;
3116 + lkb->lkb_range[RQ_RANGE_END] = end;
3123 diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
3124 --- linux-orig/cluster/dlm/lkb.h 1970-01-01 07:30:00.000000000 +0730
3125 +++ linux-patched/cluster/dlm/lkb.h 2004-07-13 18:57:22.000000000 +0800
3127 +/******************************************************************************
3128 +*******************************************************************************
3130 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3131 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3133 +** This copyrighted material is made available to anyone wishing to use,
3134 +** modify, copy, or redistribute it subject to the terms and conditions
3135 +** of the GNU General Public License v.2.
3137 +*******************************************************************************
3138 +******************************************************************************/
3140 +#ifndef __LKB_DOT_H__
3141 +#define __LKB_DOT_H__
3143 +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid);
3144 +struct dlm_lkb *create_lkb(struct dlm_ls *ls);
3145 +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb);
3146 +struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid);
3147 +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end);
3149 +#endif /* __LKB_DOT_H__ */
3150 diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
3151 --- linux-orig/cluster/dlm/locking.c 1970-01-01 07:30:00.000000000 +0730
3152 +++ linux-patched/cluster/dlm/locking.c 2004-07-13 18:57:22.000000000 +0800
3154 +/******************************************************************************
3155 +*******************************************************************************
3157 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3158 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3160 +** This copyrighted material is made available to anyone wishing to use,
3161 +** modify, copy, or redistribute it subject to the terms and conditions
3162 +** of the GNU General Public License v.2.
3164 +*******************************************************************************
3165 +******************************************************************************/
3170 + * This is where the main work of the DLM goes on
3174 +#include "dlm_internal.h"
3175 +#include "lockqueue.h"
3176 +#include "locking.h"
3177 +#include "lockspace.h"
3182 +#include "memory.h"
3186 +extern struct list_head lslist;
3188 +#define MAX(a, b) (((a) > (b)) ? (a) : (b))
3191 + * Lock compatibilty matrix - thanks Steve
3192 + * UN = Unlocked state. Not really a state, used as a flag
3193 + * PD = Padding. Used to make the matrix a nice power of two in size
3194 + * Other states are the same as the VMS DLM.
3195 + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
3198 +#define modes_compat(gr, rq) \
3199 + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
3201 +const int __dlm_compat_matrix[8][8] = {
3202 + /* UN NL CR CW PR PW EX PD */
3203 + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
3204 + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
3205 + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
3206 + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
3207 + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
3208 + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
3209 + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
3210 + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3214 + * Compatibility matrix for conversions with QUECVT set.
3215 + * Granted mode is the row; requested mode is the column.
3216 + * Usage: matrix[grmode+1][rqmode+1]
3219 +const int __quecvt_compat_matrix[8][8] = {
3220 + /* UN NL CR CW PR PW EX PD */
3221 + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
3222 + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
3223 + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
3224 + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
3225 + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
3226 + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
3227 + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
3228 + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3232 + * This defines the direction of transfer of LVB data.
3233 + * Granted mode is the row; requested mode is the column.
3234 + * Usage: matrix[grmode+1][rqmode+1]
3235 + * 1 = LVB is returned to the caller
3236 + * 0 = LVB is written to the resource
3237 + * -1 = nothing happens to the LVB
3240 +const int __lvb_operations[8][8] = {
3241 + /* UN NL CR CW PR PW EX PD*/
3242 + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
3243 + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
3244 + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
3245 + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
3246 + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
3247 + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
3248 + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
3249 + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
3252 +static void grant_lock(struct dlm_lkb * lkb, int send_remote);
3253 +static void send_blocking_asts(struct dlm_rsb * rsb, struct dlm_lkb * lkb);
3254 +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
3255 +static int convert_lock(struct dlm_ls * ls, int mode, struct dlm_lksb *lksb,
3256 + int flags, void *ast, void *astarg, void *bast,
3257 + struct dlm_range *range);
3258 +static int dlm_lock_stage1(struct dlm_ls * lspace, struct dlm_lkb * lkb, int flags,
3259 + char *name, int namelen);
3262 +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
3264 + struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, lkb_statequeue);
3266 + if (lkb->lkb_id == first->lkb_id)
3273 + * Return 1 if the locks' ranges overlap
3274 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
3277 +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
3279 + if (!lkb1->lkb_range || !lkb2->lkb_range)
3282 + if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
3283 + lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
3290 + * Resolve conversion deadlock by changing to NL the granted mode of deadlocked
3291 + * locks on the convert queue. One of the deadlocked locks is allowed to
3292 + * retain its original granted state (we choose the lkb provided although it
3293 + * shouldn't matter which.) We do not change the granted mode on locks without
3294 + * the CONVDEADLK flag. If any of these exist (there shouldn't if the app uses
3295 + * the flag consistently) the false return value is used.
3298 +static int conversion_deadlock_resolve(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
3300 + struct dlm_lkb *this;
3303 + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3307 + if (!ranges_overlap(lkb, this))
3310 + if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) {
3312 + if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){
3316 + this->lkb_grmode = DLM_LOCK_NL;
3317 + this->lkb_flags |= GDLM_LKFLG_DEMOTED;
3324 + * "A conversion deadlock arises with a pair of lock requests in the converting
3325 + * queue for one resource. The granted mode of each lock blocks the requested
3326 + * mode of the other lock."
3329 +static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
3331 + struct dlm_lkb *this;
3333 + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3337 + if (!ranges_overlap(lkb, this))
3340 + if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
3347 + * Check if the given lkb conflicts with another lkb on the queue.
3350 +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
3352 + struct dlm_lkb *this;
3354 + list_for_each_entry(this, head, lkb_statequeue) {
3357 + if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
3364 + * Deadlock can arise when using the QUECVT flag if the requested mode of the
3365 + * first converting lock is incompatible with the granted mode of another
3366 + * converting lock further down the queue. To prevent this deadlock, a
3367 + * requested QUEUECVT lock is granted immediately if adding it to the end of
3368 + * the queue would prevent a lock ahead of it from being granted.
3371 +static int queuecvt_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
3373 + struct dlm_lkb *this;
3375 + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3379 + if (ranges_overlap(lkb, this) && !modes_compat(lkb, this))
3386 + * Return 1 if the lock can be granted, 0 otherwise.
3387 + * Also detect and resolve conversion deadlocks.
3390 +static int can_be_granted(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
3392 + if (test_bit(LSFL_NOCONVGRANT, &rsb->res_ls->ls_flags) &&
3393 + lkb->lkb_grmode == DLM_LOCK_IV &&
3394 + !list_empty(&rsb->res_convertqueue))
3397 + if (lkb->lkb_rqmode == DLM_LOCK_NL)
3400 + if (lkb->lkb_rqmode == lkb->lkb_grmode)
3403 + if (queue_conflict(&rsb->res_grantqueue, lkb))
3406 + if (!queue_conflict(&rsb->res_convertqueue, lkb)) {
3407 + if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
3410 + if (list_empty(&rsb->res_convertqueue) ||
3411 + first_in_list(lkb, &rsb->res_convertqueue) ||
3412 + queuecvt_deadlock_detect(rsb, lkb))
3418 + /* there *is* a conflict between this lkb and a converting lock so
3419 + we return false unless conversion deadlock resolution is permitted
3420 + (only conversion requests will have the CONVDEADLK flag set) */
3422 + if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK))
3425 + if (!conversion_deadlock_detect(rsb, lkb))
3428 + if (conversion_deadlock_resolve(rsb, lkb))
3434 +int dlm_lock(void *lockspace,
3436 + struct dlm_lksb *lksb,
3439 + unsigned int namelen,
3441 + void (*ast) (void *astarg),
3443 + void (*bast) (void *astarg, int mode),
3444 + struct dlm_range *range)
3446 + struct dlm_ls *lspace;
3447 + struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
3448 + int ret = -EINVAL;
3450 + lspace = find_lockspace_by_local_id(lockspace);
3454 + if (mode < 0 || mode > DLM_LOCK_EX)
3457 + if (namelen > DLM_RESNAME_MAXLEN)
3460 + if (flags & DLM_LKF_CANCEL)
3463 + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
3466 + if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT))
3469 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
3472 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
3475 + if (!ast || !lksb)
3478 + if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK))
3481 + if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
3485 + * Take conversion path.
3488 + if (flags & DLM_LKF_CONVERT) {
3489 + ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
3495 + * Take new lock path.
3499 + down_read(&lspace->ls_unlock_sem);
3501 + parent_lkb = find_lock_by_id(lspace, parent);
3503 + if (!parent_lkb ||
3504 + parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
3505 + parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
3506 + parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3507 + up_read(&lspace->ls_unlock_sem);
3511 + atomic_inc(&parent_lkb->lkb_childcnt);
3512 + up_read(&lspace->ls_unlock_sem);
3515 + down_read(&lspace->ls_in_recovery);
3519 + lkb = create_lkb(lspace);
3522 + lkb->lkb_astaddr = ast;
3523 + lkb->lkb_astparam = (long) astarg;
3524 + lkb->lkb_bastaddr = bast;
3525 + lkb->lkb_rqmode = mode;
3526 + lkb->lkb_grmode = DLM_LOCK_IV;
3527 + lkb->lkb_nodeid = -1;
3528 + lkb->lkb_lksb = lksb;
3529 + lkb->lkb_parent = parent_lkb;
3530 + lkb->lkb_lockqueue_flags = flags;
3531 + lkb->lkb_lvbptr = lksb->sb_lvbptr;
3533 + /* Copy the range if appropriate */
3535 + if (range->ra_start > range->ra_end) {
3540 + if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
3544 + /* Convert relevant flags to internal numbers */
3545 + if (flags & DLM_LKF_VALBLK)
3546 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3547 + if (flags & DLM_LKF_PERSISTENT)
3548 + lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
3549 + if (flags & DLM_LKF_NODLCKWT)
3550 + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
3552 + lksb->sb_lkid = lkb->lkb_id;
3554 + ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
3558 + up_read(&lspace->ls_in_recovery);
3565 + release_lkb(lspace, lkb);
3570 + atomic_dec(&parent_lkb->lkb_childcnt);
3573 + up_read(&lspace->ls_in_recovery);
3579 +int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb, int flags, char *name,
3582 + struct dlm_rsb *rsb, *parent_rsb = NULL;
3583 + struct dlm_lkb *parent_lkb = lkb->lkb_parent;
3588 + parent_rsb = parent_lkb->lkb_resource;
3590 + error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb);
3593 + lkb->lkb_resource = rsb;
3595 + log_debug(ls, "rq %u %x \"%s\"", lkb->lkb_rqmode, lkb->lkb_id,
3598 + * Next stage, do we need to find the master or can
3599 + * we get on with the real locking work ?
3602 + if (rsb->res_nodeid == -1) {
3603 + if (get_directory_nodeid(rsb) != our_nodeid()) {
3604 + error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
3608 + error = dlm_dir_lookup(ls, our_nodeid(), rsb->res_name,
3609 + rsb->res_length, &nodeid);
3613 + if (nodeid == our_nodeid()) {
3614 + set_bit(RESFL_MASTER, &rsb->res_flags);
3617 + clear_bit(RESFL_MASTER, &rsb->res_flags);
3618 + rsb->res_nodeid = nodeid;
3621 + lkb->lkb_nodeid = rsb->res_nodeid;
3623 + error = dlm_lock_stage2(ls, lkb, rsb, flags);
3633 + * Locking routine called after we have an RSB, either a copy of a remote one
3634 + * or a local one, or perhaps a shiny new one all of our very own
3637 +int dlm_lock_stage2(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *rsb, int flags)
3641 + DLM_ASSERT(rsb->res_nodeid != -1, print_lkb(lkb); print_rsb(rsb););
3643 + if (rsb->res_nodeid) {
3644 + res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3645 + error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
3647 + dlm_lock_stage3(lkb);
3654 + * Called on an RSB's master node to do stage2 locking for a remote lock
3655 + * request. Returns a proper lkb with rsb ready for lock processing.
3656 + * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
3659 +struct dlm_lkb *remote_stage2(int remote_nodeid, struct dlm_ls *ls,
3660 + struct dlm_request *freq)
3662 + struct dlm_rsb *rsb = NULL, *parent_rsb = NULL;
3663 + struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
3664 + int error, namelen;
3666 + if (freq->rr_remparid) {
3667 + parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
3671 + atomic_inc(&parent_lkb->lkb_childcnt);
3672 + parent_rsb = parent_lkb->lkb_resource;
3676 + * A new MSTCPY lkb. Initialize lkb fields including the real lkid and
3677 + * node actually holding the (non-MSTCPY) lkb. AST address are just
3678 + * flags in the master copy.
3681 + lkb = create_lkb(ls);
3684 + lkb->lkb_grmode = DLM_LOCK_IV;
3685 + lkb->lkb_rqmode = freq->rr_rqmode;
3686 + lkb->lkb_parent = parent_lkb;
3687 + lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP);
3688 + lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST);
3689 + lkb->lkb_nodeid = remote_nodeid;
3690 + lkb->lkb_remid = freq->rr_header.rh_lkid;
3691 + lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
3692 + lkb->lkb_lockqueue_flags = freq->rr_flags;
3694 + if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
3695 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3696 + allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
3697 + if (!lkb->lkb_lvbptr)
3701 + if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
3702 + error = lkb_set_range(ls, lkb, freq->rr_range_start,
3703 + freq->rr_range_end);
3709 + * Get the RSB which this lock is for. Create a new RSB if this is a
3710 + * new lock on a new resource. We must be the master of any new rsb.
3713 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
3715 + error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 0,
3720 + if (!rsb || rsb->res_nodeid == -1) {
3721 + log_debug(ls, "inval rsb to %u", remote_nodeid);
3722 + lkb->lkb_retstatus = -EINVAL;
3726 + lkb->lkb_resource = rsb;
3728 + log_debug(ls, "rq %u from %u %x \"%s\"", lkb->lkb_rqmode, remote_nodeid,
3729 + lkb->lkb_id, rsb->res_name);
3731 + DLM_ASSERT(rsb->res_nodeid == 0,
3733 + print_request(freq);
3734 + printk("nodeid %u\n", remote_nodeid););
3740 + /* release_lkb handles parent */
3741 + release_lkb(ls, lkb);
3742 + parent_lkb = NULL;
3746 + atomic_dec(&parent_lkb->lkb_childcnt);
3752 + * The final bit of lock request processing on the master node. Here the lock
3753 + * is granted and the completion ast is queued, or the lock is put on the
3754 + * waitqueue and blocking asts are sent.
3757 +void dlm_lock_stage3(struct dlm_lkb *lkb)
3759 + struct dlm_rsb *rsb = lkb->lkb_resource;
3762 + * This is a locally mastered lock on a resource that already exists,
3763 + * see if it can be granted or if it must wait. When this function is
3764 + * called for a remote lock request (process_cluster_request,
3765 + * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
3766 + * requesting node at the end of process_cluster_request, not at the
3767 + * end of grant_lock.
3770 + down_write(&rsb->res_lock);
3772 + if (can_be_granted(rsb, lkb)) {
3773 + grant_lock(lkb, 0);
3778 + * This request is not a conversion, so the lkb didn't exist other than
3779 + * for this request and should be freed after EAGAIN is returned in the
3783 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
3784 + lkb->lkb_retstatus = -EAGAIN;
3785 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
3786 + send_blocking_asts_all(rsb, lkb);
3787 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
3792 + * The requested lkb must wait. Because the rsb of the requested lkb
3793 + * is mastered here, send blocking asts for the lkb's blocking the
3797 + lkb->lkb_retstatus = 0;
3798 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3800 + send_blocking_asts(rsb, lkb);
3803 + up_write(&rsb->res_lock);
3806 +int dlm_unlock(void *lockspace,
3809 + struct dlm_lksb *lksb,
3812 + struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
3813 + struct dlm_lkb *lkb;
3814 + struct dlm_rsb *rsb;
3815 + int ret = -EINVAL;
3820 + lkb = find_lock_by_id(ls, lkid);
3824 + /* Can't dequeue a master copy (a remote node's mastered lock) */
3825 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3828 + /* Already waiting for a remote lock operation */
3829 + if (lkb->lkb_lockqueue_state) {
3834 + /* Can only cancel WAITING or CONVERTing locks.
3835 + * This is just a quick check - it is also checked in unlock_stage2()
3836 + * (which may be on the master) under the semaphore.
3838 + if ((flags & DLM_LKF_CANCEL) &&
3839 + (lkb->lkb_status == GDLM_LKSTS_GRANTED))
3842 + /* "Normal" unlocks must operate on a granted lock */
3843 + if (!(flags & DLM_LKF_CANCEL) &&
3844 + (lkb->lkb_status != GDLM_LKSTS_GRANTED))
3847 + down_write(&ls->ls_unlock_sem);
3848 + /* Can't dequeue a lock with sublocks */
3849 + if (atomic_read(&lkb->lkb_childcnt)) {
3850 + up_write(&ls->ls_unlock_sem);
3854 + /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
3855 + if (!(flags & DLM_LKF_CANCEL))
3856 + lkb->lkb_flags |= GDLM_LKFLG_DELETED;
3857 + up_write(&ls->ls_unlock_sem);
3859 + down_read(&ls->ls_in_recovery);
3860 + rsb = find_rsb_to_unlock(ls, lkb);
3862 + log_debug(ls, "un %x ref %u flg %x nodeid %d/%d \"%s\"", lkb->lkb_id,
3863 + atomic_read(&rsb->res_ref), rsb->res_flags,
3864 + lkb->lkb_nodeid, rsb->res_nodeid, rsb->res_name);
3866 + /* Save any new params */
3868 + lkb->lkb_lksb = lksb;
3870 + lkb->lkb_astparam = (long) astarg;
3871 + lkb->lkb_lockqueue_flags = flags;
3873 + if (lkb->lkb_nodeid)
3874 + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
3876 + ret = dlm_unlock_stage2(lkb, rsb, flags);
3877 + up_read(&ls->ls_in_recovery);
3885 +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags)
3887 + int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
3890 + down_write(&rsb->res_lock);
3892 + /* Can only cancel WAITING or CONVERTing locks */
3893 + if ((flags & DLM_LKF_CANCEL) &&
3894 + (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
3895 + lkb->lkb_retstatus = -EINVAL;
3896 + queue_ast(lkb, AST_COMP, 0);
3900 + old_status = lkb_dequeue(lkb);
3903 + * If was granted grant any converting or waiting locks.
3906 + if (old_status == GDLM_LKSTS_GRANTED)
3907 + grant_pending_locks(rsb);
3910 + * Cancelling a conversion
3913 + if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
3914 + /* VMS semantics say we should send blocking ASTs again here */
3915 + send_blocking_asts(rsb, lkb);
3917 + /* Remove from deadlock detection */
3918 + if (lkb->lkb_duetime)
3919 + remove_from_deadlockqueue(lkb);
3921 + /* Stick it back on the granted queue */
3922 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
3923 + lkb->lkb_rqmode = lkb->lkb_grmode;
3925 + /* Was it blocking any other locks? */
3926 + if (first_in_list(lkb, &rsb->res_convertqueue))
3927 + grant_pending_locks(rsb);
3929 + lkb->lkb_retstatus = -DLM_ECANCEL;
3930 + queue_ast(lkb, AST_COMP, 0);
3935 + * The lvb can be saved or cleared on unlock.
3938 + if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
3939 + if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
3940 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
3941 + if (flags & DLM_LKF_IVVALBLK)
3942 + memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
3945 + lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK;
3948 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
3951 + * Only free the LKB if we are the master copy. Otherwise the AST
3952 + * delivery routine will free it after delivery.
3956 + up_write(&rsb->res_lock);
3957 + release_lkb(rsb->res_ls, lkb);
3963 + up_write(&rsb->res_lock);
3973 +static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
3974 + int flags, void *ast, void *astarg, void *bast,
3975 + struct dlm_range *range)
3977 + struct dlm_lkb *lkb;
3978 + struct dlm_rsb *rsb;
3979 + int ret = -EINVAL;
3981 + lkb = find_lock_by_id(ls, lksb->sb_lkid);
3986 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3991 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
3995 + if ((flags & DLM_LKF_QUECVT) &&
3996 + !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
4000 + if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
4004 + if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) {
4008 + /* Set up the ranges as appropriate */
4010 + if (range->ra_start > range->ra_end)
4013 + if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
4019 + rsb = lkb->lkb_resource;
4020 + down_read(&ls->ls_in_recovery);
4022 + log_debug(ls, "cv %u %x \"%s\"", mode, lkb->lkb_id, rsb->res_name);
4024 + lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
4025 + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
4027 + if (flags & DLM_LKF_NODLCKWT)
4028 + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
4030 + lkb->lkb_astaddr = ast;
4032 + lkb->lkb_astparam = (long) astarg;
4034 + lkb->lkb_bastaddr = bast;
4035 + lkb->lkb_rqmode = mode;
4036 + lkb->lkb_lockqueue_flags = flags;
4037 + lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
4038 + lkb->lkb_lvbptr = lksb->sb_lvbptr;
4040 + if (rsb->res_nodeid) {
4041 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4042 + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
4044 + ret = dlm_convert_stage2(lkb, FALSE);
4047 + up_read(&ls->ls_in_recovery);
4056 + * For local conversion requests on locally mastered locks this is called
4057 + * directly from dlm_lock/convert_lock. This function is also called for
4058 + * remote conversion requests of MSTCPY locks (from process_cluster_request).
4061 +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast)
4063 + struct dlm_rsb *rsb = lkb->lkb_resource;
4066 + down_write(&rsb->res_lock);
4068 + if (can_be_granted(rsb, lkb)) {
4069 + grant_lock(lkb, 0);
4070 + grant_pending_locks(rsb);
4075 + * Remove lkb from granted queue.
4081 + * The user won't wait so stick it back on the grant queue
4084 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4085 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4086 + ret = lkb->lkb_retstatus = -EAGAIN;
4088 + queue_ast(lkb, AST_COMP, 0);
4089 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4090 + send_blocking_asts_all(rsb, lkb);
4095 + * The lkb's status tells which queue it's on. Put back on convert
4096 + * queue. (QUECVT requests added at end of the queue, all others in
4100 + lkb->lkb_retstatus = 0;
4101 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4104 + * If the request can't be granted
4107 + send_blocking_asts(rsb, lkb);
4109 + if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
4110 + add_to_deadlockqueue(lkb);
4113 + up_write(&rsb->res_lock);
4118 + * Remove lkb from any queue it's on, add it to the granted queue, and queue a
4119 + * completion ast. rsb res_lock must be held in write when this is called.
4122 +static void grant_lock(struct dlm_lkb *lkb, int send_remote)
4124 + struct dlm_rsb *rsb = lkb->lkb_resource;
4126 + if (lkb->lkb_duetime)
4127 + remove_from_deadlockqueue(lkb);
4129 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
4131 + DLM_ASSERT(lkb->lkb_lvbptr,);
4133 + if (!rsb->res_lvbptr)
4134 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
4136 + b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
4138 + memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
4140 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4143 + if (lkb->lkb_range) {
4144 + lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
4145 + lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
4148 + lkb->lkb_grmode = lkb->lkb_rqmode;
4149 + lkb->lkb_rqmode = DLM_LOCK_IV;
4150 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4152 + lkb->lkb_highbast = 0;
4153 + lkb->lkb_retstatus = 0;
4154 + queue_ast(lkb, AST_COMP, 0);
4157 + * A remote conversion request has been granted, either immediately
4158 + * upon being requested or after waiting a bit. In the former case,
4159 + * reply_and_grant() is called. In the later case send_remote is 1 and
4160 + * remote_grant() is called.
4162 + * The "send_remote" flag is set only for locks which are granted "out
4163 + * of band" - ie by another lock being converted or unlocked.
4165 + * The second case occurs when this lkb is granted right away as part
4166 + * of processing the initial request. In that case, we send a single
4167 + * message in reply_and_grant which combines the request reply with the
4171 + if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
4173 + remote_grant(lkb);
4174 + else if (lkb->lkb_request)
4175 + reply_and_grant(lkb);
4180 +static void send_bast_queue(struct list_head *head, struct dlm_lkb *lkb)
4182 + struct dlm_lkb *gr;
4184 + list_for_each_entry(gr, head, lkb_statequeue) {
4185 + if (gr->lkb_bastaddr &&
4186 + gr->lkb_highbast < lkb->lkb_rqmode &&
4187 + ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
4188 + queue_ast(gr, AST_BAST, lkb->lkb_rqmode);
4189 + gr->lkb_highbast = lkb->lkb_rqmode;
4195 + * Notify granted locks if they are blocking a newly forced-to-wait lock.
4198 +static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
4200 + send_bast_queue(&rsb->res_grantqueue, lkb);
4201 + /* check if the following improves performance */
4202 + /* send_bast_queue(&rsb->res_convertqueue, lkb); */
4205 +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
4207 + send_bast_queue(&rsb->res_grantqueue, lkb);
4208 + send_bast_queue(&rsb->res_convertqueue, lkb);
4212 + * Called when a lock has been dequeued. Look for any locks to grant that are
4213 + * waiting for conversion or waiting to be granted.
4214 + * The rsb res_lock must be held in write when this function is called.
4217 +int grant_pending_locks(struct dlm_rsb *rsb)
4219 + struct dlm_lkb *lkb;
4220 + struct list_head *list;
4221 + struct list_head *temp;
4222 + int8_t high = DLM_LOCK_IV;
4224 + list_for_each_safe(list, temp, &rsb->res_convertqueue) {
4225 + lkb = list_entry(list, struct dlm_lkb, lkb_statequeue);
4227 + if (can_be_granted(rsb, lkb))
4228 + grant_lock(lkb, 1);
4230 + high = MAX(lkb->lkb_rqmode, high);
4233 + list_for_each_safe(list, temp, &rsb->res_waitqueue) {
4234 + lkb = list_entry(list, struct dlm_lkb, lkb_statequeue);
4236 + if (can_be_granted(rsb, lkb))
4237 + grant_lock(lkb, 1);
4239 + high = MAX(lkb->lkb_rqmode, high);
4243 + * If there are locks left on the wait/convert queue then send blocking
4244 + * ASTs to granted locks that are blocking
4246 + * FIXME: This might generate some spurious blocking ASTs for range
4250 + if (high > DLM_LOCK_IV) {
4251 + list_for_each_safe(list, temp, &rsb->res_grantqueue) {
4252 + lkb = list_entry(list, struct dlm_lkb, lkb_statequeue);
4254 + if (lkb->lkb_bastaddr &&
4255 + (lkb->lkb_highbast < high) &&
4256 + !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
4258 + queue_ast(lkb, AST_BAST, high);
4259 + lkb->lkb_highbast = high;
4268 + * Called to cancel a locking operation that failed due to some internal
4271 + * Waiting locks will be removed, converting locks will be reverted to their
4272 + * granted status, unlocks will be left where they are.
4274 + * A completion AST will be delivered to the caller.
4277 +int cancel_lockop(struct dlm_lkb *lkb, int status)
4279 + int state = lkb->lkb_lockqueue_state;
4280 + uint16_t astflags = AST_COMP;
4282 + lkb->lkb_lockqueue_state = 0;
4285 + case GDLM_LQSTATE_WAIT_RSB:
4286 + astflags |= AST_DEL;
4289 + case GDLM_LQSTATE_WAIT_CONDGRANT:
4290 + res_lkb_dequeue(lkb);
4291 + astflags |= AST_DEL;
4294 + case GDLM_LQSTATE_WAIT_CONVERT:
4295 + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
4297 + /* Remove from deadlock detection */
4298 + if (lkb->lkb_duetime) {
4299 + remove_from_deadlockqueue(lkb);
4303 + case GDLM_LQSTATE_WAIT_UNLOCK:
4304 + /* We can leave this. I think.... */
4308 + lkb->lkb_retstatus = status;
4309 + queue_ast(lkb, astflags, 0);
4315 + * Check for conversion deadlock. If a deadlock was found
4316 + * return lkb to kill, else return NULL
4319 +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb)
4321 + struct dlm_rsb *rsb = lkb->lkb_resource;
4322 + struct list_head *entry;
4324 + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
4326 + /* Work our way up to the head of the queue looking for locks that
4327 + * conflict with us */
4329 + down_read(&rsb->res_lock);
4331 + entry = lkb->lkb_statequeue.prev;
4332 + while (entry != &rsb->res_convertqueue) {
4333 + struct dlm_lkb *lkb2 = list_entry(entry, struct dlm_lkb, lkb_statequeue);
4335 + if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
4336 + up_read(&rsb->res_lock);
4339 + entry = entry->prev;
4341 + up_read(&rsb->res_lock);
4347 + * Conversion operation was cancelled by us (not the user).
4348 + * ret contains the return code to pass onto the user
4351 +void cancel_conversion(struct dlm_lkb *lkb, int ret)
4353 + struct dlm_rsb *rsb = lkb->lkb_resource;
4355 + /* Stick it back on the granted queue */
4356 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4357 + lkb->lkb_rqmode = lkb->lkb_grmode;
4359 + remove_from_deadlockqueue(lkb);
4361 + lkb->lkb_retstatus = ret;
4362 + queue_ast(lkb, AST_COMP, 0);
4367 + * As new master of the rsb for this lkb, we need to handle these requests
4368 + * removed from the lockqueue and originating from local processes:
4369 + * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
4370 + * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
4373 +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state)
4375 + struct dlm_rsb *rsb;
4378 + case GDLM_LQSTATE_WAIT_RSB:
4379 + dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
4380 + lkb->lkb_lockqueue_flags,
4381 + lkb->lkb_resource->res_name,
4382 + lkb->lkb_resource->res_length);
4385 + case GDLM_LQSTATE_WAIT_CONDGRANT:
4386 + res_lkb_dequeue(lkb);
4387 + dlm_lock_stage3(lkb);
4390 + case GDLM_LQSTATE_WAIT_UNLOCK:
4391 + rsb = find_rsb_to_unlock(ls, lkb);
4392 + dlm_unlock_stage2(lkb, rsb, lkb->lkb_lockqueue_flags);
4395 + case GDLM_LQSTATE_WAIT_CONVERT:
4396 + dlm_convert_stage2(lkb, TRUE);
4404 +static void dump_queue(struct list_head *head)
4406 + struct dlm_lkb *lkb;
4408 + list_for_each_entry(lkb, head, lkb_statequeue) {
4409 + printk("%08x gr %d rq %d flg %x sts %u node %u remid %x "
4418 + lkb->lkb_lockqueue_state,
4419 + lkb->lkb_lockqueue_flags);
4423 +static void dump_rsb(struct dlm_rsb *rsb)
4425 + printk("name \"%s\" flags %lx nodeid %u ref %u\n",
4426 + rsb->res_name, rsb->res_flags, rsb->res_nodeid,
4427 + atomic_read(&rsb->res_ref));
4429 + if (!list_empty(&rsb->res_grantqueue)) {
4430 + printk("grant queue\n");
4431 + dump_queue(&rsb->res_grantqueue);
4434 + if (!list_empty(&rsb->res_convertqueue)) {
4435 + printk("convert queue\n");
4436 + dump_queue(&rsb->res_convertqueue);
4439 + if (!list_empty(&rsb->res_waitqueue)) {
4440 + printk("wait queue\n");
4441 + dump_queue(&rsb->res_waitqueue);
4445 +void dlm_locks_dump(void)
4447 + struct dlm_ls *ls;
4448 + struct dlm_rsb *rsb;
4449 + struct list_head *head;
4452 + list_for_each_entry(ls, &lslist, ls_list) {
4453 + for (i = 0; i < ls->ls_rsbtbl_size; i++) {
4454 + head = &ls->ls_rsbtbl[i].list;
4455 + list_for_each_entry(rsb, head, res_hashchain)
4461 diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
4462 --- linux-orig/cluster/dlm/locking.h 1970-01-01 07:30:00.000000000 +0730
4463 +++ linux-patched/cluster/dlm/locking.h 2004-07-13 18:57:22.000000000 +0800
4465 +/******************************************************************************
4466 +*******************************************************************************
4468 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4469 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4471 +** This copyrighted material is made available to anyone wishing to use,
4472 +** modify, copy, or redistribute it subject to the terms and conditions
4473 +** of the GNU General Public License v.2.
4475 +*******************************************************************************
4476 +******************************************************************************/
4478 +#ifndef __LOCKING_DOT_H__
4479 +#define __LOCKING_DOT_H__
4481 +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state);
4482 +void dlm_lock_stage3(struct dlm_lkb *lkb);
4483 +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast);
4484 +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
4485 +int dlm_lock_stage2(struct dlm_ls *lspace, struct dlm_lkb *lkb, struct dlm_rsb *rsb, int flags);
4486 +struct dlm_rsb *create_rsb(struct dlm_ls *lspace, struct dlm_lkb *lkb, char *name, int namelen);
4487 +int free_rsb_if_unused(struct dlm_rsb *rsb);
4488 +struct dlm_lkb *remote_stage2(int remote_csid, struct dlm_ls *lspace,
4489 + struct dlm_request *freq);
4490 +int cancel_lockop(struct dlm_lkb *lkb, int status);
4491 +int dlm_remove_lock(struct dlm_lkb *lkb, uint32_t flags);
4492 +int grant_pending_locks(struct dlm_rsb *rsb);
4493 +void cancel_conversion(struct dlm_lkb *lkb, int ret);
4494 +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb);
4496 +#endif /* __LOCKING_DOT_H__ */
4497 diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
4498 --- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730
4499 +++ linux-patched/cluster/dlm/lockqueue.c 2004-07-13 18:57:22.000000000 +0800
4501 +/******************************************************************************
4502 +*******************************************************************************
4504 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4505 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4507 +** This copyrighted material is made available to anyone wishing to use,
4508 +** modify, copy, or redistribute it subject to the terms and conditions
4509 +** of the GNU General Public License v.2.
4511 +*******************************************************************************
4512 +******************************************************************************/
4517 + * This controls the lock queue, which is where locks
4518 + * come when they need to wait for a remote operation
4521 + * This could also be thought of as the "high-level" comms
4526 +#include "dlm_internal.h"
4527 +#include "lockqueue.h"
4529 +#include "locking.h"
4531 +#include "lowcomms.h"
4532 +#include "midcomms.h"
4533 +#include "reccomms.h"
4535 +#include "lockspace.h"
4537 +#include "memory.h"
4539 +#include "queries.h"
4542 +static void add_reply_lvb(struct dlm_lkb * lkb, struct dlm_reply *reply);
4543 +static void add_request_lvb(struct dlm_lkb * lkb, struct dlm_request *req);
4546 + * format of an entry on the request queue
4549 + struct list_head rqe_list;
4550 + uint32_t rqe_nodeid;
4551 + char rqe_request[1];
4555 + * Add a new request (if appropriate) to the request queue and send the remote
4556 + * request out. - runs in the context of the locking caller
4558 + * Recovery of a remote_stage request if the remote end fails while the lkb
4559 + * is still on the lockqueue:
4561 + * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
4562 + * lockqueue_lkb_mark() at the start of recovery.
4564 + * o Some lkb's will be rebuilt on new master rsb's during recovery.
4565 + * (depends on the type of request, see below).
4567 + * o At the end of recovery, resend_cluster_requests() looks at these
4568 + * LQRESEND lkb's and either:
4570 + * i) resends the request to the new master for the rsb where the
4571 + * request is processed as usual. The lkb remains on the lockqueue until
4572 + * the new master replies and we run process_lockqueue_reply().
4574 + * ii) if we've become the rsb master, remove the lkb from the lockqueue
4575 + * and processes the request locally via process_remastered_lkb().
4577 + * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
4578 + * and the request should be resent if dest node is failed.
4580 + * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
4581 + * wait queue. Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
4582 + * makes send_lkb_queue() skip it). Resend this request to the new master.
4584 + * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue. It will
4585 + * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
4586 + * Resend this request to the new master.
4588 + * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
4589 + * It will be rebuilt on the new master rsb's granted queue. Resend this
4590 + * request to the new master.
4593 +int remote_stage(struct dlm_lkb *lkb, int state)
4597 + lkb->lkb_lockqueue_state = state;
4598 + add_to_lockqueue(lkb);
4600 + error = send_cluster_request(lkb, state);
4602 + log_print("remote_stage error sending request %d", error);
4604 + /* Leave on lockqueue, it will be resent to correct node during
4608 + lkb->lkb_lockqueue_state = 0;
4609 + remove_from_lockqueue(lkb);
4617 + * Requests received while the lockspace is in recovery get added to the
4618 + * request queue and processed when recovery is complete.
4621 +void add_to_requestqueue(struct dlm_ls *ls, int nodeid, char *request, int length)
4623 + struct rq_entry *entry;
4625 + if (in_nodes_gone(ls, nodeid))
4628 + entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
4630 + // TODO something better
4631 + printk("dlm: add_to_requestqueue: out of memory\n");
4635 + log_debug(ls, "add_to_requestqueue %d", nodeid);
4636 + entry->rqe_nodeid = nodeid;
4637 + memcpy(entry->rqe_request, request, length);
4638 + list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
4641 +int process_requestqueue(struct dlm_ls *ls)
4643 + int error = 0, count = 0;
4644 + struct rq_entry *entry, *safe;
4645 + struct dlm_header *req;
4647 + log_all(ls, "process held requests");
4649 + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4650 + req = (struct dlm_header *) entry->rqe_request;
4651 + log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid);
4653 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
4654 + log_debug(ls, "process_requestqueue aborted");
4659 + error = process_cluster_request(entry->rqe_nodeid, req, TRUE);
4660 + if (error == -EINTR) {
4661 + log_debug(ls, "process_requestqueue interrupted");
4665 + list_del(&entry->rqe_list);
4671 + log_all(ls, "processed %d requests", count);
4675 +void wait_requestqueue(struct dlm_ls *ls)
4677 + while (!list_empty(&ls->ls_requestqueue) &&
4678 + test_bit(LSFL_LS_RUN, &ls->ls_flags))
4683 + * Resdir requests (lookup or remove) and replies from before recovery are
4684 + * invalid since the resdir was rebuilt. Clear them. Requests from nodes now
4685 + * gone are also invalid.
4688 +void purge_requestqueue(struct dlm_ls *ls)
4691 + struct rq_entry *entry, *safe;
4692 + struct dlm_header *req;
4693 + struct dlm_request *freq;
4694 + struct dlm_lkb *lkb;
4696 + log_all(ls, "purge requests");
4698 + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4699 + req = (struct dlm_header *) entry->rqe_request;
4700 + freq = (struct dlm_request *) req;
4702 + if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
4703 + req->rh_cmd == GDLM_REMCMD_LOOKUP ||
4704 + in_nodes_gone(ls, entry->rqe_nodeid)) {
4706 + list_del(&entry->rqe_list);
4710 + } else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
4713 + * Replies to resdir lookups are invalid and must be
4714 + * purged. The lookup requests are marked in
4715 + * lockqueue_lkb_mark and will be resent in
4716 + * resend_cluster_requests. The only way to check if
4717 + * this is a lookup reply is to look at the
4718 + * lockqueue_state of the lkb.
4721 + lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid);
4723 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
4724 + list_del(&entry->rqe_list);
4731 + log_all(ls, "purged %d requests", count);
4735 + * Check if there's a reply for the given lkid in the requestqueue.
4738 +int reply_in_requestqueue(struct dlm_ls *ls, int lkid)
4741 + struct rq_entry *entry, *safe;
4742 + struct dlm_header *req;
4743 + struct dlm_request *freq;
4745 + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4746 + req = (struct dlm_header *) entry->rqe_request;
4747 + freq = (struct dlm_request *) req;
4749 + if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY &&
4750 + freq->rr_header.rh_lkid == lkid) {
4759 +void allocate_and_copy_lvb(struct dlm_ls *ls, char **lvbptr, char *src)
4762 + *lvbptr = allocate_lvb(ls);
4764 + memcpy(*lvbptr, src, DLM_LVB_LEN);
4768 + * Process a lockqueue LKB after it has had it's remote processing complete and
4769 + * been pulled from the lockqueue. Runs in the context of the DLM recvd thread
4770 + * on the machine that requested the lock.
4773 +static void process_lockqueue_reply(struct dlm_lkb *lkb,
4774 + struct dlm_reply *reply,
4777 + struct dlm_rsb *rsb = lkb->lkb_resource;
4778 + struct dlm_ls *ls = rsb->res_ls;
4779 + int oldstate, state = lkb->lkb_lockqueue_state;
4781 + lkb->lkb_lockqueue_state = 0;
4783 + remove_from_lockqueue(lkb);
4786 + case GDLM_LQSTATE_WAIT_RSB:
4788 + DLM_ASSERT(reply->rl_status == 0,
4791 + print_reply(reply););
4793 + DLM_ASSERT(rsb->res_nodeid == -1 ||
4794 + rsb->res_nodeid == 0,
4797 + print_reply(reply););
4799 + if (reply->rl_nodeid == our_nodeid()) {
4800 + if (rsb->res_nodeid == -1) {
4801 + set_bit(RESFL_MASTER, &rsb->res_flags);
4802 + rsb->res_nodeid = 0;
4804 + log_all(ls, "ignore master reply %x %u",
4805 + lkb->lkb_id, nodeid);
4808 + DLM_ASSERT(rsb->res_nodeid == -1,
4811 + print_reply(reply););
4813 + clear_bit(RESFL_MASTER, &rsb->res_flags);
4814 + rsb->res_nodeid = reply->rl_nodeid;
4817 + log_debug(ls, "lookup reply %x %u", lkb->lkb_id,
4820 + lkb->lkb_nodeid = rsb->res_nodeid;
4821 + dlm_lock_stage2(ls, lkb, rsb, lkb->lkb_lockqueue_flags);
4824 + case GDLM_LQSTATE_WAIT_CONVERT:
4825 + case GDLM_LQSTATE_WAIT_CONDGRANT:
4828 + * After a remote lock/conversion/grant request we put the lock
4829 + * on the right queue and send an AST if appropriate. Any lock
4830 + * shuffling (eg newly granted locks because this one was
4831 + * converted downwards) will be dealt with in seperate messages
4832 + * (which may be in the same network message)
4836 + /* the destination wasn't the master */
4837 + if (reply->rl_status == -EINVAL) {
4838 + int master_nodeid;
4840 + log_debug(ls, "resend lookup");
4842 + rsb->res_nodeid = -1;
4843 + lkb->lkb_nodeid = -1;
4844 + if (get_directory_nodeid(rsb) != our_nodeid())
4845 + remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
4847 + dlm_dir_lookup(ls, our_nodeid(), rsb->res_name,
4848 + rsb->res_length, &master_nodeid);
4850 + if (master_nodeid == our_nodeid()) {
4851 + set_bit(RESFL_MASTER, &rsb->res_flags);
4852 + master_nodeid = 0;
4855 + clear_bit(RESFL_MASTER,&rsb->res_flags);
4856 + rsb->res_nodeid = master_nodeid;
4857 + lkb->lkb_nodeid = master_nodeid;
4858 + dlm_lock_stage2(ls, lkb, rsb,
4859 + lkb->lkb_lockqueue_flags);
4864 + if (!lkb->lkb_remid)
4865 + lkb->lkb_remid = reply->rl_lkid;
4868 + * The remote request failed (we assume because of NOQUEUE).
4869 + * If this is a new request (non-conv) the lkb was created just
4870 + * for it so the lkb should be freed. If this was a
4871 + * conversion, the lkb already existed so we should put it back
4872 + * on the grant queue.
4875 + if (reply->rl_status != 0) {
4876 + DLM_ASSERT(reply->rl_status == -EAGAIN,);
4878 + if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
4879 + res_lkb_dequeue(lkb);
4880 + lkb->lkb_retstatus = reply->rl_status;
4881 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
4883 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4884 + lkb->lkb_retstatus = reply->rl_status;
4885 + queue_ast(lkb, AST_COMP, 0);
4891 + * The remote request was successful in granting the request or
4892 + * queuing it to be granted later. Add the lkb to the
4893 + * appropriate rsb queue.
4896 + switch (reply->rl_lockstate) {
4897 + case GDLM_LKSTS_GRANTED:
4899 + /* Compact version of grant_lock(). */
4901 + down_write(&rsb->res_lock);
4902 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
4903 + memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
4906 + lkb->lkb_grmode = lkb->lkb_rqmode;
4907 + lkb->lkb_rqmode = DLM_LOCK_IV;
4908 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4910 + if (lkb->lkb_range) {
4911 + lkb->lkb_range[GR_RANGE_START] =
4912 + lkb->lkb_range[RQ_RANGE_START];
4913 + lkb->lkb_range[GR_RANGE_END] =
4914 + lkb->lkb_range[RQ_RANGE_END];
4916 + up_write(&rsb->res_lock);
4918 + lkb->lkb_retstatus = 0;
4919 + queue_ast(lkb, AST_COMP, 0);
4922 + case GDLM_LKSTS_WAITING:
4924 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4925 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4927 + log_error(ls, "wait reply for granted %x %u",
4928 + lkb->lkb_id, lkb->lkb_nodeid);
4931 + case GDLM_LKSTS_CONVERT:
4933 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4934 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4936 + log_error(ls, "convert reply for granted %x %u",
4937 + lkb->lkb_id, lkb->lkb_nodeid);
4941 + log_error(ls, "process_lockqueue_reply state %d",
4942 + reply->rl_lockstate);
4947 + case GDLM_LQSTATE_WAIT_UNLOCK:
4950 + * Unlocks should never fail. Update local lock info. This
4951 + * always sends completion AST with status in lksb
4954 + DLM_ASSERT(reply->rl_status == 0,);
4955 + oldstate = res_lkb_dequeue(lkb);
4957 + /* Differentiate between unlocks and conversion cancellations */
4958 + if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL &&
4959 + oldstate == GDLM_LKSTS_CONVERT) {
4960 + res_lkb_enqueue(lkb->lkb_resource, lkb,
4961 + GDLM_LKSTS_GRANTED);
4962 + lkb->lkb_retstatus = -DLM_ECANCEL;
4963 + queue_ast(lkb, AST_COMP, 0);
4965 + lkb->lkb_retstatus = -DLM_EUNLOCK;
4966 + queue_ast(lkb, AST_COMP | AST_DEL, 0);
4971 + log_error(ls, "process_lockqueue_reply id %x state %d",
4972 + lkb->lkb_id, state);
4977 + * Tell a remote node to grant a lock. This happens when we are the master
4978 + * copy for a lock that is actually held on a remote node. The remote end is
4979 + * also responsible for sending the completion AST.
4982 +void remote_grant(struct dlm_lkb *lkb)
4984 + struct writequeue_entry *e;
4985 + struct dlm_request *req;
4987 + // TODO Error handling
4988 + e = lowcomms_get_buffer(lkb->lkb_nodeid,
4989 + sizeof(struct dlm_request),
4990 + lkb->lkb_resource->res_ls->ls_allocation,
4995 + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
4996 + req->rr_header.rh_length = sizeof(struct dlm_request);
4997 + req->rr_header.rh_flags = 0;
4998 + req->rr_header.rh_lkid = lkb->lkb_id;
4999 + req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
5000 + req->rr_remlkid = lkb->lkb_remid;
5001 + req->rr_flags = 0;
5003 + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
5004 + /* This is a confusing non-standard use of rr_flags which is
5005 + * usually used to pass lockqueue_flags. */
5006 + req->rr_flags |= GDLM_LKFLG_DEMOTED;
5009 + add_request_lvb(lkb, req);
5010 + midcomms_send_buffer(&req->rr_header, e);
5013 +void reply_and_grant(struct dlm_lkb *lkb)
5015 + struct dlm_request *req = lkb->lkb_request;
5016 + struct dlm_reply *reply;
5017 + struct writequeue_entry *e;
5019 + // TODO Error handling
5020 + e = lowcomms_get_buffer(lkb->lkb_nodeid,
5021 + sizeof(struct dlm_reply),
5022 + lkb->lkb_resource->res_ls->ls_allocation,
5023 + (char **) &reply);
5027 + reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5028 + reply->rl_header.rh_flags = 0;
5029 + reply->rl_header.rh_length = sizeof(struct dlm_reply);
5030 + reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
5031 + reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
5033 + reply->rl_status = lkb->lkb_retstatus;
5034 + reply->rl_lockstate = lkb->lkb_status;
5035 + reply->rl_lkid = lkb->lkb_id;
5037 + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
5039 + lkb->lkb_request = NULL;
5041 + add_reply_lvb(lkb, reply);
5042 + midcomms_send_buffer(&reply->rl_header, e);
5046 + * Request removal of a dead entry in the resource directory
5049 +void remote_remove_resdata(struct dlm_ls *ls, int nodeid, char *name,
5052 + struct writequeue_entry *e;
5053 + struct dlm_request *req;
5055 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5056 + struct dlm_rcom *rc = allocate_rcom_buffer(ls);
5058 + memcpy(rc->rc_buf, name, namelen);
5059 + rc->rc_datalen = namelen;
5061 + rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
5063 + free_rcom_buffer(rc);
5066 + // TODO Error handling
5067 + e = lowcomms_get_buffer(nodeid,
5068 + sizeof(struct dlm_request) + namelen - 1,
5069 + ls->ls_allocation, (char **) &req);
5073 + memset(req, 0, sizeof(struct dlm_request) + namelen - 1);
5074 + req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
5075 + req->rr_header.rh_length =
5076 + sizeof(struct dlm_request) + namelen - 1;
5077 + req->rr_header.rh_flags = 0;
5078 + req->rr_header.rh_lkid = 0;
5079 + req->rr_header.rh_lockspace = ls->ls_global_id;
5080 + req->rr_remlkid = 0;
5081 + memcpy(req->rr_name, name, namelen);
5083 + midcomms_send_buffer(&req->rr_header, e);
5087 + * Send remote cluster request to directory or master node before the request
5088 + * is put on the lock queue. Runs in the context of the locking caller.
5091 +int send_cluster_request(struct dlm_lkb *lkb, int state)
5093 + uint32_t target_nodeid;
5094 + struct dlm_rsb *rsb = lkb->lkb_resource;
5095 + struct dlm_ls *ls = rsb->res_ls;
5096 + struct dlm_request *req;
5097 + struct writequeue_entry *e;
5099 + if (state == GDLM_LQSTATE_WAIT_RSB)
5100 + target_nodeid = get_directory_nodeid(rsb);
5102 + target_nodeid = lkb->lkb_nodeid;
5104 + /* during recovery it's valid for target_nodeid to equal our own;
5105 + resend_cluster_requests does this to get requests back on track */
5107 + DLM_ASSERT(target_nodeid && target_nodeid != -1,
5110 + printk("target_nodeid %u\n", target_nodeid););
5112 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5113 + /* this may happen when called by resend_cluster_request */
5114 + log_error(ls, "send_cluster_request to %u state %d recovery",
5115 + target_nodeid, state);
5118 + e = lowcomms_get_buffer(target_nodeid,
5119 + sizeof(struct dlm_request) +
5120 + rsb->res_length - 1, ls->ls_allocation,
5124 + memset(req, 0, sizeof(struct dlm_request) + rsb->res_length - 1);
5126 + /* Common stuff, some are just defaults */
5128 + if (lkb->lkb_bastaddr)
5129 + req->rr_asts = AST_BAST;
5130 + if (lkb->lkb_astaddr)
5131 + req->rr_asts |= AST_COMP;
5132 + if (lkb->lkb_parent)
5133 + req->rr_remparid = lkb->lkb_parent->lkb_remid;
5135 + req->rr_flags = lkb->lkb_lockqueue_flags;
5136 + req->rr_rqmode = lkb->lkb_rqmode;
5137 + req->rr_remlkid = lkb->lkb_remid;
5138 + req->rr_header.rh_length =
5139 + sizeof(struct dlm_request) + rsb->res_length - 1;
5140 + req->rr_header.rh_flags = 0;
5141 + req->rr_header.rh_lkid = lkb->lkb_id;
5142 + req->rr_header.rh_lockspace = ls->ls_global_id;
5146 + case GDLM_LQSTATE_WAIT_RSB:
5148 + DLM_ASSERT(!lkb->lkb_parent,
5152 + DLM_ASSERT(rsb->res_nodeid == -1,
5156 + log_debug(ls, "send lu %x to %u", lkb->lkb_id, target_nodeid);
5158 + req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
5159 + memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5162 + case GDLM_LQSTATE_WAIT_CONVERT:
5164 + DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5168 + log_debug(ls, "send cv %x to %u", lkb->lkb_id, target_nodeid);
5170 + req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
5171 + if (lkb->lkb_range) {
5172 + req->rr_flags |= GDLM_LKFLG_RANGE;
5173 + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5174 + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5178 + case GDLM_LQSTATE_WAIT_CONDGRANT:
5180 + DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5184 + log_debug(ls, "send rq %x to %u", lkb->lkb_id, target_nodeid);
5186 + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
5187 + memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5188 + if (lkb->lkb_range) {
5189 + req->rr_flags |= GDLM_LKFLG_RANGE;
5190 + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5191 + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5195 + case GDLM_LQSTATE_WAIT_UNLOCK:
5197 + log_debug(ls, "send un %x to %u", lkb->lkb_id, target_nodeid);
5199 + if (rsb->res_nodeid != -1)
5200 + log_all(ls, "un %x to %u rsb nodeid %u", lkb->lkb_id,
5201 + target_nodeid, rsb->res_nodeid);
5203 + req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
5207 + DLM_ASSERT(0, printk("Unknown cluster request\n"););
5210 + add_request_lvb(lkb, req);
5211 + midcomms_send_buffer(&req->rr_header, e);
5217 + * We got a request from another cluster node, process it and return an info
5218 + * structure with the lock state/LVB etc as required. Executes in the DLM's
5222 +int process_cluster_request(int nodeid, struct dlm_header *req, int recovery)
5224 + struct dlm_ls *lspace;
5225 + struct dlm_lkb *lkb = NULL;
5226 + struct dlm_rsb *rsb;
5227 + int send_reply = 0, status = 0, namelen;
5228 + struct dlm_request *freq = (struct dlm_request *) req;
5229 + struct dlm_reply *rp = (struct dlm_reply *) req;
5230 + struct dlm_reply reply;
5232 + lspace = find_lockspace_by_global_id(req->rh_lockspace);
5235 + log_print("process_cluster_request invalid lockspace %x "
5236 + "from %d req %u", req->rh_lockspace, nodeid,
5242 + /* wait for recoverd to drain requestqueue */
5244 + wait_requestqueue(lspace);
5247 + * If we're in recovery then queue the request for later. Otherwise,
5248 + * we still need to get the "in_recovery" lock to make sure the
5249 + * recovery itself doesn't start until we are done.
5252 + if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
5253 + if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags))
5254 + log_error(lspace, "process_cluster_request warning %u",
5256 + add_to_requestqueue(lspace, nodeid, (char *) req,
5258 + log_debug(lspace, "process_cluster_request queue %d from %u",
5259 + req->rh_cmd, nodeid);
5263 + if (!down_read_trylock(&lspace->ls_in_recovery)) {
5270 + * Process the request.
5273 + switch (req->rh_cmd) {
5275 + case GDLM_REMCMD_LOOKUP:
5277 + uint32_t dir_nodeid, r_nodeid;
5280 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5282 + dir_nodeid = name_to_directory_nodeid(lspace,
5285 + if (dir_nodeid != our_nodeid())
5286 + log_debug(lspace, "ignoring directory lookup");
5288 + status = dlm_dir_lookup(lspace, nodeid, freq->rr_name,
5289 + namelen, &r_nodeid);
5293 + reply.rl_status = status;
5294 + reply.rl_lockstate = 0;
5295 + reply.rl_nodeid = r_nodeid;
5300 + case GDLM_REMCMD_REM_RESDATA:
5302 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5303 + remove_resdata(lspace, nodeid, freq->rr_name, namelen);
5306 + case GDLM_REMCMD_LOCKREQUEST:
5308 + lkb = remote_stage2(nodeid, lspace, freq);
5310 + lkb->lkb_request = freq;
5311 + if (lkb->lkb_retstatus != -EINVAL)
5312 + dlm_lock_stage3(lkb);
5315 + * If the request was granted in lock_stage3, then a
5316 + * reply message was already sent in combination with
5317 + * the grant message and lkb_request is NULL.
5320 + if (lkb->lkb_request) {
5321 + lkb->lkb_request = NULL;
5323 + reply.rl_status = lkb->lkb_retstatus;
5324 + reply.rl_lockstate = lkb->lkb_status;
5325 + reply.rl_lkid = lkb->lkb_id;
5328 + * If the request could not be granted and the
5329 + * user won't wait, then free up the LKB
5332 + if (lkb->lkb_retstatus == -EAGAIN) {
5333 + rsb = lkb->lkb_resource;
5334 + release_lkb(lspace, lkb);
5338 + else if (lkb->lkb_retstatus == -EINVAL) {
5339 + release_lkb(lspace, lkb);
5344 + reply.rl_status = -ENOMEM;
5349 + case GDLM_REMCMD_CONVREQUEST:
5351 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5354 + print_request(freq);
5355 + printk("nodeid %u\n", nodeid););
5357 + rsb = lkb->lkb_resource;
5361 + print_request(freq);
5362 + printk("nodeid %u\n", nodeid););
5364 + DLM_ASSERT(!rsb->res_nodeid,
5367 + print_request(freq);
5368 + printk("nodeid %u\n", nodeid););
5370 + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5373 + print_request(freq);
5374 + printk("nodeid %u\n", nodeid););
5376 + DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_GRANTED,
5379 + print_request(freq);
5380 + printk("nodeid %u\n", nodeid););
5382 + lkb->lkb_rqmode = freq->rr_rqmode;
5383 + lkb->lkb_lockqueue_flags = freq->rr_flags;
5384 + lkb->lkb_request = freq;
5385 + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
5387 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK ||
5388 + freq->rr_flags & DLM_LKF_VALBLK) {
5389 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
5390 + allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
5394 + if (freq->rr_flags & GDLM_LKFLG_RANGE) {
5395 + if (lkb_set_range(lspace, lkb, freq->rr_range_start,
5396 + freq->rr_range_end)) {
5397 + reply.rl_status = -ENOMEM;
5403 + log_debug(lspace, "cv %u from %u %x \"%s\"", lkb->lkb_rqmode,
5404 + nodeid, lkb->lkb_id, rsb->res_name);
5406 + dlm_convert_stage2(lkb, FALSE);
5409 + * If the conv request was granted in stage2, then a reply
5410 + * message was already sent in combination with the grant
5414 + if (lkb->lkb_request) {
5415 + lkb->lkb_request = NULL;
5417 + reply.rl_status = lkb->lkb_retstatus;
5418 + reply.rl_lockstate = lkb->lkb_status;
5419 + reply.rl_lkid = lkb->lkb_id;
5423 + case GDLM_REMCMD_LOCKREPLY:
5425 + lkb = find_lock_by_id(lspace, req->rh_lkid);
5429 + printk("nodeid %u\n", nodeid););
5431 + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5434 + printk("nodeid %u\n", nodeid););
5436 + process_lockqueue_reply(lkb, rp, nodeid);
5439 + case GDLM_REMCMD_LOCKGRANT:
5442 + * Remote lock has been granted asynchronously. Do a compact
5443 + * version of what grant_lock() does.
5446 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5449 + print_request(freq);
5450 + printk("nodeid %u\n", nodeid););
5452 + rsb = lkb->lkb_resource;
5456 + print_request(freq);
5457 + printk("nodeid %u\n", nodeid););
5459 + DLM_ASSERT(rsb->res_nodeid,
5462 + print_request(freq);
5463 + printk("nodeid %u\n", nodeid););
5465 + DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5468 + print_request(freq);
5469 + printk("nodeid %u\n", nodeid););
5471 + if (lkb->lkb_lockqueue_state) {
5472 + log_error(rsb->res_ls, "granting lock on lockqueue");
5476 + down_write(&rsb->res_lock);
5478 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5479 + memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN);
5481 + lkb->lkb_grmode = lkb->lkb_rqmode;
5482 + lkb->lkb_rqmode = DLM_LOCK_IV;
5484 + if (lkb->lkb_range) {
5485 + lkb->lkb_range[GR_RANGE_START] =
5486 + lkb->lkb_range[RQ_RANGE_START];
5487 + lkb->lkb_range[GR_RANGE_END] =
5488 + lkb->lkb_range[RQ_RANGE_END];
5491 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5492 + up_write(&rsb->res_lock);
5494 + if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
5495 + lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
5497 + lkb->lkb_retstatus = 0;
5498 + queue_ast(lkb, AST_COMP, 0);
5501 + case GDLM_REMCMD_SENDBAST:
5503 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5506 + print_request(freq);
5507 + printk("nodeid %u\n", nodeid););
5509 + if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
5510 + queue_ast(lkb, AST_BAST, freq->rr_rqmode);
5513 + case GDLM_REMCMD_SENDCAST:
5515 + /* This is only used for some error completion ASTs */
5517 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5520 + print_request(freq);
5521 + printk("nodeid %u\n", nodeid););
5523 + /* Return the lock to granted status */
5524 + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
5525 + lkb->lkb_retstatus = freq->rr_status;
5526 + queue_ast(lkb, AST_COMP, 0);
5529 + case GDLM_REMCMD_UNLOCKREQUEST:
5531 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5534 + print_request(freq);
5535 + printk("nodeid %u\n", nodeid););
5537 + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5539 + print_request(freq);
5540 + printk("nodeid %u\n", nodeid););
5542 + rsb = find_rsb_to_unlock(lspace, lkb);
5544 + log_debug(lspace, "un from %u %x \"%s\"", nodeid, lkb->lkb_id,
5547 + reply.rl_status = dlm_unlock_stage2(lkb, rsb, freq->rr_flags);
5551 + case GDLM_REMCMD_QUERY:
5552 + remote_query(nodeid, lspace, req);
5555 + case GDLM_REMCMD_QUERYREPLY:
5556 + remote_query_reply(nodeid, lspace, req);
5560 + log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
5563 + up_read(&lspace->ls_in_recovery);
5567 + reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5568 + reply.rl_header.rh_flags = 0;
5569 + reply.rl_header.rh_length = sizeof(reply);
5570 + reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
5571 + reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
5573 + status = midcomms_send_message(nodeid, &reply.rl_header,
5582 +static void add_reply_lvb(struct dlm_lkb *lkb, struct dlm_reply *reply)
5584 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5585 + memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5588 +static void add_request_lvb(struct dlm_lkb *lkb, struct dlm_request *req)
5590 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5591 + memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5593 diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
5594 --- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730
5595 +++ linux-patched/cluster/dlm/lockqueue.h 2004-07-13 18:57:22.000000000 +0800
5597 +/******************************************************************************
5598 +*******************************************************************************
5600 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5601 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5603 +** This copyrighted material is made available to anyone wishing to use,
5604 +** modify, copy, or redistribute it subject to the terms and conditions
5605 +** of the GNU General Public License v.2.
5607 +*******************************************************************************
5608 +******************************************************************************/
5610 +#ifndef __LOCKQUEUE_DOT_H__
5611 +#define __LOCKQUEUE_DOT_H__
5613 +void remote_grant(struct dlm_lkb * lkb);
5614 +void reply_and_grant(struct dlm_lkb * lkb);
5615 +int remote_stage(struct dlm_lkb * lkb, int state);
5616 +int process_cluster_request(int csid, struct dlm_header *req, int recovery);
5617 +int send_cluster_request(struct dlm_lkb * lkb, int state);
5618 +void purge_requestqueue(struct dlm_ls * ls);
5619 +int process_requestqueue(struct dlm_ls * ls);
5620 +int reply_in_requestqueue(struct dlm_ls * ls, int lkid);
5621 +void remote_remove_resdata(struct dlm_ls * ls, int nodeid, char *name, int namelen);
5622 +void allocate_and_copy_lvb(struct dlm_ls * ls, char **lvbptr, char *src);
5624 +#endif /* __LOCKQUEUE_DOT_H__ */
5625 diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
5626 --- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730
5627 +++ linux-patched/cluster/dlm/lockspace.c 2004-07-13 18:57:22.000000000 +0800
5629 +/******************************************************************************
5630 +*******************************************************************************
5632 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5633 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5635 +** This copyrighted material is made available to anyone wishing to use,
5636 +** modify, copy, or redistribute it subject to the terms and conditions
5637 +** of the GNU General Public License v.2.
5639 +*******************************************************************************
5640 +******************************************************************************/
5642 +#include <linux/module.h>
5644 +#include "dlm_internal.h"
5645 +#include "recoverd.h"
5650 +#include "lowcomms.h"
5651 +#include "config.h"
5652 +#include "memory.h"
5653 +#include "lockspace.h"
5654 +#include "device.h"
5656 +#define GDST_NONE (0)
5657 +#define GDST_RUNNING (1)
5659 +static int dlmstate;
5660 +static int dlmcount;
5661 +static struct semaphore dlmstate_lock;
5662 +struct list_head lslist;
5663 +spinlock_t lslist_lock;
5664 +struct kcl_service_ops ls_ops;
5666 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
5669 +void dlm_lockspace_init(void)
5671 + dlmstate = GDST_NONE;
5673 + init_MUTEX(&dlmstate_lock);
5674 + INIT_LIST_HEAD(&lslist);
5675 + spin_lock_init(&lslist_lock);
5678 +struct dlm_ls *find_lockspace_by_global_id(uint32_t id)
5680 + struct dlm_ls *ls;
5682 + spin_lock(&lslist_lock);
5684 + list_for_each_entry(ls, &lslist, ls_list) {
5685 + if (ls->ls_global_id == id)
5690 + spin_unlock(&lslist_lock);
5694 +/* TODO: make this more efficient */
5695 +struct dlm_ls *find_lockspace_by_local_id(void *id)
5697 + struct dlm_ls *ls;
5699 + spin_lock(&lslist_lock);
5701 + list_for_each_entry(ls, &lslist, ls_list) {
5702 + if (ls->ls_local_id == (uint32_t)(long)id)
5707 + spin_unlock(&lslist_lock);
5711 +struct dlm_ls *find_lockspace_by_name(char *name, int namelen)
5713 + struct dlm_ls *ls;
5715 + spin_lock(&lslist_lock);
5717 + list_for_each_entry(ls, &lslist, ls_list) {
5718 + if (ls->ls_namelen == namelen &&
5719 + memcmp(ls->ls_name, name, namelen) == 0)
5724 + spin_unlock(&lslist_lock);
5729 + * Called from dlm_init. These are the general threads which are not
5730 + * lockspace-specific and work for all dlm lockspaces.
5733 +static int threads_start(void)
5737 + /* Thread which interacts with cman for all ls's */
5738 + error = dlm_recoverd_start();
5740 + log_print("cannot start recovery thread %d", error);
5744 + /* Thread which process lock requests for all ls's */
5745 + error = astd_start();
5747 + log_print("cannot start ast thread %d", error);
5748 + goto recoverd_fail;
5751 + /* Thread for sending/receiving messages for all ls's */
5752 + error = lowcomms_start();
5754 + log_print("cannot start lowcomms %d", error);
5764 + dlm_recoverd_stop();
5770 +static void threads_stop(void)
5774 + dlm_recoverd_stop();
5777 +static int init_internal(void)
5781 + if (dlmstate == GDST_RUNNING)
5784 + error = threads_start();
5788 + dlmstate = GDST_RUNNING;
5798 + * Called after dlm module is loaded and before any lockspaces are created.
5799 + * Starts and initializes global threads and structures. These global entities
5800 + * are shared by and independent of all lockspaces.
5802 + * There should be a dlm-specific user command which a person can run which
5803 + * calls this function. If a user hasn't run that command and something
5804 + * creates a new lockspace, this is called first.
5806 + * This also starts the default lockspace.
5813 + down(&dlmstate_lock);
5814 + error = init_internal();
5815 + up(&dlmstate_lock);
5820 +int dlm_release(void)
5824 + down(&dlmstate_lock);
5826 + if (dlmstate == GDST_NONE)
5835 + spin_lock(&lslist_lock);
5836 + if (!list_empty(&lslist)) {
5837 + spin_unlock(&lslist_lock);
5838 + log_print("cannot stop threads, lockspaces still exist");
5841 + spin_unlock(&lslist_lock);
5844 + dlmstate = GDST_NONE;
5847 + up(&dlmstate_lock);
5852 +struct dlm_ls *allocate_ls(int namelen)
5854 + struct dlm_ls *ls;
5856 + /* FIXME: use appropriate malloc type */
5858 + ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
5860 + memset(ls, 0, sizeof(struct dlm_ls) + namelen);
5865 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
5867 + struct dlm_ls *ls;
5868 + int i, size, error = -ENOMEM;
5869 + uint32_t local_id = 0;
5871 + if (!try_module_get(THIS_MODULE))
5874 + if (namelen > MAX_SERVICE_NAME_LEN)
5877 + if ((ls = find_lockspace_by_name(name, namelen))) {
5878 + *lockspace = (void *)(long)ls->ls_local_id;
5883 + * Initialize ls fields
5886 + ls = allocate_ls(namelen);
5890 + memcpy(ls->ls_name, name, namelen);
5891 + ls->ls_namelen = namelen;
5893 + ls->ls_allocation = GFP_KERNEL;
5896 + size = dlm_config.rsbtbl_size;
5897 + ls->ls_rsbtbl_size = size;
5899 + ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
5900 + if (!ls->ls_rsbtbl)
5902 + for (i = 0; i < size; i++) {
5903 + INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
5904 + rwlock_init(&ls->ls_rsbtbl[i].lock);
5907 + size = dlm_config.lkbtbl_size;
5908 + ls->ls_lkbtbl_size = size;
5910 + ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
5911 + if (!ls->ls_lkbtbl)
5913 + for (i = 0; i < size; i++) {
5914 + INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
5915 + rwlock_init(&ls->ls_lkbtbl[i].lock);
5916 + ls->ls_lkbtbl[i].counter = 1;
5919 + size = dlm_config.dirtbl_size;
5920 + ls->ls_dirtbl_size = size;
5922 + ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
5923 + if (!ls->ls_dirtbl)
5925 + for (i = 0; i < size; i++) {
5926 + INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
5927 + rwlock_init(&ls->ls_dirtbl[i].lock);
5930 + INIT_LIST_HEAD(&ls->ls_nodes);
5931 + INIT_LIST_HEAD(&ls->ls_nodes_gone);
5932 + ls->ls_num_nodes = 0;
5933 + INIT_LIST_HEAD(&ls->ls_recover);
5934 + spin_lock_init(&ls->ls_recover_lock);
5935 + INIT_LIST_HEAD(&ls->ls_recover_list);
5936 + ls->ls_recover_list_count = 0;
5937 + spin_lock_init(&ls->ls_recover_list_lock);
5938 + init_waitqueue_head(&ls->ls_wait_general);
5939 + INIT_LIST_HEAD(&ls->ls_rootres);
5940 + INIT_LIST_HEAD(&ls->ls_requestqueue);
5941 + INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
5942 + ls->ls_last_stop = 0;
5943 + ls->ls_last_start = 0;
5944 + ls->ls_last_finish = 0;
5945 + ls->ls_rcom_msgid = 0;
5946 + init_MUTEX(&ls->ls_rcom_lock);
5947 + init_rwsem(&ls->ls_in_recovery);
5948 + init_rwsem(&ls->ls_unlock_sem);
5949 + init_rwsem(&ls->ls_rec_rsblist);
5950 + init_rwsem(&ls->ls_gap_rsblist);
5951 + down_write(&ls->ls_in_recovery);
5953 + if (flags & DLM_LSF_NOTIMERS)
5954 + set_bit(LSFL_NOTIMERS, &ls->ls_flags);
5955 + if (flags & DLM_LSF_NOCONVGRANT)
5956 + set_bit(LSFL_NOCONVGRANT, &ls->ls_flags);
5959 + * Connect this lockspace with the cluster manager
5962 + error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
5963 + &ls_ops, TRUE, (void *) ls, &local_id);
5967 + ls->ls_state = LSST_INIT;
5968 + ls->ls_local_id = local_id;
5970 + spin_lock(&lslist_lock);
5971 + list_add(&ls->ls_list, &lslist);
5972 + spin_unlock(&lslist_lock);
5974 + error = kcl_join_service(local_id);
5976 + log_error(ls, "service manager join error %d", error);
5980 + /* The ls isn't actually running until it receives a start() from CMAN.
5981 + Neither does it have a global ls id until started. */
5983 + /* Return the local ID as the lockspace handle. I've left this
5984 + cast to a void* as it allows us to replace it with pretty much
5985 + anything at a future date without breaking clients. But returning
5986 + the address of the lockspace is a bad idea as it could get
5987 + forcibly removed, leaving client with a dangling pointer */
5988 + *lockspace = (void *)(long)local_id;
5993 + kcl_unregister_service(ls->ls_local_id);
5995 + kfree(ls->ls_dirtbl);
5997 + kfree(ls->ls_lkbtbl);
5999 + kfree(ls->ls_rsbtbl);
6007 + * Called by a system like GFS which wants independent lock spaces.
6010 +int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
6012 + int error = -ENOSYS;
6014 + down(&dlmstate_lock);
6015 + error = init_internal();
6019 + error = new_lockspace(name, namelen, lockspace, flags);
6021 + up(&dlmstate_lock);
6025 +/* Return 1 if the lockspace still has active remote locks,
6026 + * 2 if the lockspace still has active local locks.
6028 +static int lockspace_busy(struct dlm_ls *ls)
6030 + int i, lkb_found = 0;
6031 + struct dlm_lkb *lkb;
6033 + /* NOTE: We check the lockidtbl here rather than the resource table.
6034 + This is because there may be LKBs queued as ASTs that have been
6035 + unlinked from their RSBs and are pending deletion once the AST has
6038 + for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6039 + read_lock(&ls->ls_lkbtbl[i].lock);
6040 + if (!list_empty(&ls->ls_lkbtbl[i].list)) {
6042 + list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
6044 + if (!lkb->lkb_nodeid) {
6045 + read_unlock(&ls->ls_lkbtbl[i].lock);
6050 + read_unlock(&ls->ls_lkbtbl[i].lock);
6055 +static int release_lockspace(struct dlm_ls *ls, int force)
6057 + struct dlm_lkb *lkb;
6058 + struct dlm_rsb *rsb;
6059 + struct dlm_recover *rv;
6060 + struct dlm_csb *csb;
6061 + struct list_head *head;
6063 + int busy = lockspace_busy(ls);
6065 + /* Don't destroy a busy lockspace */
6070 + kcl_leave_service(ls->ls_local_id);
6071 + kcl_unregister_service(ls->ls_local_id);
6074 + spin_lock(&lslist_lock);
6075 + list_del(&ls->ls_list);
6076 + spin_unlock(&lslist_lock);
6079 + * Free resdata structs.
6082 + dlm_dir_clear(ls);
6083 + kfree(ls->ls_dirtbl);
6086 + * Free all lkb's on lkbtbl[] lists.
6089 + for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6090 + head = &ls->ls_lkbtbl[i].list;
6091 + while (!list_empty(head)) {
6092 + lkb = list_entry(head->next, struct dlm_lkb,
6094 + list_del(&lkb->lkb_idtbl_list);
6096 + if (lkb->lkb_lockqueue_state)
6097 + remove_from_lockqueue(lkb);
6099 + if (lkb->lkb_astflags & (AST_COMP | AST_BAST))
6100 + list_del(&lkb->lkb_astqueue);
6102 + if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
6103 + free_lvb(lkb->lkb_lvbptr);
6109 + kfree(ls->ls_lkbtbl);
6112 + * Free all rsb's on rsbtbl[] lists
6115 + for (i = 0; i < ls->ls_rsbtbl_size; i++) {
6116 + head = &ls->ls_rsbtbl[i].list;
6117 + while (!list_empty(head)) {
6118 + rsb = list_entry(head->next, struct dlm_rsb,
6120 + list_del(&rsb->res_hashchain);
6122 + if (rsb->res_lvbptr)
6123 + free_lvb(rsb->res_lvbptr);
6129 + kfree(ls->ls_rsbtbl);
6132 + * Free structures on any other lists
6135 + head = &ls->ls_recover;
6136 + while (!list_empty(head)) {
6137 + rv = list_entry(head->next, struct dlm_recover, list);
6138 + list_del(&rv->list);
6142 + head = &ls->ls_nodes;
6143 + while (!list_empty(head)) {
6144 + csb = list_entry(head->next, struct dlm_csb, list);
6145 + list_del(&csb->list);
6149 + head = &ls->ls_nodes_gone;
6150 + while (!list_empty(head)) {
6151 + csb = list_entry(head->next, struct dlm_csb, list);
6152 + list_del(&csb->list);
6160 + module_put(THIS_MODULE);
6166 + * Called when a system has released all its locks and is not going to use the
6167 + * lockspace any longer. We blindly free everything we're managing for this
6168 + * lockspace. Remaining nodes will go through the recovery process as if we'd
6169 + * died. The lockspace must continue to function as usual, participating in
6170 + * recoveries, until kcl_leave_service returns.
6172 + * Force has 4 possible values:
6173 + * 0 - don't destroy locksapce if it has any LKBs
6174 + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
6175 + * 2 - destroy lockspace regardless of LKBs
6176 + * 3 - destroy lockspace as part of a forced shutdown
6179 +int dlm_release_lockspace(void *lockspace, int force)
6181 + struct dlm_ls *ls;
6183 + ls = find_lockspace_by_local_id(lockspace);
6187 + return release_lockspace(ls, force);
6191 +/* Called when the cluster is being shut down dirtily */
6192 +void dlm_emergency_shutdown()
6194 + struct dlm_ls *ls;
6195 + struct dlm_ls *tmp;
6197 + /* Shut lowcomms down to prevent any socket activity */
6198 + lowcomms_stop_accept();
6200 + /* Delete the devices that belong the the userland
6201 + lockspaces to be deleted. */
6202 + dlm_device_free_devices();
6204 + /* Now try to clean the lockspaces */
6205 + spin_lock(&lslist_lock);
6207 + list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
6208 + spin_unlock(&lslist_lock);
6209 + release_lockspace(ls, 3);
6210 + spin_lock(&lslist_lock);
6213 + spin_unlock(&lslist_lock);
6216 +struct dlm_recover *allocate_dlm_recover(void)
6218 + struct dlm_recover *rv;
6220 + rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
6222 + memset(rv, 0, sizeof(struct dlm_recover));
6227 + * Called by CMAN on a specific ls. "stop" means set flag which while set
6228 + * causes all new requests to ls to be queued and not submitted until flag is
6229 + * cleared. stop on a ls also needs to cancel any prior starts on the ls.
6230 + * The recoverd thread carries out any work called for by this event.
6233 +static int dlm_ls_stop(void *servicedata)
6235 + struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6238 + spin_lock(&ls->ls_recover_lock);
6239 + ls->ls_last_stop = ls->ls_last_start;
6240 + set_bit(LSFL_LS_STOP, &ls->ls_flags);
6241 + new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
6242 + spin_unlock(&ls->ls_recover_lock);
6245 + * This in_recovery lock does two things:
6247 + * 1) Keeps this function from returning until all threads are out
6248 + * of locking routines and locking is truely stopped.
6249 + * 2) Keeps any new requests from being processed until it's unlocked
6250 + * when recovery is complete.
6254 + down_write(&ls->ls_in_recovery);
6256 + clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
6257 + clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
6258 + clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
6259 + clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
6261 + dlm_recoverd_kick(ls);
6267 + * Called by CMAN on a specific ls. "start" means enable the lockspace to do
6268 + * request processing which first requires that the recovery procedure be
6269 + * stepped through with all nodes sharing the lockspace (nodeids). The first
6270 + * start on the ls after it's created is a special case and requires some extra
6271 + * work like figuring out our own local nodeid. We can't do all this in the
6272 + * calling CMAN context, so we must pass this work off to the recoverd thread
6273 + * which was created in dlm_init(). The recoverd thread carries out any work
6274 + * called for by this event.
6277 +static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
6278 + int event_id, int type)
6280 + struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6281 + struct dlm_recover *rv;
6282 + int error = -ENOMEM;
6284 + rv = allocate_dlm_recover();
6288 + rv->nodeids = nodeids;
6289 + rv->node_count = count;
6290 + rv->event_id = event_id;
6292 + spin_lock(&ls->ls_recover_lock);
6293 + ls->ls_last_start = event_id;
6294 + list_add_tail(&rv->list, &ls->ls_recover);
6295 + set_bit(LSFL_LS_START, &ls->ls_flags);
6296 + spin_unlock(&ls->ls_recover_lock);
6298 + dlm_recoverd_kick(ls);
6306 + * Called by CMAN on a specific ls. "finish" means that all nodes which
6307 + * received a "start" have completed the start and called kcl_start_done.
6308 + * The recoverd thread carries out any work called for by this event.
6311 +static void dlm_ls_finish(void *servicedata, int event_id)
6313 + struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6315 + spin_lock(&ls->ls_recover_lock);
6316 + ls->ls_last_finish = event_id;
6317 + set_bit(LSFL_LS_FINISH, &ls->ls_flags);
6318 + spin_unlock(&ls->ls_recover_lock);
6320 + dlm_recoverd_kick(ls);
6323 +struct kcl_service_ops ls_ops = {
6324 + .stop = dlm_ls_stop,
6325 + .start = dlm_ls_start,
6326 + .finish = dlm_ls_finish
6328 diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
6329 --- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730
6330 +++ linux-patched/cluster/dlm/lockspace.h 2004-07-13 18:57:22.000000000 +0800
6332 +/******************************************************************************
6333 +*******************************************************************************
6335 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6336 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6338 +** This copyrighted material is made available to anyone wishing to use,
6339 +** modify, copy, or redistribute it subject to the terms and conditions
6340 +** of the GNU General Public License v.2.
6342 +*******************************************************************************
6343 +******************************************************************************/
6345 +#ifndef __LOCKSPACE_DOT_H__
6346 +#define __LOCKSPACE_DOT_H__
6348 +void dlm_lockspace_init(void);
6349 +int dlm_init(void);
6350 +int dlm_release(void);
6351 +int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
6352 +int dlm_release_lockspace(void *ls, int force);
6353 +struct dlm_ls *find_lockspace_by_global_id(uint32_t id);
6354 +struct dlm_ls *find_lockspace_by_local_id(void *id);
6355 +struct dlm_ls *find_lockspace_by_name(char *name, int namelen);
6356 +void dlm_emergency_shutdown(void);
6358 +#endif /* __LOCKSPACE_DOT_H__ */
6359 diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
6360 --- linux-orig/cluster/dlm/lowcomms.c 1970-01-01 07:30:00.000000000 +0730
6361 +++ linux-patched/cluster/dlm/lowcomms.c 2004-07-13 18:57:22.000000000 +0800
6363 +/******************************************************************************
6364 +*******************************************************************************
6366 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6367 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6369 +** This copyrighted material is made available to anyone wishing to use,
6370 +** modify, copy, or redistribute it subject to the terms and conditions
6371 +** of the GNU General Public License v.2.
6373 +*******************************************************************************
6374 +******************************************************************************/
6379 + * This is the "low-level" comms layer.
6381 + * It is responsible for sending/receiving messages
6382 + * from other nodes in the cluster.
6384 + * Cluster nodes are referred to by their nodeids. nodeids are
6385 + * simply 32 bit numbers to the locking module - if they need to
6386 + * be expanded for the cluster infrastructure then that is it's
6387 + * responsibility. It is this layer's
6388 + * responsibility to resolve these into IP address or
6389 + * whatever it needs for inter-node communication.
6391 + * The comms level is two kernel threads that deal mainly with
6392 + * the receiving of messages from other nodes and passing them
6393 + * up to the mid-level comms layer (which understands the
6394 + * message format) for execution by the locking core, and
6395 + * a send thread which does all the setting up of connections
6396 + * to remote nodes and the sending of data. Threads are not allowed
6397 + * to send their own data because it may cause them to wait in times
6398 + * of high load. Also, this way, the sending thread can collect together
6399 + * messages bound for one node and send them in one block.
6401 + * I don't see any problem with the recv thread executing the locking
6402 + * code on behalf of remote processes as the locking code is
6403 + * short, efficient and never waits.
6408 +#include <asm/ioctls.h>
6409 +#include <net/sock.h>
6410 +#include <net/tcp.h>
6411 +#include <linux/pagemap.h>
6412 +#include <cluster/cnxman.h>
6414 +#include "dlm_internal.h"
6415 +#include "lowcomms.h"
6416 +#include "midcomms.h"
6417 +#include "config.h"
6425 +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
6426 +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
6427 +#define CBUF_EMPTY(cb) ((cb)->len == 0)
6428 +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
6429 +#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \
6430 + (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
6431 +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
6433 +struct connection {
6434 + struct socket *sock; /* NULL if not connected */
6435 + uint32_t nodeid; /* So we know who we are in the list */
6436 + struct rw_semaphore sock_sem; /* Stop connect races */
6437 + struct list_head read_list; /* On this list when ready for reading */
6438 + struct list_head write_list; /* On this list when ready for writing */
6439 + struct list_head state_list; /* On this list when ready to connect */
6440 + unsigned long flags; /* bit 1,2 = We are on the read/write lists */
6441 +#define CF_READ_PENDING 1
6442 +#define CF_WRITE_PENDING 2
6443 +#define CF_CONNECT_PENDING 3
6444 +#define CF_IS_OTHERSOCK 4
6445 + struct list_head writequeue; /* List of outgoing writequeue_entries */
6446 + struct list_head listenlist; /* List of allocated listening sockets */
6447 + spinlock_t writequeue_lock;
6448 + int (*rx_action) (struct connection *); /* What to do when active */
6449 + struct page *rx_page;
6452 +#define MAX_CONNECT_RETRIES 3
6453 + struct connection *othersock;
6455 +#define sock2con(x) ((struct connection *)(x)->sk_user_data)
6456 +#define nodeid2con(x) (&connections[(x)])
6458 +/* An entry waiting to be sent */
6459 +struct writequeue_entry {
6460 + struct list_head list;
6461 + struct page *page;
6466 + struct connection *con;
6469 +/* "Template" structure for IPv4 and IPv6 used to fill
6470 + * in the missing bits when converting between cman (which knows
6471 + * nothing about sockaddr structs) and real life where we actually
6472 + * have to connect to these addresses. Also one of these structs
6473 + * will hold the cached "us" address.
6475 + * It's an in6 sockaddr just so there's enough space for anything
6476 + * we're likely to see here.
6478 +static struct sockaddr_in6 local_addr;
6480 +/* Manage daemons */
6481 +static struct semaphore thread_lock;
6482 +static struct completion thread_completion;
6483 +static atomic_t send_run;
6484 +static atomic_t recv_run;
6486 +/* An array of connections, indexed by NODEID */
6487 +static struct connection *connections;
6488 +static int conn_array_size;
6489 +static atomic_t writequeue_length;
6490 +static atomic_t accepting;
6492 +static wait_queue_t lowcomms_send_waitq_head;
6493 +static wait_queue_head_t lowcomms_send_waitq;
6495 +static wait_queue_t lowcomms_recv_waitq_head;
6496 +static wait_queue_head_t lowcomms_recv_waitq;
6498 +/* List of sockets that have reads pending */
6499 +static struct list_head read_sockets;
6500 +static spinlock_t read_sockets_lock;
6502 +/* List of sockets which have writes pending */
6503 +static struct list_head write_sockets;
6504 +static spinlock_t write_sockets_lock;
6506 +/* List of sockets which have connects pending */
6507 +static struct list_head state_sockets;
6508 +static spinlock_t state_sockets_lock;
6510 +/* List of allocated listen sockets */
6511 +static struct list_head listen_sockets;
6513 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
6514 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
6517 +/* Data available on socket or listen socket received a connect */
6518 +static void lowcomms_data_ready(struct sock *sk, int count_unused)
6520 + struct connection *con = sock2con(sk);
6522 + if (test_and_set_bit(CF_READ_PENDING, &con->flags))
6525 + spin_lock_bh(&read_sockets_lock);
6526 + list_add_tail(&con->read_list, &read_sockets);
6527 + spin_unlock_bh(&read_sockets_lock);
6529 + wake_up_interruptible(&lowcomms_recv_waitq);
6532 +static void lowcomms_write_space(struct sock *sk)
6534 + struct connection *con = sock2con(sk);
6536 + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
6539 + spin_lock_bh(&write_sockets_lock);
6540 + list_add_tail(&con->write_list, &write_sockets);
6541 + spin_unlock_bh(&write_sockets_lock);
6543 + wake_up_interruptible(&lowcomms_send_waitq);
6546 +static inline void lowcomms_connect_sock(struct connection *con)
6548 + if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
6550 + if (!atomic_read(&accepting))
6553 + spin_lock_bh(&state_sockets_lock);
6554 + list_add_tail(&con->state_list, &state_sockets);
6555 + spin_unlock_bh(&state_sockets_lock);
6557 + wake_up_interruptible(&lowcomms_send_waitq);
6560 +static void lowcomms_state_change(struct sock *sk)
6562 +/* struct connection *con = sock2con(sk); */
6564 + switch (sk->sk_state) {
6565 + case TCP_ESTABLISHED:
6566 + lowcomms_write_space(sk);
6569 + case TCP_FIN_WAIT1:
6570 + case TCP_FIN_WAIT2:
6571 + case TCP_TIME_WAIT:
6573 + case TCP_CLOSE_WAIT:
6574 + case TCP_LAST_ACK:
6576 + /* FIXME: I think this causes more trouble than it solves.
6577 + lowcomms wil reconnect anyway when there is something to
6578 + send. This just attempts reconnection if a node goes down!
6580 + /* lowcomms_connect_sock(con); */
6584 + printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
6589 +/* Make a socket active */
6590 +static int add_sock(struct socket *sock, struct connection *con)
6594 + /* Install a data_ready callback */
6595 + con->sock->sk->sk_data_ready = lowcomms_data_ready;
6596 + con->sock->sk->sk_write_space = lowcomms_write_space;
6597 + con->sock->sk->sk_state_change = lowcomms_state_change;
6602 +/* Add the port number to an IP6 or 4 sockaddr and return the address
6604 +static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
6607 + saddr->sin6_family = local_addr.sin6_family;
6608 + if (local_addr.sin6_family == AF_INET) {
6609 + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
6610 + in4_addr->sin_port = cpu_to_be16(port);
6611 + *addr_len = sizeof(struct sockaddr_in);
6614 + saddr->sin6_port = cpu_to_be16(port);
6615 + *addr_len = sizeof(struct sockaddr_in6);
6619 +/* Close a remote connection and tidy up */
6620 +static void close_connection(struct connection *con)
6622 + if (test_bit(CF_IS_OTHERSOCK, &con->flags))
6625 + down_write(&con->sock_sem);
6628 + sock_release(con->sock);
6630 + if (con->othersock) {
6631 + down_write(&con->othersock->sock_sem);
6632 + sock_release(con->othersock->sock);
6633 + con->othersock->sock = NULL;
6634 + up_write(&con->othersock->sock_sem);
6635 + kfree(con->othersock);
6636 + con->othersock = NULL;
6639 + if (con->rx_page) {
6640 + __free_page(con->rx_page);
6641 + con->rx_page = NULL;
6643 + up_write(&con->sock_sem);
6646 +/* Data received from remote end */
6647 +static int receive_from_sock(struct connection *con)
6650 + struct msghdr msg;
6651 + struct iovec iov[2];
6655 + int call_again_soon = 0;
6657 + down_read(&con->sock_sem);
6659 + if (con->sock == NULL)
6661 + if (con->rx_page == NULL) {
6663 + * This doesn't need to be atomic, but I think it should
6664 + * improve performance if it is.
6666 + con->rx_page = alloc_page(GFP_ATOMIC);
6667 + if (con->rx_page == NULL)
6669 + CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
6672 + * To avoid doing too many short reads, we will reschedule for another
6673 + * another time if there are less than 32 bytes left in the buffer.
6675 + if (!CBUF_MAY_ADD(&con->cb, 32))
6678 + msg.msg_control = NULL;
6679 + msg.msg_controllen = 0;
6680 + msg.msg_iovlen = 1;
6681 + msg.msg_iov = iov;
6682 + msg.msg_name = NULL;
6683 + msg.msg_namelen = 0;
6684 + msg.msg_flags = 0;
6687 + * iov[0] is the bit of the circular buffer between the current end
6688 + * point (cb.base + cb.len) and the end of the buffer.
6690 + iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
6691 + iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
6692 + iov[1].iov_len = 0;
6695 + * iov[1] is the bit of the circular buffer between the start of the
6696 + * buffer and the start of the currently used section (cb.base)
6698 + if (CBUF_DATA(&con->cb) >= con->cb.base) {
6699 + iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
6700 + iov[1].iov_len = con->cb.base;
6701 + iov[1].iov_base = page_address(con->rx_page);
6702 + msg.msg_iovlen = 2;
6704 + len = iov[0].iov_len + iov[1].iov_len;
6708 + r = ret = sock_recvmsg(con->sock, &msg, len,
6709 + MSG_DONTWAIT | MSG_NOSIGNAL);
6715 + call_again_soon = 1;
6716 + CBUF_ADD(&con->cb, ret);
6717 + ret = midcomms_process_incoming_buffer(con->nodeid,
6718 + page_address(con->rx_page),
6719 + con->cb.base, con->cb.len,
6721 + if (ret == -EBADMSG) {
6722 + printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
6723 + "iov_len=%u, iov_base[0]=%p, read=%d\n",
6724 + page_address(con->rx_page), con->cb.base, con->cb.len,
6725 + len, iov[0].iov_base, r);
6729 + CBUF_EAT(&con->cb, ret);
6731 + if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
6732 + __free_page(con->rx_page);
6733 + con->rx_page = NULL;
6736 + if (call_again_soon)
6738 + up_read(&con->sock_sem);
6743 + lowcomms_data_ready(con->sock->sk, 0);
6744 + up_read(&con->sock_sem);
6749 + up_read(&con->sock_sem);
6750 + if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) {
6751 + close_connection(con);
6752 + lowcomms_connect_sock(con);
6759 +/* Listening socket is busy, accept a connection */
6760 +static int accept_from_sock(struct connection *con)
6763 + struct sockaddr_in6 peeraddr;
6764 + struct socket *newsock;
6767 + struct connection *newcon;
6769 + memset(&peeraddr, 0, sizeof(peeraddr));
6770 + newsock = sock_alloc();
6774 + down_read(&con->sock_sem);
6776 + result = -ENOTCONN;
6777 + if (con->sock == NULL)
6780 + newsock->type = con->sock->type;
6781 + newsock->ops = con->sock->ops;
6783 + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
6787 + /* Get the connected socket's peer */
6788 + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
6790 + result = -ECONNABORTED;
6794 + /* Get the new node's NODEID */
6795 + nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
6796 + if (nodeid == 0) {
6797 + printk("dlm: connect from non cluster node\n");
6798 + sock_release(newsock);
6799 + up_read(&con->sock_sem);
6803 + log_print("got connection from %d", nodeid);
6805 + /* Check to see if we already have a connection to this node. This
6806 + * could happen if the two nodes initiate a connection at roughly
6807 + * the same time and the connections cross on the wire.
6809 + * In this case we store the incoming one in "othersock"
6811 + newcon = nodeid2con(nodeid);
6812 + down_write(&newcon->sock_sem);
6813 + if (newcon->sock) {
6814 + struct connection *othercon;
6816 + othercon = kmalloc(sizeof(struct connection), GFP_KERNEL);
6818 + printk("dlm: failed to allocate incoming socket\n");
6819 + sock_release(newsock);
6820 + up_write(&newcon->sock_sem);
6821 + up_read(&con->sock_sem);
6824 + memset(othercon, 0, sizeof(*othercon));
6825 + newcon->othersock = othercon;
6826 + othercon->nodeid = nodeid;
6827 + othercon->sock = newsock;
6828 + othercon->rx_action = receive_from_sock;
6829 + add_sock(newsock, othercon);
6830 + init_rwsem(&othercon->sock_sem);
6831 + set_bit(CF_IS_OTHERSOCK, &othercon->flags);
6832 + newsock->sk->sk_user_data = othercon;
6834 + up_write(&newcon->sock_sem);
6835 + lowcomms_data_ready(newsock->sk, 0);
6836 + up_read(&con->sock_sem);
6840 + newsock->sk->sk_user_data = newcon;
6841 + newcon->rx_action = receive_from_sock;
6842 + add_sock(newsock, newcon);
6843 + up_write(&newcon->sock_sem);
6846 + * Add it to the active queue in case we got data
6847 + * beween processing the accept adding the socket
6848 + * to the read_sockets list
6850 + lowcomms_data_ready(newsock->sk, 0);
6852 + up_read(&con->sock_sem);
6858 + up_read(&con->sock_sem);
6859 + sock_release(newsock);
6861 + printk("dlm: error accepting connection from node: %d\n", result);
6865 +/* Connect a new socket to its peer */
6866 +static int connect_to_sock(struct connection *con)
6868 + int result = -EHOSTUNREACH;
6869 + struct sockaddr_in6 saddr;
6871 + struct socket *sock;
6873 + if (con->nodeid == 0) {
6874 + log_print("attempt to connect sock 0 foiled");
6878 + down_write(&con->sock_sem);
6879 + if (con->retries++ > MAX_CONNECT_RETRIES)
6882 + // FIXME not sure this should happen, let alone like this.
6884 + sock_release(con->sock);
6888 + /* Create a socket to communicate with */
6889 + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6893 + if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
6896 + sock->sk->sk_user_data = con;
6897 + con->rx_action = receive_from_sock;
6899 + make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
6901 + add_sock(sock, con);
6903 + sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
6905 + if (result == -EINPROGRESS)
6911 + up_write(&con->sock_sem);
6913 + * Returning an error here means we've given up trying to connect to
6914 + * a remote node, otherwise we return 0 and reschedule the connetion
6921 + sock_release(con->sock);
6925 + * Some errors are fatal and this list might need adjusting. For other
6926 + * errors we try again until the max number of retries is reached.
6928 + if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
6929 + result != -ENETDOWN && result != EINVAL
6930 + && result != -EPROTONOSUPPORT) {
6931 + lowcomms_connect_sock(con);
6937 +static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
6939 + struct socket *sock = NULL;
6943 + struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
6945 + /* Create a socket to communicate with */
6946 + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6948 + printk("dlm: Can't create listening comms socket\n");
6954 + result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
6957 + printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
6959 + sock->sk->sk_user_data = con;
6960 + con->rx_action = accept_from_sock;
6963 + /* Bind to our port */
6964 + make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
6965 + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
6967 + printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
6968 + sock_release(sock);
6976 + result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
6979 + printk("dlm: Set keepalive failed: %d\n", result);
6982 + result = sock->ops->listen(sock, 5);
6984 + printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
6985 + sock_release(sock);
6995 +/* Listen on all interfaces */
6996 +static int listen_for_all(void)
7000 + struct socket *sock = NULL;
7001 + struct list_head *addr_list;
7002 + struct connection *con = nodeid2con(0);
7003 + struct cluster_node_addr *node_addr;
7004 + char local_addr[sizeof(struct sockaddr_in6)];
7006 + /* This will also fill in local_addr */
7007 + nodeid = lowcomms_our_nodeid();
7009 + addr_list = kcl_get_node_addresses(nodeid);
7011 + printk("dlm: cannot initialise comms layer\n");
7012 + result = -ENOTCONN;
7016 + list_for_each_entry(node_addr, addr_list, list) {
7019 + con = kmalloc(sizeof(struct connection), GFP_KERNEL);
7021 + printk("dlm: failed to allocate listen socket\n");
7024 + memset(con, 0, sizeof(*con));
7025 + init_rwsem(&con->sock_sem);
7026 + spin_lock_init(&con->writequeue_lock);
7027 + INIT_LIST_HEAD(&con->writequeue);
7028 + set_bit(CF_IS_OTHERSOCK, &con->flags);
7031 + memcpy(local_addr, node_addr->addr, node_addr->addr_len);
7032 + sock = create_listen_sock(con, local_addr,
7033 + node_addr->addr_len);
7035 + add_sock(sock, con);
7041 + /* Keep a list of dynamically allocated listening sockets
7042 + so we can free them at shutdown */
7043 + if (test_bit(CF_IS_OTHERSOCK, &con->flags)) {
7044 + list_add_tail(&con->listenlist, &listen_sockets);
7055 +static struct writequeue_entry *new_writequeue_entry(struct connection *con,
7058 + struct writequeue_entry *entry;
7060 + entry = kmalloc(sizeof(struct writequeue_entry), allocation);
7064 + entry->page = alloc_page(allocation);
7065 + if (!entry->page) {
7070 + entry->offset = 0;
7079 +struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7080 + int allocation, char **ppc)
7082 + struct connection *con = nodeid2con(nodeid);
7083 + struct writequeue_entry *e;
7087 + if (!atomic_read(&accepting))
7090 + spin_lock(&con->writequeue_lock);
7091 + e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
7092 + if (((struct list_head *) e == &con->writequeue) ||
7093 + (PAGE_CACHE_SIZE - e->end < len)) {
7098 + users = e->users++;
7100 + spin_unlock(&con->writequeue_lock);
7106 + *ppc = page_address(e->page) + offset;
7110 + e = new_writequeue_entry(con, allocation);
7112 + spin_lock(&con->writequeue_lock);
7115 + users = e->users++;
7116 + list_add_tail(&e->list, &con->writequeue);
7117 + spin_unlock(&con->writequeue_lock);
7118 + atomic_inc(&writequeue_length);
7124 +void lowcomms_commit_buffer(struct writequeue_entry *e)
7126 + struct connection *con = e->con;
7129 + if (!atomic_read(&accepting))
7132 + spin_lock(&con->writequeue_lock);
7133 + users = --e->users;
7136 + e->len = e->end - e->offset;
7138 + spin_unlock(&con->writequeue_lock);
7140 + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
7141 + spin_lock_bh(&write_sockets_lock);
7142 + list_add_tail(&con->write_list, &write_sockets);
7143 + spin_unlock_bh(&write_sockets_lock);
7145 + wake_up_interruptible(&lowcomms_send_waitq);
7150 + spin_unlock(&con->writequeue_lock);
7154 +static void free_entry(struct writequeue_entry *e)
7156 + __free_page(e->page);
7158 + atomic_dec(&writequeue_length);
7161 +/* Send a message */
7162 +static int send_to_sock(struct connection *con)
7165 + ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
7166 + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
7167 + struct writequeue_entry *e;
7170 + down_read(&con->sock_sem);
7171 + if (con->sock == NULL)
7174 + sendpage = con->sock->ops->sendpage;
7176 + spin_lock(&con->writequeue_lock);
7178 + e = list_entry(con->writequeue.next, struct writequeue_entry,
7180 + if ((struct list_head *) e == &con->writequeue)
7184 + offset = e->offset;
7185 + BUG_ON(len == 0 && e->users == 0);
7186 + spin_unlock(&con->writequeue_lock);
7190 + ret = sendpage(con->sock, e->page, offset, len,
7192 + if (ret == -EAGAIN || ret == 0)
7198 + spin_lock(&con->writequeue_lock);
7202 + if (e->len == 0 && e->users == 0) {
7203 + list_del(&e->list);
7208 + spin_unlock(&con->writequeue_lock);
7210 + up_read(&con->sock_sem);
7214 + up_read(&con->sock_sem);
7215 + close_connection(con);
7216 + lowcomms_connect_sock(con);
7220 + up_read(&con->sock_sem);
7221 + lowcomms_connect_sock(con);
7225 +/* Called from recoverd when it knows that a node has
7226 + left the cluster */
7227 +int lowcomms_close(int nodeid)
7229 + struct connection *con;
7234 + con = nodeid2con(nodeid);
7236 + close_connection(con);
7244 +/* API send message call, may queue the request */
7245 +/* N.B. This is the old interface - use the new one for new calls */
7246 +int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
7248 + struct writequeue_entry *e;
7251 + DLM_ASSERT(nodeid < dlm_config.max_connections,
7252 + printk("nodeid=%u\n", nodeid););
7254 + e = lowcomms_get_buffer(nodeid, len, allocation, &b);
7256 + memcpy(b, buf, len);
7257 + lowcomms_commit_buffer(e);
7263 +/* Look for activity on active sockets */
7264 +static void process_sockets(void)
7266 + struct list_head *list;
7267 + struct list_head *temp;
7269 + spin_lock_bh(&read_sockets_lock);
7270 + list_for_each_safe(list, temp, &read_sockets) {
7271 + struct connection *con =
7272 + list_entry(list, struct connection, read_list);
7273 + list_del(&con->read_list);
7274 + clear_bit(CF_READ_PENDING, &con->flags);
7276 + spin_unlock_bh(&read_sockets_lock);
7278 + con->rx_action(con);
7280 + /* Don't starve out everyone else */
7282 + spin_lock_bh(&read_sockets_lock);
7284 + spin_unlock_bh(&read_sockets_lock);
7287 +/* Try to send any messages that are pending
7289 +static void process_output_queue(void)
7291 + struct list_head *list;
7292 + struct list_head *temp;
7295 + spin_lock_bh(&write_sockets_lock);
7296 + list_for_each_safe(list, temp, &write_sockets) {
7297 + struct connection *con =
7298 + list_entry(list, struct connection, write_list);
7299 + list_del(&con->write_list);
7300 + clear_bit(CF_WRITE_PENDING, &con->flags);
7302 + spin_unlock_bh(&write_sockets_lock);
7304 + ret = send_to_sock(con);
7307 + spin_lock_bh(&write_sockets_lock);
7309 + spin_unlock_bh(&write_sockets_lock);
7312 +static void process_state_queue(void)
7314 + struct list_head *list;
7315 + struct list_head *temp;
7318 + spin_lock_bh(&state_sockets_lock);
7319 + list_for_each_safe(list, temp, &state_sockets) {
7320 + struct connection *con =
7321 + list_entry(list, struct connection, state_list);
7322 + list_del(&con->state_list);
7323 + clear_bit(CF_CONNECT_PENDING, &con->flags);
7324 + spin_unlock_bh(&state_sockets_lock);
7326 + ret = connect_to_sock(con);
7329 + spin_lock_bh(&state_sockets_lock);
7331 + spin_unlock_bh(&state_sockets_lock);
7334 +/* Discard all entries on the write queues */
7335 +static void clean_writequeues(void)
7337 + struct list_head *list;
7338 + struct list_head *temp;
7341 + for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) {
7342 + struct connection *con = nodeid2con(nodeid);
7344 + spin_lock(&con->writequeue_lock);
7345 + list_for_each_safe(list, temp, &con->writequeue) {
7346 + struct writequeue_entry *e =
7347 + list_entry(list, struct writequeue_entry, list);
7348 + list_del(&e->list);
7351 + spin_unlock(&con->writequeue_lock);
7355 +static int read_list_empty(void)
7359 + spin_lock_bh(&read_sockets_lock);
7360 + status = list_empty(&read_sockets);
7361 + spin_unlock_bh(&read_sockets_lock);
7366 +/* DLM Transport comms receive daemon */
7367 +static int dlm_recvd(void *data)
7369 + daemonize("dlm_recvd");
7370 + atomic_set(&recv_run, 1);
7372 + init_waitqueue_head(&lowcomms_recv_waitq);
7373 + init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
7374 + add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
7376 + complete(&thread_completion);
7378 + while (atomic_read(&recv_run)) {
7380 + set_task_state(current, TASK_INTERRUPTIBLE);
7382 + if (read_list_empty())
7385 + set_task_state(current, TASK_RUNNING);
7387 + process_sockets();
7390 + down(&thread_lock);
7393 + complete(&thread_completion);
7398 +static int write_and_state_lists_empty(void)
7402 + spin_lock_bh(&write_sockets_lock);
7403 + status = list_empty(&write_sockets);
7404 + spin_unlock_bh(&write_sockets_lock);
7406 + spin_lock_bh(&state_sockets_lock);
7407 + if (list_empty(&state_sockets) == 0)
7409 + spin_unlock_bh(&state_sockets_lock);
7414 +/* DLM Transport send daemon */
7415 +static int dlm_sendd(void *data)
7417 + daemonize("dlm_sendd");
7418 + atomic_set(&send_run, 1);
7420 + init_waitqueue_head(&lowcomms_send_waitq);
7421 + init_waitqueue_entry(&lowcomms_send_waitq_head, current);
7422 + add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
7424 + complete(&thread_completion);
7426 + while (atomic_read(&send_run)) {
7428 + set_task_state(current, TASK_INTERRUPTIBLE);
7430 + if (write_and_state_lists_empty())
7433 + set_task_state(current, TASK_RUNNING);
7435 + process_state_queue();
7436 + process_output_queue();
7439 + down(&thread_lock);
7442 + complete(&thread_completion);
7447 +static void daemons_stop(void)
7449 + if (atomic_read(&recv_run)) {
7450 + down(&thread_lock);
7451 + atomic_set(&recv_run, 0);
7452 + wake_up_interruptible(&lowcomms_recv_waitq);
7454 + wait_for_completion(&thread_completion);
7457 + if (atomic_read(&send_run)) {
7458 + down(&thread_lock);
7459 + atomic_set(&send_run, 0);
7460 + wake_up_interruptible(&lowcomms_send_waitq);
7462 + wait_for_completion(&thread_completion);
7466 +static int daemons_start(void)
7470 + error = kernel_thread(dlm_recvd, NULL, 0);
7472 + log_print("can't start recvd thread: %d", error);
7475 + wait_for_completion(&thread_completion);
7477 + error = kernel_thread(dlm_sendd, NULL, 0);
7479 + log_print("can't start sendd thread: %d", error);
7483 + wait_for_completion(&thread_completion);
7491 + * Return the largest buffer size we can cope with.
7493 +int lowcomms_max_buffer_size(void)
7495 + return PAGE_CACHE_SIZE;
7498 +void lowcomms_stop(void)
7501 + struct connection *temp;
7502 + struct connection *lcon;
7504 + atomic_set(&accepting, 0);
7506 + /* Set all the activity flags to prevent any
7509 + for (i = 0; i < conn_array_size; i++) {
7510 + connections[i].flags = 0x7;
7513 + clean_writequeues();
7515 + for (i = 0; i < conn_array_size; i++) {
7516 + close_connection(nodeid2con(i));
7519 + kfree(connections);
7520 + connections = NULL;
7522 + /* Free up any dynamically allocated listening sockets */
7523 + list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
7524 + sock_release(lcon->sock);
7528 + kcl_releaseref_cluster();
7531 +/* This is quite likely to sleep... */
7532 +int lowcomms_start(void)
7537 + INIT_LIST_HEAD(&read_sockets);
7538 + INIT_LIST_HEAD(&write_sockets);
7539 + INIT_LIST_HEAD(&state_sockets);
7540 + INIT_LIST_HEAD(&listen_sockets);
7542 + spin_lock_init(&read_sockets_lock);
7543 + spin_lock_init(&write_sockets_lock);
7544 + spin_lock_init(&state_sockets_lock);
7546 + init_completion(&thread_completion);
7547 + init_MUTEX(&thread_lock);
7548 + atomic_set(&send_run, 0);
7549 + atomic_set(&recv_run, 0);
7551 + error = -ENOTCONN;
7552 + if (kcl_addref_cluster())
7556 + * Temporarily initialise the waitq head so that lowcomms_send_message
7557 + * doesn't crash if it gets called before the thread is fully
7560 + init_waitqueue_head(&lowcomms_send_waitq);
7564 + connections = kmalloc(sizeof(struct connection) *
7565 + dlm_config.max_connections, GFP_KERNEL);
7569 + memset(connections, 0,
7570 + sizeof(struct connection) * dlm_config.max_connections);
7571 + for (i = 0; i < dlm_config.max_connections; i++) {
7572 + connections[i].nodeid = i;
7573 + init_rwsem(&connections[i].sock_sem);
7574 + INIT_LIST_HEAD(&connections[i].writequeue);
7575 + spin_lock_init(&connections[i].writequeue_lock);
7577 + conn_array_size = dlm_config.max_connections;
7579 + /* Start listening */
7580 + error = listen_for_all();
7582 + goto fail_free_conn;
7584 + error = daemons_start();
7586 + goto fail_free_conn;
7588 + atomic_set(&accepting, 1);
7593 + kfree(connections);
7599 +/* Don't accept any more outgoing work */
7600 +void lowcomms_stop_accept()
7602 + atomic_set(&accepting, 0);
7605 +/* Cluster Manager interface functions for looking up
7606 + nodeids and IP addresses by each other
7609 +/* Return the IP address of a node given its NODEID */
7610 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
7612 + struct list_head *addrs;
7613 + struct cluster_node_addr *node_addr;
7614 + struct cluster_node_addr *current_addr = NULL;
7615 + struct sockaddr_in6 *saddr;
7619 + addrs = kcl_get_node_addresses(nodeid);
7623 + interface = kcl_get_current_interface();
7625 + /* Look for address number <interface> */
7626 + i=0; /* i/f numbers start at 1 */
7627 + list_for_each_entry(node_addr, addrs, list) {
7628 + if (interface == ++i) {
7629 + current_addr = node_addr;
7634 + /* If that failed then just use the first one */
7635 + if (!current_addr)
7636 + current_addr = (struct cluster_node_addr *)addrs->next;
7638 + saddr = (struct sockaddr_in6 *)current_addr->addr;
7640 + /* Extract the IP address */
7641 + if (saddr->sin6_family == AF_INET) {
7642 + struct sockaddr_in *in4 = (struct sockaddr_in *)saddr;
7643 + struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
7644 + ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
7647 + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
7648 + memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
7654 +/* Return the NODEID for a node given its sockaddr */
7655 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
7657 + struct kcl_cluster_node node;
7658 + struct sockaddr_in6 ipv6_addr;
7659 + struct sockaddr_in ipv4_addr;
7661 + if (addr->sa_family == AF_INET) {
7662 + struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
7663 + memcpy(&ipv4_addr, &local_addr, addr_len);
7664 + memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
7666 + addr = (struct sockaddr *)&ipv4_addr;
7669 + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
7670 + memcpy(&ipv6_addr, &local_addr, addr_len);
7671 + memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
7673 + addr = (struct sockaddr *)&ipv6_addr;
7676 + if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
7677 + return node.node_id;
7682 +int lowcomms_our_nodeid(void)
7684 + struct kcl_cluster_node node;
7685 + struct list_head *addrs;
7686 + struct cluster_node_addr *first_addr;
7687 + static int our_nodeid = 0;
7690 + return our_nodeid;
7692 + if (kcl_get_node_by_nodeid(0, &node) == -1)
7695 + our_nodeid = node.node_id;
7697 + /* Fill in the "template" structure */
7698 + addrs = kcl_get_node_addresses(our_nodeid);
7702 + first_addr = (struct cluster_node_addr *) addrs->next;
7703 + memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
7705 + return node.node_id;
7708 + * Overrides for Emacs so that we follow Linus's tabbing style.
7709 + * Emacs will notice this stuff at the end of the file and automatically
7710 + * adjust the settings for this buffer only. This must remain at the end
7712 + * ---------------------------------------------------------------------------
7713 + * Local variables:
7714 + * c-file-style: "linux"
7717 diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
7718 --- linux-orig/cluster/dlm/lowcomms.h 1970-01-01 07:30:00.000000000 +0730
7719 +++ linux-patched/cluster/dlm/lowcomms.h 2004-07-13 18:57:22.000000000 +0800
7721 +/******************************************************************************
7722 +*******************************************************************************
7724 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7725 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7727 +** This copyrighted material is made available to anyone wishing to use,
7728 +** modify, copy, or redistribute it subject to the terms and conditions
7729 +** of the GNU General Public License v.2.
7731 +*******************************************************************************
7732 +******************************************************************************/
7734 +#ifndef __LOWCOMMS_DOT_H__
7735 +#define __LOWCOMMS_DOT_H__
7737 +/* The old interface */
7738 +int lowcomms_send_message(int csid, char *buf, int len, int allocation);
7740 +/* The new interface */
7741 +struct writequeue_entry;
7742 +extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7743 + int allocation, char **ppc);
7744 +extern void lowcomms_commit_buffer(struct writequeue_entry *e);
7746 +int lowcomms_start(void);
7747 +void lowcomms_stop(void);
7748 +void lowcomms_stop_accept(void);
7749 +int lowcomms_close(int nodeid);
7750 +int lowcomms_max_buffer_size(void);
7752 +int lowcomms_our_nodeid(void);
7754 +#endif /* __LOWCOMMS_DOT_H__ */
7755 diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
7756 --- linux-orig/cluster/dlm/main.c 1970-01-01 07:30:00.000000000 +0730
7757 +++ linux-patched/cluster/dlm/main.c 2004-07-13 18:57:22.000000000 +0800
7759 +/******************************************************************************
7760 +*******************************************************************************
7762 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7763 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7765 +** This copyrighted material is made available to anyone wishing to use,
7766 +** modify, copy, or redistribute it subject to the terms and conditions
7767 +** of the GNU General Public License v.2.
7769 +*******************************************************************************
7770 +******************************************************************************/
7772 +#define EXPORT_SYMTAB
7774 +#include <linux/init.h>
7775 +#include <linux/proc_fs.h>
7776 +#include <linux/ctype.h>
7777 +#include <linux/module.h>
7778 +#include <net/sock.h>
7780 +#include <cluster/cnxman.h>
7782 +#include "dlm_internal.h"
7783 +#include "lockspace.h"
7784 +#include "recoverd.h"
7788 +#include "locking.h"
7789 +#include "config.h"
7790 +#include "memory.h"
7791 +#include "recover.h"
7792 +#include "lowcomms.h"
7794 +int dlm_device_init(void);
7795 +void dlm_device_exit(void);
7796 +void dlm_proc_init(void);
7797 +void dlm_proc_exit(void);
7800 +/* Cluster manager callbacks, we want to know if a node dies
7801 + N.B. this is independent of lockspace-specific event callbacks from SM */
7803 +static void cman_callback(kcl_callback_reason reason, long arg)
7805 + if (reason == DIED) {
7806 + lowcomms_close((int) arg);
7809 + /* This is unconditional. so do what we can to tidy up */
7810 + if (reason == LEAVING) {
7811 + dlm_emergency_shutdown();
7815 +int __init init_dlm(void)
7818 + dlm_lockspace_init();
7819 + dlm_recoverd_init();
7821 + dlm_device_init();
7822 + dlm_memory_init();
7823 + dlm_config_init();
7825 + kcl_add_callback(cman_callback);
7827 + printk("DLM %s (built %s %s) installed\n",
7828 + DLM_RELEASE_NAME, __DATE__, __TIME__);
7833 +void __exit exit_dlm(void)
7835 + kcl_remove_callback(cman_callback);
7837 + dlm_device_exit();
7838 + dlm_memory_exit();
7839 + dlm_config_exit();
7843 +MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
7844 +MODULE_AUTHOR("Red Hat, Inc.");
7845 +MODULE_LICENSE("GPL");
7847 +module_init(init_dlm);
7848 +module_exit(exit_dlm);
7850 +EXPORT_SYMBOL(dlm_init);
7851 +EXPORT_SYMBOL(dlm_release);
7852 +EXPORT_SYMBOL(dlm_new_lockspace);
7853 +EXPORT_SYMBOL(dlm_release_lockspace);
7854 +EXPORT_SYMBOL(dlm_lock);
7855 +EXPORT_SYMBOL(dlm_unlock);
7856 +EXPORT_SYMBOL(dlm_debug_dump);
7857 diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
7858 --- linux-orig/cluster/dlm/memory.c 1970-01-01 07:30:00.000000000 +0730
7859 +++ linux-patched/cluster/dlm/memory.c 2004-07-13 18:57:22.000000000 +0800
7861 +/******************************************************************************
7862 +*******************************************************************************
7864 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7865 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7867 +** This copyrighted material is made available to anyone wishing to use,
7868 +** modify, copy, or redistribute it subject to the terms and conditions
7869 +** of the GNU General Public License v.2.
7871 +*******************************************************************************
7872 +******************************************************************************/
7876 + * memory allocation routines
7880 +#include "dlm_internal.h"
7881 +#include "memory.h"
7882 +#include "config.h"
7884 +/* as the man says...Shouldn't this be in a header file somewhere? */
7885 +#define BYTES_PER_WORD sizeof(void *)
7887 +static kmem_cache_t *rsb_cache_small;
7888 +static kmem_cache_t *rsb_cache_large;
7889 +static kmem_cache_t *lkb_cache;
7890 +static kmem_cache_t *lvb_cache;
7891 +static kmem_cache_t *resdir_cache_large;
7892 +static kmem_cache_t *resdir_cache_small;
7894 +/* The thresholds above which we allocate large RSBs/resdatas rather than small
7895 + * ones. This must make the resultant structure end on a word boundary */
7896 +#define LARGE_RSB_NAME 28
7897 +#define LARGE_RES_NAME 28
7899 +int dlm_memory_init()
7901 + int ret = -ENOMEM;
7905 + kmem_cache_create("dlm_rsb(small)",
7906 + (sizeof(struct dlm_rsb) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7907 + __alignof__(struct dlm_rsb), 0, NULL, NULL);
7908 + if (!rsb_cache_small)
7912 + kmem_cache_create("dlm_rsb(large)",
7913 + sizeof(struct dlm_rsb) + DLM_RESNAME_MAXLEN,
7914 + __alignof__(struct dlm_rsb), 0, NULL, NULL);
7915 + if (!rsb_cache_large)
7916 + goto out_free_rsbs;
7918 + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
7919 + __alignof__(struct dlm_lkb), 0, NULL, NULL);
7921 + goto out_free_rsbl;
7923 + resdir_cache_large =
7924 + kmem_cache_create("dlm_resdir(l)",
7925 + sizeof(struct dlm_direntry) + DLM_RESNAME_MAXLEN,
7926 + __alignof__(struct dlm_direntry), 0, NULL, NULL);
7927 + if (!resdir_cache_large)
7928 + goto out_free_lkb;
7930 + resdir_cache_small =
7931 + kmem_cache_create("dlm_resdir(s)",
7932 + (sizeof(struct dlm_direntry) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7933 + __alignof__(struct dlm_direntry), 0, NULL, NULL);
7934 + if (!resdir_cache_small)
7935 + goto out_free_resl;
7937 + /* LVB cache also holds ranges, so should be 64bit aligned */
7938 + lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
7939 + __alignof__(uint64_t), 0, NULL, NULL);
7941 + goto out_free_ress;
7947 + kmem_cache_destroy(resdir_cache_small);
7950 + kmem_cache_destroy(resdir_cache_large);
7953 + kmem_cache_destroy(lkb_cache);
7956 + kmem_cache_destroy(rsb_cache_large);
7959 + kmem_cache_destroy(rsb_cache_small);
7965 +void dlm_memory_exit()
7967 + kmem_cache_destroy(rsb_cache_large);
7968 + kmem_cache_destroy(rsb_cache_small);
7969 + kmem_cache_destroy(lkb_cache);
7970 + kmem_cache_destroy(resdir_cache_small);
7971 + kmem_cache_destroy(resdir_cache_large);
7972 + kmem_cache_destroy(lvb_cache);
7975 +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
7977 + struct dlm_rsb *r;
7979 + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
7981 + if (namelen >= LARGE_RSB_NAME)
7982 + r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
7984 + r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
7987 + memset(r, 0, sizeof(struct dlm_rsb) + namelen);
7992 +void free_rsb(struct dlm_rsb *r)
7994 + int length = r->res_length;
7997 + memset(r, 0x55, sizeof(struct dlm_rsb) + r->res_length);
8000 + if (length >= LARGE_RSB_NAME)
8001 + kmem_cache_free(rsb_cache_large, r);
8003 + kmem_cache_free(rsb_cache_small, r);
8006 +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
8008 + struct dlm_lkb *l;
8010 + l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
8012 + memset(l, 0, sizeof(struct dlm_lkb));
8017 +void free_lkb(struct dlm_lkb *l)
8020 + memset(l, 0xAA, sizeof(struct dlm_lkb));
8022 + kmem_cache_free(lkb_cache, l);
8025 +struct dlm_direntry *allocate_resdata(struct dlm_ls *ls, int namelen)
8027 + struct dlm_direntry *rd;
8029 + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
8031 + if (namelen >= LARGE_RES_NAME)
8032 + rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
8034 + rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
8037 + memset(rd, 0, sizeof(struct dlm_direntry));
8042 +void free_resdata(struct dlm_direntry *de)
8044 + if (de->length >= LARGE_RES_NAME)
8045 + kmem_cache_free(resdir_cache_large, de);
8047 + kmem_cache_free(resdir_cache_small, de);
8050 +char *allocate_lvb(struct dlm_ls *ls)
8054 + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8056 + memset(l, 0, DLM_LVB_LEN);
8061 +void free_lvb(char *l)
8063 + kmem_cache_free(lvb_cache, l);
8066 +/* Ranges are allocated from the LVB cache as they are the same size (4x64
8068 +uint64_t *allocate_range(struct dlm_ls * ls)
8072 + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8074 + memset(l, 0, DLM_LVB_LEN);
8079 +void free_range(uint64_t *l)
8081 + kmem_cache_free(lvb_cache, l);
8084 +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls)
8086 + struct dlm_rcom *rc;
8088 + rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
8090 + memset(rc, 0, dlm_config.buffer_size);
8095 +void free_rcom_buffer(struct dlm_rcom *rc)
8099 diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
8100 --- linux-orig/cluster/dlm/memory.h 1970-01-01 07:30:00.000000000 +0730
8101 +++ linux-patched/cluster/dlm/memory.h 2004-07-13 18:57:22.000000000 +0800
8103 +/******************************************************************************
8104 +*******************************************************************************
8106 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8107 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8109 +** This copyrighted material is made available to anyone wishing to use,
8110 +** modify, copy, or redistribute it subject to the terms and conditions
8111 +** of the GNU General Public License v.2.
8113 +*******************************************************************************
8114 +******************************************************************************/
8116 +#ifndef __MEMORY_DOT_H__
8117 +#define __MEMORY_DOT_H__
8119 +int dlm_memory_init(void);
8120 +void dlm_memory_exit(void);
8121 +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
8122 +void free_rsb(struct dlm_rsb *r);
8123 +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
8124 +void free_lkb(struct dlm_lkb *l);
8125 +struct dlm_direntry *allocate_resdata(struct dlm_ls *ls, int namelen);
8126 +void free_resdata(struct dlm_direntry *de);
8127 +char *allocate_lvb(struct dlm_ls *ls);
8128 +void free_lvb(char *l);
8129 +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls);
8130 +void free_rcom_buffer(struct dlm_rcom *rc);
8131 +uint64_t *allocate_range(struct dlm_ls *ls);
8132 +void free_range(uint64_t *l);
8134 +#endif /* __MEMORY_DOT_H__ */
8135 diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
8136 --- linux-orig/cluster/dlm/midcomms.c 1970-01-01 07:30:00.000000000 +0730
8137 +++ linux-patched/cluster/dlm/midcomms.c 2004-07-13 18:57:22.000000000 +0800
8139 +/******************************************************************************
8140 +*******************************************************************************
8142 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8143 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8145 +** This copyrighted material is made available to anyone wishing to use,
8146 +** modify, copy, or redistribute it subject to the terms and conditions
8147 +** of the GNU General Public License v.2.
8149 +*******************************************************************************
8150 +******************************************************************************/
8155 + * This is the appallingly named "mid-level" comms layer.
8157 + * Its purpose is to take packets from the "real" comms layer,
8158 + * split them up into packets and pass them to the interested
8159 + * part of the locking mechanism.
8161 + * It also takes messages from the locking layer, formats them
8162 + * into packets and sends them to the comms layer.
8164 + * It knows the format of the mid-level messages used and nodeidss
8165 + * but it does not know how to resolve a nodeid into an IP address
8166 + * or any of the comms channel details
8170 +#include "dlm_internal.h"
8171 +#include "lowcomms.h"
8172 +#include "midcomms.h"
8173 +#include "lockqueue.h"
8175 +#include "reccomms.h"
8176 +#include "config.h"
8178 +/* Byteorder routines */
8180 +static void host_to_network(void *msg)
8182 + struct dlm_header *head = msg;
8183 + struct dlm_request *req = msg;
8184 + struct dlm_reply *rep = msg;
8185 + struct dlm_query_request *qreq = msg;
8186 + struct dlm_query_reply *qrep= msg;
8187 + struct dlm_rcom *rc = msg;
8189 + /* Force into network byte order */
8192 + * Do the common header first
8195 + head->rh_length = cpu_to_le16(head->rh_length);
8196 + head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
8197 + /* Leave the lkid alone as it is transparent at the remote end */
8200 + * Do the fields in the remlockrequest or remlockreply structs
8203 + switch (req->rr_header.rh_cmd) {
8205 + case GDLM_REMCMD_LOCKREQUEST:
8206 + case GDLM_REMCMD_CONVREQUEST:
8207 + req->rr_range_start = cpu_to_le64(req->rr_range_start);
8208 + req->rr_range_end = cpu_to_le64(req->rr_range_end);
8209 + /* Deliberate fall through */
8210 + case GDLM_REMCMD_UNLOCKREQUEST:
8211 + case GDLM_REMCMD_LOOKUP:
8212 + case GDLM_REMCMD_LOCKGRANT:
8213 + case GDLM_REMCMD_SENDBAST:
8214 + case GDLM_REMCMD_SENDCAST:
8215 + case GDLM_REMCMD_REM_RESDATA:
8216 + req->rr_flags = cpu_to_le32(req->rr_flags);
8217 + req->rr_status = cpu_to_le32(req->rr_status);
8220 + case GDLM_REMCMD_LOCKREPLY:
8221 + rep->rl_lockstate = cpu_to_le32(rep->rl_lockstate);
8222 + rep->rl_nodeid = cpu_to_le32(rep->rl_nodeid);
8223 + rep->rl_status = cpu_to_le32(rep->rl_status);
8226 + case GDLM_REMCMD_RECOVERMESSAGE:
8227 + case GDLM_REMCMD_RECOVERREPLY:
8228 + rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
8229 + rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
8232 + case GDLM_REMCMD_QUERY:
8233 + qreq->rq_mstlkid = cpu_to_le32(qreq->rq_mstlkid);
8234 + qreq->rq_query = cpu_to_le32(qreq->rq_query);
8235 + qreq->rq_maxlocks = cpu_to_le32(qreq->rq_maxlocks);
8238 + case GDLM_REMCMD_QUERYREPLY:
8239 + qrep->rq_numlocks = cpu_to_le32(qrep->rq_numlocks);
8240 + qrep->rq_status = cpu_to_le32(qrep->rq_status);
8241 + qrep->rq_grantcount = cpu_to_le32(qrep->rq_grantcount);
8242 + qrep->rq_waitcount = cpu_to_le32(qrep->rq_waitcount);
8243 + qrep->rq_convcount = cpu_to_le32(qrep->rq_convcount);
8247 + printk("dlm: warning, unknown REMCMD type %u\n",
8248 + req->rr_header.rh_cmd);
8252 +static void network_to_host(void *msg)
8254 + struct dlm_header *head = msg;
8255 + struct dlm_request *req = msg;
8256 + struct dlm_reply *rep = msg;
8257 + struct dlm_query_request *qreq = msg;
8258 + struct dlm_query_reply *qrep = msg;
8259 + struct dlm_rcom *rc = msg;
8261 + /* Force into host byte order */
8264 + * Do the common header first
8267 + head->rh_length = le16_to_cpu(head->rh_length);
8268 + head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
8269 + /* Leave the lkid alone as it is transparent at the remote end */
8272 + * Do the fields in the remlockrequest or remlockreply structs
8275 + switch (req->rr_header.rh_cmd) {
8277 + case GDLM_REMCMD_LOCKREQUEST:
8278 + case GDLM_REMCMD_CONVREQUEST:
8279 + req->rr_range_start = le64_to_cpu(req->rr_range_start);
8280 + req->rr_range_end = le64_to_cpu(req->rr_range_end);
8281 + case GDLM_REMCMD_LOOKUP:
8282 + case GDLM_REMCMD_UNLOCKREQUEST:
8283 + case GDLM_REMCMD_LOCKGRANT:
8284 + case GDLM_REMCMD_SENDBAST:
8285 + case GDLM_REMCMD_SENDCAST:
8286 + case GDLM_REMCMD_REM_RESDATA:
8287 + /* Actually, not much to do here as the remote lock IDs are
8288 + * transparent too */
8289 + req->rr_flags = le32_to_cpu(req->rr_flags);
8290 + req->rr_status = le32_to_cpu(req->rr_status);
8293 + case GDLM_REMCMD_LOCKREPLY:
8294 + rep->rl_lockstate = le32_to_cpu(rep->rl_lockstate);
8295 + rep->rl_nodeid = le32_to_cpu(rep->rl_nodeid);
8296 + rep->rl_status = le32_to_cpu(rep->rl_status);
8299 + case GDLM_REMCMD_RECOVERMESSAGE:
8300 + case GDLM_REMCMD_RECOVERREPLY:
8301 + rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
8302 + rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
8306 + case GDLM_REMCMD_QUERY:
8307 + qreq->rq_mstlkid = le32_to_cpu(qreq->rq_mstlkid);
8308 + qreq->rq_query = le32_to_cpu(qreq->rq_query);
8309 + qreq->rq_maxlocks = le32_to_cpu(qreq->rq_maxlocks);
8312 + case GDLM_REMCMD_QUERYREPLY:
8313 + qrep->rq_numlocks = le32_to_cpu(qrep->rq_numlocks);
8314 + qrep->rq_status = le32_to_cpu(qrep->rq_status);
8315 + qrep->rq_grantcount = le32_to_cpu(qrep->rq_grantcount);
8316 + qrep->rq_waitcount = le32_to_cpu(qrep->rq_waitcount);
8317 + qrep->rq_convcount = le32_to_cpu(qrep->rq_convcount);
8321 + printk("dlm: warning, unknown REMCMD type %u\n",
8322 + req->rr_header.rh_cmd);
8326 +static void copy_from_cb(void *dst, const void *base, unsigned offset,
8327 + unsigned len, unsigned limit)
8329 + unsigned copy = len;
8331 + if ((copy + offset) > limit)
8332 + copy = limit - offset;
8333 + memcpy(dst, base + offset, copy);
8336 + memcpy(dst + copy, base, len);
8339 +static void khexdump(const unsigned char *c, int len)
8341 + while (len > 16) {
8343 + "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
8344 + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
8345 + c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
8349 + printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
8354 + printk(KERN_INFO "%02x\n", c[0]);
8360 + * Called from the low-level comms layer to process a buffer of
8363 + * Only complete messages are processed here, any "spare" bytes from
8364 + * the end of a buffer are saved and tacked onto the front of the next
8365 + * message that comes in. I doubt this will happen very often but we
8366 + * need to be able to cope with it and I don't want the task to be waiting
8367 + * for packets to come in when there is useful work to be done.
8370 +int midcomms_process_incoming_buffer(int nodeid, const void *base,
8371 + unsigned offset, unsigned len,
8374 + unsigned char __tmp[sizeof(struct dlm_header) + 64];
8375 + struct dlm_header *msg = (struct dlm_header *) __tmp;
8381 + while (len > sizeof(struct dlm_header)) {
8382 + /* Get message header and check it over */
8383 + copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
8385 + msglen = le16_to_cpu(msg->rh_length);
8386 + id = msg->rh_lkid;
8387 + space = msg->rh_lockspace;
8389 + /* Check message size */
8391 + if (msglen < sizeof(struct dlm_header))
8394 + if (msglen > dlm_config.buffer_size) {
8395 + printk("dlm: message size too big %d\n", msglen);
8400 + /* Not enough in buffer yet? wait for some more */
8404 + /* Make sure our temp buffer is large enough */
8405 + if (msglen > sizeof(__tmp) &&
8406 + msg == (struct dlm_header *) __tmp) {
8407 + msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
8412 + copy_from_cb(msg, base, offset, msglen, limit);
8413 + BUG_ON(id != msg->rh_lkid);
8414 + BUG_ON(space != msg->rh_lockspace);
8417 + offset &= (limit - 1);
8419 + network_to_host(msg);
8421 + if ((msg->rh_cmd > 32) ||
8422 + (msg->rh_cmd == 0) ||
8423 + (msg->rh_length < sizeof(struct dlm_header)) ||
8424 + (msg->rh_length > dlm_config.buffer_size)) {
8426 + printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
8427 + "lkid=%u, lockspace=%u\n",
8428 + msg->rh_cmd, msg->rh_flags, msg->rh_length,
8429 + msg->rh_lkid, msg->rh_lockspace);
8431 + printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
8432 + "ret=%u, limit=%08x newbuf=%d\n",
8433 + base, offset, len, ret, limit,
8434 + ((struct dlm_header *) __tmp == msg));
8436 + khexdump((const unsigned char *) msg, msg->rh_length);
8441 + switch (msg->rh_cmd) {
8442 + case GDLM_REMCMD_RECOVERMESSAGE:
8443 + case GDLM_REMCMD_RECOVERREPLY:
8444 + process_recovery_comm(nodeid, msg);
8447 + process_cluster_request(nodeid, msg, FALSE);
8451 + if (msg != (struct dlm_header *) __tmp)
8454 + return err ? err : ret;
8458 + * Send a lowcomms buffer
8461 +void midcomms_send_buffer(struct dlm_header *msg, struct writequeue_entry *e)
8463 + host_to_network(msg);
8464 + lowcomms_commit_buffer(e);
8468 + * Make the message into network byte order and send it
8471 +int midcomms_send_message(uint32_t nodeid, struct dlm_header *msg,
8474 + int len = msg->rh_length;
8476 + host_to_network(msg);
8479 + * Loopback. In fact, the locking code pretty much prevents this from
8480 + * being needed but it can happen when the directory node is also the
8484 + if (nodeid == our_nodeid())
8485 + return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
8488 + return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
8490 diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
8491 --- linux-orig/cluster/dlm/midcomms.h 1970-01-01 07:30:00.000000000 +0730
8492 +++ linux-patched/cluster/dlm/midcomms.h 2004-07-13 18:57:22.000000000 +0800
8494 +/******************************************************************************
8495 +*******************************************************************************
8497 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8498 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8500 +** This copyrighted material is made available to anyone wishing to use,
8501 +** modify, copy, or redistribute it subject to the terms and conditions
8502 +** of the GNU General Public License v.2.
8504 +*******************************************************************************
8505 +******************************************************************************/
8507 +#ifndef __MIDCOMMS_DOT_H__
8508 +#define __MIDCOMMS_DOT_H__
8510 +int midcomms_send_message(uint32_t csid, struct dlm_header *msg,
8512 +int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
8513 + unsigned len, unsigned limit);
8514 +void midcomms_send_buffer(struct dlm_header *msg,
8515 + struct writequeue_entry *e);
8517 +#endif /* __MIDCOMMS_DOT_H__ */
8518 diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
8519 --- linux-orig/cluster/dlm/nodes.c 1970-01-01 07:30:00.000000000 +0730
8520 +++ linux-patched/cluster/dlm/nodes.c 2004-07-13 18:57:22.000000000 +0800
8522 +/******************************************************************************
8523 +*******************************************************************************
8525 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8526 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8528 +** This copyrighted material is made available to anyone wishing to use,
8529 +** modify, copy, or redistribute it subject to the terms and conditions
8530 +** of the GNU General Public License v.2.
8532 +*******************************************************************************
8533 +******************************************************************************/
8535 +#include <net/sock.h>
8536 +#include <cluster/cnxman.h>
8538 +#include "dlm_internal.h"
8539 +#include "lowcomms.h"
8541 +#include "recover.h"
8542 +#include "reccomms.h"
8545 +static struct list_head cluster_nodes;
8546 +static spinlock_t node_lock;
8547 +static uint32_t local_nodeid;
8548 +static struct semaphore local_init_lock;
8551 +void dlm_nodes_init(void)
8553 + INIT_LIST_HEAD(&cluster_nodes);
8554 + spin_lock_init(&node_lock);
8556 + init_MUTEX(&local_init_lock);
8559 +static struct dlm_node *search_node(uint32_t nodeid)
8561 + struct dlm_node *node;
8563 + list_for_each_entry(node, &cluster_nodes, list) {
8564 + if (node->nodeid == nodeid)
8572 +static void put_node(struct dlm_node *node)
8574 + spin_lock(&node_lock);
8576 + if (node->refcount == 0) {
8577 + list_del(&node->list);
8578 + spin_unlock(&node_lock);
8582 + spin_unlock(&node_lock);
8585 +static int get_node(uint32_t nodeid, struct dlm_node **ndp)
8587 + struct dlm_node *node, *node2;
8588 + int error = -ENOMEM;
8590 + spin_lock(&node_lock);
8591 + node = search_node(nodeid);
8594 + spin_unlock(&node_lock);
8599 + node = (struct dlm_node *) kmalloc(sizeof(struct dlm_node), GFP_KERNEL);
8603 + memset(node, 0, sizeof(struct dlm_node));
8604 + node->nodeid = nodeid;
8606 + spin_lock(&node_lock);
8607 + node2 = search_node(nodeid);
8609 + node2->refcount++;
8610 + spin_unlock(&node_lock);
8616 + node->refcount = 1;
8617 + list_add_tail(&node->list, &cluster_nodes);
8618 + spin_unlock(&node_lock);
8628 +int init_new_csb(uint32_t nodeid, struct dlm_csb **ret_csb)
8630 + struct dlm_csb *csb;
8631 + struct dlm_node *node;
8632 + int error = -ENOMEM;
8634 + csb = (struct dlm_csb *) kmalloc(sizeof(struct dlm_csb), GFP_KERNEL);
8638 + memset(csb, 0, sizeof(struct dlm_csb));
8640 + error = get_node(nodeid, &node);
8646 + down(&local_init_lock);
8648 + if (!local_nodeid) {
8649 + if (nodeid == our_nodeid()) {
8650 + local_nodeid = node->nodeid;
8653 + up(&local_init_lock);
8664 +void release_csb(struct dlm_csb *csb)
8666 + put_node(csb->node);
8670 +uint32_t our_nodeid(void)
8672 + return lowcomms_our_nodeid();
8675 +int nodes_reconfig_wait(struct dlm_ls *ls)
8679 + if (ls->ls_low_nodeid == our_nodeid()) {
8680 + error = dlm_wait_status_all(ls, NODES_VALID);
8682 + set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
8684 + /* Experimental: this delay should allow any final messages
8685 + * from the previous node to be received before beginning
8688 + if (ls->ls_num_nodes == 1) {
8689 + current->state = TASK_UNINTERRUPTIBLE;
8690 + schedule_timeout((2) * HZ);
8694 + error = dlm_wait_status_low(ls, NODES_ALL_VALID);
8699 +static void add_ordered_node(struct dlm_ls *ls, struct dlm_csb *new)
8701 + struct dlm_csb *csb = NULL;
8702 + struct list_head *tmp;
8703 + struct list_head *newlist = &new->list;
8704 + struct list_head *head = &ls->ls_nodes;
8706 + list_for_each(tmp, head) {
8707 + csb = list_entry(tmp, struct dlm_csb, list);
8709 + if (new->node->nodeid < csb->node->nodeid)
8714 + list_add_tail(newlist, head);
8716 + /* FIXME: can use list macro here */
8717 + newlist->prev = tmp->prev;
8718 + newlist->next = tmp;
8719 + tmp->prev->next = newlist;
8720 + tmp->prev = newlist;
8724 +int ls_nodes_reconfig(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
8726 + struct dlm_csb *csb, *safe;
8727 + int error, i, found, pos = 0, neg = 0;
8728 + uint32_t low = (uint32_t) (-1);
8731 + * Remove (and save) departed nodes from lockspace's nodes list
8734 + list_for_each_entry_safe(csb, safe, &ls->ls_nodes, list) {
8736 + for (i = 0; i < rv->node_count; i++) {
8737 + if (csb->node->nodeid == rv->nodeids[i]) {
8745 + csb->gone_event = rv->event_id;
8746 + list_del(&csb->list);
8747 + list_add_tail(&csb->list, &ls->ls_nodes_gone);
8748 + ls->ls_num_nodes--;
8749 + log_all(ls, "remove node %u", csb->node->nodeid);
8754 + * Add new nodes to lockspace's nodes list
8757 + for (i = 0; i < rv->node_count; i++) {
8759 + list_for_each_entry(csb, &ls->ls_nodes, list) {
8760 + if (csb->node->nodeid == rv->nodeids[i]) {
8769 + error = init_new_csb(rv->nodeids[i], &csb);
8770 + DLM_ASSERT(!error,);
8772 + add_ordered_node(ls, csb);
8773 + ls->ls_num_nodes++;
8774 + log_all(ls, "add node %u", csb->node->nodeid);
8778 + list_for_each_entry(csb, &ls->ls_nodes, list) {
8779 + if (csb->node->nodeid < low)
8780 + low = csb->node->nodeid;
8783 + rcom_log_clear(ls);
8784 + ls->ls_low_nodeid = low;
8785 + ls->ls_nodes_mask = dlm_next_power2(ls->ls_num_nodes) - 1;
8786 + set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8789 + error = nodes_reconfig_wait(ls);
8791 + log_all(ls, "total nodes %d", ls->ls_num_nodes);
8796 +int ls_nodes_init(struct dlm_ls *ls, struct dlm_recover *rv)
8798 + struct dlm_csb *csb;
8800 + uint32_t low = (uint32_t) (-1);
8802 + log_all(ls, "add nodes");
8804 + for (i = 0; i < rv->node_count; i++) {
8805 + error = init_new_csb(rv->nodeids[i], &csb);
8809 + add_ordered_node(ls, csb);
8810 + ls->ls_num_nodes++;
8812 + if (csb->node->nodeid < low)
8813 + low = csb->node->nodeid;
8816 + ls->ls_low_nodeid = low;
8817 + ls->ls_nodes_mask = dlm_next_power2(ls->ls_num_nodes) - 1;
8818 + set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8820 + error = nodes_reconfig_wait(ls);
8822 + log_all(ls, "total nodes %d", ls->ls_num_nodes);
8827 + while (!list_empty(&ls->ls_nodes)) {
8828 + csb = list_entry(ls->ls_nodes.next, struct dlm_csb, list);
8829 + list_del(&csb->list);
8832 + ls->ls_num_nodes = 0;
8837 +int in_nodes_gone(struct dlm_ls *ls, uint32_t nodeid)
8839 + struct dlm_csb *csb;
8841 + list_for_each_entry(csb, &ls->ls_nodes_gone, list) {
8842 + if (csb->node->nodeid == nodeid)
8847 diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
8848 --- linux-orig/cluster/dlm/nodes.h 1970-01-01 07:30:00.000000000 +0730
8849 +++ linux-patched/cluster/dlm/nodes.h 2004-07-13 18:57:22.000000000 +0800
8851 +/******************************************************************************
8852 +*******************************************************************************
8854 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8855 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8857 +** This copyrighted material is made available to anyone wishing to use,
8858 +** modify, copy, or redistribute it subject to the terms and conditions
8859 +** of the GNU General Public License v.2.
8861 +*******************************************************************************
8862 +******************************************************************************/
8864 +#ifndef __NODES_DOT_H__
8865 +#define __NODES_DOT_H__
8867 +void dlm_nodes_init(void);
8868 +int init_new_csb(uint32_t nodeid, struct dlm_csb ** ret_csb);
8869 +void release_csb(struct dlm_csb * csb);
8870 +uint32_t our_nodeid(void);
8871 +int ls_nodes_reconfig(struct dlm_ls * ls, struct dlm_recover * gr, int *neg);
8872 +int ls_nodes_init(struct dlm_ls * ls, struct dlm_recover * gr);
8873 +int in_nodes_gone(struct dlm_ls * ls, uint32_t nodeid);
8875 +#endif /* __NODES_DOT_H__ */
8876 diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
8877 --- linux-orig/cluster/dlm/proc.c 1970-01-01 07:30:00.000000000 +0730
8878 +++ linux-patched/cluster/dlm/proc.c 2004-07-13 18:57:22.000000000 +0800
8880 +/******************************************************************************
8881 +*******************************************************************************
8883 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8884 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8886 +** This copyrighted material is made available to anyone wishing to use,
8887 +** modify, copy, or redistribute it subject to the terms and conditions
8888 +** of the GNU General Public License v.2.
8890 +*******************************************************************************
8891 +******************************************************************************/
8893 +#include <linux/init.h>
8894 +#include <linux/proc_fs.h>
8895 +#include <linux/ctype.h>
8896 +#include <linux/seq_file.h>
8897 +#include <linux/module.h>
8899 +#include "dlm_internal.h"
8900 +#include "lockspace.h"
8902 +#if defined(DLM_DEBUG)
8903 +#define DLM_DEBUG_SIZE (1024)
8904 +#define MAX_DEBUG_MSG_LEN (64)
8906 +#define DLM_DEBUG_SIZE (0)
8907 +#define MAX_DEBUG_MSG_LEN (0)
8910 +static char * debug_buf;
8911 +static unsigned int debug_size;
8912 +static unsigned int debug_point;
8913 +static int debug_wrap;
8914 +static spinlock_t debug_lock;
8915 +static struct proc_dir_entry * debug_proc_entry = NULL;
8916 +static struct proc_dir_entry * rcom_proc_entry = NULL;
8917 +static char proc_ls_name[255] = "";
8919 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
8920 +static struct proc_dir_entry * locks_proc_entry = NULL;
8921 +static struct seq_operations locks_info_op;
8924 +static int locks_open(struct inode *inode, struct file *file)
8926 + return seq_open(file, &locks_info_op);
8929 +/* Write simply sets the lockspace to use */
8930 +static ssize_t locks_write(struct file *file, const char *buf,
8931 + size_t count, loff_t * ppos)
8933 + if (count < sizeof(proc_ls_name)) {
8934 + copy_from_user(proc_ls_name, buf, count);
8935 + proc_ls_name[count] = '\0';
8937 + /* Remove any trailing LF so that lazy users
8938 + can just echo "lsname" > /proc/cluster/dlm_locks */
8939 + if (proc_ls_name[count - 1] == '\n')
8940 + proc_ls_name[count - 1] = '\0';
8947 +static struct file_operations locks_fops = {
8949 + write:locks_write,
8952 + release:seq_release,
8955 +struct ls_dumpinfo {
8957 + struct list_head *next;
8958 + struct dlm_ls *ls;
8959 + struct dlm_rsb *rsb;
8962 +static int print_resource(struct dlm_rsb * res, struct seq_file *s);
8964 +static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
8969 + /* Find the next non-empty hash bucket */
8970 + for (i = di->entry; i < di->ls->ls_rsbtbl_size; i++) {
8971 + read_lock(&di->ls->ls_rsbtbl[i].lock);
8972 + if (!list_empty(&di->ls->ls_rsbtbl[i].list)) {
8973 + di->next = di->ls->ls_rsbtbl[i].list.next;
8974 + read_unlock(&di->ls->ls_rsbtbl[i].lock);
8977 + read_unlock(&di->ls->ls_rsbtbl[i].lock);
8981 + if (di->entry >= di->ls->ls_rsbtbl_size)
8982 + return NULL; /* End of hash list */
8983 + } else { /* Find the next entry in the list */
8985 + read_lock(&di->ls->ls_rsbtbl[i].lock);
8986 + di->next = di->next->next;
8987 + if (di->next->next == di->ls->ls_rsbtbl[i].list.next) {
8988 + /* End of list - move to next bucket */
8991 + read_unlock(&di->ls->ls_rsbtbl[i].lock);
8992 + return next_rsb(di); /* do the top half of this conditional */
8994 + read_unlock(&di->ls->ls_rsbtbl[i].lock);
8996 + di->rsb = list_entry(di->next, struct dlm_rsb, res_hashchain);
9001 +static void *s_start(struct seq_file *m, loff_t * pos)
9003 + struct ls_dumpinfo *di;
9004 + struct dlm_ls *ls;
9007 + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9011 + di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
9016 + seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
9022 + for (i = 0; i < *pos; i++)
9023 + if (next_rsb(di) == NULL)
9026 + return next_rsb(di);
9029 +static void *s_next(struct seq_file *m, void *p, loff_t * pos)
9031 + struct ls_dumpinfo *di = p;
9035 + return next_rsb(di);
9038 +static int s_show(struct seq_file *m, void *p)
9040 + struct ls_dumpinfo *di = p;
9041 + return print_resource(di->rsb, m);
9044 +static void s_stop(struct seq_file *m, void *p)
9049 +static struct seq_operations locks_info_op = {
9056 +static char *print_lockmode(int mode)
9078 +static void print_lock(struct seq_file *s, struct dlm_lkb * lkb, struct dlm_rsb * res)
9081 + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
9083 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9084 + || lkb->lkb_status == GDLM_LKSTS_WAITING)
9085 + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
9087 + if (lkb->lkb_range) {
9088 + /* This warns on Alpha. Tough. Only I see it */
9089 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9090 + || lkb->lkb_status == GDLM_LKSTS_GRANTED)
9091 + seq_printf(s, " %" PRIx64 "-%" PRIx64,
9092 + lkb->lkb_range[GR_RANGE_START],
9093 + lkb->lkb_range[GR_RANGE_END]);
9094 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9095 + || lkb->lkb_status == GDLM_LKSTS_WAITING)
9096 + seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
9097 + lkb->lkb_range[RQ_RANGE_START],
9098 + lkb->lkb_range[RQ_RANGE_END]);
9101 + if (lkb->lkb_nodeid) {
9102 + if (lkb->lkb_nodeid != res->res_nodeid)
9103 + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
9106 + seq_printf(s, " Master: %08x", lkb->lkb_remid);
9109 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
9110 + seq_printf(s, " LQ: %d", lkb->lkb_lockqueue_state);
9112 + seq_printf(s, "\n");
9115 +static int print_resource(struct dlm_rsb *res, struct seq_file *s)
9118 + struct list_head *locklist;
9120 + seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
9121 + res->res_parent, res->res_length);
9122 + for (i = 0; i < res->res_length; i++) {
9123 + if (isprint(res->res_name[i]))
9124 + seq_printf(s, "%c", res->res_name[i]);
9126 + seq_printf(s, "%c", '.');
9128 + if (res->res_nodeid)
9129 + seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
9132 + seq_printf(s, "\" \nMaster Copy\n");
9134 + /* Print the LVB: */
9135 + if (res->res_lvbptr) {
9136 + seq_printf(s, "LVB: ");
9137 + for (i = 0; i < DLM_LVB_LEN; i++) {
9138 + if (i == DLM_LVB_LEN / 2)
9139 + seq_printf(s, "\n ");
9140 + seq_printf(s, "%02x ",
9141 + (unsigned char) res->res_lvbptr[i]);
9143 + seq_printf(s, "\n");
9146 + /* Print the locks attached to this resource */
9147 + seq_printf(s, "Granted Queue\n");
9148 + list_for_each(locklist, &res->res_grantqueue) {
9149 + struct dlm_lkb *this_lkb =
9150 + list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9151 + print_lock(s, this_lkb, res);
9154 + seq_printf(s, "Conversion Queue\n");
9155 + list_for_each(locklist, &res->res_convertqueue) {
9156 + struct dlm_lkb *this_lkb =
9157 + list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9158 + print_lock(s, this_lkb, res);
9161 + seq_printf(s, "Waiting Queue\n");
9162 + list_for_each(locklist, &res->res_waitqueue) {
9163 + struct dlm_lkb *this_lkb =
9164 + list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9165 + print_lock(s, this_lkb, res);
9169 +#endif /* CONFIG_CLUSTER_DLM_PROCLOCKS */
9171 +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...)
9174 + int i, n, size, len;
9175 + char buf[MAX_DEBUG_MSG_LEN+1];
9177 + spin_lock(&debug_lock);
9182 + size = MAX_DEBUG_MSG_LEN;
9183 + memset(buf, 0, size+1);
9185 + n = snprintf(buf, size, "%s ", ls->ls_name);
9188 + va_start(va, fmt);
9189 + vsnprintf(buf+n, size, fmt, va);
9192 + len = strlen(buf);
9193 + if (len > MAX_DEBUG_MSG_LEN-1)
9194 + len = MAX_DEBUG_MSG_LEN-1;
9196 + buf[len+1] = '\0';
9198 + for (i = 0; i < strlen(buf); i++) {
9199 + debug_buf[debug_point++] = buf[i];
9201 + if (debug_point == debug_size) {
9207 + spin_unlock(&debug_lock);
9210 +void dlm_debug_dump(void)
9214 + spin_lock(&debug_lock);
9216 + for (i = debug_point; i < debug_size; i++)
9217 + printk("%c", debug_buf[i]);
9219 + for (i = 0; i < debug_point; i++)
9220 + printk("%c", debug_buf[i]);
9221 + spin_unlock(&debug_lock);
9224 +void dlm_debug_setup(int size)
9228 + if (size > PAGE_SIZE)
9231 + b = kmalloc(size, GFP_KERNEL);
9233 + spin_lock(&debug_lock);
9238 + debug_size = size;
9242 + memset(debug_buf, 0, debug_size);
9244 + spin_unlock(&debug_lock);
9247 +static void dlm_debug_init(void)
9253 + spin_lock_init(&debug_lock);
9255 + dlm_debug_setup(DLM_DEBUG_SIZE);
9258 +#ifdef CONFIG_PROC_FS
9259 +int dlm_debug_info(char *b, char **start, off_t offset, int length)
9263 + spin_lock(&debug_lock);
9266 + for (i = debug_point; i < debug_size; i++)
9267 + n += sprintf(b + n, "%c", debug_buf[i]);
9269 + for (i = 0; i < debug_point; i++)
9270 + n += sprintf(b + n, "%c", debug_buf[i]);
9272 + spin_unlock(&debug_lock);
9277 +int dlm_rcom_info(char *b, char **start, off_t offset, int length)
9279 + struct dlm_ls *ls;
9280 + struct dlm_csb *csb;
9283 + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9287 + n += sprintf(b + n, "nodeid names_send_count names_send_msgid "
9288 + "names_recv_count names_recv_msgid "
9289 + "locks_send_count locks_send_msgid "
9290 + "locks_recv_count locks_recv_msgid\n");
9292 + list_for_each_entry(csb, &ls->ls_nodes, list) {
9293 + n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n",
9294 + csb->node->nodeid,
9295 + csb->names_send_count,
9296 + csb->names_send_msgid,
9297 + csb->names_recv_count,
9298 + csb->names_recv_msgid,
9299 + csb->locks_send_count,
9300 + csb->locks_send_msgid,
9301 + csb->locks_recv_count,
9302 + csb->locks_recv_msgid);
9308 +void dlm_proc_init(void)
9310 +#ifdef CONFIG_PROC_FS
9311 + debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
9313 + if (!debug_proc_entry)
9316 + debug_proc_entry->get_info = &dlm_debug_info;
9318 + rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL);
9319 + if (!rcom_proc_entry)
9322 + rcom_proc_entry->get_info = &dlm_rcom_info;
9326 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9327 + locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
9329 + NULL, NULL, NULL);
9330 + if (!locks_proc_entry)
9332 + locks_proc_entry->proc_fops = &locks_fops;
9336 +void dlm_proc_exit(void)
9338 +#ifdef CONFIG_PROC_FS
9339 + if (debug_proc_entry) {
9340 + remove_proc_entry("cluster/dlm_debug", NULL);
9341 + dlm_debug_setup(0);
9344 + if (rcom_proc_entry)
9345 + remove_proc_entry("cluster/dlm_rcom", NULL);
9348 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9349 + if (locks_proc_entry)
9350 + remove_proc_entry("cluster/dlm_locks", NULL);
9353 diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
9354 --- linux-orig/cluster/dlm/queries.c 1970-01-01 07:30:00.000000000 +0730
9355 +++ linux-patched/cluster/dlm/queries.c 2004-07-13 18:57:22.000000000 +0800
9357 +/******************************************************************************
9358 +*******************************************************************************
9360 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9361 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9363 +** This copyrighted material is made available to anyone wishing to use,
9364 +** modify, copy, or redistribute it subject to the terms and conditions
9365 +** of the GNU General Public License v.2.
9367 +*******************************************************************************
9368 +******************************************************************************/
9373 + * This file provides the kernel query interface to the DLM.
9377 +#define EXPORT_SYMTAB
9378 +#include <linux/module.h>
9380 +#include "dlm_internal.h"
9381 +#include "lockspace.h"
9382 +#include "lockqueue.h"
9383 +#include "locking.h"
9388 +#include "memory.h"
9389 +#include "lowcomms.h"
9390 +#include "midcomms.h"
9393 +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo);
9394 +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo);
9397 + * API entry point.
9399 +int dlm_query(void *lockspace,
9400 + struct dlm_lksb *lksb,
9402 + struct dlm_queryinfo *qinfo,
9403 + void (ast_routine(void *)),
9406 + int status = -EINVAL;
9407 + struct dlm_lkb *target_lkb;
9408 + struct dlm_lkb *query_lkb = NULL; /* Our temporary LKB */
9409 + struct dlm_ls *ls = (struct dlm_ls *) find_lockspace_by_local_id(lockspace);
9421 + if (!qinfo->gqi_lockinfo)
9422 + qinfo->gqi_locksize = 0;
9424 + /* Find the lkid */
9425 + target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
9429 + /* If the user wants a list of locks that are blocking or
9430 + not blocking this lock, then it must be waiting
9433 + if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
9434 + (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
9435 + target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
9438 + /* We now allocate an LKB for our own use (so we can hang
9439 + * things like the AST routine and the lksb from it) */
9440 + lksb->sb_status = -EBUSY;
9441 + query_lkb = create_lkb(ls);
9446 + query_lkb->lkb_astaddr = ast_routine;
9447 + query_lkb->lkb_astparam = (long)astarg;
9448 + query_lkb->lkb_resource = target_lkb->lkb_resource;
9449 + query_lkb->lkb_lksb = lksb;
9451 + /* Don't free the resource while we are querying it. This ref
9452 + * will be dropped when the LKB is freed */
9453 + hold_rsb(query_lkb->lkb_resource);
9455 + /* Fill in the stuff that's always local */
9456 + if (qinfo->gqi_resinfo) {
9457 + if (target_lkb->lkb_resource->res_nodeid)
9458 + qinfo->gqi_resinfo->rsi_masternode =
9459 + target_lkb->lkb_resource->res_nodeid;
9461 + qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
9462 + qinfo->gqi_resinfo->rsi_length =
9463 + target_lkb->lkb_resource->res_length;
9464 + memcpy(qinfo->gqi_resinfo->rsi_name,
9465 + target_lkb->lkb_resource->res_name,
9466 + qinfo->gqi_resinfo->rsi_length);
9469 + /* If the master is local (or the user doesn't want the overhead of a
9470 + * remote call) - fill in the details here */
9471 + if (target_lkb->lkb_resource->res_nodeid == 0 ||
9472 + (query & DLM_QUERY_LOCAL)) {
9475 + /* Resource info */
9476 + if (qinfo->gqi_resinfo) {
9477 + query_resource(target_lkb->lkb_resource,
9478 + qinfo->gqi_resinfo);
9482 + if (qinfo->gqi_lockinfo) {
9483 + status = query_locks(query, target_lkb, qinfo);
9486 + query_lkb->lkb_retstatus = status;
9487 + queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
9490 + /* An AST will be delivered so we must return success here */
9495 + /* Remote master */
9496 + if (target_lkb->lkb_resource->res_nodeid != 0)
9498 + struct dlm_query_request *remquery;
9499 + struct writequeue_entry *e;
9501 + /* Clear this cos the receiving end adds to it with
9502 + each incoming packet */
9503 + qinfo->gqi_lockcount = 0;
9505 + /* Squirrel a pointer to the query info struct
9506 + somewhere illegal */
9507 + query_lkb->lkb_request = (struct dlm_request *) qinfo;
9509 + e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
9510 + sizeof(struct dlm_query_request),
9511 + ls->ls_allocation,
9512 + (char **) &remquery);
9514 + status = -ENOBUFS;
9518 + /* Build remote packet */
9519 + memset(remquery, 0, sizeof(struct dlm_query_request));
9521 + remquery->rq_maxlocks = qinfo->gqi_locksize;
9522 + remquery->rq_query = query;
9523 + remquery->rq_mstlkid = target_lkb->lkb_remid;
9524 + if (qinfo->gqi_lockinfo)
9525 + remquery->rq_maxlocks = qinfo->gqi_locksize;
9527 + remquery->rq_header.rh_cmd = GDLM_REMCMD_QUERY;
9528 + remquery->rq_header.rh_flags = 0;
9529 + remquery->rq_header.rh_length = sizeof(struct dlm_query_request);
9530 + remquery->rq_header.rh_lkid = query_lkb->lkb_id;
9531 + remquery->rq_header.rh_lockspace = ls->ls_global_id;
9533 + midcomms_send_buffer(&remquery->rq_header, e);
9542 +static inline int valid_range(struct dlm_range *r)
9544 + if (r->ra_start != 0ULL ||
9545 + r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
9551 +static void put_int(int x, char *buf, int *offp)
9553 + x = cpu_to_le32(x);
9554 + memcpy(buf + *offp, &x, sizeof(int));
9555 + *offp += sizeof(int);
9558 +static void put_int64(uint64_t x, char *buf, int *offp)
9560 + x = cpu_to_le64(x);
9561 + memcpy(buf + *offp, &x, sizeof(uint64_t));
9562 + *offp += sizeof(uint64_t);
9565 +static int get_int(char *buf, int *offp)
9568 + memcpy(&value, buf + *offp, sizeof(int));
9569 + *offp += sizeof(int);
9570 + return le32_to_cpu(value);
9573 +static uint64_t get_int64(char *buf, int *offp)
9577 + memcpy(&value, buf + *offp, sizeof(uint64_t));
9578 + *offp += sizeof(uint64_t);
9579 + return le64_to_cpu(value);
9582 +#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
9584 +/* Called from recvd to get lock info for a remote node */
9585 +int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
9587 + struct dlm_query_request *query = (struct dlm_query_request *) msg;
9588 + struct dlm_query_reply *reply;
9589 + struct dlm_resinfo resinfo;
9590 + struct dlm_queryinfo qinfo;
9591 + struct writequeue_entry *e;
9593 + struct dlm_lkb *lkb;
9598 + int start_lock = 0;
9600 + lkb = find_lock_by_id(ls, query->rq_mstlkid);
9606 + qinfo.gqi_resinfo = &resinfo;
9607 + qinfo.gqi_locksize = query->rq_maxlocks;
9609 + /* Get the resource bits */
9610 + query_resource(lkb->lkb_resource, &resinfo);
9612 + /* Now get the locks if wanted */
9613 + if (query->rq_maxlocks) {
9614 + qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
9616 + if (!qinfo.gqi_lockinfo) {
9621 + status = query_locks(query->rq_query, lkb, &qinfo);
9622 + if (status && status != -E2BIG) {
9623 + kfree(qinfo.gqi_lockinfo);
9628 + qinfo.gqi_lockinfo = NULL;
9629 + qinfo.gqi_lockcount = 0;
9632 + /* Send as many blocks as needed for all the locks */
9635 + int msg_len = sizeof(struct dlm_query_reply);
9636 + int last_msg_len = msg_len; /* keeps compiler quiet */
9639 + /* First work out how many locks we can fit into a block */
9640 + for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
9642 + last_msg_len = msg_len;
9644 + msg_len += LOCK_LEN;
9645 + if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
9646 + valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
9648 + msg_len += sizeof(uint64_t) * 4;
9652 + /* There must be a neater way of doing this... */
9653 + if (msg_len > PAGE_SIZE) {
9655 + msg_len = last_msg_len;
9661 + e = lowcomms_get_buffer(nodeid,
9663 + ls->ls_allocation,
9664 + (char **) &reply);
9666 + kfree(qinfo.gqi_lockinfo);
9667 + status = -ENOBUFS;
9671 + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
9672 + reply->rq_header.rh_length = msg_len;
9673 + reply->rq_header.rh_lkid = msg->rh_lkid;
9674 + reply->rq_header.rh_lockspace = msg->rh_lockspace;
9676 + reply->rq_status = status;
9677 + reply->rq_startlock = cur_lock;
9678 + reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
9679 + reply->rq_convcount = qinfo.gqi_resinfo->rsi_convcount;
9680 + reply->rq_waitcount = qinfo.gqi_resinfo->rsi_waitcount;
9681 + memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
9683 + buf = (char *)reply;
9684 + bufidx = sizeof(struct dlm_query_reply);
9686 + for (; cur_lock < last_lock; cur_lock++) {
9688 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
9689 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
9690 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
9691 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
9692 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
9693 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
9694 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
9696 + if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
9697 + valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
9699 + buf[bufidx++] = 1;
9700 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
9701 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
9702 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
9703 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
9706 + buf[bufidx++] = 0;
9710 + if (cur_lock == qinfo.gqi_lockcount) {
9711 + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
9715 + reply->rq_header.rh_flags = 0;
9718 + reply->rq_numlocks = cur_lock - start_lock;
9719 + start_lock = cur_lock;
9721 + midcomms_send_buffer(&reply->rq_header, e);
9722 + } while (!finished);
9724 + kfree(qinfo.gqi_lockinfo);
9729 + e = lowcomms_get_buffer(nodeid,
9730 + sizeof(struct dlm_query_reply),
9731 + ls->ls_allocation,
9732 + (char **) &reply);
9734 + status = -ENOBUFS;
9737 + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
9738 + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */
9739 + reply->rq_header.rh_length = sizeof(struct dlm_query_reply);
9740 + reply->rq_header.rh_lkid = msg->rh_lkid;
9741 + reply->rq_header.rh_lockspace = msg->rh_lockspace;
9742 + reply->rq_status = status;
9743 + reply->rq_numlocks = 0;
9744 + reply->rq_startlock = 0;
9745 + reply->rq_grantcount = 0;
9746 + reply->rq_convcount = 0;
9747 + reply->rq_waitcount = 0;
9749 + midcomms_send_buffer(&reply->rq_header, e);
9754 +/* Reply to a remote query */
9755 +int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
9757 + struct dlm_lkb *query_lkb;
9758 + struct dlm_queryinfo *qinfo;
9759 + struct dlm_query_reply *reply;
9764 + query_lkb = find_lock_by_id(ls, msg->rh_lkid);
9768 + qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
9769 + reply = (struct dlm_query_reply *) msg;
9771 + /* Copy the easy bits first */
9772 + qinfo->gqi_lockcount += reply->rq_numlocks;
9773 + if (qinfo->gqi_resinfo) {
9774 + qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
9775 + qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
9776 + qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
9777 + memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
9781 + /* Now unpack the locks */
9782 + bufidx = sizeof(struct dlm_query_reply);
9783 + buf = (char *) msg;
9785 + DLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
9786 + printk("start = %d, num + %d. Max= %d\n",
9787 + reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
9789 + for (i = reply->rq_startlock;
9790 + i < reply->rq_startlock + reply->rq_numlocks; i++) {
9791 + qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
9792 + qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
9793 + qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
9794 + qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
9795 + qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
9796 + qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
9797 + qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
9798 + if (buf[bufidx++]) {
9799 + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
9800 + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = get_int64(buf, &bufidx);
9801 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
9802 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = get_int64(buf, &bufidx);
9805 + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
9806 + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
9807 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
9808 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
9812 + /* If this was the last block then now tell the user */
9813 + if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
9814 + query_lkb->lkb_retstatus = reply->rq_status;
9815 + queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
9822 +/* Aggregate resource information */
9823 +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo)
9825 + struct list_head *tmp;
9828 + if (rsb->res_lvbptr)
9829 + memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
9831 + resinfo->rsi_grantcount = 0;
9832 + list_for_each(tmp, &rsb->res_grantqueue) {
9833 + resinfo->rsi_grantcount++;
9836 + resinfo->rsi_waitcount = 0;
9837 + list_for_each(tmp, &rsb->res_waitqueue) {
9838 + resinfo->rsi_waitcount++;
9841 + resinfo->rsi_convcount = 0;
9842 + list_for_each(tmp, &rsb->res_convertqueue) {
9843 + resinfo->rsi_convcount++;
9849 +static int add_lock(struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
9853 + /* Don't fill it in if the buffer is full */
9854 + if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
9857 + /* gqi_lockcount contains the number of locks we have returned */
9858 + entry = qinfo->gqi_lockcount++;
9860 + /* Fun with master copies */
9861 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
9862 + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
9863 + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
9866 + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
9867 + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
9870 + /* Also make sure we always have a valid nodeid in there, the
9871 + calling end may not know which node "0" is */
9872 + if (lkb->lkb_nodeid)
9873 + qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
9875 + qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
9877 + if (lkb->lkb_parent)
9878 + qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
9880 + qinfo->gqi_lockinfo[entry].lki_parent = 0;
9882 + qinfo->gqi_lockinfo[entry].lki_state = lkb->lkb_status;
9883 + qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
9884 + qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
9886 + if (lkb->lkb_range) {
9887 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
9888 + lkb->lkb_range[GR_RANGE_START];
9889 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
9890 + lkb->lkb_range[GR_RANGE_END];
9891 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
9892 + lkb->lkb_range[RQ_RANGE_START];
9893 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
9894 + lkb->lkb_range[RQ_RANGE_END];
9896 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
9897 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
9898 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
9899 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
9904 +static int query_lkb_queue(struct list_head *queue, int query,
9905 + struct dlm_queryinfo *qinfo)
9907 + struct list_head *tmp;
9909 + int mode = query & DLM_QUERY_MODE_MASK;
9911 + list_for_each(tmp, queue) {
9912 + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
9915 + if (query & DLM_QUERY_RQMODE)
9916 + lkmode = lkb->lkb_rqmode;
9918 + lkmode = lkb->lkb_grmode;
9920 + /* Add the LKB info to the list if it matches the criteria in
9921 + * the query bitmap */
9922 + switch (query & DLM_QUERY_MASK) {
9923 + case DLM_QUERY_LOCKS_ALL:
9924 + status = add_lock(lkb, qinfo);
9927 + case DLM_QUERY_LOCKS_HIGHER:
9928 + if (lkmode > mode)
9929 + status = add_lock(lkb, qinfo);
9932 + case DLM_QUERY_LOCKS_EQUAL:
9933 + if (lkmode == mode)
9934 + status = add_lock(lkb, qinfo);
9937 + case DLM_QUERY_LOCKS_LOWER:
9938 + if (lkmode < mode)
9939 + status = add_lock(lkb, qinfo);
9947 + * Return 1 if the locks' ranges overlap
9948 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
9950 +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
9952 + if (!lkb1->lkb_range || !lkb2->lkb_range)
9955 + if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
9956 + lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
9961 +extern const int __dlm_compat_matrix[8][8];
9964 +static int get_blocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
9966 + struct list_head *tmp;
9969 + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9970 + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
9972 + if (ranges_overlap(lkb, qlkb) &&
9973 + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
9974 + status = add_lock(lkb, qinfo);
9980 +static int get_nonblocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
9982 + struct list_head *tmp;
9985 + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9986 + struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
9988 + if (!(ranges_overlap(lkb, qlkb) &&
9989 + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
9990 + status = add_lock(lkb, qinfo);
9996 +/* Gather a list of appropriate locks */
9997 +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
10002 + /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
10003 + * was requested as the mode
10005 + if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
10006 + query &= ~DLM_QUERY_MODE_MASK;
10007 + if (query & DLM_QUERY_RQMODE)
10008 + query |= lkb->lkb_rqmode;
10010 + query |= lkb->lkb_grmode;
10013 + qinfo->gqi_lockcount = 0;
10015 + /* BLOCKING/NOTBLOCK only look at the granted queue */
10016 + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
10017 + return get_blocking_locks(lkb, qinfo);
10019 + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
10020 + return get_nonblocking_locks(lkb, qinfo);
10022 + /* Do the lock queues that were requested */
10023 + if (query & DLM_QUERY_QUEUE_GRANT) {
10024 + status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue,
10028 + if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
10029 + status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue,
10033 + if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
10034 + status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue,
10042 +EXPORT_SYMBOL(dlm_query);
10044 + * Overrides for Emacs so that we follow Linus's tabbing style.
10045 + * Emacs will notice this stuff at the end of the file and automatically
10046 + * adjust the settings for this buffer only. This must remain at the end
10048 + * ---------------------------------------------------------------------------
10049 + * Local variables:
10050 + * c-file-style: "linux"
10053 diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
10054 --- linux-orig/cluster/dlm/queries.h 1970-01-01 07:30:00.000000000 +0730
10055 +++ linux-patched/cluster/dlm/queries.h 2004-07-13 18:57:22.000000000 +0800
10057 +/******************************************************************************
10058 +*******************************************************************************
10060 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10061 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10063 +** This copyrighted material is made available to anyone wishing to use,
10064 +** modify, copy, or redistribute it subject to the terms and conditions
10065 +** of the GNU General Public License v.2.
10067 +*******************************************************************************
10068 +******************************************************************************/
10070 +#ifndef __QUERIES_DOT_H__
10071 +#define __QUERIES_DOT_H__
10073 +extern int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10074 +extern int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10076 +#endif /* __QUERIES_DOT_H__ */
10077 diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
10078 --- linux-orig/cluster/dlm/rebuild.c 1970-01-01 07:30:00.000000000 +0730
10079 +++ linux-patched/cluster/dlm/rebuild.c 2004-07-13 18:57:22.000000000 +0800
10081 +/******************************************************************************
10082 +*******************************************************************************
10084 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10085 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10087 +** This copyrighted material is made available to anyone wishing to use,
10088 +** modify, copy, or redistribute it subject to the terms and conditions
10089 +** of the GNU General Public License v.2.
10091 +*******************************************************************************
10092 +******************************************************************************/
10095 + * Rebuild RSB's on new masters. Functions for transferring locks and
10096 + * subresources to new RSB masters during recovery.
10099 +#include "dlm_internal.h"
10100 +#include "reccomms.h"
10103 +#include "nodes.h"
10104 +#include "config.h"
10105 +#include "memory.h"
10106 +#include "recover.h"
10109 +/* Types of entity serialised in remastering messages */
10110 +#define REMASTER_ROOTRSB 1
10111 +#define REMASTER_RSB 2
10112 +#define REMASTER_LKB 3
10114 +struct rcom_fill {
10115 + char * outbuf; /* Beginning of data */
10116 + int offset; /* Current offset into outbuf */
10117 + int maxlen; /* Max value of offset */
10120 + struct dlm_rsb * rsb;
10121 + struct dlm_rsb * subrsb;
10122 + struct dlm_lkb * lkb;
10123 + struct list_head * lkbqueue;
10126 +typedef struct rcom_fill rcom_fill_t;
10129 +struct rebuild_node {
10130 + struct list_head list;
10132 + struct dlm_rsb * rootrsb;
10134 +typedef struct rebuild_node rebuild_node_t;
10138 + * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
10139 + * master. The rsb will be "done" with recovery when the new master has
10140 + * replied with all the new remote lockid's for this rsb's lkb's.
10143 +void expect_new_lkids(struct dlm_rsb *rsb)
10145 + rsb->res_newlkid_expect = 0;
10146 + recover_list_add(rsb);
10150 + * This function is called on root rsb or subrsb when another lkb is being sent
10151 + * to the new master for which we expect to receive a corresponding remote lkid
10154 +void need_new_lkid(struct dlm_rsb *rsb)
10156 + struct dlm_rsb *root = rsb;
10158 + if (rsb->res_parent)
10159 + root = rsb->res_root;
10161 + if (!root->res_newlkid_expect)
10162 + recover_list_add(root);
10164 + DLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
10166 + root->res_newlkid_expect++;
10170 + * This function is called for each lkb for which a new remote lkid is
10171 + * received. Decrement the expected number of remote lkids expected for the
10175 +void have_new_lkid(struct dlm_lkb *lkb)
10177 + struct dlm_rsb *root = lkb->lkb_resource;
10179 + if (root->res_parent)
10180 + root = root->res_root;
10182 + down_write(&root->res_lock);
10184 + DLM_ASSERT(root->res_newlkid_expect,
10185 + printk("newlkid_expect=%d\n", root->res_newlkid_expect););
10187 + root->res_newlkid_expect--;
10189 + if (!root->res_newlkid_expect) {
10190 + clear_bit(RESFL_NEW_MASTER, &root->res_flags);
10191 + recover_list_del(root);
10193 + up_write(&root->res_lock);
10197 + * Return the rebuild struct for a node - will create an entry on the rootrsb
10198 + * list if necessary.
10200 + * Currently no locking is needed here as it all happens in the dlm_recvd
10204 +static rebuild_node_t *find_rebuild_root(struct dlm_ls *ls, int nodeid)
10206 + rebuild_node_t *node = NULL;
10208 + list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
10209 + if (node->nodeid == nodeid)
10213 + /* Not found, add one */
10214 + node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
10218 + node->nodeid = nodeid;
10219 + node->rootrsb = NULL;
10220 + list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
10226 + * Tidy up after a rebuild run. Called when all recovery has finished
10229 +void rebuild_freemem(struct dlm_ls *ls)
10231 + rebuild_node_t *node = NULL, *s;
10233 + list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
10234 + list_del(&node->list);
10239 +static void put_int(int x, char *buf, int *offp)
10241 + x = cpu_to_le32(x);
10242 + memcpy(buf + *offp, &x, sizeof(int));
10243 + *offp += sizeof(int);
10246 +static void put_int64(uint64_t x, char *buf, int *offp)
10248 + x = cpu_to_le64(x);
10249 + memcpy(buf + *offp, &x, sizeof(uint64_t));
10250 + *offp += sizeof(uint64_t);
10253 +static void put_bytes(char *x, int len, char *buf, int *offp)
10255 + put_int(len, buf, offp);
10256 + memcpy(buf + *offp, x, len);
10260 +static void put_char(char x, char *buf, int *offp)
10266 +static int get_int(char *buf, int *offp)
10269 + memcpy(&value, buf + *offp, sizeof(int));
10270 + *offp += sizeof(int);
10271 + return le32_to_cpu(value);
10274 +static uint64_t get_int64(char *buf, int *offp)
10278 + memcpy(&value, buf + *offp, sizeof(uint64_t));
10279 + *offp += sizeof(uint64_t);
10280 + return le64_to_cpu(value);
10283 +static char get_char(char *buf, int *offp)
10285 + char x = buf[*offp];
10291 +static void get_bytes(char *bytes, int *len, char *buf, int *offp)
10293 + *len = get_int(buf, offp);
10294 + memcpy(bytes, buf + *offp, *len);
10298 +static int lkb_length(struct dlm_lkb *lkb)
10302 + len += sizeof(int); /* lkb_id */
10303 + len += sizeof(int); /* lkb_resource->res_reamasterid */
10304 + len += sizeof(int); /* lkb_flags */
10305 + len += sizeof(int); /* lkb_status */
10306 + len += sizeof(char); /* lkb_rqmode */
10307 + len += sizeof(char); /* lkb_grmode */
10308 + len += sizeof(int); /* lkb_childcnt */
10309 + len += sizeof(int); /* lkb_parent->lkb_id */
10310 + len += sizeof(int); /* lkb_bastaddr */
10312 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10313 + len += sizeof(int); /* number of lvb bytes */
10314 + len += DLM_LVB_LEN;
10317 + if (lkb->lkb_range) {
10318 + len += sizeof(uint64_t);
10319 + len += sizeof(uint64_t);
10320 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
10321 + len += sizeof(uint64_t);
10322 + len += sizeof(uint64_t);
10330 + * It's up to the caller to be sure there's enough space in the buffer.
10333 +static void serialise_lkb(struct dlm_lkb *lkb, char *buf, int *offp)
10337 + /* Need to tell the remote end if we have a range */
10338 + flags = lkb->lkb_flags;
10339 + if (lkb->lkb_range)
10340 + flags |= GDLM_LKFLG_RANGE;
10343 + * See lkb_length()
10344 + * Total: 30 (no lvb) or 66 (with lvb) bytes
10347 + put_int(lkb->lkb_id, buf, offp);
10348 + put_int(lkb->lkb_resource->res_remasterid, buf, offp);
10349 + put_int(flags, buf, offp);
10350 + put_int(lkb->lkb_status, buf, offp);
10351 + put_char(lkb->lkb_rqmode, buf, offp);
10352 + put_char(lkb->lkb_grmode, buf, offp);
10353 + put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
10355 + if (lkb->lkb_parent)
10356 + put_int(lkb->lkb_parent->lkb_id, buf, offp);
10358 + put_int(0, buf, offp);
10360 + if (lkb->lkb_bastaddr)
10361 + put_int(1, buf, offp);
10363 + put_int(0, buf, offp);
10365 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10366 + DLM_ASSERT(lkb->lkb_lvbptr,);
10367 + put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
10370 + /* Only send the range we actually need */
10371 + if (lkb->lkb_range) {
10372 + switch (lkb->lkb_status) {
10373 + case GDLM_LKSTS_CONVERT:
10374 + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10375 + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10376 + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10377 + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10379 + case GDLM_LKSTS_WAITING:
10380 + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10381 + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10383 + case GDLM_LKSTS_GRANTED:
10384 + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10385 + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10393 +static int rsb_length(struct dlm_rsb *rsb)
10397 + len += sizeof(int); /* number of res_name bytes */
10398 + len += rsb->res_length; /* res_name */
10399 + len += sizeof(int); /* res_remasterid */
10400 + len += sizeof(int); /* res_parent->res_remasterid */
10405 +static inline struct dlm_rsb *next_subrsb(struct dlm_rsb *subrsb)
10407 + struct list_head *tmp;
10408 + struct dlm_rsb *r;
10410 + tmp = subrsb->res_subreslist.next;
10411 + r = list_entry(tmp, struct dlm_rsb, res_subreslist);
10416 +static inline int last_in_list(struct dlm_rsb *r, struct list_head *head)
10418 + struct dlm_rsb *last;
10419 + last = list_entry(head->prev, struct dlm_rsb, res_subreslist);
10426 + * Used to decide if an rsb should be rebuilt on a new master. An rsb only
10427 + * needs to be rebuild if we have lkb's queued on it. NOREBUILD lkb's on the
10428 + * wait queue are not rebuilt.
10431 +static int lkbs_to_remaster(struct dlm_rsb *r)
10433 + struct dlm_lkb *lkb;
10434 + struct dlm_rsb *sub;
10436 + if (!list_empty(&r->res_grantqueue) ||
10437 + !list_empty(&r->res_convertqueue))
10440 + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
10441 + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10446 + list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
10447 + if (!list_empty(&sub->res_grantqueue) ||
10448 + !list_empty(&sub->res_convertqueue))
10451 + list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) {
10452 + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10461 +static void serialise_rsb(struct dlm_rsb *rsb, char *buf, int *offp)
10464 + * See rsb_length()
10465 + * Total: 36 bytes (4 + 24 + 4 + 4)
10468 + put_bytes(rsb->res_name, rsb->res_length, buf, offp);
10469 + put_int(rsb->res_remasterid, buf, offp);
10471 + if (rsb->res_parent)
10472 + put_int(rsb->res_parent->res_remasterid, buf, offp);
10474 + put_int(0, buf, offp);
10476 + DLM_ASSERT(!rsb->res_lvbptr,);
10480 + * Flatten an LKB into a buffer for sending to the new RSB master. As a
10481 + * side-effect the nodeid of the lock is set to the nodeid of the new RSB
10485 +static int pack_one_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb,
10486 + rcom_fill_t *fill)
10488 + if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
10491 + lkb->lkb_nodeid = r->res_nodeid;
10493 + put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
10494 + serialise_lkb(lkb, fill->outbuf, &fill->offset);
10497 + need_new_lkid(r);
10505 + * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
10508 +static int pack_lkb_queue(struct dlm_rsb *r, struct list_head *queue,
10509 + rcom_fill_t *fill)
10511 + struct dlm_lkb *lkb;
10514 + list_for_each_entry(lkb, queue, lkb_statequeue) {
10515 + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10518 + error = pack_one_lkb(r, lkb, fill);
10527 + fill->lkbqueue = queue;
10532 +static int pack_lkb_queues(struct dlm_rsb *r, rcom_fill_t *fill)
10536 + error = pack_lkb_queue(r, &r->res_grantqueue, fill);
10540 + error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10544 + error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10551 + * Pack remaining lkb's for rsb or subrsb. This may include a partial lkb
10552 + * queue and full lkb queues.
10555 +static int pack_lkb_remaining(struct dlm_rsb *r, rcom_fill_t *fill)
10557 + struct list_head *tmp, *start, *end;
10558 + struct dlm_lkb *lkb;
10562 + * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
10565 + error = pack_one_lkb(r, fill->lkb, fill);
10569 + start = fill->lkb->lkb_statequeue.next;
10570 + end = fill->lkbqueue;
10572 + for (tmp = start; tmp != end; tmp = tmp->next) {
10573 + lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
10575 + error = pack_one_lkb(r, lkb, fill);
10583 + * Pack all lkb's on r's queues following fill->lkbqueue.
10586 + if (fill->lkbqueue == &r->res_waitqueue)
10588 + if (fill->lkbqueue == &r->res_convertqueue)
10591 + DLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
10593 + error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10597 + error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10603 +static int pack_one_subrsb(struct dlm_rsb *rsb, struct dlm_rsb *subrsb,
10604 + rcom_fill_t *fill)
10608 + down_write(&subrsb->res_lock);
10610 + if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
10613 + subrsb->res_nodeid = rsb->res_nodeid;
10614 + subrsb->res_remasterid = ++fill->remasterid;
10616 + put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
10617 + serialise_rsb(subrsb, fill->outbuf, &fill->offset);
10619 + error = pack_lkb_queues(subrsb, fill);
10623 + up_write(&subrsb->res_lock);
10628 + up_write(&subrsb->res_lock);
10629 + fill->subrsb = subrsb;
10634 +static int pack_subrsbs(struct dlm_rsb *rsb, struct dlm_rsb *in_subrsb,
10635 + rcom_fill_t *fill)
10637 + struct dlm_rsb *subrsb;
10641 + * When an initial subrsb is given, we know it needs to be packed.
10642 + * When no initial subrsb is given, begin with the first (if any exist).
10645 + if (!in_subrsb) {
10646 + if (list_empty(&rsb->res_subreslist))
10649 + subrsb = list_entry(rsb->res_subreslist.next, struct dlm_rsb,
10652 + subrsb = in_subrsb;
10655 + error = pack_one_subrsb(rsb, subrsb, fill);
10659 + if (last_in_list(subrsb, &rsb->res_subreslist))
10662 + subrsb = next_subrsb(subrsb);
10670 + * Finish packing whatever is left in an rsb tree. If space runs out while
10671 + * finishing, save subrsb/lkb and this will be called again for the same rsb.
10673 + * !subrsb && lkb, we left off part way through root rsb's lkbs.
10674 + * subrsb && !lkb, we left off just before starting a new subrsb.
10675 + * subrsb && lkb, we left off part way through a subrsb's lkbs.
10676 + * !subrsb && !lkb, we shouldn't be in this function, but starting
10677 + * a new rsb in pack_rsb_tree().
10680 +static int pack_rsb_tree_remaining(struct dlm_ls *ls, struct dlm_rsb *rsb,
10681 + rcom_fill_t *fill)
10683 + struct dlm_rsb *subrsb = NULL;
10686 + if (!fill->subrsb && fill->lkb) {
10687 + error = pack_lkb_remaining(rsb, fill);
10691 + error = pack_subrsbs(rsb, NULL, fill);
10696 + else if (fill->subrsb && !fill->lkb) {
10697 + error = pack_subrsbs(rsb, fill->subrsb, fill);
10702 + else if (fill->subrsb && fill->lkb) {
10703 + error = pack_lkb_remaining(fill->subrsb, fill);
10707 + if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
10710 + subrsb = next_subrsb(fill->subrsb);
10712 + error = pack_subrsbs(rsb, subrsb, fill);
10717 + fill->subrsb = NULL;
10718 + fill->lkb = NULL;
10725 + * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
10726 + * buffer. When the buffer runs out of space, save the place to restart (the
10727 + * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
10730 +static int pack_rsb_tree(struct dlm_ls *ls, struct dlm_rsb *rsb,
10731 + rcom_fill_t *fill)
10733 + int error = -ENOSPC;
10735 + fill->remasterid = 0;
10738 + * Pack the root rsb itself. A 1 byte type precedes the serialised
10739 + * rsb. Then pack the lkb's for the root rsb.
10742 + down_write(&rsb->res_lock);
10744 + if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
10747 + rsb->res_remasterid = ++fill->remasterid;
10748 + put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
10749 + serialise_rsb(rsb, fill->outbuf, &fill->offset);
10751 + error = pack_lkb_queues(rsb, fill);
10755 + up_write(&rsb->res_lock);
10758 + * Pack subrsb/lkb's under the root rsb.
10761 + error = pack_subrsbs(rsb, NULL, fill);
10766 + up_write(&rsb->res_lock);
10771 + * Given an RSB, return the next RSB that should be sent to a new master.
10774 +static struct dlm_rsb *next_remastered_rsb(struct dlm_ls *ls,
10775 + struct dlm_rsb *rsb)
10777 + struct list_head *tmp, *start, *end;
10778 + struct dlm_rsb *r;
10781 + start = ls->ls_rootres.next;
10783 + start = rsb->res_rootlist.next;
10785 + end = &ls->ls_rootres;
10787 + for (tmp = start; tmp != end; tmp = tmp->next) {
10788 + r = list_entry(tmp, struct dlm_rsb, res_rootlist);
10790 + if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
10791 + if (r->res_nodeid && lkbs_to_remaster(r)) {
10792 + expect_new_lkids(r);
10795 + clear_bit(RESFL_NEW_MASTER, &r->res_flags);
10803 + * Given an rcom buffer, fill it with RSB's that need to be sent to a single
10804 + * new master node. In the case where all the data to send to one node
10805 + * requires multiple messages, this function needs to resume filling each
10806 + * successive buffer from the point where it left off when the previous buffer
10810 +static void fill_rcom_buffer(struct dlm_ls *ls, rcom_fill_t *fill,
10811 + uint32_t *nodeid)
10813 + struct dlm_rsb *rsb, *prev_rsb = fill->rsb;
10816 + fill->offset = 0;
10821 + * The first time this function is called.
10824 + rsb = next_remastered_rsb(ls, NULL);
10828 + } else if (fill->subrsb || fill->lkb) {
10831 + * Continue packing an rsb tree that was partially packed last
10832 + * time (fill->subrsb/lkb indicates where packing of last block
10837 + *nodeid = rsb->res_nodeid;
10839 + error = pack_rsb_tree_remaining(ls, rsb, fill);
10840 + if (error == -ENOSPC)
10843 + rsb = next_remastered_rsb(ls, prev_rsb);
10847 + if (rsb->res_nodeid != prev_rsb->res_nodeid)
10854 + * Pack rsb trees into the buffer until we run out of space, run out of
10855 + * new rsb's or hit a new nodeid.
10858 + *nodeid = rsb->res_nodeid;
10861 + error = pack_rsb_tree(ls, rsb, fill);
10862 + if (error == -ENOSPC)
10867 + rsb = next_remastered_rsb(ls, prev_rsb);
10871 + if (rsb->res_nodeid != prev_rsb->res_nodeid)
10885 + * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
10888 +int rebuild_rsbs_send(struct dlm_ls *ls)
10890 + struct dlm_rcom *rc;
10891 + rcom_fill_t fill;
10895 + DLM_ASSERT(recover_list_empty(ls),);
10897 + log_all(ls, "rebuild locks");
10900 + rc = allocate_rcom_buffer(ls);
10905 + memset(&fill, 0, sizeof(rcom_fill_t));
10906 + fill.outbuf = rc->rc_buf;
10907 + fill.maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
10910 + fill_rcom_buffer(ls, &fill, &nodeid);
10911 + if (!fill.offset)
10914 + rc->rc_datalen = fill.offset;
10915 + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
10920 + error = dlm_recovery_stopped(ls);
10924 + while (fill.more);
10926 + error = dlm_wait_function(ls, &recover_list_empty);
10928 + log_all(ls, "rebuilt %d locks", fill.count);
10931 + rebuild_freemem(ls);
10932 + free_rcom_buffer(rc);
10938 +static struct dlm_rsb *find_by_remasterid(struct dlm_ls *ls, int remasterid,
10939 + struct dlm_rsb *rootrsb)
10941 + struct dlm_rsb *rsb;
10943 + DLM_ASSERT(rootrsb,);
10945 + if (rootrsb->res_remasterid == remasterid) {
10950 + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
10951 + if (rsb->res_remasterid == remasterid)
10961 + * Search a queue for the given remote lock id (remlkid).
10964 +static struct dlm_lkb *search_remlkid(struct list_head *statequeue, int nodeid,
10967 + struct dlm_lkb *lkb;
10969 + list_for_each_entry(lkb, statequeue, lkb_statequeue) {
10970 + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
10979 + * Given a remote lock ID (and a parent resource), return the local LKB for it
10980 + * Hopefully we dont need to do this too often on deep lock trees. This is
10981 + * VERY suboptimal for anything but the smallest lock trees. It searches the
10982 + * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
10983 + * returns the LKB address. OPTIMISATION: we should keep a list of these while
10984 + * we are building up the remastered LKBs
10987 +static struct dlm_lkb *find_by_remlkid(struct dlm_rsb *rootrsb, int nodeid,
10990 + struct dlm_lkb *lkb;
10991 + struct dlm_rsb *rsb;
10993 + lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
10997 + lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
11001 + lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
11005 + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
11006 + lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
11010 + lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
11014 + lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
11025 + * Unpack an LKB from a remaster operation
11028 +static int deserialise_lkb(struct dlm_ls *ls, int rem_nodeid,
11029 + struct dlm_rsb *rootrsb, char *buf, int *ptr,
11030 + char *outbuf, int *outoffp)
11032 + struct dlm_lkb *lkb;
11033 + struct dlm_rsb *rsb;
11034 + int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
11036 + remote_lkid = get_int(buf, ptr);
11038 + rsb_rmid = get_int(buf, ptr);
11039 + rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
11040 + DLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
11043 + * We could have received this lkb already from a previous recovery
11044 + * that was interrupted. If so, just return the lkid to the remote
11047 + lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
11051 + lkb = create_lkb(rsb->res_ls);
11055 + lkb->lkb_remid = remote_lkid;
11056 + lkb->lkb_flags = get_int(buf, ptr);
11057 + status = get_int(buf, ptr);
11058 + lkb->lkb_rqmode = get_char(buf, ptr);
11059 + lkb->lkb_grmode = get_char(buf, ptr);
11060 + atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
11062 + parentid = get_int(buf, ptr);
11063 + lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
11065 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11066 + lkb->lkb_lvbptr = allocate_lvb(ls);
11067 + if (!lkb->lkb_lvbptr)
11069 + get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
11072 + if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
11073 + uint64_t start, end;
11075 + /* Don't need to keep the range flag, for comms use only */
11076 + lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
11077 + start = get_int64(buf, ptr);
11078 + end = get_int64(buf, ptr);
11080 + lkb->lkb_range = allocate_range(rsb->res_ls);
11081 + if (!lkb->lkb_range)
11084 + switch (status) {
11085 + case GDLM_LKSTS_CONVERT:
11086 + lkb->lkb_range[RQ_RANGE_START] = start;
11087 + lkb->lkb_range[RQ_RANGE_END] = end;
11088 + start = get_int64(buf, ptr);
11089 + end = get_int64(buf, ptr);
11090 + lkb->lkb_range[GR_RANGE_START] = start;
11091 + lkb->lkb_range[GR_RANGE_END] = end;
11093 + case GDLM_LKSTS_WAITING:
11094 + lkb->lkb_range[RQ_RANGE_START] = start;
11095 + lkb->lkb_range[RQ_RANGE_END] = end;
11098 + case GDLM_LKSTS_GRANTED:
11099 + lkb->lkb_range[GR_RANGE_START] = start;
11100 + lkb->lkb_range[GR_RANGE_END] = end;
11107 + /* Resolve local lock LKB address from parent ID */
11109 + lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
11112 + atomic_inc(&rsb->res_ref);
11113 + lkb->lkb_resource = rsb;
11115 + lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
11116 + lkb->lkb_nodeid = rem_nodeid;
11119 + * Put the lkb on an RSB queue. An lkb that's in the midst of a
11120 + * conversion request (on the requesting node's lockqueue and has
11121 + * LQCONVERT set) should be put on the granted queue. The convert
11122 + * request will be resent by the requesting node.
11125 + if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
11126 + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
11127 + DLM_ASSERT(status == GDLM_LKSTS_CONVERT,
11128 + printk("status=%d\n", status););
11129 + lkb->lkb_rqmode = DLM_LOCK_IV;
11130 + status = GDLM_LKSTS_GRANTED;
11133 + lkb_enqueue(rsb, lkb, status);
11136 + * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
11139 + if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
11140 + && lkb->lkb_grmode > DLM_LOCK_NL) {
11141 + if (!rsb->res_lvbptr)
11142 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
11143 + if (!rsb->res_lvbptr)
11145 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
11149 + * Clear flags that may have been sent over that are only relevant in
11150 + * the context of the sender.
11153 + lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND |
11154 + GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED);
11157 + /* Return the new LKID to the caller's buffer */
11158 + put_int(lkb->lkb_id, outbuf, outoffp);
11159 + put_int(lkb->lkb_remid, outbuf, outoffp);
11166 +static struct dlm_rsb *deserialise_rsb(struct dlm_ls *ls, int nodeid,
11167 + struct dlm_rsb *rootrsb, char *buf,
11172 + int parent_remasterid;
11173 + char name[DLM_RESNAME_MAXLEN];
11175 + struct dlm_rsb *parent = NULL;
11176 + struct dlm_rsb *rsb;
11178 + get_bytes(name, &length, buf, ptr);
11179 + remasterid = get_int(buf, ptr);
11180 + parent_remasterid = get_int(buf, ptr);
11182 + if (parent_remasterid)
11183 + parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
11186 + * The rsb reference from this find_or_create_rsb() will keep the rsb
11187 + * around while we add new lkb's to it from deserialise_lkb. Each of
11188 + * the lkb's will add an rsb reference. The reference added here is
11189 + * removed by release_rsb() after all lkb's are added.
11192 + error = find_or_create_rsb(ls, parent, name, length, 1, &rsb);
11193 + DLM_ASSERT(!error,);
11195 + /* There is a case where the above needs to create the RSB. */
11196 + if (rsb->res_nodeid == -1)
11197 + rsb->res_nodeid = our_nodeid();
11199 + rsb->res_remasterid = remasterid;
11205 + * Processing at the receiving end of a NEWLOCKS message from a node in
11206 + * rebuild_rsbs_send(). Rebuild a remastered lock tree. Nodeid is the remote
11207 + * node whose locks we are now mastering. For a reply we need to send back the
11208 + * new lockids of the remastered locks so that remote ops can find them.
11211 +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
11213 + struct dlm_rcom *rc;
11214 + struct dlm_rsb *rsb = NULL;
11215 + rebuild_node_t *rnode;
11217 + int outptr, ptr = 0, error = -ENOMEM;
11219 + rnode = find_rebuild_root(ls, nodeid);
11224 + * Allocate a buffer for the reply message which is a list of remote
11225 + * lock IDs and their (new) local lock ids. It will always be big
11226 + * enough to fit <n> ID pairs if it already fit <n> LKBs.
11229 + rc = allocate_rcom_buffer(ls);
11232 + outbuf = rc->rc_buf;
11236 + * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
11237 + * created. Each deserialise_rsb adds an rsb reference that must be
11238 + * removed with release_rsb once all new lkb's for an rsb have been
11242 + while (ptr < len) {
11245 + type = get_char(buf, &ptr);
11248 + case REMASTER_ROOTRSB:
11250 + release_rsb(rsb);
11251 + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11253 + rnode->rootrsb = rsb;
11256 + case REMASTER_RSB:
11258 + release_rsb(rsb);
11259 + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11263 + case REMASTER_LKB:
11264 + deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
11265 + outbuf, &outptr);
11269 + DLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
11270 + "len=%d\n", type, nodeid, ptr,
11276 + release_rsb(rsb);
11279 + * Reply with the new lock IDs.
11282 + rc->rc_datalen = outptr;
11283 + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
11285 + free_rcom_buffer(rc);
11292 + * Processing for a NEWLOCKIDS message. Called when we get the reply from the
11293 + * new master telling us what the new remote lock IDs are for the remastered
11297 +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
11304 + while (offset < len) {
11307 + struct dlm_lkb *lkb;
11309 + if (offset + 8 > len) {
11310 + log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
11311 + "length nodeid=%d offset=%d len=%d",
11312 + nodeid, offset, len);
11316 + remote_id = get_int(buf, &offset);
11317 + local_id = get_int(buf, &offset);
11319 + lkb = find_lock_by_id(ls, local_id);
11321 + lkb->lkb_remid = remote_id;
11322 + have_new_lkid(lkb);
11324 + log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
11325 + "nodeid=%d id=%x remid=%x offset=%d len=%d",
11326 + nodeid, local_id, remote_id, offset, len);
11330 + if (recover_list_empty(ls))
11331 + wake_up(&ls->ls_wait_general);
11335 diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
11336 --- linux-orig/cluster/dlm/rebuild.h 1970-01-01 07:30:00.000000000 +0730
11337 +++ linux-patched/cluster/dlm/rebuild.h 2004-07-13 18:57:22.000000000 +0800
11339 +/******************************************************************************
11340 +*******************************************************************************
11342 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11343 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11345 +** This copyrighted material is made available to anyone wishing to use,
11346 +** modify, copy, or redistribute it subject to the terms and conditions
11347 +** of the GNU General Public License v.2.
11349 +*******************************************************************************
11350 +******************************************************************************/
11352 +#ifndef __REBUILD_DOT_H__
11353 +#define __REBUILD_DOT_H__
11355 +int rebuild_rsbs_send(struct dlm_ls *ls);
11356 +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
11357 +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
11358 +int rebuild_freemem(struct dlm_ls *ls);
11360 +#endif /* __REBUILD_DOT_H__ */
11361 diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
11362 --- linux-orig/cluster/dlm/reccomms.c 1970-01-01 07:30:00.000000000 +0730
11363 +++ linux-patched/cluster/dlm/reccomms.c 2004-07-13 18:57:22.000000000 +0800
11365 +/******************************************************************************
11366 +*******************************************************************************
11368 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11369 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11371 +** This copyrighted material is made available to anyone wishing to use,
11372 +** modify, copy, or redistribute it subject to the terms and conditions
11373 +** of the GNU General Public License v.2.
11375 +*******************************************************************************
11376 +******************************************************************************/
11378 +#include "dlm_internal.h"
11379 +#include "lowcomms.h"
11380 +#include "midcomms.h"
11381 +#include "reccomms.h"
11382 +#include "nodes.h"
11383 +#include "lockspace.h"
11384 +#include "recover.h"
11386 +#include "config.h"
11387 +#include "rebuild.h"
11388 +#include "memory.h"
11390 +/* Running on the basis that only a single recovery communication will be done
11391 + * at a time per lockspace */
11393 +static void rcom_process_message(struct dlm_ls * ls, uint32_t nodeid, struct dlm_rcom * rc);
11396 + * Track per-node progress/stats during recovery to help debugging.
11399 +void rcom_log(struct dlm_ls *ls, int nodeid, struct dlm_rcom *rc, int send)
11401 + struct dlm_csb *csb;
11404 + list_for_each_entry(csb, &ls->ls_nodes, list) {
11405 + if (csb->node->nodeid == nodeid) {
11414 + if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) {
11416 + csb->names_send_count++;
11417 + csb->names_send_msgid = rc->rc_msgid;
11419 + csb->names_recv_count++;
11420 + csb->names_recv_msgid = rc->rc_msgid;
11422 + } else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) {
11424 + csb->locks_send_count++;
11425 + csb->locks_send_msgid = rc->rc_msgid;
11427 + csb->locks_recv_count++;
11428 + csb->locks_recv_msgid = rc->rc_msgid;
11433 +void rcom_log_clear(struct dlm_ls *ls)
11435 + struct dlm_csb *csb;
11437 + list_for_each_entry(csb, &ls->ls_nodes, list) {
11438 + csb->names_send_count = 0;
11439 + csb->names_send_msgid = 0;
11440 + csb->names_recv_count = 0;
11441 + csb->names_recv_msgid = 0;
11442 + csb->locks_send_count = 0;
11443 + csb->locks_send_msgid = 0;
11444 + csb->locks_recv_count = 0;
11445 + csb->locks_recv_msgid = 0;
11449 +static int rcom_response(struct dlm_ls *ls)
11451 + return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11455 + * rcom_send_message - send or request recovery data
11456 + * @ls: the lockspace
11457 + * @nodeid: node to which the message is sent
11458 + * @type: type of recovery message
11459 + * @rc: the rc buffer to send
11460 + * @need_reply: wait for reply if this is set
11462 + * Using this interface
11463 + * i) Allocate an rc buffer:
11464 + * rc = allocate_rcom_buffer(ls);
11465 + * ii) Copy data to send beginning at rc->rc_buf:
11466 + * memcpy(rc->rc_buf, mybuf, mylen);
11467 + * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
11468 + * rc->rc_datalen = mylen
11469 + * iv) Submit the rc to this function:
11470 + * rcom_send_message(rc);
11472 + * The max value of "mylen" is dlm_config.buffer_size - sizeof(struct
11473 + * dlm_rcom). If more data must be passed in one send, use
11474 + * rcom_expand_buffer() which incrementally increases the size of the rc buffer
11475 + * by dlm_config.buffer_size bytes.
11477 + * Any data returned for the message (when need_reply is set) will saved in
11478 + * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
11479 + * number of bytes copied into rc->rc_buf.
11481 + * Returns: 0 on success, -EXXX on failure
11484 +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
11485 + struct dlm_rcom *rc, int need_reply)
11489 + if (!rc->rc_datalen)
11490 + rc->rc_datalen = 1;
11493 + * Fill in the header.
11496 + rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
11497 + rc->rc_header.rh_lockspace = ls->ls_global_id;
11498 + rc->rc_header.rh_length = sizeof(struct dlm_rcom) + rc->rc_datalen - 1;
11499 + rc->rc_subcmd = type;
11500 + rc->rc_msgid = ++ls->ls_rcom_msgid;
11502 + rcom_log(ls, nodeid, rc, 1);
11505 + * When a reply is received, the reply data goes back into this buffer.
11506 + * Synchronous rcom requests (need_reply=1) are serialised because of
11507 + * the single ls_rcom.
11510 + if (need_reply) {
11511 + down(&ls->ls_rcom_lock);
11512 + ls->ls_rcom = rc;
11516 + * After sending the message we'll wait at the end of this function to
11517 + * get a reply. The READY flag will be set when the reply has been
11518 + * received and requested data has been copied into
11519 + * ls->ls_rcom->rc_buf;
11522 + DLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
11525 + * The WAIT bit indicates that we're waiting for and willing to accept a
11526 + * reply. Any replies are ignored unless this bit is set.
11529 + set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11532 + * Process the message locally.
11535 + if (nodeid == our_nodeid()) {
11536 + rcom_process_message(ls, nodeid, rc);
11541 + * Send the message.
11544 + log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
11546 + error = midcomms_send_message(nodeid, (struct dlm_header *) rc,
11548 + DLM_ASSERT(error >= 0, printk("error = %d\n", error););
11552 + * Wait for a reply. Once a reply is processed from midcomms, the
11553 + * READY bit will be set and we'll be awoken (dlm_wait_function will
11557 + if (need_reply) {
11558 + error = dlm_wait_function(ls, &rcom_response);
11560 + log_debug(ls, "rcom wait error %d", error);
11564 + clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11565 + clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11568 + up(&ls->ls_rcom_lock);
11574 + * Runs in same context as midcomms.
11577 +static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc)
11579 + struct dlm_rcom rc_stack;
11580 + struct dlm_rcom *reply = NULL;
11581 + int status, datalen, maxlen;
11582 + uint32_t r_nodeid, be_nodeid;
11587 + rcom_log(ls, nodeid, rc, 0);
11589 + if (dlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
11590 + log_error(ls, "ignoring recovery message %x from %u",
11591 + rc->rc_subcmd, nodeid);
11595 + switch (rc->rc_subcmd) {
11597 + case RECCOMM_STATUS:
11599 + memset(&rc_stack, 0, sizeof(struct dlm_rcom));
11600 + reply = &rc_stack;
11602 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11603 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11604 + reply->rc_subcmd = rc->rc_subcmd;
11605 + reply->rc_msgid = rc->rc_msgid;
11606 + reply->rc_buf[0] = 0;
11608 + if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
11609 + reply->rc_buf[0] |= RESDIR_VALID;
11611 + if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
11612 + reply->rc_buf[0] |= RESDIR_ALL_VALID;
11614 + if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
11615 + reply->rc_buf[0] |= NODES_VALID;
11617 + if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
11618 + reply->rc_buf[0] |= NODES_ALL_VALID;
11620 + reply->rc_datalen = 1;
11621 + reply->rc_header.rh_length =
11622 + sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11624 + log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
11627 + case RECCOMM_RECOVERNAMES:
11629 + reply = allocate_rcom_buffer(ls);
11630 + DLM_ASSERT(reply,);
11631 + maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
11633 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11634 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11635 + reply->rc_subcmd = rc->rc_subcmd;
11636 + reply->rc_msgid = rc->rc_msgid;
11639 + * The other node wants a bunch of resource names. The name of
11640 + * the resource to begin with is in rc->rc_buf.
11643 + datalen = dlm_dir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
11644 + reply->rc_buf, maxlen, nodeid);
11646 + reply->rc_datalen = datalen;
11647 + reply->rc_header.rh_length =
11648 + sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11650 + log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
11651 + reply->rc_msgid);
11654 + case RECCOMM_GETMASTER:
11656 + reply = allocate_rcom_buffer(ls);
11657 + DLM_ASSERT(reply,);
11659 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11660 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11661 + reply->rc_subcmd = rc->rc_subcmd;
11662 + reply->rc_msgid = rc->rc_msgid;
11665 + * The other node wants to know the master of a named resource.
11668 + status = dlm_dir_lookup_recovery(ls, nodeid, rc->rc_buf,
11669 + rc->rc_datalen, &r_nodeid);
11670 + if (status != 0) {
11671 + free_rcom_buffer(reply);
11675 + be_nodeid = cpu_to_be32(r_nodeid);
11676 + memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
11677 + reply->rc_datalen = sizeof(uint32_t);
11678 + reply->rc_header.rh_length =
11679 + sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11682 + case RECCOMM_BULKLOOKUP:
11684 + reply = allocate_rcom_buffer(ls);
11685 + DLM_ASSERT(reply,);
11687 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11688 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11689 + reply->rc_subcmd = rc->rc_subcmd;
11690 + reply->rc_msgid = rc->rc_msgid;
11693 + * This is a bulk version of the above and just returns a
11694 + * buffer full of node ids to match the resources
11697 + datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
11698 + rc->rc_datalen, reply->rc_buf);
11699 + if (datalen < 0) {
11700 + free_rcom_buffer(reply);
11705 + reply->rc_datalen = datalen;
11706 + reply->rc_header.rh_length =
11707 + sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11711 + * These RECCOMM messages don't need replies.
11714 + case RECCOMM_NEWLOCKS:
11715 + rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11718 + case RECCOMM_NEWLOCKIDS:
11719 + rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11722 + case RECCOMM_REMRESDATA:
11723 + remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11727 + DLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
11731 + if (nodeid == our_nodeid()) {
11732 + DLM_ASSERT(rc == ls->ls_rcom,);
11733 + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11734 + rc->rc_datalen = reply->rc_datalen;
11736 + midcomms_send_message(nodeid,
11737 + (struct dlm_header *) reply,
11741 + if (reply != &rc_stack)
11742 + free_rcom_buffer(reply);
11746 +static void process_reply_sync(struct dlm_ls *ls, uint32_t nodeid,
11747 + struct dlm_rcom *reply)
11749 + struct dlm_rcom *rc = ls->ls_rcom;
11751 + if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
11752 + log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
11756 + if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
11757 + log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
11758 + reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
11762 + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11763 + rc->rc_datalen = reply->rc_datalen;
11766 + * Tell the thread waiting in rcom_send_message() that it can go ahead.
11769 + set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11770 + wake_up(&ls->ls_wait_general);
11773 +static void process_reply_async(struct dlm_ls *ls, uint32_t nodeid,
11774 + struct dlm_rcom *reply)
11776 + restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
11777 + reply->rc_msgid);
11781 + * Runs in same context as midcomms.
11784 +static void rcom_process_reply(struct dlm_ls *ls, uint32_t nodeid,
11785 + struct dlm_rcom *reply)
11787 + if (dlm_recovery_stopped(ls)) {
11788 + log_error(ls, "ignoring recovery reply %x from %u",
11789 + reply->rc_subcmd, nodeid);
11793 + switch (reply->rc_subcmd) {
11794 + case RECCOMM_GETMASTER:
11795 + process_reply_async(ls, nodeid, reply);
11797 + case RECCOMM_STATUS:
11798 + case RECCOMM_NEWLOCKS:
11799 + case RECCOMM_NEWLOCKIDS:
11800 + case RECCOMM_RECOVERNAMES:
11801 + process_reply_sync(ls, nodeid, reply);
11804 + log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
11805 + reply->rc_subcmd, nodeid);
11810 +static int send_ls_not_ready(uint32_t nodeid, struct dlm_header *header)
11812 + struct writequeue_entry *wq;
11813 + struct dlm_rcom *rc = (struct dlm_rcom *) header;
11814 + struct dlm_rcom *reply;
11816 + wq = lowcomms_get_buffer(nodeid, sizeof(struct dlm_rcom), GFP_KERNEL,
11817 + (char **)&reply);
11821 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11822 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11823 + reply->rc_subcmd = rc->rc_subcmd;
11824 + reply->rc_msgid = rc->rc_msgid;
11825 + reply->rc_buf[0] = 0;
11827 + reply->rc_datalen = 1;
11828 + reply->rc_header.rh_length = sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11830 + midcomms_send_buffer((struct dlm_header *)reply, wq);
11836 + * Runs in same context as midcomms. Both recovery requests and recovery
11837 + * replies come through this function.
11840 +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header)
11842 + struct dlm_ls *ls = find_lockspace_by_global_id(header->rh_lockspace);
11843 + struct dlm_rcom *rc = (struct dlm_rcom *) header;
11845 + /* If the lockspace doesn't exist then still send a status message
11846 + back, it's possible that it just doesn't have it's global_id
11849 + send_ls_not_ready(nodeid, header);
11853 + switch (header->rh_cmd) {
11854 + case GDLM_REMCMD_RECOVERMESSAGE:
11855 + down_read(&ls->ls_rec_rsblist);
11856 + rcom_process_message(ls, nodeid, rc);
11857 + up_read(&ls->ls_rec_rsblist);
11860 + case GDLM_REMCMD_RECOVERREPLY:
11861 + rcom_process_reply(ls, nodeid, rc);
11865 + DLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
11869 diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
11870 --- linux-orig/cluster/dlm/reccomms.h 1970-01-01 07:30:00.000000000 +0730
11871 +++ linux-patched/cluster/dlm/reccomms.h 2004-07-13 18:57:22.000000000 +0800
11873 +/******************************************************************************
11874 +*******************************************************************************
11876 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11877 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11879 +** This copyrighted material is made available to anyone wishing to use,
11880 +** modify, copy, or redistribute it subject to the terms and conditions
11881 +** of the GNU General Public License v.2.
11883 +*******************************************************************************
11884 +******************************************************************************/
11886 +#ifndef __RECCOMMS_DOT_H__
11887 +#define __RECCOMMS_DOT_H__
11891 +#define RESDIR_VALID (1)
11892 +#define RESDIR_ALL_VALID (2)
11893 +#define NODES_VALID (4)
11894 +#define NODES_ALL_VALID (8)
11896 +#define RECCOMM_STATUS (1)
11897 +#define RECCOMM_RECOVERNAMES (2)
11898 +#define RECCOMM_GETMASTER (3)
11899 +#define RECCOMM_BULKLOOKUP (4)
11900 +#define RECCOMM_NEWLOCKS (5)
11901 +#define RECCOMM_NEWLOCKIDS (6)
11902 +#define RECCOMM_REMRESDATA (7)
11904 +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
11905 + struct dlm_rcom *rc, int need_reply);
11906 +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header);
11907 +void rcom_log_clear(struct dlm_ls *ls);
11910 diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
11911 --- linux-orig/cluster/dlm/recover.c 1970-01-01 07:30:00.000000000 +0730
11912 +++ linux-patched/cluster/dlm/recover.c 2004-07-13 18:57:22.000000000 +0800
11914 +/******************************************************************************
11915 +*******************************************************************************
11917 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11918 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11920 +** This copyrighted material is made available to anyone wishing to use,
11921 +** modify, copy, or redistribute it subject to the terms and conditions
11922 +** of the GNU General Public License v.2.
11924 +*******************************************************************************
11925 +******************************************************************************/
11927 +#include "dlm_internal.h"
11928 +#include "reccomms.h"
11930 +#include "locking.h"
11932 +#include "lockspace.h"
11934 +#include "nodes.h"
11935 +#include "config.h"
11937 +#include "memory.h"
11940 + * Called in recovery routines to check whether the recovery process has been
11941 + * interrupted/stopped by another transition. A recovery in-process will abort
11942 + * if the lockspace is "stopped" so that a new recovery process can start from
11943 + * the beginning when the lockspace is "started" again.
11946 +int dlm_recovery_stopped(struct dlm_ls *ls)
11948 + return test_bit(LSFL_LS_STOP, &ls->ls_flags);
11951 +static void dlm_wait_timer_fn(unsigned long data)
11953 + struct dlm_ls *ls = (struct dlm_ls *) data;
11955 + wake_up(&ls->ls_wait_general);
11959 + * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
11960 + * set due to failure of a node in ls_nodes). When another function thinks it
11961 + * could have completed the waited-on task, they should wake up ls_wait_general
11962 + * to get an immediate response rather than waiting for the timer to detect the
11963 + * result. A timer wakes us up periodically while waiting to see if we should
11964 + * abort due to a node failure.
11967 +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls))
11969 + struct timer_list timer;
11972 + init_timer(&timer);
11973 + timer.function = dlm_wait_timer_fn;
11974 + timer.data = (long) ls;
11977 + mod_timer(&timer, jiffies + (5 * HZ));
11979 + wchan_cond_sleep_intr(ls->ls_wait_general,
11981 + !test_bit(LSFL_LS_STOP, &ls->ls_flags));
11983 + if (timer_pending(&timer))
11984 + del_timer(&timer);
11989 + if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
11998 +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status)
12000 + struct dlm_rcom rc_stack, *rc;
12001 + struct dlm_csb *csb;
12005 + memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12007 + rc->rc_datalen = 0;
12009 + list_for_each_entry(csb, &ls->ls_nodes, list) {
12011 + error = dlm_recovery_stopped(ls);
12015 + error = rcom_send_message(ls, csb->node->nodeid,
12016 + RECCOMM_STATUS, rc, 1);
12020 + status = rc->rc_buf[0];
12021 + if (status & wait_status)
12024 + set_current_state(TASK_INTERRUPTIBLE);
12025 + schedule_timeout(HZ >> 1);
12034 +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status)
12036 + struct dlm_rcom rc_stack, *rc;
12037 + uint32_t nodeid = ls->ls_low_nodeid;
12041 + memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12043 + rc->rc_datalen = 0;
12046 + error = dlm_recovery_stopped(ls);
12050 + error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
12054 + status = rc->rc_buf[0];
12055 + if (status & wait_status)
12058 + set_current_state(TASK_INTERRUPTIBLE);
12059 + schedule_timeout(HZ >> 1);
12067 +static int purge_queue(struct dlm_ls *ls, struct list_head *queue)
12069 + struct dlm_lkb *lkb, *safe;
12070 + struct dlm_rsb *rsb;
12073 + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
12074 + if (!lkb->lkb_nodeid)
12077 + DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
12079 + if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
12080 + list_del(&lkb->lkb_statequeue);
12082 + rsb = lkb->lkb_resource;
12083 + lkb->lkb_status = 0;
12085 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
12086 + && &lkb->lkb_duetime)
12087 + remove_from_deadlockqueue(lkb);
12089 + release_lkb(ls, lkb);
12090 + release_rsb(rsb);
12099 + * Go through local restbl and for each rsb we're master of, clear out any
12100 + * lkb's held by departed nodes.
12103 +int restbl_lkb_purge(struct dlm_ls *ls)
12105 + struct list_head *tmp2, *safe2;
12107 + struct dlm_rsb *rootrsb, *safe, *rsb;
12109 + log_all(ls, "purge locks of departed nodes");
12111 + list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
12113 + if (rootrsb->res_nodeid)
12116 + hold_rsb(rootrsb);
12117 + down_write(&rootrsb->res_lock);
12119 + /* This traverses the subreslist in reverse order so we purge
12120 + * the children before their parents. */
12122 + for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
12123 + tmp2 != &rootrsb->res_subreslist;
12124 + tmp2 = safe2, safe2 = safe2->prev) {
12125 + rsb = list_entry(tmp2, struct dlm_rsb, res_subreslist);
12128 + purge_queue(ls, &rsb->res_grantqueue);
12129 + purge_queue(ls, &rsb->res_convertqueue);
12130 + purge_queue(ls, &rsb->res_waitqueue);
12131 + release_rsb(rsb);
12133 + count += purge_queue(ls, &rootrsb->res_grantqueue);
12134 + count += purge_queue(ls, &rootrsb->res_convertqueue);
12135 + count += purge_queue(ls, &rootrsb->res_waitqueue);
12137 + up_write(&rootrsb->res_lock);
12138 + release_rsb(rootrsb);
12141 + log_all(ls, "purged %d locks", count);
12147 + * Grant any locks that have become grantable after a purge
12150 +int restbl_grant_after_purge(struct dlm_ls *ls)
12152 + struct dlm_rsb *root, *rsb, *safe;
12155 + down_write(&ls->ls_gap_rsblist);
12157 + list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
12158 + /* only the rsb master grants locks */
12159 + if (root->res_nodeid)
12162 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
12163 + log_debug(ls, "restbl_grant_after_purge aborted");
12165 + up_write(&ls->ls_gap_rsblist);
12169 + down_write(&root->res_lock);
12170 + grant_pending_locks(root);
12171 + up_write(&root->res_lock);
12173 + list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
12174 + down_write(&rsb->res_lock);
12175 + grant_pending_locks(rsb);
12176 + up_write(&rsb->res_lock);
12179 + up_write(&ls->ls_gap_rsblist);
12186 + * Set the lock master for all LKBs in a lock queue
12189 +static void set_lock_master(struct list_head *queue, int nodeid)
12191 + struct dlm_lkb *lkb;
12193 + list_for_each_entry(lkb, queue, lkb_statequeue) {
12194 + /* Don't muck around with pre-exising sublocks */
12195 + if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
12196 + lkb->lkb_nodeid = nodeid;
12200 +static void set_master_lkbs(struct dlm_rsb *rsb)
12202 + set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
12203 + set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
12204 + set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
12208 + * This rsb struct is now the master so it is responsible for keeping the
12209 + * latest rsb. Find if any current lkb's have an up to date copy of the lvb to
12210 + * be used as the rsb copy. An equivalent step occurs as new lkb's arrive for
12211 + * this rsb in deserialise_lkb.
12214 +static void set_rsb_lvb(struct dlm_rsb *rsb)
12216 + struct dlm_lkb *lkb;
12218 + list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
12220 + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12221 + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12222 + (lkb->lkb_grmode > DLM_LOCK_NL))
12224 + if (!rsb->res_lvbptr)
12225 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12227 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12232 + list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
12234 + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12235 + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12236 + (lkb->lkb_grmode > DLM_LOCK_NL))
12238 + if (!rsb->res_lvbptr)
12239 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12241 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12248 + * Propogate the new master nodeid to locks, subrsbs, sublocks.
12249 + * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
12252 +static void set_new_master(struct dlm_rsb *rsb)
12254 + struct dlm_rsb *subrsb;
12256 + down_write(&rsb->res_lock);
12258 + if (rsb->res_nodeid == our_nodeid()) {
12259 + rsb->res_nodeid = 0;
12260 + set_rsb_lvb(rsb);
12263 + set_master_lkbs(rsb);
12265 + list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
12266 + subrsb->res_nodeid = rsb->res_nodeid;
12267 + set_master_lkbs(subrsb);
12270 + up_write(&rsb->res_lock);
12272 + set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
12276 + * The recover_list contains all the rsb's for which we've requested the new
12277 + * master nodeid. As replies are returned from the resource directories the
12278 + * rsb's are removed from the list. When the list is empty we're done.
12280 + * The recover_list is later similarly used for all rsb's for which we've sent
12281 + * new lkb's and need to receive new corresponding lkid's.
12284 +int recover_list_empty(struct dlm_ls *ls)
12288 + spin_lock(&ls->ls_recover_list_lock);
12289 + empty = list_empty(&ls->ls_recover_list);
12290 + spin_unlock(&ls->ls_recover_list_lock);
12295 +int recover_list_count(struct dlm_ls *ls)
12299 + spin_lock(&ls->ls_recover_list_lock);
12300 + count = ls->ls_recover_list_count;
12301 + spin_unlock(&ls->ls_recover_list_lock);
12306 +void recover_list_add(struct dlm_rsb *rsb)
12308 + struct dlm_ls *ls = rsb->res_ls;
12310 + spin_lock(&ls->ls_recover_list_lock);
12311 + if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
12312 + list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
12313 + ls->ls_recover_list_count++;
12316 + spin_unlock(&ls->ls_recover_list_lock);
12319 +void recover_list_del(struct dlm_rsb *rsb)
12321 + struct dlm_ls *ls = rsb->res_ls;
12323 + spin_lock(&ls->ls_recover_list_lock);
12324 + clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
12325 + list_del(&rsb->res_recover_list);
12326 + ls->ls_recover_list_count--;
12327 + spin_unlock(&ls->ls_recover_list_lock);
12329 + release_rsb(rsb);
12332 +static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, int msgid)
12334 + struct dlm_rsb *rsb = NULL;
12336 + spin_lock(&ls->ls_recover_list_lock);
12338 + list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
12339 + if (rsb->res_recover_msgid == msgid)
12345 + spin_unlock(&ls->ls_recover_list_lock);
12350 +static void recover_list_clear(struct dlm_ls *ls)
12352 + struct dlm_rsb *rsb;
12355 + spin_lock(&ls->ls_recover_list_lock);
12357 + while (!list_empty(&ls->ls_recover_list)) {
12358 + rsb = list_entry(ls->ls_recover_list.next, struct dlm_rsb,
12359 + res_recover_list);
12360 + list_del(&rsb->res_recover_list);
12361 + ls->ls_recover_list_count--;
12363 + spin_unlock(&ls->ls_recover_list_lock);
12368 +static int rsb_master_lookup(struct dlm_rsb *rsb, struct dlm_rcom *rc)
12370 + struct dlm_ls *ls = rsb->res_ls;
12371 + uint32_t dir_nodeid, r_nodeid;
12374 + dir_nodeid = get_directory_nodeid(rsb);
12376 + if (dir_nodeid == our_nodeid()) {
12377 + error = dlm_dir_lookup_recovery(ls, dir_nodeid, rsb->res_name,
12378 + rsb->res_length, &r_nodeid);
12382 + rsb->res_nodeid = r_nodeid;
12383 + set_new_master(rsb);
12385 + /* As we are the only thread doing recovery this
12386 + should be safe. if not then we need to use a different
12387 + ID somehow. We must set it in the RSB before rcom_send_msg
12388 + completes cos we may get a reply quite quickly.
12390 + rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
12392 + recover_list_add(rsb);
12394 + memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
12395 + rc->rc_datalen = rsb->res_length;
12397 + error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
12408 + * Go through local root resources and for each rsb which has a master which
12409 + * has departed, get the new master nodeid from the resdir. The resdir will
12410 + * assign mastery to the first node to look up the new master. That means
12411 + * we'll discover in this lookup if we're the new master of any rsb's.
12413 + * We fire off all the resdir requests individually and asynchronously to the
12414 + * correct resdir node. The replies are processed in rsb_master_recv().
12417 +int restbl_rsb_update(struct dlm_ls *ls)
12419 + struct dlm_rsb *rsb, *safe;
12420 + struct dlm_rcom *rc;
12421 + int error = -ENOMEM;
12424 + log_all(ls, "update remastered resources");
12426 + rc = allocate_rcom_buffer(ls);
12430 + list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
12431 + if (!rsb->res_nodeid)
12434 + error = dlm_recovery_stopped(ls);
12438 + if (in_nodes_gone(ls, rsb->res_nodeid)) {
12439 + error = rsb_master_lookup(rsb, rc);
12446 + error = dlm_wait_function(ls, &recover_list_empty);
12448 + log_all(ls, "updated %d resources", count);
12451 + free_rcom_buffer(rc);
12457 +int restbl_rsb_update_recv(struct dlm_ls *ls, uint32_t nodeid, char *buf,
12458 + int length, int msgid)
12460 + struct dlm_rsb *rsb;
12461 + uint32_t be_nodeid;
12463 + rsb = recover_list_find(ls, msgid);
12465 + log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
12469 + memcpy(&be_nodeid, buf, sizeof(uint32_t));
12470 + rsb->res_nodeid = be32_to_cpu(be_nodeid);
12471 + set_new_master(rsb);
12472 + recover_list_del(rsb);
12474 + if (recover_list_empty(ls))
12475 + wake_up(&ls->ls_wait_general);
12482 + * This function not used any longer.
12485 +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
12488 + char *inbufptr, *outbufptr;
12491 + * The other node wants nodeids matching the resource names in inbuf.
12492 + * The resource names are packed into inbuf as
12493 + * [len1][name1][len2][name2]... where lenX is 1 byte and nameX is
12494 + * lenX bytes. Matching nodeids are packed into outbuf in order
12495 + * [nodeid1][nodeid2]...
12498 + inbufptr = inbuf;
12499 + outbufptr = outbuf;
12501 + while (inbufptr < inbuf + inlen) {
12502 + uint32_t r_nodeid, be_nodeid;
12505 + status = dlm_dir_lookup_recovery(ls, nodeid, inbufptr + 1,
12506 + *inbufptr, &r_nodeid);
12510 + inbufptr += *inbufptr + 1;
12512 + be_nodeid = cpu_to_be32(r_nodeid);
12513 + memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
12514 + outbufptr += sizeof(uint32_t);
12516 + /* add assertion that outbufptr - outbuf is not > than ... */
12519 + return (outbufptr - outbuf);
12524 diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
12525 --- linux-orig/cluster/dlm/recover.h 1970-01-01 07:30:00.000000000 +0730
12526 +++ linux-patched/cluster/dlm/recover.h 2004-07-13 18:57:22.000000000 +0800
12528 +/******************************************************************************
12529 +*******************************************************************************
12531 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12532 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12534 +** This copyrighted material is made available to anyone wishing to use,
12535 +** modify, copy, or redistribute it subject to the terms and conditions
12536 +** of the GNU General Public License v.2.
12538 +*******************************************************************************
12539 +******************************************************************************/
12541 +#ifndef __RECOVER_DOT_H__
12542 +#define __RECOVER_DOT_H__
12544 +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls));
12545 +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status);
12546 +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status);
12547 +int dlm_recovery_stopped(struct dlm_ls *ls);
12548 +int recover_list_empty(struct dlm_ls *ls);
12549 +int recover_list_count(struct dlm_ls *ls);
12550 +void recover_list_add(struct dlm_rsb *rsb);
12551 +void recover_list_del(struct dlm_rsb *rsb);
12552 +int restbl_lkb_purge(struct dlm_ls *ls);
12553 +void restbl_grant_after_purge(struct dlm_ls *ls);
12554 +int restbl_rsb_update(struct dlm_ls *ls);
12555 +int restbl_rsb_update_recv(struct dlm_ls *ls, int nodeid, char *buf, int len,
12557 +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
12560 +#endif /* __RECOVER_DOT_H__ */
12561 diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
12562 --- linux-orig/cluster/dlm/recoverd.c 1970-01-01 07:30:00.000000000 +0730
12563 +++ linux-patched/cluster/dlm/recoverd.c 2004-07-13 18:57:22.000000000 +0800
12565 +/******************************************************************************
12566 +*******************************************************************************
12568 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12569 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12571 +** This copyrighted material is made available to anyone wishing to use,
12572 +** modify, copy, or redistribute it subject to the terms and conditions
12573 +** of the GNU General Public License v.2.
12575 +*******************************************************************************
12576 +******************************************************************************/
12578 +#include "dlm_internal.h"
12579 +#include "nodes.h"
12582 +#include "recover.h"
12583 +#include "lockspace.h"
12584 +#include "lowcomms.h"
12585 +#include "lockqueue.h"
12587 +#include "rebuild.h"
12590 + * next_move actions
12593 +#define DO_STOP (1)
12594 +#define DO_START (2)
12595 +#define DO_FINISH (3)
12596 +#define DO_FINISH_STOP (4)
12597 +#define DO_FINISH_START (5)
12600 + * recoverd_flags for thread
12603 +#define THREAD_STOP (0)
12606 + * local thread variables
12609 +static unsigned long recoverd_flags;
12610 +static struct completion recoverd_run;
12611 +static wait_queue_head_t recoverd_wait;
12612 +static struct task_struct *recoverd_task;
12615 + * Queue of lockspaces (dlm_recover structs) which need to be
12616 + * started/recovered
12619 +static struct list_head recoverd_start_queue;
12620 +static atomic_t recoverd_start_count;
12622 +extern struct list_head lslist;
12623 +extern spinlock_t lslist_lock;
12625 +void dlm_recoverd_init(void)
12627 + INIT_LIST_HEAD(&recoverd_start_queue);
12628 + atomic_set(&recoverd_start_count, 0);
12630 + init_completion(&recoverd_run);
12631 + init_waitqueue_head(&recoverd_wait);
12632 + memset(&recoverd_flags, 0, sizeof(unsigned long));
12635 +static int enable_locking(struct dlm_ls *ls, int event_id)
12639 + spin_lock(&ls->ls_recover_lock);
12640 + if (ls->ls_last_stop < event_id) {
12641 + set_bit(LSFL_LS_RUN, &ls->ls_flags);
12642 + up_write(&ls->ls_in_recovery);
12645 + log_debug(ls, "enable_locking: abort %d", event_id);
12647 + spin_unlock(&ls->ls_recover_lock);
12651 +static int ls_first_start(struct dlm_ls *ls, struct dlm_recover *rv)
12655 + log_all(ls, "recover event %u (first)", rv->event_id);
12657 + kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
12659 + error = ls_nodes_init(ls, rv);
12661 + log_error(ls, "nodes_init failed %d", error);
12665 + error = dlm_dir_rebuild_local(ls);
12667 + log_error(ls, "dlm_dir_rebuild_local failed %d", error);
12671 + error = dlm_dir_rebuild_wait(ls);
12673 + log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
12677 + log_all(ls, "recover event %u done", rv->event_id);
12678 + kcl_start_done(ls->ls_local_id, rv->event_id);
12685 + * We are given here a new group of nodes which are in the lockspace. We first
12686 + * figure out the differences in ls membership from when we were last running.
12687 + * If nodes from before are gone, then there will be some lock recovery to do.
12688 + * If there are only nodes which have joined, then there's no lock recovery.
12690 + * note: cman requires an rc to finish starting on an revent (where nodes die)
12691 + * before it allows an sevent (where nodes join) to be processed. This means
12692 + * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
12696 +static int ls_reconfig(struct dlm_ls *ls, struct dlm_recover *rv)
12698 + int error, neg = 0;
12700 + log_all(ls, "recover event %u", rv->event_id);
12703 + * Add or remove nodes from the lockspace's ls_nodes list.
12706 + error = ls_nodes_reconfig(ls, rv, &neg);
12708 + log_error(ls, "nodes_reconfig failed %d", error);
12713 + * Rebuild our own share of the resdir by collecting from all other
12714 + * nodes rsb name/master pairs for which the name hashes to us.
12717 + error = dlm_dir_rebuild_local(ls);
12719 + log_error(ls, "dlm_dir_rebuild_local failed %d", error);
12724 + * Purge resdir-related requests that are being held in requestqueue.
12725 + * All resdir requests from before recovery started are invalid now due
12726 + * to the resdir rebuild and will be resent by the requesting nodes.
12729 + purge_requestqueue(ls);
12730 + set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12733 + * Wait for all nodes to complete resdir rebuild.
12736 + error = dlm_dir_rebuild_wait(ls);
12738 + log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
12743 + * Mark our own lkb's waiting in the lockqueue for remote replies from
12744 + * nodes that are now departed. These will be resent to the new
12745 + * masters in resend_cluster_requests. Also mark resdir lookup
12746 + * requests for resending.
12749 + lockqueue_lkb_mark(ls);
12751 + error = dlm_recovery_stopped(ls);
12757 + * Clear lkb's for departed nodes. This can't fail since it
12758 + * doesn't involve communicating with other nodes.
12761 + down_write(&ls->ls_rec_rsblist);
12762 + restbl_lkb_purge(ls);
12763 + up_write(&ls->ls_rec_rsblist);
12765 + down_read(&ls->ls_rec_rsblist);
12768 + * Get new master id's for rsb's of departed nodes. This fails
12769 + * if we can't communicate with other nodes.
12772 + error = restbl_rsb_update(ls);
12774 + log_error(ls, "restbl_rsb_update failed %d", error);
12779 + * Send our lkb info to new masters. This fails if we can't
12780 + * communicate with a node.
12783 + error = rebuild_rsbs_send(ls);
12785 + log_error(ls, "rebuild_rsbs_send failed %d", error);
12788 + up_read(&ls->ls_rec_rsblist);
12791 + clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12793 + log_all(ls, "recover event %u done", rv->event_id);
12794 + kcl_start_done(ls->ls_local_id, rv->event_id);
12798 + up_read(&ls->ls_rec_rsblist);
12800 + log_all(ls, "recover event %d error %d", rv->event_id, error);
12804 +static void clear_finished_nodes(struct dlm_ls *ls, int finish_event)
12806 + struct dlm_csb *csb, *safe;
12808 + list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, list) {
12809 + if (csb->gone_event <= finish_event) {
12810 + list_del(&csb->list);
12811 + release_csb(csb);
12817 + * Between calls to this routine for a ls, there can be multiple stop/start
12818 + * events from cman where every start but the latest is cancelled by stops.
12819 + * There can only be a single finish from cman because every finish requires us
12820 + * to call start_done. A single finish event could be followed by multiple
12821 + * stop/start events. This routine takes any combination of events from cman
12822 + * and boils them down to one course of action.
12825 +static int next_move(struct dlm_ls *ls, struct dlm_recover **rv_out,
12828 + LIST_HEAD(events);
12829 + unsigned int cmd = 0, stop, start, finish;
12830 + unsigned int last_stop, last_start, last_finish;
12831 + struct dlm_recover *rv = NULL, *start_rv = NULL;
12834 + * Grab the current state of cman/sm events.
12837 + spin_lock(&ls->ls_recover_lock);
12839 + stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
12840 + start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
12841 + finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
12843 + last_stop = ls->ls_last_stop;
12844 + last_start = ls->ls_last_start;
12845 + last_finish = ls->ls_last_finish;
12847 + while (!list_empty(&ls->ls_recover)) {
12848 + rv = list_entry(ls->ls_recover.next, struct dlm_recover, list);
12849 + list_del(&rv->list);
12850 + list_add_tail(&rv->list, &events);
12852 + spin_unlock(&ls->ls_recover_lock);
12854 + log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
12855 + last_stop, last_start, last_finish);
12858 + * Toss start events which have since been cancelled.
12861 + while (!list_empty(&events)) {
12862 + DLM_ASSERT(start,);
12863 + rv = list_entry(events.next, struct dlm_recover, list);
12864 + list_del(&rv->list);
12866 + if (rv->event_id <= last_stop) {
12867 + log_debug(ls, "move skip event %u", rv->event_id);
12868 + kfree(rv->nodeids);
12872 + log_debug(ls, "move use event %u", rv->event_id);
12873 + DLM_ASSERT(!start_rv,);
12879 + * Eight possible combinations of events.
12883 + if (!stop && !start && !finish) {
12884 + DLM_ASSERT(!start_rv,);
12890 + if (!stop && !start && finish) {
12891 + DLM_ASSERT(!start_rv,);
12892 + DLM_ASSERT(last_start > last_stop,);
12893 + DLM_ASSERT(last_finish == last_start,);
12895 + *finish_out = last_finish;
12900 + if (!stop && start && !finish) {
12901 + DLM_ASSERT(start_rv,);
12902 + DLM_ASSERT(last_start > last_stop,);
12904 + *rv_out = start_rv;
12909 + if (!stop && start && finish) {
12910 + DLM_ASSERT(0, printk("finish and start with no stop\n"););
12914 + if (stop && !start && !finish) {
12915 + DLM_ASSERT(!start_rv,);
12916 + DLM_ASSERT(last_start == last_stop,);
12922 + if (stop && !start && finish) {
12923 + DLM_ASSERT(!start_rv,);
12924 + DLM_ASSERT(last_finish == last_start,);
12925 + DLM_ASSERT(last_stop == last_start,);
12926 + cmd = DO_FINISH_STOP;
12927 + *finish_out = last_finish;
12932 + if (stop && start && !finish) {
12934 + DLM_ASSERT(last_start > last_stop,);
12936 + *rv_out = start_rv;
12938 + DLM_ASSERT(last_stop == last_start,);
12945 + if (stop && start && finish) {
12947 + DLM_ASSERT(last_start > last_stop,);
12948 + DLM_ASSERT(last_start > last_finish,);
12949 + cmd = DO_FINISH_START;
12950 + *finish_out = last_finish;
12951 + *rv_out = start_rv;
12953 + DLM_ASSERT(last_start == last_stop,);
12954 + DLM_ASSERT(last_start > last_finish,);
12955 + cmd = DO_FINISH_STOP;
12956 + *finish_out = last_finish;
12966 + * This function decides what to do given every combination of current
12967 + * lockspace state and next lockspace state.
12970 +static void do_ls_recovery(struct dlm_ls *ls)
12972 + struct dlm_recover *rv = NULL;
12973 + int error, cur_state, next_state = 0, do_now, finish_event = 0;
12975 + do_now = next_move(ls, &rv, &finish_event);
12979 + cur_state = ls->ls_state;
12982 + DLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
12983 + log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
12986 + * LSST_CLEAR - we're not in any recovery state. We can get a stop or
12987 + * a stop and start which equates with a START.
12990 + if (cur_state == LSST_CLEAR) {
12991 + switch (do_now) {
12993 + next_state = LSST_WAIT_START;
12997 + error = ls_reconfig(ls, rv);
12999 + next_state = LSST_WAIT_START;
13001 + next_state = LSST_RECONFIG_DONE;
13004 + case DO_FINISH: /* invalid */
13005 + case DO_FINISH_STOP: /* invalid */
13006 + case DO_FINISH_START: /* invalid */
13014 + * LSST_WAIT_START - we're not running because of getting a stop or
13015 + * failing a start. We wait in this state for another stop/start or
13016 + * just the next start to begin another reconfig attempt.
13019 + if (cur_state == LSST_WAIT_START) {
13020 + switch (do_now) {
13025 + error = ls_reconfig(ls, rv);
13027 + next_state = LSST_WAIT_START;
13029 + next_state = LSST_RECONFIG_DONE;
13032 + case DO_FINISH: /* invalid */
13033 + case DO_FINISH_STOP: /* invalid */
13034 + case DO_FINISH_START: /* invalid */
13042 + * LSST_RECONFIG_DONE - we entered this state after successfully
13043 + * completing ls_reconfig and calling kcl_start_done. We expect to get
13044 + * a finish if everything goes ok. A finish could be followed by stop
13045 + * or stop/start before we get here to check it. Or a finish may never
13046 + * happen, only stop or stop/start.
13049 + if (cur_state == LSST_RECONFIG_DONE) {
13050 + switch (do_now) {
13052 + clear_finished_nodes(ls, finish_event);
13053 + next_state = LSST_CLEAR;
13055 + error = enable_locking(ls, finish_event);
13059 + error = process_requestqueue(ls);
13063 + error = resend_cluster_requests(ls);
13067 + restbl_grant_after_purge(ls);
13069 + log_all(ls, "recover event %u finished", finish_event);
13073 + next_state = LSST_WAIT_START;
13076 + case DO_FINISH_STOP:
13077 + clear_finished_nodes(ls, finish_event);
13078 + next_state = LSST_WAIT_START;
13081 + case DO_FINISH_START:
13082 + clear_finished_nodes(ls, finish_event);
13083 + /* fall into DO_START */
13086 + error = ls_reconfig(ls, rv);
13088 + next_state = LSST_WAIT_START;
13090 + next_state = LSST_RECONFIG_DONE;
13100 + * LSST_INIT - state after ls is created and before it has been
13101 + * started. A start operation will cause the ls to be started for the
13102 + * first time. A failed start will cause to just wait in INIT for
13103 + * another stop/start.
13106 + if (cur_state == LSST_INIT) {
13107 + switch (do_now) {
13109 + error = ls_first_start(ls, rv);
13111 + next_state = LSST_INIT_DONE;
13117 + case DO_FINISH: /* invalid */
13118 + case DO_FINISH_STOP: /* invalid */
13119 + case DO_FINISH_START: /* invalid */
13127 + * LSST_INIT_DONE - after the first start operation is completed
13128 + * successfully and kcl_start_done() called. If there are no errors, a
13129 + * finish will arrive next and we'll move to LSST_CLEAR.
13132 + if (cur_state == LSST_INIT_DONE) {
13133 + switch (do_now) {
13135 + case DO_FINISH_STOP:
13136 + next_state = LSST_WAIT_START;
13140 + case DO_FINISH_START:
13141 + error = ls_reconfig(ls, rv);
13143 + next_state = LSST_WAIT_START;
13145 + next_state = LSST_RECONFIG_DONE;
13149 + next_state = LSST_CLEAR;
13150 + enable_locking(ls, finish_event);
13151 + log_all(ls, "recover event %u finished", finish_event);
13162 + ls->ls_state = next_state;
13165 + kfree(rv->nodeids);
13170 +static __inline__ struct dlm_ls *get_work(int clear)
13172 + struct dlm_ls *ls;
13174 + spin_lock(&lslist_lock);
13176 + list_for_each_entry(ls, &lslist, ls_list) {
13178 + if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
13182 + if (test_bit(LSFL_WORK, &ls->ls_flags))
13189 + spin_unlock(&lslist_lock);
13195 + * Thread which does recovery for all lockspaces.
13198 +static int dlm_recoverd(void *arg)
13200 + struct dlm_ls *ls;
13202 + daemonize("dlm_recoverd");
13203 + recoverd_task = current;
13204 + complete(&recoverd_run);
13206 + while (!test_bit(THREAD_STOP, &recoverd_flags)) {
13207 + wchan_cond_sleep_intr(recoverd_wait, !get_work(0));
13208 + if ((ls = get_work(1)))
13209 + do_ls_recovery(ls);
13212 + complete(&recoverd_run);
13217 + * Mark a specific lockspace as needing work and wake up the thread to do it.
13220 +void dlm_recoverd_kick(struct dlm_ls *ls)
13222 + set_bit(LSFL_WORK, &ls->ls_flags);
13223 + wake_up(&recoverd_wait);
13227 + * Start the recoverd thread when dlm is started (before any lockspaces).
13230 +int dlm_recoverd_start(void)
13234 + clear_bit(THREAD_STOP, &recoverd_flags);
13235 + error = kernel_thread(dlm_recoverd, NULL, 0);
13240 + wait_for_completion(&recoverd_run);
13247 + * Stop the recoverd thread when dlm is shut down (all lockspaces are gone).
13250 +int dlm_recoverd_stop(void)
13252 + set_bit(THREAD_STOP, &recoverd_flags);
13253 + wake_up(&recoverd_wait);
13254 + wait_for_completion(&recoverd_run);
13258 diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
13259 --- linux-orig/cluster/dlm/recoverd.h 1970-01-01 07:30:00.000000000 +0730
13260 +++ linux-patched/cluster/dlm/recoverd.h 2004-07-13 18:57:22.000000000 +0800
13262 +/******************************************************************************
13263 +*******************************************************************************
13265 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13266 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13268 +** This copyrighted material is made available to anyone wishing to use,
13269 +** modify, copy, or redistribute it subject to the terms and conditions
13270 +** of the GNU General Public License v.2.
13272 +*******************************************************************************
13273 +******************************************************************************/
13275 +#ifndef __RECOVERD_DOT_H__
13276 +#define __RECOVERD_DOT_H__
13278 +void dlm_recoverd_init(void);
13279 +void dlm_recoverd_kick(struct dlm_ls *ls);
13280 +int dlm_recoverd_start(void);
13281 +int dlm_recoverd_stop(void);
13283 +#endif /* __RECOVERD_DOT_H__ */
13284 diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
13285 --- linux-orig/cluster/dlm/rsb.c 1970-01-01 07:30:00.000000000 +0730
13286 +++ linux-patched/cluster/dlm/rsb.c 2004-07-13 18:57:22.000000000 +0800
13288 +/******************************************************************************
13289 +*******************************************************************************
13291 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13292 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13294 +** This copyrighted material is made available to anyone wishing to use,
13295 +** modify, copy, or redistribute it subject to the terms and conditions
13296 +** of the GNU General Public License v.2.
13298 +*******************************************************************************
13299 +******************************************************************************/
13301 +#include "dlm_internal.h"
13302 +#include "locking.h"
13303 +#include "memory.h"
13304 +#include "lockqueue.h"
13305 +#include "nodes.h"
13309 +static struct dlm_rsb *search_hashchain(struct list_head *head,
13310 + struct dlm_rsb *parent,
13311 + char *name, int namelen)
13313 + struct dlm_rsb *r;
13315 + list_for_each_entry(r, head, res_hashchain) {
13316 + if ((parent == r->res_parent) && (namelen == r->res_length) &&
13317 + (memcmp(name, r->res_name, namelen) == 0)) {
13318 + atomic_inc(&r->res_ref);
13327 + * A way to arbitrarily hold onto an rsb which we already have a reference to
13328 + * to make sure it doesn't go away. Opposite of release_rsb().
13331 +void hold_rsb(struct dlm_rsb *r)
13333 + atomic_inc(&r->res_ref);
13337 + * release_rsb() - Decrement reference count on rsb struct. Free the rsb
13338 + * struct when there are zero references. Every lkb for the rsb adds a
13339 + * reference. When ref is zero there can be no more lkb's for the rsb, on the
13340 + * queue's or anywhere else.
13343 +void release_rsb(struct dlm_rsb *r)
13345 + struct dlm_ls *ls = r->res_ls;
13346 + int removed = FALSE;
13348 + write_lock(&ls->ls_rsbtbl[r->res_bucket].lock);
13349 + if (atomic_dec_and_test(&r->res_ref)) {
13350 + DLM_ASSERT(list_empty(&r->res_grantqueue), print_rsb(r););
13351 + DLM_ASSERT(list_empty(&r->res_waitqueue), print_rsb(r););
13352 + DLM_ASSERT(list_empty(&r->res_convertqueue), print_rsb(r););
13354 + list_del(&r->res_hashchain);
13356 + write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock);
13361 + down_read(&ls->ls_gap_rsblist);
13362 + if (r->res_parent)
13363 + list_del(&r->res_subreslist);
13365 + list_del(&r->res_rootlist);
13366 + up_read(&ls->ls_gap_rsblist);
13368 + if (r->res_parent)
13370 + if (r->res_nodeid && r->res_nodeid != -1)
13372 + if (r->res_nodeid == -1 && !test_bit(RESFL_MASTER, &r->res_flags))
13375 + if (get_directory_nodeid(r) != our_nodeid())
13376 + remote_remove_resdata(r->res_ls, get_directory_nodeid(r),
13377 + r->res_name, r->res_length);
13379 + remove_resdata(r->res_ls, our_nodeid(), r->res_name,
13382 + if (r->res_lvbptr)
13383 + free_lvb(r->res_lvbptr);
13388 +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb)
13390 + struct dlm_rsb *r = lkb->lkb_resource;
13392 + write_lock(&ls->ls_rsbtbl[r->res_bucket].lock);
13393 + if (!r->res_parent && atomic_read(&r->res_ref) == 1)
13394 + r->res_nodeid = -1;
13395 + write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock);
13401 + * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
13402 + * If the rsb exists, its ref count is incremented by this function. If it
13403 + * doesn't exist, it's created with a ref count of one.
13406 +int find_or_create_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name,
13407 + int namelen, int create, struct dlm_rsb **rp)
13410 + struct dlm_rsb *r, *tmp;
13411 + int error = -ENOMEM;
13413 + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
13415 + bucket = dlm_hash(name, namelen);
13416 + bucket &= (ls->ls_rsbtbl_size - 1);
13418 + read_lock(&ls->ls_rsbtbl[bucket].lock);
13419 + r = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, namelen);
13420 + read_unlock(&ls->ls_rsbtbl[bucket].lock);
13429 + r = allocate_rsb(ls, namelen);
13433 + INIT_LIST_HEAD(&r->res_subreslist);
13434 + INIT_LIST_HEAD(&r->res_grantqueue);
13435 + INIT_LIST_HEAD(&r->res_convertqueue);
13436 + INIT_LIST_HEAD(&r->res_waitqueue);
13438 + memcpy(r->res_name, name, namelen);
13439 + r->res_length = namelen;
13441 + init_rwsem(&r->res_lock);
13442 + atomic_set(&r->res_ref, 1);
13443 + r->res_bucket = bucket;
13446 + r->res_parent = parent;
13447 + r->res_depth = parent->res_depth + 1;
13448 + r->res_root = parent->res_root;
13449 + r->res_nodeid = parent->res_nodeid;
13451 + r->res_parent = NULL;
13452 + r->res_depth = 1;
13454 + r->res_nodeid = -1;
13457 + write_lock(&ls->ls_rsbtbl[bucket].lock);
13458 + tmp = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, namelen);
13460 + write_unlock(&ls->ls_rsbtbl[bucket].lock);
13464 + list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
13465 + write_unlock(&ls->ls_rsbtbl[bucket].lock);
13467 + down_read(&ls->ls_gap_rsblist);
13469 + list_add_tail(&r->res_subreslist,
13470 + &r->res_root->res_subreslist);
13472 + list_add(&r->res_rootlist, &ls->ls_rootres);
13473 + up_read(&ls->ls_gap_rsblist);
13487 + * Add a LKB to a resource's grant/convert/wait queue. in order
13490 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
13492 + struct dlm_lkb *lkb = NULL;
13494 + list_for_each_entry(lkb, head, lkb_statequeue) {
13495 + if (lkb->lkb_rqmode < mode)
13500 + /* No entries in the queue, we are alone */
13501 + list_add_tail(new, head);
13503 + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
13508 + * The rsb res_lock must be held in write when this function is called.
13511 +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
13513 + DLM_ASSERT(!lkb->lkb_status,
13517 + lkb->lkb_status = type;
13520 + case GDLM_LKSTS_WAITING:
13521 + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
13524 + case GDLM_LKSTS_GRANTED:
13525 + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
13526 + lkb->lkb_grmode);
13529 + case GDLM_LKSTS_CONVERT:
13530 + if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
13531 + list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
13534 + if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)
13535 + list_add_tail(&lkb->lkb_statequeue,
13536 + &r->res_convertqueue);
13538 + lkb_add_ordered(&lkb->lkb_statequeue,
13539 + &r->res_convertqueue, lkb->lkb_rqmode);
13547 +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
13549 + down_write(&r->res_lock);
13550 + lkb_enqueue(r, lkb, type);
13551 + up_write(&r->res_lock);
13555 + * The rsb res_lock must be held in write when this function is called.
13558 +int lkb_dequeue(struct dlm_lkb *lkb)
13560 + int status = lkb->lkb_status;
13565 + lkb->lkb_status = 0;
13566 + list_del(&lkb->lkb_statequeue);
13572 +int res_lkb_dequeue(struct dlm_lkb *lkb)
13576 + down_write(&lkb->lkb_resource->res_lock);
13577 + status = lkb_dequeue(lkb);
13578 + up_write(&lkb->lkb_resource->res_lock);
13584 + * The rsb res_lock must be held in write when this function is called.
13587 +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
13591 + status = lkb_dequeue(lkb);
13592 + lkb_enqueue(r, lkb, type);
13597 +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
13601 + down_write(&r->res_lock);
13602 + status = lkb_swqueue(r, lkb, type);
13603 + up_write(&r->res_lock);
13607 diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
13608 --- linux-orig/cluster/dlm/rsb.h 1970-01-01 07:30:00.000000000 +0730
13609 +++ linux-patched/cluster/dlm/rsb.h 2004-07-13 18:57:22.000000000 +0800
13611 +/******************************************************************************
13612 +*******************************************************************************
13614 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13615 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13617 +** This copyrighted material is made available to anyone wishing to use,
13618 +** modify, copy, or redistribute it subject to the terms and conditions
13619 +** of the GNU General Public License v.2.
13621 +*******************************************************************************
13622 +******************************************************************************/
13624 +#ifndef __RSB_DOT_H__
13625 +#define __RSB_DOT_H__
13627 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
13628 +void release_rsb(struct dlm_rsb *r);
13629 +void hold_rsb(struct dlm_rsb *r);
13630 +int find_or_create_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name,
13631 + int namelen, int create, struct dlm_rsb **rp);
13632 +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb);
13633 +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
13634 +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
13635 +int lkb_dequeue(struct dlm_lkb *lkb);
13636 +int res_lkb_dequeue(struct dlm_lkb *lkb);
13637 +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
13638 +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
13640 +#endif /* __RSB_DOT_H__ */
13641 diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
13642 --- linux-orig/cluster/dlm/util.c 1970-01-01 07:30:00.000000000 +0730
13643 +++ linux-patched/cluster/dlm/util.c 2004-07-13 18:57:22.000000000 +0800
13645 +/******************************************************************************
13646 +*******************************************************************************
13648 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13649 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13651 +** This copyrighted material is made available to anyone wishing to use,
13652 +** modify, copy, or redistribute it subject to the terms and conditions
13653 +** of the GNU General Public License v.2.
13655 +*******************************************************************************
13656 +******************************************************************************/
13658 +#include "dlm_internal.h"
13660 +static const uint32_t crc_32_tab[] = {
13661 + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
13662 + 0xe963a535, 0x9e6495a3,
13663 + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
13664 + 0xe7b82d07, 0x90bf1d91,
13665 + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
13666 + 0xf4d4b551, 0x83d385c7,
13667 + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
13668 + 0xfa0f3d63, 0x8d080df5,
13669 + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
13670 + 0xd20d85fd, 0xa50ab56b,
13671 + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
13672 + 0xdcd60dcf, 0xabd13d59,
13673 + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
13674 + 0xcfba9599, 0xb8bda50f,
13675 + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
13676 + 0xc1611dab, 0xb6662d3d,
13677 + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
13678 + 0x9fbfe4a5, 0xe8b8d433,
13679 + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
13680 + 0x91646c97, 0xe6635c01,
13681 + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
13682 + 0x8208f4c1, 0xf50fc457,
13683 + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
13684 + 0x8cd37cf3, 0xfbd44c65,
13685 + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
13686 + 0xa4d1c46d, 0xd3d6f4fb,
13687 + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
13688 + 0xaa0a4c5f, 0xdd0d7cc9,
13689 + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
13690 + 0xb966d409, 0xce61e49f,
13691 + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
13692 + 0xb7bd5c3b, 0xc0ba6cad,
13693 + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
13694 + 0x04db2615, 0x73dc1683,
13695 + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
13696 + 0x0a00ae27, 0x7d079eb1,
13697 + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
13698 + 0x196c3671, 0x6e6b06e7,
13699 + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
13700 + 0x17b7be43, 0x60b08ed5,
13701 + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
13702 + 0x3fb506dd, 0x48b2364b,
13703 + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
13704 + 0x316e8eef, 0x4669be79,
13705 + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
13706 + 0x220216b9, 0x5505262f,
13707 + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
13708 + 0x2cd99e8b, 0x5bdeae1d,
13709 + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
13710 + 0x72076785, 0x05005713,
13711 + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
13712 + 0x7cdcefb7, 0x0bdbdf21,
13713 + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
13714 + 0x6fb077e1, 0x18b74777,
13715 + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
13716 + 0x616bffd3, 0x166ccf45,
13717 + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
13718 + 0x4969474d, 0x3e6e77db,
13719 + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
13720 + 0x47b2cf7f, 0x30b5ffe9,
13721 + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
13722 + 0x54de5729, 0x23d967bf,
13723 + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
13724 + 0x5a05df1b, 0x2d02ef8d
13728 + * dlm_hash - hash an array of data
13729 + * @data: the data to be hashed
13730 + * @len: the length of data to be hashed
13732 + * Copied from GFS.
13734 + * Take some data and convert it to a 32-bit hash.
13736 + * The hash function is a 32-bit CRC of the data. The algorithm uses
13737 + * the crc_32_tab table above.
13739 + * This may not be the fastest hash function, but it does a fair bit better
13740 + * at providing uniform results than the others I've looked at. That's
13741 + * really important for efficient directories.
13743 + * Returns: the hash
13746 +uint32_t dlm_hash(const char *data, int len)
13748 + uint32_t hash = 0xFFFFFFFF;
13750 + for (; len--; data++)
13751 + hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
13758 +uint32_t dlm_next_power2(uint32_t val)
13762 + for (x = 1; x < val; x <<= 1) ;
13767 +void print_lkb(struct dlm_lkb *lkb)
13769 + printk("dlm: lkb\n"
13786 + lkb->lkb_lockqueue_state,
13787 + lkb->lkb_lockqueue_flags);
13790 +void print_rsb(struct dlm_rsb *r)
13792 + printk("dlm: rsb\n"
13798 + atomic_read(&r->res_ref));
13801 +void print_request(struct dlm_request *req)
13803 + printk("dlm: request\n"
13810 + req->rr_header.rh_cmd,
13811 + req->rr_header.rh_lkid,
13818 +void print_reply(struct dlm_reply *rp)
13820 + printk("dlm: reply\n"
13827 + rp->rl_header.rh_cmd,
13828 + rp->rl_header.rh_lkid,
13829 + rp->rl_lockstate,
13835 diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
13836 --- linux-orig/cluster/dlm/util.h 1970-01-01 07:30:00.000000000 +0730
13837 +++ linux-patched/cluster/dlm/util.h 2004-07-13 18:57:22.000000000 +0800
13839 +/******************************************************************************
13840 +*******************************************************************************
13842 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13843 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13845 +** This copyrighted material is made available to anyone wishing to use,
13846 +** modify, copy, or redistribute it subject to the terms and conditions
13847 +** of the GNU General Public License v.2.
13849 +*******************************************************************************
13850 +******************************************************************************/
13852 +#ifndef __UTIL_DOT_H__
13853 +#define __UTIL_DOT_H__
13855 +uint32_t dlm_hash(const char *data, int len);
13856 +uint32_t dlm_next_power2(uint32_t val);
13858 +void print_lkb(struct dlm_lkb *lkb);
13859 +void print_rsb(struct dlm_rsb *r);
13860 +void print_request(struct dlm_request *req);
13861 +void print_reply(struct dlm_reply *rp);
13864 diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
13865 --- linux-orig/include/cluster/dlm.h 1970-01-01 07:30:00.000000000 +0730
13866 +++ linux-patched/include/cluster/dlm.h 2004-07-13 18:57:22.000000000 +0800
13868 +/******************************************************************************
13869 +*******************************************************************************
13871 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13872 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13874 +** This copyrighted material is made available to anyone wishing to use,
13875 +** modify, copy, or redistribute it subject to the terms and conditions
13876 +** of the GNU General Public License v.2.
13878 +*******************************************************************************
13879 +******************************************************************************/
13881 +#ifndef __DLM_DOT_H__
13882 +#define __DLM_DOT_H__
13885 + * Interface to DLM - routines and structures to use DLM lockspaces.
13892 +#define DLM_LOCK_IV (-1) /* invalid */
13893 +#define DLM_LOCK_NL (0) /* null */
13894 +#define DLM_LOCK_CR (1) /* concurrent read */
13895 +#define DLM_LOCK_CW (2) /* concurrent write */
13896 +#define DLM_LOCK_PR (3) /* protected read */
13897 +#define DLM_LOCK_PW (4) /* protected write */
13898 +#define DLM_LOCK_EX (5) /* exclusive */
13901 + * Maximum size in bytes of a dlm_lock name
13904 +#define DLM_RESNAME_MAXLEN (64)
13907 + * Size in bytes of Lock Value Block
13910 +#define DLM_LVB_LEN (32)
13913 + * Flags to dlm_new_lockspace
13915 + * DLM_LSF_NOTIMERS
13917 + * Do not subject locks in this lockspace to time-outs.
13919 + * DLM_LSF_NOCONVGRANT
13921 + * Do not grant new locks unless the conversion queue is empty.
13925 +#define DLM_LSF_NOTIMERS (1)
13926 +#define DLM_LSF_NOCONVGRANT (2)
13929 + * Flags to dlm_lock
13931 + * DLM_LKF_NOQUEUE
13933 + * Do not queue the lock request on the wait queue if it cannot be granted
13934 + * immediately. If the lock cannot be granted because of this flag, DLM will
13935 + * either return -EAGAIN from the dlm_lock call or will return 0 from
13936 + * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
13938 + * DLM_LKF_CONVERT
13940 + * Indicates a lock conversion request. For conversions the name and namelen
13941 + * are ignored and the lock ID in the LKSB is used to identify the lock.
13945 + * Requests DLM to return the current contents of the lock value block in the
13946 + * lock status block. When this flag is set in a lock conversion from PW or EX
13947 + * modes, DLM assigns the value specified in the lock status block to the lock
13948 + * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
13949 + * containing application-specific information.
13953 + * Force a conversion lock request to the back of the convert queue. All other
13954 + * conversion requests ahead of it must be granted before it can be granted.
13955 + * This enforces a FIFO ordering on the convert queue. When this flag is set,
13956 + * indefinite postponement is averted. This flag is allowed only when
13957 + * converting a lock to a more restrictive mode.
13961 + * Used to cancel a pending conversion (with dlm_unlock). Lock is returned to
13962 + * previously granted mode.
13964 + * DLM_LKF_IVVALBLK
13966 + * Invalidate/clear the lock value block.
13968 + * DLM_LKF_CONVDEADLK
13970 + * The granted mode of a lock being converted (from a non-NL mode) can be
13971 + * changed to NL in the process of acquiring the requested mode to avoid
13972 + * conversion deadlock.
13974 + * DLM_LKF_PERSISTENT
13976 + * Only relevant to locks originating in userspace. Signals to the ioctl.c code
13977 + * that this lock should not be unlocked when the process exits.
13979 + * DLM_LKF_NODLKWT
13981 + * This lock is not to be checked for conversion deadlocks.
13983 + * DLM_LKF_NODLCKBLK
13985 + * not yet implemented
13987 + * DLM_LKF_EXPEDITE
13989 + * If this lock conversion cannot be granted immediately it is to go to the
13990 + * head of the conversion queue regardless of its requested lock mode.
13992 + * DLM_LKF_NOQUEUEBAST
13994 + * Send blocking AST's before returning -EAGAIN to the caller. It is only
13995 + * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
13996 + * NOQUEUE requests otherwise.
14000 +#define DLM_LKF_NOQUEUE (0x00000001)
14001 +#define DLM_LKF_CANCEL (0x00000002)
14002 +#define DLM_LKF_CONVERT (0x00000004)
14003 +#define DLM_LKF_VALBLK (0x00000008)
14004 +#define DLM_LKF_QUECVT (0x00000010)
14005 +#define DLM_LKF_IVVALBLK (0x00000020)
14006 +#define DLM_LKF_CONVDEADLK (0x00000040)
14007 +#define DLM_LKF_PERSISTENT (0x00000080)
14008 +#define DLM_LKF_NODLCKWT (0x00000100)
14009 +#define DLM_LKF_NODLCKBLK (0x00000200)
14010 +#define DLM_LKF_EXPEDITE (0x00000400)
14011 +#define DLM_LKF_NOQUEUEBAST (0x00000800)
14014 + * Some return codes that are not not in errno.h
14017 +#define DLM_ECANCEL (0x10001)
14018 +#define DLM_EUNLOCK (0x10002)
14020 +typedef void dlm_lockspace_t;
14023 + * Lock range structure
14026 +struct dlm_range {
14027 + uint64_t ra_start;
14032 + * Lock status block
14034 + * Use this structure to specify the contents of the lock value block. For a
14035 + * conversion request, this structure is used to specify the lock ID of the
14036 + * lock. DLM writes the status of the lock request and the lock ID assigned
14037 + * to the request in the lock status block.
14039 + * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
14040 + * It is available when dlm_lock returns.
14042 + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
14043 + * shown for the DLM_LKF_VALBLK flag.
14045 + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
14046 + * it was first demoted to NL to avoid conversion deadlock.
14048 + * sb_status: the returned status of the lock request set prior to AST
14049 + * execution. Possible return values:
14051 + * 0 if lock request was successful
14052 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14053 + * -ENOMEM if there is no memory to process request
14054 + * -EINVAL if there are invalid parameters
14055 + * -DLM_EUNLOCK if unlock request was successful
14059 +#define DLM_SBF_DEMOTED (0x01)
14063 + uint32_t sb_lkid;
14065 + char * sb_lvbptr;
14069 + * These defines are the bits that make up the
14073 +/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
14074 + * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
14075 +#define DLM_LOCK_THIS 0x0007
14076 +#define DLM_QUERY_MODE_MASK 0x0007
14078 +/* Bits 3, 4, 5 bitmap of queue(s) to query */
14079 +#define DLM_QUERY_QUEUE_WAIT 0x0008
14080 +#define DLM_QUERY_QUEUE_CONVERT 0x0010
14081 +#define DLM_QUERY_QUEUE_GRANT 0x0020
14082 +#define DLM_QUERY_QUEUE_GRANTED 0x0030 /* Shorthand */
14083 +#define DLM_QUERY_QUEUE_ALL 0x0038 /* Shorthand */
14085 +/* Bit 6, Return only the information that can be established without a network
14086 + * round-trip. The caller must be aware of the implications of this. Useful for
14087 + * just getting the master node id or resource name. */
14088 +#define DLM_QUERY_LOCAL 0x0040
14090 +/* Bits 8 up, query type */
14091 +#define DLM_QUERY_LOCKS_HIGHER 0x0100
14092 +#define DLM_QUERY_LOCKS_LOWER 0x0200
14093 +#define DLM_QUERY_LOCKS_EQUAL 0x0300
14094 +#define DLM_QUERY_LOCKS_BLOCKING 0x0400
14095 +#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
14096 +#define DLM_QUERY_LOCKS_ALL 0x0600
14097 +#define DLM_QUERY_MASK 0x0F00
14099 +/* GRMODE is the default for mode comparisons,
14100 + RQMODE might also be handy */
14101 +#define DLM_QUERY_GRMODE 0x0000
14102 +#define DLM_QUERY_RQMODE 0x1000
14104 +/* Structures passed into and out of the query */
14106 +struct dlm_lockinfo {
14107 + int lki_lkid; /* Lock ID on originating node */
14108 + int lki_mstlkid; /* Lock ID on master node */
14110 + int lki_node; /* Originating node (not master) */
14111 + uint8_t lki_state; /* Queue the lock is on */
14112 + uint8_t lki_grmode; /* Granted mode */
14113 + uint8_t lki_rqmode; /* Requested mode */
14114 + struct dlm_range lki_grrange; /* Granted range, if applicable */
14115 + struct dlm_range lki_rqrange; /* Requested range, if applicable */
14118 +struct dlm_resinfo {
14120 + int rsi_grantcount; /* No. of nodes on grant queue */
14121 + int rsi_convcount; /* No. of nodes on convert queue */
14122 + int rsi_waitcount; /* No. of nodes on wait queue */
14123 + int rsi_masternode; /* Master for this resource */
14124 + char rsi_name[DLM_RESNAME_MAXLEN]; /* Resource name */
14125 + char rsi_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable
14129 +struct dlm_queryinfo {
14130 + struct dlm_resinfo *gqi_resinfo;
14131 + struct dlm_lockinfo *gqi_lockinfo; /* This points to an array
14133 + int gqi_locksize; /* input */
14134 + int gqi_lockcount; /* output */
14141 + * Starts and initializes DLM threads and structures. Creation of the first
14142 + * lockspace will call this if it has not been called already.
14144 + * Returns: 0 if successful, -EXXX on error
14147 +int dlm_init(void);
14152 + * Stops DLM threads.
14154 + * Returns: 0 if successful, -EXXX on error
14157 +int dlm_release(void);
14160 + * dlm_new_lockspace
14162 + * Starts a lockspace with the given name. If the named lockspace exists in
14163 + * the cluster, the calling node joins it.
14166 +int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
14170 + * dlm_release_lockspace
14172 + * Stop a lockspace.
14175 +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
14180 + * Make an asyncronous request to acquire or convert a lock on a named
14183 + * lockspace: context for the request
14184 + * mode: the requested mode of the lock (DLM_LOCK_)
14185 + * lksb: lock status block for input and async return values
14186 + * flags: input flags (DLM_LKF_)
14187 + * name: name of the resource to lock, can be binary
14188 + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
14189 + * parent: the lock ID of a parent lock or 0 if none
14190 + * lockast: function DLM executes when it completes processing the request
14191 + * astarg: argument passed to lockast and bast functions
14192 + * bast: function DLM executes when this lock later blocks another request
14195 + * 0 if request is successfully queued for processing
14196 + * -EINVAL if any input parameters are invalid
14197 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14198 + * -ENOMEM if there is no memory to process request
14199 + * -ENOTCONN if there is a communication error
14201 + * If the call to dlm_lock returns an error then the operation has failed and
14202 + * the AST routine will not be called. If dlm_lock returns 0 it is still
14203 + * possible that the lock operation will fail. The AST routine will be called
14204 + * when the locking is complete and the status is returned in the lksb.
14206 + * If the AST routines or parameter are passed to a conversion operation then
14207 + * they will overwrite those values that were passed to a previous dlm_lock
14210 + * AST routines should not block (at least not for long), but may make
14211 + * any locking calls they please.
14214 +int dlm_lock(dlm_lockspace_t *lockspace,
14216 + struct dlm_lksb *lksb,
14219 + unsigned int namelen,
14221 + void (*lockast) (void *astarg),
14223 + void (*bast) (void *astarg, int mode),
14224 + struct dlm_range *range);
14229 + * Asynchronously release a lock on a resource. The AST routine is called
14230 + * when the resource is successfully unlocked.
14232 + * lockspace: context for the request
14233 + * lkid: the lock ID as returned in the lksb
14234 + * flags: input flags (DLM_LKF_)
14235 + * lksb: if NULL the lksb parameter passed to last lock request is used
14236 + * astarg: if NULL, astarg in last lock request is used
14239 + * 0 if request is successfully queued for processing
14240 + * -EINVAL if any input parameters are invalid
14241 + * -ENOTEMPTY if the lock still has sublocks
14242 + * -EBUSY if the lock is waiting for a remote lock operation
14243 + * -ENOTCONN if there is a communication error
14246 +extern int dlm_unlock(dlm_lockspace_t *lockspace,
14249 + struct dlm_lksb *lksb,
14252 +/* Query interface
14254 + * Query the other holders of a resource, given a known lock ID
14256 + * lockspace: context for the request
14257 + * lksb: LKSB, sb_lkid contains the lock ID of a valid lock
14258 + * on the resource. sb_status will contain the status
14259 + * of the request on completion.
14260 + * query: query bitmap see DLM_QUERY_* above
14261 + * qinfo: pointer to dlm_queryinfo structure
14262 + * ast_routine: AST routine to call on completion
14263 + * artarg: argument to AST routine. It is "traditional"
14264 + * to put the qinfo pointer into lksb->sb_lvbptr
14265 + * and pass the lksb in here.
14267 +extern int dlm_query(dlm_lockspace_t *lockspace,
14268 + struct dlm_lksb *lksb,
14270 + struct dlm_queryinfo *qinfo,
14271 + void (ast_routine(void *)),
14275 +void dlm_debug_dump(void);
14277 +#endif /* __KERNEL__ */
14279 +#endif /* __DLM_DOT_H__ */
14280 diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
14281 --- linux-orig/include/cluster/dlm_device.h 1970-01-01 07:30:00.000000000 +0730
14282 +++ linux-patched/include/cluster/dlm_device.h 2004-07-13 18:57:22.000000000 +0800
14284 +/******************************************************************************
14285 +*******************************************************************************
14287 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14288 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14290 +** This copyrighted material is made available to anyone wishing to use,
14291 +** modify, copy, or redistribute it subject to the terms and conditions
14292 +** of the GNU General Public License v.2.
14294 +*******************************************************************************
14295 +******************************************************************************/
14297 +/* This is the device interface for dlm, most users will use a library
14301 +/* Version of the device interface */
14302 +#define DLM_DEVICE_VERSION_MAJOR 2
14303 +#define DLM_DEVICE_VERSION_MINOR 0
14304 +#define DLM_DEVICE_VERSION_PATCH 0
14306 +/* struct passed to the lock write */
14307 +struct dlm_lock_params {
14308 + uint32_t version[3];
14314 + struct dlm_range range;
14319 + struct dlm_lksb *lksb;
14324 +/* struct read from the "device" fd,
14325 + consists mainly of userspace pointers for the library to use */
14326 +struct dlm_lock_result {
14329 + void (*astaddr)(void *astparam);
14330 + struct dlm_lksb *user_lksb;
14331 + struct dlm_lksb lksb; /* But this has real data in it */
14332 + uint8_t bast_mode; /* Not yet used */
14335 +/* commands passed to the device */
14336 +#define DLM_USER_LOCK 1
14337 +#define DLM_USER_UNLOCK 2
14338 +#define DLM_USER_QUERY 3
14340 +/* Arbitrary length restriction */
14341 +#define MAX_LS_NAME_LEN 64
14343 +/* ioctls on the device */
14344 +#define DLM_CREATE_LOCKSPACE _IOW('D', 0x01, char *)
14345 +#define DLM_RELEASE_LOCKSPACE _IOW('D', 0x02, char *)
14346 +#define DLM_FORCE_RELEASE_LOCKSPACE _IOW('D', 0x03, char *)