1 # Add DLM to the build system
2 diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
3 --- linux-2.6.7/cluster/Kconfig 2004-06-17 15:00:36.000000000 +0800
4 +++ linux/cluster/Kconfig 2004-06-17 15:00:57.000000000 +0800
5 @@ -10,4 +10,22 @@ config CLUSTER
6 needed by all the other components. It provides membership services
7 for those other subsystems.
10 + tristate "Distributed Lock Manager"
13 + A fully distributed lock manager, providing cluster-wide locking services
14 + and protected lock namespaces for kernel and userland applications.
16 +config CLUSTER_DLM_PROCLOCKS
17 + boolean "/proc/locks support for DLM"
18 + depends on CLUSTER_DLM
21 + If this option is enabled a file will appear in /proc/cluster/dlm_locks.
22 + write into this "file" the name of a lockspace known to the DLM and then
23 + read out a list of all the resources and locks in that lockspace that are
24 + known to the local node. Note because the DLM is distributed this may not
25 + be the full lock picture.
28 diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
29 --- linux-2.6.7/cluster/Makefile 2004-06-17 15:00:36.000000000 +0800
30 +++ linux/cluster/Makefile 2004-06-17 15:00:57.000000000 +0800
34 obj-$(CONFIG_CLUSTER) += cman/
35 +obj-$(CONFIG_CLUSTER_DLM) += dlm/
36 diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile
37 --- linux-2.6.7/cluster/dlm/Makefile 1970-01-01 07:30:00.000000000 +0730
38 +++ linux/cluster/dlm/Makefile 2004-06-17 15:00:57.000000000 +0800
62 +obj-$(CONFIG_CLUSTER_DLM) += dlm.o
63 diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
64 --- linux-orig/cluster/dlm/ast.c 1970-01-01 07:30:00.000000000 +0730
65 +++ linux-patched/cluster/dlm/ast.c 2004-06-25 18:31:07.000000000 +0800
67 +/******************************************************************************
68 +*******************************************************************************
70 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
71 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
73 +** This copyrighted material is made available to anyone wishing to use,
74 +** modify, copy, or redistribute it subject to the terms and conditions
75 +** of the GNU General Public License v.2.
77 +*******************************************************************************
78 +******************************************************************************/
81 + * This delivers ASTs and checks for dead remote requests and deadlocks.
84 +#include <linux/timer.h>
86 +#include "dlm_internal.h"
88 +#include "lockqueue.h"
92 +#include "lowcomms.h"
93 +#include "midcomms.h"
98 +/* Wake up flags for astd */
99 +#define GDLMD_WAKE_ASTS 1
100 +#define GDLMD_WAKE_TIMER 2
102 +static struct list_head _deadlockqueue;
103 +static struct semaphore _deadlockqueue_lock;
104 +static struct list_head _lockqueue;
105 +static struct semaphore _lockqueue_lock;
106 +static struct timer_list _lockqueue_timer;
107 +static struct list_head _ast_queue;
108 +static struct semaphore _ast_queue_lock;
109 +static wait_queue_head_t _astd_waitchan;
110 +static atomic_t _astd_running;
111 +static long _astd_pid;
112 +static unsigned long _astd_wakeflags;
113 +static struct completion _astd_done;
115 +void add_to_lockqueue(gd_lkb_t *lkb)
117 + /* Time stamp the entry so we know if it's been waiting too long */
118 + lkb->lkb_lockqueue_time = jiffies;
120 + down(&_lockqueue_lock);
121 + list_add(&lkb->lkb_lockqueue, &_lockqueue);
122 + up(&_lockqueue_lock);
125 +void remove_from_lockqueue(gd_lkb_t *lkb)
127 + down(&_lockqueue_lock);
128 + list_del(&lkb->lkb_lockqueue);
129 + up(&_lockqueue_lock);
132 +void add_to_deadlockqueue(gd_lkb_t *lkb)
134 + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
136 + lkb->lkb_duetime = jiffies;
137 + down(&_deadlockqueue_lock);
138 + list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
139 + up(&_deadlockqueue_lock);
142 +void remove_from_deadlockqueue(gd_lkb_t *lkb)
144 + if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
147 + down(&_deadlockqueue_lock);
148 + list_del(&lkb->lkb_deadlockq);
149 + up(&_deadlockqueue_lock);
151 + /* Invalidate the due time */
152 + memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
155 +void remove_from_astqueue(gd_lkb_t *lkb)
157 + down(&_ast_queue_lock);
158 + if (lkb->lkb_asts_to_deliver)
159 + list_del(&lkb->lkb_astqueue);
160 + lkb->lkb_asts_to_deliver = 0;
161 + up(&_ast_queue_lock);
165 + * Actually deliver an AST to a user. The caller MUST hold the ast queue lock
166 + * and we unlock it for the duration of the user call, otherwise things can
170 +static void deliver_ast(gd_lkb_t *lkb, gd_ast_type_t astt)
172 + void (*cast) (long param) = lkb->lkb_astaddr;
173 + void (*bast) (long param, int mode) = lkb->lkb_bastaddr;
175 + up(&_ast_queue_lock);
177 + if (cast && (astt == GDLM_QUEUE_COMPAST))
178 + cast(lkb->lkb_astparam);
180 + else if (bast && (astt == GDLM_QUEUE_BLKAST)
181 + && (lkb->lkb_status == GDLM_LKSTS_GRANTED))
182 + bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode);
185 + * Remove LKB if requested. It is up to the caller to remove the LKB
186 + * from any resource queue it may be on.
188 + * NOTE: we check lkb_asts_to_deliver here in case an ast for us was
189 + * queued during the AST delivery itself (eg a user called dlm_unlock
190 + * in the AST routine!
193 + if (lkb->lkb_flags & GDLM_LKFLG_DELAST && astt == GDLM_QUEUE_COMPAST &&
194 + lkb->lkb_asts_to_deliver == 0) {
195 + gd_res_t *rsb = lkb->lkb_resource;
196 + struct rw_semaphore *in_recovery = &rsb->res_ls->ls_in_recovery;
198 + down_read(in_recovery);
199 + release_lkb(rsb->res_ls, lkb);
201 + up_read(in_recovery);
204 + /* This queue can get very big so we schedule here to give the rest of
205 + * the cluster chance to do some work. */
208 + down(&_ast_queue_lock);
212 + * Queue an AST for delivery, this will only deal with
213 + * kernel ASTs, usermode API will piggyback on top of this.
215 + * This can be called in either the user or DLM context.
216 + * ASTs are queued EVEN IF we are already running in gdlm_astd
217 + * context as we don't know what other locks are held (eg we could
218 + * be being called from a lock operation that was called from
220 + * If the AST is to be queued remotely then a message is sent to
221 + * the target system via midcomms.
224 +void queue_ast(gd_lkb_t *lkb, gd_ast_type_t astt, uint8_t rqmode)
226 + struct gd_remlockrequest req;
228 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
230 + * Send a message to have an ast queued remotely. Note: we do
231 + * not send remote completion asts, they are handled as part of
232 + * remote lock granting.
235 + if (astt == GDLM_QUEUE_BLKAST) {
236 + req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
237 + req.rr_header.rh_length = sizeof(req);
238 + req.rr_header.rh_flags = 0;
239 + req.rr_header.rh_lkid = lkb->lkb_id;
240 + req.rr_header.rh_lockspace =
241 + lkb->lkb_resource->res_ls->ls_global_id;
242 + req.rr_status = lkb->lkb_retstatus;
243 + req.rr_remlkid = lkb->lkb_remid;
244 + req.rr_rqmode = rqmode;
246 + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
247 + lkb->lkb_resource->res_ls->ls_allocation);
249 + } else if (lkb->lkb_retstatus == -EDEADLOCK) {
251 + * We only queue remote Completion ASTs here for error
252 + * completions that happen out of band.
253 + * DEADLOCK is one such.
256 + req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
257 + req.rr_header.rh_length = sizeof(req);
258 + req.rr_header.rh_flags = 0;
259 + req.rr_header.rh_lkid = lkb->lkb_id;
260 + req.rr_header.rh_lockspace =
261 + lkb->lkb_resource->res_ls->ls_global_id;
262 + req.rr_status = lkb->lkb_retstatus;
263 + req.rr_remlkid = lkb->lkb_remid;
264 + req.rr_rqmode = rqmode;
266 + midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
267 + lkb->lkb_resource->res_ls->ls_allocation);
271 + * Prepare info which will be returned in ast/bast.
274 + if (astt == GDLM_QUEUE_BLKAST) {
275 + lkb->lkb_bastmode = rqmode;
277 + lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
279 + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
280 + lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
282 + lkb->lkb_lksb->sb_flags = 0;
286 + * Queue ast/bast or deliver directly. astd can deliver ASTs
287 + * during deadlock detection or lock timeouts.
290 + down(&_ast_queue_lock);
292 + if (!lkb->lkb_asts_to_deliver)
293 + list_add_tail(&lkb->lkb_astqueue, &_ast_queue);
294 + lkb->lkb_asts_to_deliver |= astt;
296 + up(&_ast_queue_lock);
298 + /* It is the responsibility of the caller to call wake_astd()
299 + * after it has finished other locking operations that request
300 + * the ASTs to be delivered after */
305 + * Process any LKBs on the AST queue. The were queued in queue_ast().
308 +static void process_asts(void)
310 + gd_lkb_t *lkb, *safe;
311 + uint32_t to_deliver;
313 + down(&_ast_queue_lock);
315 + list_for_each_entry_safe(lkb, safe, &_ast_queue, lkb_astqueue) {
317 + /* The lkb can be placed back on _ast_queue as soon as
318 + * _ast_queue_lock is released. */
320 + to_deliver = lkb->lkb_asts_to_deliver;
321 + lkb->lkb_asts_to_deliver = 0;
322 + list_del(&lkb->lkb_astqueue);
324 + if ((to_deliver & GDLM_QUEUE_COMPAST))
325 + deliver_ast(lkb, GDLM_QUEUE_COMPAST);
327 + if ((to_deliver & GDLM_QUEUE_BLKAST))
328 + deliver_ast(lkb, GDLM_QUEUE_BLKAST);
330 + up(&_ast_queue_lock);
333 +void lockqueue_lkb_mark(gd_ls_t *ls)
335 + gd_lkb_t *lkb, *safe;
338 + log_all(ls, "mark waiting requests");
340 + down(&_lockqueue_lock);
342 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
344 + if (lkb->lkb_resource->res_ls != ls)
348 + * These lkb's are new and the master is being looked up. Mark
349 + * the lkb request to be resent. Even if the destination node
350 + * for the request is still living and has our request, it will
351 + * purge all resdir requests in purge_requestqueue. If there's
352 + * a reply to the LOOKUP request in our requestqueue (the reply
353 + * arrived after ls_stop), it is invalid and will be discarded
354 + * in purge_requestqueue, too.
357 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
358 + GDLM_ASSERT(lkb->lkb_nodeid == -1,
359 + log_error(ls, "nodeid=%d\n",
360 + lkb->lkb_nodeid););
362 + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
368 + * These lkb's have an outstanding request to a bygone node.
369 + * The request will be redirected to the new master node in
370 + * resend_cluster_requests(). Don't mark the request for
371 + * resending if there's a reply for it saved in the
375 + if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
376 + !reply_in_requestqueue(ls, lkb->lkb_id)) {
378 + lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
381 + * Don't rebuild this lkb on a new rsb in
382 + * rebuild_rsbs_send().
385 + if (lkb->lkb_lockqueue_state ==
386 + GDLM_LQSTATE_WAIT_CONDGRANT) {
387 + GDLM_ASSERT(lkb->lkb_status ==
388 + GDLM_LKSTS_WAITING, );
389 + lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
393 + * This flag indicates to the new master that his lkb
394 + * is in the midst of a convert request and should be
395 + * placed on the granted queue rather than the convert
396 + * queue. We will resend this convert request to the
400 + else if (lkb->lkb_lockqueue_state ==
401 + GDLM_LQSTATE_WAIT_CONVERT) {
402 + GDLM_ASSERT(lkb->lkb_status ==
403 + GDLM_LKSTS_CONVERT, );
404 + lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
410 + up(&_lockqueue_lock);
412 + log_all(ls, "marked %d requests", count);
415 +int resend_cluster_requests(gd_ls_t *ls)
417 + gd_lkb_t *lkb, *safe;
418 + int error = 0, state, count = 0;
420 + log_all(ls, "resend marked requests");
422 + down(&_lockqueue_lock);
424 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
426 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
427 + log_debug(ls, "resend_cluster_requests: aborted");
432 + if (lkb->lkb_resource->res_ls != ls)
435 + log_debug(ls, "resend_cluster_requests id=%x nodeid=%d "
436 + "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid,
437 + lkb->lkb_lockqueue_state, lkb->lkb_flags);
440 + * Resend/process the lockqueue lkb's (in-progres requests)
441 + * that were flagged at the start of recovery in
442 + * lockqueue_lkb_mark().
445 + if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
446 + lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
447 + lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
448 + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
450 + if (lkb->lkb_nodeid == -1) {
452 + * Send lookup to new resdir node.
454 + lkb->lkb_lockqueue_time = jiffies;
455 + send_cluster_request(lkb,
456 + lkb->lkb_lockqueue_state);
459 + else if (lkb->lkb_nodeid != 0) {
461 + * There's a new RSB master (that's not us.)
463 + lkb->lkb_lockqueue_time = jiffies;
464 + send_cluster_request(lkb,
465 + lkb->lkb_lockqueue_state);
470 + * We are the new RSB master for this lkb
473 + state = lkb->lkb_lockqueue_state;
474 + lkb->lkb_lockqueue_state = 0;
475 + /* list_del equals remove_from_lockqueue() */
476 + list_del(&lkb->lkb_lockqueue);
477 + process_remastered_lkb(lkb, state);
483 + up(&_lockqueue_lock);
485 + log_all(ls, "resent %d requests", count);
490 + * Process any LKBs on the Lock queue, this
491 + * just looks at the entries to see if they have been
492 + * on the queue too long and fails the requests if so.
495 +static void process_lockqueue(void)
497 + gd_lkb_t *lkb, *safe;
501 + down(&_lockqueue_lock);
503 + list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
504 + ls = lkb->lkb_resource->res_ls;
506 + if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
509 + /* Don't time out locks that are in transition */
510 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
513 + if (check_timeout(lkb->lkb_lockqueue_time,
514 + dlm_config.lock_timeout)) {
516 + list_del(&lkb->lkb_lockqueue);
517 + up(&_lockqueue_lock);
518 + cancel_lockop(lkb, -ETIMEDOUT);
519 + down(&_lockqueue_lock);
522 + up(&_lockqueue_lock);
527 + if (atomic_read(&_astd_running))
528 + mod_timer(&_lockqueue_timer,
529 + jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
532 +/* Look for deadlocks */
533 +static void process_deadlockqueue(void)
535 + gd_lkb_t *lkb, *safe;
537 + down(&_deadlockqueue_lock);
539 + list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
540 + gd_lkb_t *kill_lkb;
542 + /* Only look at "due" locks */
543 + if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
546 + /* Don't look at locks that are in transition */
547 + if (!test_bit(LSFL_LS_RUN,
548 + &lkb->lkb_resource->res_ls->ls_flags))
551 + up(&_deadlockqueue_lock);
553 + /* Lock has hit due time, check for conversion deadlock */
554 + kill_lkb = conversion_deadlock_check(lkb);
556 + cancel_conversion(kill_lkb, -EDEADLOCK);
558 + down(&_deadlockqueue_lock);
560 + up(&_deadlockqueue_lock);
563 +static __inline__ int no_asts(void)
567 + down(&_ast_queue_lock);
568 + ret = list_empty(&_ast_queue);
569 + up(&_ast_queue_lock);
573 +static void lockqueue_timer_fn(unsigned long arg)
575 + set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags);
576 + wake_up(&_astd_waitchan);
580 + * DLM daemon which delivers asts.
583 +static int dlm_astd(void *data)
585 + daemonize("dlm_astd");
587 + INIT_LIST_HEAD(&_lockqueue);
588 + init_MUTEX(&_lockqueue_lock);
589 + INIT_LIST_HEAD(&_deadlockqueue);
590 + init_MUTEX(&_deadlockqueue_lock);
591 + INIT_LIST_HEAD(&_ast_queue);
592 + init_MUTEX(&_ast_queue_lock);
593 + init_waitqueue_head(&_astd_waitchan);
594 + complete(&_astd_done);
597 + * Set a timer to check the lockqueue for dead locks (and deadlocks).
600 + init_timer(&_lockqueue_timer);
601 + _lockqueue_timer.function = lockqueue_timer_fn;
602 + _lockqueue_timer.data = 0;
603 + mod_timer(&_lockqueue_timer,
604 + jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
606 + while (atomic_read(&_astd_running)) {
607 + wchan_cond_sleep_intr(_astd_waitchan, no_asts());
609 + if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags))
612 + if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) {
613 + process_lockqueue();
614 + if (dlm_config.deadlocktime)
615 + process_deadlockqueue();
619 + if (timer_pending(&_lockqueue_timer))
620 + del_timer(&_lockqueue_timer);
622 + complete(&_astd_done);
627 +void wake_astd(void)
629 + set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags);
630 + wake_up(&_astd_waitchan);
635 + init_completion(&_astd_done);
636 + atomic_set(&_astd_running, 1);
637 + _astd_pid = kernel_thread(dlm_astd, NULL, 0);
638 + wait_for_completion(&_astd_done);
644 + atomic_set(&_astd_running, 0);
646 + wait_for_completion(&_astd_done);
648 diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
649 --- linux-orig/cluster/dlm/ast.h 1970-01-01 07:30:00.000000000 +0730
650 +++ linux-patched/cluster/dlm/ast.h 2004-06-25 18:31:07.000000000 +0800
652 +/******************************************************************************
653 +*******************************************************************************
655 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
656 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
658 +** This copyrighted material is made available to anyone wishing to use,
659 +** modify, copy, or redistribute it subject to the terms and conditions
660 +** of the GNU General Public License v.2.
662 +*******************************************************************************
663 +******************************************************************************/
665 +#ifndef __AST_DOT_H__
666 +#define __AST_DOT_H__
668 +void lockqueue_lkb_mark(gd_ls_t * ls);
669 +int resend_cluster_requests(gd_ls_t * ls);
670 +void add_to_lockqueue(gd_lkb_t * lkb);
671 +void remove_from_lockqueue(gd_lkb_t * lkb);
672 +void add_to_deadlockqueue(gd_lkb_t * lkb);
673 +void remove_from_deadlockqueue(gd_lkb_t * lkb);
674 +void remove_from_astqueue(gd_lkb_t * lkb);
675 +void queue_ast(gd_lkb_t * lkb, gd_ast_type_t astt, uint8_t rqmode);
676 +void wake_astd(void);
677 +int astd_start(void);
678 +void astd_stop(void);
680 +#endif /* __AST_DOT_H__ */
681 diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
682 --- linux-orig/cluster/dlm/config.c 1970-01-01 07:30:00.000000000 +0730
683 +++ linux-patched/cluster/dlm/config.c 2004-06-25 18:31:07.000000000 +0800
685 +/******************************************************************************
686 +*******************************************************************************
688 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
689 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
691 +** This copyrighted material is made available to anyone wishing to use,
692 +** modify, copy, or redistribute it subject to the terms and conditions
693 +** of the GNU General Public License v.2.
695 +*******************************************************************************
696 +******************************************************************************/
698 +#include <linux/module.h>
699 +#include <linux/proc_fs.h>
701 +#include "dlm_internal.h"
702 +#include "lowcomms.h"
705 +/* Config file defaults */
706 +#define DEFAULT_TCP_PORT 21064
707 +#define DEFAULT_LOCK_TIMEOUT 30
708 +#define DEFAULT_BUFFER_SIZE 4096
709 +#define DEFAULT_RESHASHTBL 256
710 +#define DEFAULT_LOCKIDTBL 1024
711 +#define DEFAULT_MAX_CONNECTIONS 128
712 +#define DEFAULT_DEADLOCKTIME 10
714 +struct config_info dlm_config = {
715 + .tcp_port = DEFAULT_TCP_PORT,
716 + .lock_timeout = DEFAULT_LOCK_TIMEOUT,
717 + .buffer_size = DEFAULT_BUFFER_SIZE,
718 + .reshashtbl = DEFAULT_RESHASHTBL,
719 + .lockidtbl = DEFAULT_LOCKIDTBL,
720 + .max_connections = DEFAULT_MAX_CONNECTIONS,
721 + .deadlocktime = DEFAULT_DEADLOCKTIME,
725 +static struct config_proc_info {
730 + .name = "tcp_port",
731 + .value = &dlm_config.tcp_port,
734 + .name = "lock_timeout",
735 + .value = &dlm_config.lock_timeout,
738 + .name = "buffer_size",
739 + .value = &dlm_config.buffer_size,
742 + .name = "reshashtbl",
743 + .value = &dlm_config.reshashtbl,
746 + .name = "lockidtbl",
747 + .value = &dlm_config.lockidtbl,
750 + .name = "max_connections",
751 + .value = &dlm_config.max_connections,
754 + .name = "deadlocktime",
755 + .value = &dlm_config.deadlocktime,
758 +static struct proc_dir_entry *dlm_dir;
760 +static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
761 + int *eof, void *data)
763 + struct config_proc_info *cinfo = data;
764 + return snprintf(page, count, "%d\n", *cinfo->value);
767 +static int dlm_config_write_proc(struct file *file, const char *buffer,
768 + unsigned long count, void *data)
770 + struct config_proc_info *cinfo = data;
774 + value = simple_strtoul(buffer, &end, 10);
776 + *cinfo->value = value;
780 +int dlm_config_init(void)
783 + struct proc_dir_entry *pde;
785 + dlm_dir = proc_mkdir("cluster/config/dlm", 0);
789 + dlm_dir->owner = THIS_MODULE;
791 + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
792 + pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
794 + pde->data = &config_proc[i];
795 + pde->write_proc = dlm_config_write_proc;
796 + pde->read_proc = dlm_config_read_proc;
802 +void dlm_config_exit(void)
806 + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
807 + remove_proc_entry(config_proc[i].name, dlm_dir);
808 + remove_proc_entry("cluster/config/dlm", NULL);
810 diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
811 --- linux-orig/cluster/dlm/config.h 1970-01-01 07:30:00.000000000 +0730
812 +++ linux-patched/cluster/dlm/config.h 2004-06-25 18:31:07.000000000 +0800
814 +/******************************************************************************
815 +*******************************************************************************
817 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
818 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
820 +** This copyrighted material is made available to anyone wishing to use,
821 +** modify, copy, or redistribute it subject to the terms and conditions
822 +** of the GNU General Public License v.2.
824 +*******************************************************************************
825 +******************************************************************************/
827 +#ifndef __CONFIG_DOT_H__
828 +#define __CONFIG_DOT_H__
830 +struct config_info {
836 + int max_connections;
840 +extern struct config_info dlm_config;
841 +extern int dlm_config_init(void);
842 +extern void dlm_config_exit(void);
844 +#endif /* __CONFIG_DOT_H__ */
845 diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
846 --- linux-orig/cluster/dlm/device.c 1970-01-01 07:30:00.000000000 +0730
847 +++ linux-patched/cluster/dlm/device.c 2004-06-25 18:31:07.000000000 +0800
849 +/******************************************************************************
850 +*******************************************************************************
852 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
853 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
855 +** This copyrighted material is made available to anyone wishing to use,
856 +** modify, copy, or redistribute it subject to the terms and conditions
857 +** of the GNU General Public License v.2.
859 +*******************************************************************************
860 +******************************************************************************/
865 + * This is the userland interface to the DLM.
867 + * The locking is done via a misc char device (find the
868 + * registered minor number in /proc/misc).
870 + * User code should not use this interface directly but
871 + * call the library routines in libdlm.a instead.
875 +#include <linux/miscdevice.h>
876 +#include <linux/init.h>
877 +#include <linux/wait.h>
878 +#include <linux/module.h>
879 +#include <linux/file.h>
880 +#include <linux/fs.h>
881 +#include <linux/poll.h>
882 +#include <linux/signal.h>
883 +#include <linux/spinlock.h>
884 +#include <asm/ioctls.h>
886 +#include "dlm_internal.h"
889 +extern gd_lkb_t *dlm_get_lkb(gd_ls_t *, int);
890 +static struct file_operations _dlm_fops;
891 +static const char *name_prefix="dlm";
892 +static struct list_head user_ls_list;
894 +/* Flags in li_flags */
895 +#define LI_FLAG_COMPLETE 1
896 +#define LI_FLAG_FIRSTLOCK 2
900 + struct dlm_lksb li_lksb;
901 + wait_queue_head_t li_waitq;
902 + unsigned long li_flags;
903 + void __user *li_astparam;
904 + void __user *li_astaddr;
905 + void __user *li_bastaddr;
906 + struct file_info *li_file;
907 + struct dlm_lksb __user *li_user_lksb;
908 + struct semaphore li_firstlock;
909 + struct dlm_queryinfo *li_queryinfo;
910 + struct dlm_queryinfo __user *li_user_queryinfo;
913 +/* A queued AST no less */
915 + struct dlm_lock_result result;
916 + struct dlm_queryinfo *queryinfo;
917 + struct dlm_queryinfo __user *user_queryinfo;
918 + struct list_head list;
921 +/* One of these per userland lockspace */
923 + void *ls_lockspace;
924 + atomic_t ls_refcnt;
925 + long ls_flags; /* bit 1 means LS has been deleted */
927 + /* Passed into misc_register() */
928 + struct miscdevice ls_miscinfo;
929 + struct list_head ls_list;
932 +/* misc_device info for the control device */
933 +static struct miscdevice ctl_device;
936 + * Stuff we hang off the file struct.
937 + * The first two are to cope with unlocking all the
938 + * locks help by a process when it dies.
941 + struct list_head fi_lkb_list; /* List of active lkbs */
942 + spinlock_t fi_lkb_lock;
943 + struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
944 + spinlock_t fi_ast_lock;
945 + wait_queue_head_t fi_wait;
946 + struct user_ls *fi_ls;
947 + atomic_t fi_refcnt; /* Number of users */
948 + unsigned long fi_flags; /* Bit 1 means the device is open */
952 +/* get and put ops for file_info.
953 + Actually I don't really like "get" and "put", but everyone
954 + else seems to use them and I can't think of anything
955 + nicer at the moment */
956 +static void get_file_info(struct file_info *f)
958 + atomic_inc(&f->fi_refcnt);
961 +static void put_file_info(struct file_info *f)
963 + if (atomic_dec_and_test(&f->fi_refcnt))
967 +/* Find a lockspace struct given the device minor number */
968 +static struct user_ls *find_lockspace(int minor)
970 + struct user_ls *lsinfo;
972 + list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
974 + if (lsinfo->ls_miscinfo.minor == minor)
980 +static void add_lockspace_to_list(struct user_ls *lsinfo)
982 + list_add(&lsinfo->ls_list, &user_ls_list);
985 +/* Register a lockspace with the DLM and create a misc
986 + device for userland to access it */
987 +static int register_lockspace(char *name, struct user_ls **ls)
989 + struct user_ls *newls;
993 + namelen = strlen(name)+strlen(name_prefix)+2;
995 + newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
998 + memset(newls, 0, sizeof(struct user_ls));
1000 + newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
1001 + if (!newls->ls_miscinfo.name) {
1005 + snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
1007 + status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1,
1008 + strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1,
1009 + &newls->ls_lockspace, 0);
1011 + if (status != 0) {
1012 + kfree(newls->ls_miscinfo.name);
1017 + newls->ls_miscinfo.fops = &_dlm_fops;
1018 + newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
1020 + status = misc_register(&newls->ls_miscinfo);
1022 + log_print("failed to register misc device for %s", name);
1023 + dlm_release_lockspace(newls->ls_lockspace, 0);
1024 + kfree(newls->ls_miscinfo.name);
1030 + add_lockspace_to_list(newls);
1035 +static int unregister_lockspace(struct user_ls *lsinfo, int force)
1039 + status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
1043 + status = misc_deregister(&lsinfo->ls_miscinfo);
1047 + list_del(&lsinfo->ls_list);
1048 + kfree(lsinfo->ls_miscinfo.name);
1054 +/* Add it to userland's AST queue */
1055 +static void add_to_astqueue(struct lock_info *li, void *astaddr)
1057 + struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
1061 + ast->result.astparam = li->li_astparam;
1062 + ast->result.astaddr = astaddr;
1063 + ast->result.user_lksb = li->li_user_lksb;
1064 + ast->result.cmd = li->li_cmd;
1065 + memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
1067 + /* These two will both be NULL for anything other than queries */
1068 + ast->queryinfo = li->li_queryinfo;
1069 + ast->user_queryinfo = li->li_user_queryinfo;
1071 + spin_lock(&li->li_file->fi_ast_lock);
1072 + list_add_tail(&ast->list, &li->li_file->fi_ast_list);
1073 + spin_unlock(&li->li_file->fi_ast_lock);
1074 + wake_up_interruptible(&li->li_file->fi_wait);
1077 +static void bast_routine(void *param, int mode)
1079 + struct lock_info *li = param;
1082 + add_to_astqueue(li, li->li_bastaddr);
1087 + * This is the kernel's AST routine.
1088 + * All lock, unlock & query operations complete here.
1089 + * The only syncronous ops are those done during device close.
1091 +static void ast_routine(void *param)
1093 + struct lock_info *li = param;
1095 + /* Param may be NULL if a persistent lock is unlocked by someone else */
1099 + /* If it's an async request then post data to the user's AST queue. */
1100 + if (li->li_astaddr) {
1102 + /* Only queue AST if the device is still open */
1103 + if (test_bit(1, &li->li_file->fi_flags))
1104 + add_to_astqueue(li, li->li_astaddr);
1106 + /* If it's a new lock operation that failed, then
1107 + * remove it from the owner queue and free the
1108 + * lock_info. The DLM will not free the LKB until this
1109 + * AST has completed.
1111 + if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1112 + li->li_lksb.sb_status != 0) {
1115 + /* Wait till dlm_lock() has finished */
1116 + down(&li->li_firstlock);
1117 + lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1119 + spin_lock(&li->li_file->fi_lkb_lock);
1120 + list_del(&lkb->lkb_ownerqueue);
1121 + spin_unlock(&li->li_file->fi_lkb_lock);
1123 + up(&li->li_firstlock);
1124 + put_file_info(li->li_file);
1128 + /* Free unlocks & queries */
1129 + if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
1130 + li->li_cmd == DLM_USER_QUERY) {
1131 + put_file_info(li->li_file);
1136 + /* Syncronous request, just wake up the caller */
1137 + set_bit(LI_FLAG_COMPLETE, &li->li_flags);
1138 + wake_up_interruptible(&li->li_waitq);
1143 + * Wait for the lock op to complete and return the status.
1145 +static int wait_for_ast(struct lock_info *li)
1147 + /* Wait for the AST routine to complete */
1148 + set_task_state(current, TASK_INTERRUPTIBLE);
1149 + while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
1152 + set_task_state(current, TASK_RUNNING);
1154 + return li->li_lksb.sb_status;
1158 +/* Open on control device */
1159 +static int dlm_ctl_open(struct inode *inode, struct file *file)
1164 +/* Close on control device */
1165 +static int dlm_ctl_close(struct inode *inode, struct file *file)
1170 +/* Open on lockspace device */
1171 +static int dlm_open(struct inode *inode, struct file *file)
1173 + struct file_info *f;
1174 + struct user_ls *lsinfo;
1176 + lsinfo = find_lockspace(iminor(inode));
1180 + f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
1184 + atomic_inc(&lsinfo->ls_refcnt);
1185 + INIT_LIST_HEAD(&f->fi_lkb_list);
1186 + INIT_LIST_HEAD(&f->fi_ast_list);
1187 + spin_lock_init(&f->fi_ast_lock);
1188 + spin_lock_init(&f->fi_lkb_lock);
1189 + init_waitqueue_head(&f->fi_wait);
1190 + f->fi_ls = lsinfo;
1191 + atomic_set(&f->fi_refcnt, 1);
1192 + set_bit(1, &f->fi_flags);
1194 + file->private_data = f;
1199 +/* Check the user's version matches ours */
1200 +static int check_version(struct dlm_lock_params *params)
1202 + if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
1203 + (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
1204 + params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
1206 + log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
1207 + params->version[0],
1208 + params->version[1],
1209 + params->version[2],
1210 + DLM_DEVICE_VERSION_MAJOR,
1211 + DLM_DEVICE_VERSION_MINOR,
1212 + DLM_DEVICE_VERSION_PATCH);
1218 +/* Close on lockspace device */
1219 +static int dlm_close(struct inode *inode, struct file *file)
1221 + struct file_info *f = file->private_data;
1222 + struct lock_info li;
1225 + gd_lkb_t *lkb, *safe;
1226 + struct user_ls *lsinfo;
1227 + DECLARE_WAITQUEUE(wq, current);
1229 + lsinfo = find_lockspace(iminor(inode));
1233 + /* Mark this closed so that ASTs will not be delivered any more */
1234 + clear_bit(1, &f->fi_flags);
1236 + /* Block signals while we are doing this */
1237 + sigfillset(&allsigs);
1238 + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1240 + /* We use our own lock_info struct here, so that any
1241 + * outstanding "real" ASTs will be delivered with the
1242 + * corresponding "real" params, thus freeing the lock_info
1243 + * that belongs the lock. This catches the corner case where
1244 + * a lock is BUSY when we try to unlock it here
1246 + memset(&li, 0, sizeof(li));
1247 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1248 + init_waitqueue_head(&li.li_waitq);
1249 + add_wait_queue(&li.li_waitq, &wq);
1252 + * Free any outstanding locks, they are on the
1253 + * list in LIFO order so there should be no problems
1254 + * about unlocking parents before children.
1255 + * Although we don't remove the lkbs from the list here
1256 + * (what would be the point?), foreach_safe is needed
1257 + * because the lkbs are freed during dlm_unlock operations
1259 + list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) {
1263 + struct lock_info *old_li;
1265 + /* Make a copy of this pointer. If all goes well we will
1266 + * free it later. if not it will be left to the AST routine
1269 + old_li = (struct lock_info *)lkb->lkb_astparam;
1271 + /* Don't unlock persistent locks */
1272 + if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
1273 + list_del(&lkb->lkb_ownerqueue);
1275 + /* But tidy our references in it */
1277 + lkb->lkb_astparam = (long)NULL;
1282 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1284 + /* If it's not granted then cancel the request.
1285 + * If the lock was WAITING then it will be dropped,
1286 + * if it was converting then it will be reverted to GRANTED,
1287 + * then we will unlock it.
1289 + lock_status = lkb->lkb_status;
1291 + if (lock_status != GDLM_LKSTS_GRANTED)
1292 + flags = DLM_LKF_CANCEL;
1294 + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1296 + /* Must wait for it to complete as the next lock could be its
1299 + wait_for_ast(&li);
1301 + /* If it was waiting for a conversion, it will
1302 + now be granted so we can unlock it properly */
1303 + if (lock_status == GDLM_LKSTS_CONVERT) {
1305 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1306 + status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li);
1309 + wait_for_ast(&li);
1311 + /* Unlock suceeded, free the lock_info struct. */
1312 + if (status == 0) {
1318 + remove_wait_queue(&li.li_waitq, &wq);
1320 + /* If this is the last reference, and the lockspace has been deleted
1321 + the free the struct */
1322 + if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
1326 + /* Restore signals */
1327 + sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1328 + recalc_sigpending();
1334 + * ioctls to create/remove lockspaces, and check how many
1335 + * outstanding ASTs there are against a particular LS.
1337 +static int dlm_ioctl(struct inode *inode, struct file *file,
1338 + uint command, ulong u)
1340 + struct file_info *fi = file->private_data;
1341 + int status = -EINVAL;
1343 + struct list_head *tmp_list;
1345 + switch (command) {
1347 + /* Are there any ASTs for us to read?
1348 + * Warning, this returns the number of messages (ASTs)
1349 + * in the queue, NOT the number of bytes to read
1353 + spin_lock(&fi->fi_ast_lock);
1354 + list_for_each(tmp_list, &fi->fi_ast_list)
1356 + spin_unlock(&fi->fi_ast_lock);
1357 + status = put_user(count, (int *)u);
1368 + * ioctls to create/remove lockspaces.
1370 +static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
1371 + uint command, ulong u)
1373 + int status = -EINVAL;
1374 + char ls_name[MAX_LS_NAME_LEN];
1375 + struct user_ls *lsinfo;
1378 + switch (command) {
1379 + case DLM_CREATE_LOCKSPACE:
1380 + if (!capable(CAP_SYS_ADMIN))
1383 + if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
1385 + status = register_lockspace(ls_name, &lsinfo);
1387 + /* If it succeeded then return the minor number */
1389 + status = lsinfo->ls_miscinfo.minor;
1392 + case DLM_FORCE_RELEASE_LOCKSPACE:
1395 + case DLM_RELEASE_LOCKSPACE:
1396 + if (!capable(CAP_SYS_ADMIN))
1399 + lsinfo = find_lockspace(u);
1402 + status = unregister_lockspace(lsinfo, force);
1412 +/* Deal with the messy stuff of copying a web of structs
1413 + from kernel space to userspace */
1414 +static int copy_query_result(struct ast_info *ast)
1416 + int status = -EFAULT;
1417 + struct dlm_queryinfo qi;
1419 + /* Get the pointers to userspace structs */
1420 + if (copy_from_user(&qi, ast->user_queryinfo,
1421 + sizeof(struct dlm_queryinfo)))
1424 + /* TODO: does this deref a user pointer? */
1425 + if (put_user(ast->queryinfo->gqi_lockcount,
1426 + &ast->user_queryinfo->gqi_lockcount))
1429 + if (qi.gqi_resinfo) {
1430 + if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
1431 + sizeof(struct dlm_resinfo)))
1435 + if (qi.gqi_lockinfo) {
1436 + if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
1437 + sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
1443 + if (ast->queryinfo->gqi_lockinfo)
1444 + kfree(ast->queryinfo->gqi_lockinfo);
1446 + if (ast->queryinfo->gqi_resinfo)
1447 + kfree(ast->queryinfo->gqi_resinfo);
1449 + kfree(ast->queryinfo);
1455 +/* Read call, might block if no ASTs are waiting.
1456 + * It will only ever return one message at a time, regardless
1457 + * of how many are pending.
1459 +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
1461 + struct file_info *fi = file->private_data;
1462 + struct ast_info *ast;
1464 + DECLARE_WAITQUEUE(wait, current);
1466 + if (count < sizeof(struct dlm_lock_result))
1469 + spin_lock(&fi->fi_ast_lock);
1470 + if (list_empty(&fi->fi_ast_list)) {
1472 + /* No waiting ASTs.
1473 + * Return EOF if the lockspace been deleted.
1475 + if (test_bit(1, &fi->fi_ls->ls_flags))
1478 + if (file->f_flags & O_NONBLOCK) {
1479 + spin_unlock(&fi->fi_ast_lock);
1483 + add_wait_queue(&fi->fi_wait, &wait);
1486 + set_current_state(TASK_INTERRUPTIBLE);
1487 + if (list_empty(&fi->fi_ast_list) &&
1488 + !signal_pending(current)) {
1490 + spin_unlock(&fi->fi_ast_lock);
1492 + spin_lock(&fi->fi_ast_lock);
1496 + current->state = TASK_RUNNING;
1497 + remove_wait_queue(&fi->fi_wait, &wait);
1499 + if (signal_pending(current)) {
1500 + spin_unlock(&fi->fi_ast_lock);
1501 + return -ERESTARTSYS;
1505 + ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
1506 + list_del(&ast->list);
1507 + spin_unlock(&fi->fi_ast_lock);
1509 + ret = sizeof(struct dlm_lock_result);
1510 + if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
1513 + /* If it was a query then copy the result block back here */
1514 + if (ast->queryinfo) {
1515 + int status = copy_query_result(ast);
1524 +static unsigned int dlm_poll(struct file *file, poll_table *wait)
1526 + struct file_info *fi = file->private_data;
1528 + poll_wait(file, &fi->fi_wait, wait);
1530 + spin_lock(&fi->fi_ast_lock);
1531 + if (!list_empty(&fi->fi_ast_list)) {
1532 + spin_unlock(&fi->fi_ast_lock);
1533 + return POLLIN | POLLRDNORM;
1536 + spin_unlock(&fi->fi_ast_lock);
1540 +static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
1542 + struct lock_info *li;
1545 + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1549 + get_file_info(fi);
1550 + li->li_user_lksb = kparams->lksb;
1551 + li->li_astparam = kparams->astparam;
1552 + li->li_bastaddr = kparams->bastaddr;
1553 + li->li_astaddr = kparams->astaddr;
1556 + li->li_cmd = kparams->cmd;
1557 + clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1559 + if (copy_from_user(&li->li_lksb, kparams->lksb,
1560 + sizeof(struct dlm_lksb))) {
1564 + li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
1566 + /* Allocate query structs */
1568 + li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
1569 + if (!li->li_queryinfo)
1572 + /* Mainly to get gqi_lock buffer size */
1573 + if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
1574 + sizeof(struct dlm_queryinfo))) {
1579 + /* Overwrite userspace pointers we just copied with kernel space ones */
1580 + if (li->li_queryinfo->gqi_resinfo) {
1581 + li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
1582 + if (!li->li_queryinfo->gqi_resinfo)
1585 + if (li->li_queryinfo->gqi_lockinfo) {
1586 + li->li_queryinfo->gqi_lockinfo =
1587 + kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
1589 + if (!li->li_queryinfo->gqi_lockinfo)
1593 + li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
1595 + return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
1596 + kparams->flags, /* query */
1601 + kfree(li->li_queryinfo);
1608 +static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
1609 + const char *buffer)
1611 + struct lock_info *li;
1613 + char name[DLM_RESNAME_MAXLEN];
1616 + * Validate things that we need to have correct.
1618 + if (kparams->namelen > DLM_RESNAME_MAXLEN)
1621 + if (!kparams->astaddr)
1624 + if (!kparams->lksb)
1627 + /* Get the lock name */
1628 + if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
1629 + kparams->namelen)) {
1633 + /* For conversions, the lock will already have a lock_info
1634 + block squirelled away in astparam */
1635 + if (kparams->flags & DLM_LKF_CONVERT) {
1636 + gd_lkb_t *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1640 + li = (struct lock_info *)lkb->lkb_astparam;
1642 + /* Only override these if they are provided */
1643 + if (li->li_user_lksb)
1644 + li->li_user_lksb = kparams->lksb;
1645 + if (li->li_astparam)
1646 + li->li_astparam = kparams->astparam;
1647 + if (li->li_bastaddr)
1648 + li->li_bastaddr = kparams->bastaddr;
1649 + if (li->li_bastaddr)
1650 + li->li_astaddr = kparams->astaddr;
1654 + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1658 + li->li_user_lksb = kparams->lksb;
1659 + li->li_astparam = kparams->astparam;
1660 + li->li_bastaddr = kparams->bastaddr;
1661 + li->li_astaddr = kparams->astaddr;
1664 + li->li_cmd = kparams->cmd;
1665 + li->li_queryinfo = NULL;
1667 + /* semaphore to allow us to complete our work before
1668 + the AST routine runs. In fact we only need (and use) this
1669 + when the initial lock fails */
1670 + init_MUTEX_LOCKED(&li->li_firstlock);
1671 + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1673 + get_file_info(fi);
1676 + /* Copy the user's LKSB into kernel space,
1677 + needed for conversions & value block operations */
1678 + if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb,
1679 + sizeof(struct dlm_lksb)))
1683 + status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
1684 + kparams->flags, name, kparams->namelen,
1688 + li->li_bastaddr ? bast_routine : NULL,
1689 + kparams->range.ra_end ? &kparams->range : NULL);
1691 + /* If it succeeded (this far) with a new lock then keep track of
1692 + it on the file's lkb list */
1693 + if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
1695 + lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1698 + spin_lock(&fi->fi_lkb_lock);
1699 + list_add(&lkb->lkb_ownerqueue,
1700 + &fi->fi_lkb_list);
1701 + spin_unlock(&fi->fi_lkb_lock);
1704 + log_print("failed to get lkb for new lock");
1706 + up(&li->li_firstlock);
1712 +static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
1714 + struct lock_info *li;
1718 + lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1723 + li = (struct lock_info *)lkb->lkb_astparam;
1725 + li->li_user_lksb = kparams->lksb;
1726 + li->li_astparam = kparams->astparam;
1727 + li->li_cmd = kparams->cmd;
1729 + /* Have to do it here cos the lkb may not exist after
1731 + spin_lock(&fi->fi_lkb_lock);
1732 + list_del(&lkb->lkb_ownerqueue);
1733 + spin_unlock(&fi->fi_lkb_lock);
1735 + /* Use existing lksb & astparams */
1736 + status = dlm_unlock(fi->fi_ls->ls_lockspace,
1738 + kparams->flags, NULL, NULL);
1743 +/* Write call, submit a locking request */
1744 +static ssize_t dlm_write(struct file *file, const char __user *buffer,
1745 + size_t count, loff_t *ppos)
1747 + struct file_info *fi = file->private_data;
1748 + struct dlm_lock_params kparams;
1753 + if (count < sizeof(kparams))
1756 + /* Has the lockspace been deleted */
1757 + if (test_bit(1, &fi->fi_ls->ls_flags))
1760 + /* Get the command info */
1761 + if (copy_from_user(&kparams, buffer, sizeof(kparams)))
1764 + if (check_version(&kparams))
1767 + /* Block signals while we are doing this */
1768 + sigfillset(&allsigs);
1769 + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1771 + switch (kparams.cmd)
1773 + case DLM_USER_LOCK:
1774 + status = do_user_lock(fi, &kparams, buffer);
1777 + case DLM_USER_UNLOCK:
1778 + status = do_user_unlock(fi, &kparams);
1781 + case DLM_USER_QUERY:
1782 + status = do_user_query(fi, &kparams);
1789 + /* Restore signals */
1790 + sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1791 + recalc_sigpending();
1799 +void dlm_device_free_devices()
1801 + struct user_ls *tmp;
1802 + struct user_ls *lsinfo;
1804 + list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
1805 + misc_deregister(&lsinfo->ls_miscinfo);
1807 + /* Tidy up, but don't delete the lsinfo struct until
1808 + all the users have closed their devices */
1809 + list_del(&lsinfo->ls_list);
1810 + kfree(lsinfo->ls_miscinfo.name);
1811 + set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
1815 +static struct file_operations _dlm_fops = {
1817 + .release = dlm_close,
1818 + .ioctl = dlm_ioctl,
1820 + .write = dlm_write,
1822 + .owner = THIS_MODULE,
1825 +static struct file_operations _dlm_ctl_fops = {
1826 + .open = dlm_ctl_open,
1827 + .release = dlm_ctl_close,
1828 + .ioctl = dlm_ctl_ioctl,
1829 + .owner = THIS_MODULE,
1833 + * Create control device
1835 +int dlm_device_init(void)
1839 + INIT_LIST_HEAD(&user_ls_list);
1841 + ctl_device.name = "dlm-control";
1842 + ctl_device.fops = &_dlm_ctl_fops;
1843 + ctl_device.minor = MISC_DYNAMIC_MINOR;
1845 + r = misc_register(&ctl_device);
1847 + log_print("misc_register failed for DLM control device");
1854 +void dlm_device_exit(void)
1856 + misc_deregister(&ctl_device);
1860 + * Overrides for Emacs so that we follow Linus's tabbing style.
1861 + * Emacs will notice this stuff at the end of the file and automatically
1862 + * adjust the settings for this buffer only. This must remain at the end
1864 + * ---------------------------------------------------------------------------
1865 + * Local variables:
1866 + * c-file-style: "linux"
1869 diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
1870 --- linux-orig/cluster/dlm/device.h 1970-01-01 07:30:00.000000000 +0730
1871 +++ linux-patched/cluster/dlm/device.h 2004-06-25 18:31:07.000000000 +0800
1873 +/******************************************************************************
1874 +*******************************************************************************
1876 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
1877 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
1879 +** This copyrighted material is made available to anyone wishing to use,
1880 +** modify, copy, or redistribute it subject to the terms and conditions
1881 +** of the GNU General Public License v.2.
1883 +*******************************************************************************
1884 +******************************************************************************/
1886 +#ifndef __DEVICE_DOT_H__
1887 +#define __DEVICE_DOT_H__
1889 +extern void dlm_device_free_devices(void);
1891 +#endif /* __DEVICE_DOT_H__ */
1892 diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
1893 --- linux-orig/cluster/dlm/dir.c 1970-01-01 07:30:00.000000000 +0730
1894 +++ linux-patched/cluster/dlm/dir.c 2004-06-25 18:31:07.000000000 +0800
1896 +/******************************************************************************
1897 +*******************************************************************************
1899 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
1900 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
1902 +** This copyrighted material is made available to anyone wishing to use,
1903 +** modify, copy, or redistribute it subject to the terms and conditions
1904 +** of the GNU General Public License v.2.
1906 +*******************************************************************************
1907 +******************************************************************************/
1909 +#include "dlm_internal.h"
1911 +#include "lockspace.h"
1912 +#include "lowcomms.h"
1913 +#include "reccomms.h"
1915 +#include "config.h"
1916 +#include "memory.h"
1917 +#include "recover.h"
1921 + * We use the upper 16 bits of the hash value to select the directory node.
1922 + * Low bits are used for distribution of rsb's among hash buckets on each node.
1924 + * From the hash value, we are interested in arriving at a final value between
1925 + * zero and the number of nodes minus one (num_nodes - 1).
1927 + * To accomplish this scaling, we take the nearest power of two larger than
1928 + * num_nodes and subtract one to create a bit mask. The mask is applied to the
1929 + * hash, reducing the range to nearer the final range.
1931 + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
1932 + * num_nodes to the previously masked hash value.
1934 + * This value in the desired range is used as an offset into the sorted list of
1935 + * nodeid's to give the particular nodeid of the directory node.
1938 +uint32_t name_to_directory_nodeid(gd_ls_t *ls, char *name, int length)
1940 + struct list_head *tmp;
1941 + gd_csb_t *csb = NULL;
1942 + uint32_t hash, node, n = 0, nodeid;
1944 + if (ls->ls_num_nodes == 1) {
1945 + nodeid = our_nodeid();
1949 + hash = gdlm_hash(name, length);
1950 + node = (hash >> 16) & ls->ls_nodes_mask;
1951 + node %= ls->ls_num_nodes;
1953 + list_for_each(tmp, &ls->ls_nodes) {
1956 + csb = list_entry(tmp, gd_csb_t, csb_list);
1960 + GDLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n",
1961 + ls->ls_num_nodes, n, node, ls->ls_nodes_mask););
1962 + nodeid = csb->csb_node->gn_nodeid;
1968 +uint32_t get_directory_nodeid(gd_res_t *rsb)
1970 + return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
1974 +static inline uint32_t rd_hash(gd_ls_t *ls, char *name, int len)
1978 + val = gdlm_hash(name, len);
1979 + val &= RESDIRHASH_MASK;
1984 +static void add_resdata_to_hash(gd_ls_t *ls, gd_resdata_t *rd)
1986 + gd_resdir_bucket_t *bucket;
1989 + hashval = rd_hash(ls, rd->rd_name, rd->rd_length);
1990 + bucket = &ls->ls_resdir_hash[hashval];
1992 + list_add_tail(&rd->rd_list, &bucket->rb_reslist);
1995 +static gd_resdata_t *search_rdbucket(gd_ls_t *ls, char *name, int namelen,
1998 + struct list_head *head;
2001 + head = &ls->ls_resdir_hash[bucket].rb_reslist;
2002 + list_for_each_entry(rd, head, rd_list) {
2003 + if (rd->rd_length == namelen &&
2004 + !memcmp(name, rd->rd_name, namelen))
2012 +void remove_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
2018 + bucket = rd_hash(ls, name, namelen);
2020 + write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
2022 + rd = search_rdbucket(ls, name, namelen, bucket);
2025 + log_debug(ls, "remove_resdata not found nodeid=%u", nodeid);
2029 + if (rd->rd_master_nodeid != nodeid) {
2030 + log_debug(ls, "remove_resdata wrong nodeid=%u", nodeid);
2034 + if (rd->rd_sequence == sequence) {
2035 + list_del(&rd->rd_list);
2039 + log_debug(ls, "remove_resdata mismatch nodeid=%u rd=%u in=%u",
2040 + nodeid, rd->rd_sequence, sequence);
2045 + write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2048 +void resdir_clear(gd_ls_t *ls)
2050 + struct list_head *head;
2054 + for (i = 0; i < RESDIRHASH_SIZE; i++) {
2055 + head = &ls->ls_resdir_hash[i].rb_reslist;
2056 + while (!list_empty(head)) {
2057 + rd = list_entry(head->next, gd_resdata_t, rd_list);
2058 + list_del(&rd->rd_list);
2064 +static void gdlm_resmov_in(gd_resmov_t *rm, char *buf)
2068 + memcpy(&tmp, buf, sizeof(gd_resmov_t));
2070 + rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
2071 + rm->rm_length = be16_to_cpu(tmp.rm_length);
2074 +int resdir_rebuild_local(gd_ls_t *ls)
2079 + gd_resmov_t mov, last_mov;
2080 + char *b, *last_name;
2081 + int error = -ENOMEM, count = 0;
2083 + log_all(ls, "rebuild resource directory");
2087 + rc = allocate_rcom_buffer(ls);
2091 + last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
2095 + list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
2096 + last_mov.rm_length = 0;
2098 + error = gdlm_recovery_stopped(ls);
2102 + memcpy(rc->rc_buf, last_name, last_mov.rm_length);
2103 + rc->rc_datalen = last_mov.rm_length;
2105 + error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
2106 + RECCOMM_RECOVERNAMES, rc, 1);
2113 + * pick each res out of buffer
2119 + gdlm_resmov_in(&mov, b);
2120 + b += sizeof(gd_resmov_t);
2122 + /* Length of 0 with a non-zero nodeid marks the
2123 + * end of the list */
2124 + if (!mov.rm_length && mov.rm_nodeid)
2127 + /* This is just the end of the block */
2128 + if (!mov.rm_length)
2132 + rd = allocate_resdata(ls, mov.rm_length);
2136 + rd->rd_master_nodeid = mov.rm_nodeid;
2137 + rd->rd_length = mov.rm_length;
2138 + rd->rd_sequence = 1;
2140 + memcpy(rd->rd_name, b, mov.rm_length);
2141 + b += mov.rm_length;
2143 + add_resdata_to_hash(ls, rd);
2147 + memset(last_name, 0, DLM_RESNAME_MAXLEN);
2148 + memcpy(last_name, rd->rd_name, rd->rd_length);
2155 + set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
2158 + log_all(ls, "rebuilt %d resources", count);
2164 + free_rcom_buffer(rc);
2171 + * The reply end of resdir_rebuild_local/RECOVERNAMES. Collect and send as
2172 + * many resource names as can fit in the buffer.
2175 +int resdir_rebuild_send(gd_ls_t *ls, char *inbuf, int inlen, char *outbuf,
2176 + int outlen, uint32_t nodeid)
2178 + struct list_head *list;
2179 + gd_res_t *start_rsb = NULL, *rsb;
2180 + int offset = 0, start_namelen, error;
2183 + uint32_t dir_nodeid;
2186 + * Find the rsb where we left off (or start again)
2189 + start_namelen = inlen;
2190 + start_name = inbuf;
2192 + if (start_namelen > 1) {
2193 + error = find_or_create_rsb(ls, NULL, start_name,
2194 + start_namelen, 0, &start_rsb);
2195 + GDLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
2196 + release_rsb(start_rsb);
2200 + * Send rsb names for rsb's we're master of and whose directory node
2201 + * matches the requesting node.
2204 + down_read(&ls->ls_rec_rsblist);
2206 + list = start_rsb->res_rootlist.next;
2208 + list = ls->ls_rootres.next;
2210 + for (offset = 0; list != &ls->ls_rootres; list = list->next) {
2211 + rsb = list_entry(list, gd_res_t, res_rootlist);
2212 + if (rsb->res_nodeid)
2215 + dir_nodeid = get_directory_nodeid(rsb);
2216 + if (dir_nodeid != nodeid)
2219 + if (offset + sizeof(gd_resmov_t)*2 + rsb->res_length > outlen) {
2220 + /* Write end-of-block record */
2221 + memset(&tmp, 0, sizeof(gd_resmov_t));
2222 + memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2223 + offset += sizeof(gd_resmov_t);
2227 + memset(&tmp, 0, sizeof(gd_resmov_t));
2228 + tmp.rm_nodeid = cpu_to_be32(our_nodeid());
2229 + tmp.rm_length = cpu_to_be16(rsb->res_length);
2231 + memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2232 + offset += sizeof(gd_resmov_t);
2234 + memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
2235 + offset += rsb->res_length;
2239 + * If we've reached the end of the list (and there's room) write a
2240 + * terminating record.
2243 + if ((list == &ls->ls_rootres) &&
2244 + (offset + sizeof(gd_resmov_t) <= outlen)) {
2246 + memset(&tmp, 0, sizeof(gd_resmov_t));
2247 + /* This only needs to be non-zero */
2248 + tmp.rm_nodeid = cpu_to_be32(1);
2249 + /* and this must be zero */
2250 + tmp.rm_length = 0;
2251 + memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2252 + offset += sizeof(gd_resmov_t);
2256 + up_read(&ls->ls_rec_rsblist);
2260 +int get_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
2261 + gd_resdata_t **rdp, int recovery)
2264 + gd_resdata_t *tmp;
2267 + bucket = rd_hash(ls, name, namelen);
2269 + read_lock(&ls->ls_resdir_hash[bucket].rb_lock);
2270 + rd = search_rdbucket(ls, name, namelen, bucket);
2271 + read_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2276 + rd = allocate_resdata(ls, namelen);
2280 + rd->rd_master_nodeid = nodeid;
2281 + rd->rd_length = namelen;
2282 + memcpy(rd->rd_name, name, namelen);
2284 + write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
2285 + tmp = search_rdbucket(ls, name, namelen, bucket);
2287 + list_add_tail(&rd->rd_list,
2288 + &ls->ls_resdir_hash[bucket].rb_reslist);
2289 + write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2300 + if (++rd->rd_sequence == 0)
2301 + rd->rd_sequence++;
2303 + rd->rd_sequence = 1;
2309 + * The node with lowest id queries all nodes to determine when all are done.
2310 + * All other nodes query the low nodeid for this.
2313 +int resdir_rebuild_wait(gd_ls_t *ls)
2317 + if (ls->ls_low_nodeid == our_nodeid()) {
2318 + error = gdlm_wait_status_all(ls, RESDIR_VALID);
2320 + set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
2322 + error = gdlm_wait_status_low(ls, RESDIR_ALL_VALID);
2326 diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
2327 --- linux-orig/cluster/dlm/dir.h 1970-01-01 07:30:00.000000000 +0730
2328 +++ linux-patched/cluster/dlm/dir.h 2004-06-25 18:31:07.000000000 +0800
2330 +/******************************************************************************
2331 +*******************************************************************************
2333 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2334 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2336 +** This copyrighted material is made available to anyone wishing to use,
2337 +** modify, copy, or redistribute it subject to the terms and conditions
2338 +** of the GNU General Public License v.2.
2340 +*******************************************************************************
2341 +******************************************************************************/
2343 +#ifndef __DIR_DOT_H__
2344 +#define __DIR_DOT_H__
2346 +uint32_t name_to_directory_nodeid(gd_ls_t * ls, char *name, int length);
2347 +uint32_t get_directory_nodeid(gd_res_t * rsb);
2348 +void remove_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
2349 + uint8_t sequence);
2350 +int resdir_rebuild_local(gd_ls_t * ls);
2351 +int resdir_rebuild_send(gd_ls_t * ls, char *inbuf, int inlen, char *outbuf,
2352 + int outlen, uint32_t nodeid);
2353 +int get_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
2354 + gd_resdata_t ** rdp, int recovery);
2355 +int resdir_rebuild_wait(gd_ls_t * ls);
2356 +void resdir_clear(gd_ls_t * ls);
2357 +void resdir_dump(gd_ls_t * ls);
2359 +#endif /* __DIR_DOT_H__ */
2360 diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
2361 --- linux-orig/cluster/dlm/dlm_internal.h 1970-01-01 07:30:00.000000000 +0730
2362 +++ linux-patched/cluster/dlm/dlm_internal.h 2004-06-25 18:31:07.000000000 +0800
2364 +/******************************************************************************
2365 +*******************************************************************************
2367 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2368 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2370 +** This copyrighted material is made available to anyone wishing to use,
2371 +** modify, copy, or redistribute it subject to the terms and conditions
2372 +** of the GNU General Public License v.2.
2374 +*******************************************************************************
2375 +******************************************************************************/
2377 +#ifndef __DLM_INTERNAL_DOT_H__
2378 +#define __DLM_INTERNAL_DOT_H__
2381 + * This is the main header file to be included in each DLM source file.
2384 +#define DLM_RELEASE_NAME "<CVS>"
2386 +#include <linux/slab.h>
2387 +#include <linux/sched.h>
2388 +#include <asm/semaphore.h>
2389 +#include <linux/types.h>
2390 +#include <linux/spinlock.h>
2391 +#include <linux/vmalloc.h>
2392 +#include <asm/uaccess.h>
2393 +#include <linux/list.h>
2394 +#include <linux/errno.h>
2395 +#include <linux/random.h>
2397 +#include <cluster/dlm.h>
2398 +#include <cluster/dlm_device.h>
2399 +#include <cluster/service.h>
2409 +#if (BITS_PER_LONG == 64)
2410 +#define PRIu64 "lu"
2411 +#define PRId64 "ld"
2412 +#define PRIo64 "lo"
2413 +#define PRIx64 "lx"
2414 +#define PRIX64 "lX"
2415 +#define SCNu64 "lu"
2416 +#define SCNd64 "ld"
2417 +#define SCNo64 "lo"
2418 +#define SCNx64 "lx"
2419 +#define SCNX64 "lX"
2421 +#define PRIu64 "Lu"
2422 +#define PRId64 "Ld"
2423 +#define PRIo64 "Lo"
2424 +#define PRIx64 "Lx"
2425 +#define PRIX64 "LX"
2426 +#define SCNu64 "Lu"
2427 +#define SCNd64 "Ld"
2428 +#define SCNo64 "Lo"
2429 +#define SCNx64 "Lx"
2430 +#define SCNX64 "LX"
2433 +#define wchan_cond_sleep_intr(chan, sleep_cond) \
2436 + DECLARE_WAITQUEUE(__wait_chan, current); \
2437 + current->state = TASK_INTERRUPTIBLE; \
2438 + add_wait_queue(&chan, &__wait_chan); \
2439 + if ((sleep_cond)) \
2441 + remove_wait_queue(&chan, &__wait_chan); \
2442 + current->state = TASK_RUNNING; \
2446 +static inline int check_timeout(unsigned long stamp, unsigned int seconds)
2448 + return time_after(jiffies, stamp + seconds * HZ);
2452 +#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
2454 +#define log_all(ls, fmt, args...) \
2456 + printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
2457 + dlm_debug_log(ls, fmt, ##args); \
2460 +#define log_error log_all
2464 +#if defined(DLM_DEBUG)
2465 +#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
2467 +#define log_debug(ls, fmt, args...)
2470 +#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
2472 +#define log_debug log_all
2476 +#define GDLM_ASSERT(x, do) \
2480 + dlm_debug_dump(); \
2481 + printk("\nDLM: Assertion failed on line %d of file %s\n" \
2482 + "DLM: assertion: \"%s\"\n" \
2483 + "DLM: time = %lu\n", \
2484 + __LINE__, __FILE__, #x, jiffies); \
2488 + panic("DLM: Record message above and reboot.\n"); \
2502 +struct gd_resdir_bucket;
2503 +struct gd_remlockreply;
2504 +struct gd_remlockrequest;
2507 +typedef struct gd_ls gd_ls_t;
2508 +typedef struct gd_lkb gd_lkb_t;
2509 +typedef struct gd_res gd_res_t;
2510 +typedef struct gd_csb gd_csb_t;
2511 +typedef struct gd_node gd_node_t;
2512 +typedef struct gd_resmov gd_resmov_t;
2513 +typedef struct gd_resdata gd_resdata_t;
2514 +typedef struct gd_recover gd_recover_t;
2515 +typedef struct gd_resdir_bucket gd_resdir_bucket_t;
2516 +typedef struct gd_rcom gd_rcom_t;
2519 + * Resource Data - an entry for a resource in the resdir hash table
2522 +struct gd_resdata {
2523 + struct list_head rd_list;
2524 + uint32_t rd_master_nodeid;
2525 + uint16_t rd_length;
2526 + uint8_t rd_sequence;
2527 + char rd_name[1]; /* <rd_length> bytes */
2531 + * Resource Directory Bucket - a hash bucket of resdata entries in the resdir
2535 +struct gd_resdir_bucket {
2536 + struct list_head rb_reslist;
2541 + * A resource description as moved between nodes
2545 + uint32_t rm_nodeid;
2546 + uint16_t rm_length;
2551 + * An entry in the lock ID table. Locks for this bucket are kept on list.
2552 + * Counter is used to assign an id to locks as they are added to this bucket.
2555 +struct gd_lockidtbl_entry {
2556 + struct list_head list;
2560 +/* Elements in the range array */
2562 +#define GR_RANGE_START 0
2563 +#define GR_RANGE_END 1
2564 +#define RQ_RANGE_START 2
2565 +#define RQ_RANGE_END 3
2568 + * Lockspace structure. The context for GDLM locks.
2571 +#define RESHASHTBL_SIZE (256)
2573 +#define RESDIRHASH_SHIFT (9)
2574 +#define RESDIRHASH_SIZE (1 << RESDIRHASH_SHIFT)
2575 +#define RESDIRHASH_MASK (RESDIRHASH_SIZE - 1)
2577 +#define LSFL_WORK (0)
2578 +#define LSFL_LS_RUN (1)
2579 +#define LSFL_LS_STOP (2)
2580 +#define LSFL_LS_START (3)
2581 +#define LSFL_LS_FINISH (4)
2582 +#define LSFL_RECCOMM_WAIT (5)
2583 +#define LSFL_RECCOMM_READY (6)
2584 +#define LSFL_NOTIMERS (7)
2585 +#define LSFL_FINISH_RECOVERY (8)
2586 +#define LSFL_RESDIR_VALID (9)
2587 +#define LSFL_ALL_RESDIR_VALID (10)
2588 +#define LSFL_NODES_VALID (11)
2589 +#define LSFL_ALL_NODES_VALID (12)
2590 +#define LSFL_REQUEST_WARN (13)
2592 +#define LSST_NONE (0)
2593 +#define LSST_INIT (1)
2594 +#define LSST_INIT_DONE (2)
2595 +#define LSST_CLEAR (3)
2596 +#define LSST_WAIT_START (4)
2597 +#define LSST_RECONFIG_DONE (5)
2600 + struct list_head ls_list; /* list of lockspaces */
2601 + uint32_t ls_local_id; /* local unique lockspace ID */
2602 + uint32_t ls_global_id; /* global unique lockspace ID */
2603 + int ls_allocation; /* Memory allocation policy */
2604 + unsigned long ls_flags; /* LSFL_ */
2606 + struct list_head ls_rootres; /* List of root resources */
2610 + struct list_head *ls_reshashtbl; /* Hash table for resources */
2611 + rwlock_t ls_reshash_lock; /* Lock for hash table */
2613 + struct gd_lockidtbl_entry *ls_lockidtbl;
2614 + uint32_t ls_lockidtbl_size; /* Size of lock id table */
2615 + rwlock_t ls_lockidtbl_lock;
2617 + struct list_head ls_nodes; /* current nodes in RC */
2618 + uint32_t ls_num_nodes; /* number of nodes in RC */
2619 + uint32_t ls_nodes_mask;
2620 + uint32_t ls_low_nodeid;
2622 + int ls_state; /* state changes for recovery */
2623 + struct list_head ls_recover; /* gr_recover_t structs */
2624 + int ls_last_stop; /* event ids from sm */
2625 + int ls_last_start;
2626 + int ls_last_finish;
2627 + spinlock_t ls_recover_lock;
2628 + struct list_head ls_nodes_gone; /* dead node list for recovery */
2630 + wait_queue_head_t ls_wait_general;
2632 + gd_rcom_t *ls_rcom;
2633 + uint32_t ls_rcom_msgid;
2634 + struct semaphore ls_rcom_lock;
2636 + struct list_head ls_recover_list;
2637 + int ls_recover_list_count;
2638 + spinlock_t ls_recover_list_lock;
2640 + struct rw_semaphore ls_in_recovery; /* held in write during
2641 + * recovery, read for normal
2643 + struct rw_semaphore ls_unlock_sem; /* To prevent unlock on a
2644 + * parent lock racing with a
2645 + * new child lock */
2647 + struct rw_semaphore ls_rec_rsblist; /* To prevent incoming recovery
2648 + * operations happening while
2649 + * we are purging */
2651 + struct rw_semaphore ls_gap_rsblist; /* To protect rootres list
2652 + * in grant_after_purge() which
2653 + * runs outside recovery */
2655 + struct list_head ls_rebuild_rootrsb_list; /* Root of lock trees
2656 + * we are deserialising
2659 + struct list_head ls_deadlockq; /* List of locks in conversion ordered
2660 + * by duetime. for deadlock detection */
2662 + struct list_head ls_requestqueue; /* List of incoming requests
2663 + * held while we are in
2666 + gd_resdir_bucket_t ls_resdir_hash[RESDIRHASH_SIZE];
2669 + char ls_name[1]; /* <namelen> bytes */
2673 + * Cluster node (per node in cluster)
2677 + struct list_head gn_list; /* global list of cluster nodes */
2678 + uint32_t gn_nodeid; /* cluster unique nodeid (cman) */
2679 + uint32_t gn_ipaddr; /* node's first IP address (cman) */
2680 + int gn_refcount; /* number of csb's referencing */
2684 + * Cluster System Block (per node in a ls)
2688 + struct list_head csb_list; /* per-lockspace list of nodes */
2689 + gd_node_t *csb_node; /* global node structure */
2690 + int csb_gone_event; /* event id when node was removed */
2692 + uint32_t csb_names_send_count;
2693 + uint32_t csb_names_send_msgid;
2694 + uint32_t csb_names_recv_count;
2695 + uint32_t csb_names_recv_msgid;
2696 + uint32_t csb_locks_send_count;
2697 + uint32_t csb_locks_send_msgid;
2698 + uint32_t csb_locks_recv_count;
2699 + uint32_t csb_locks_recv_msgid;
2708 +#define GDLM_RESSTS_DIRENTRY 1 /* This is a directory entry */
2709 +#define GDLM_RESSTS_LVBINVALID 2 /* The LVB is invalid */
2711 +#define RESFL_NEW_MASTER (0)
2712 +#define RESFL_RECOVER_LIST (1)
2715 + struct list_head res_hashchain; /* Chain of resources in this hash
2718 + gd_ls_t *res_ls; /* The owning lockspace */
2720 + struct list_head res_rootlist; /* List of root resources in lockspace */
2722 + struct list_head res_subreslist; /* List of all sub-resources
2723 + * for this root res. */
2724 + /* This is a list head on the root res and holds the whole tree below
2726 + uint8_t res_depth; /* Depth in resource tree */
2727 + uint16_t res_status;
2728 + unsigned long res_flags; /* Flags, RESFL_ */
2730 + struct list_head res_grantqueue;
2731 + struct list_head res_convertqueue;
2732 + struct list_head res_waitqueue;
2734 + uint32_t res_nodeid; /* nodeid of master node */
2736 + gd_res_t *res_root; /* If a subresource, this is our root */
2737 + gd_res_t *res_parent; /* Our parent resource (if any) */
2739 + atomic_t res_ref; /* No of lkb's */
2740 + uint16_t res_remasterid; /* ID used during remaster */
2741 + struct list_head res_recover_list; /* General list for use during
2743 + int res_recover_msgid;
2744 + int res_newlkid_expect;
2746 + struct rw_semaphore res_lock;
2748 + char *res_lvbptr; /* Lock value block */
2750 + uint8_t res_resdir_seq; /* Last directory sequence number */
2752 + uint8_t res_length;
2753 + char res_name[1]; /* <res_length> bytes */
2757 + * Lock block. To avoid confusion, where flags mirror the
2758 + * public flags, they should have the same value.
2761 +#define GDLM_LKSTS_NEW (0)
2762 +#define GDLM_LKSTS_WAITING (1)
2763 +#define GDLM_LKSTS_GRANTED (2)
2764 +#define GDLM_LKSTS_CONVERT (3)
2766 +#define GDLM_LKFLG_VALBLK (0x00000008)
2767 +#define GDLM_LKFLG_PERSISTENT (0x00000080) /* Don't unlock when process exits */
2768 +#define GDLM_LKFLG_NODLCKWT (0x00000100) /* Don't do deadlock detection */
2769 +#define GDLM_LKFLG_EXPEDITE (0x00000400) /* Move to head of convert queue */
2771 +/* Internal flags */
2772 +#define GDLM_LKFLG_RANGE (0x00001000) /* Range field is present (remote protocol only) */
2773 +#define GDLM_LKFLG_MSTCPY (0x00002000)
2774 +#define GDLM_LKFLG_DELETED (0x00004000) /* LKB is being deleted */
2775 +#define GDLM_LKFLG_DELAST (0x00008000) /* Delete after delivering AST */
2776 +#define GDLM_LKFLG_LQRESEND (0x00010000) /* LKB on lockqueue must be resent */
2777 +#define GDLM_LKFLG_DEMOTED (0x00020000)
2778 +#define GDLM_LKFLG_RESENT (0x00040000)
2779 +#define GDLM_LKFLG_NOREBUILD (0x00080000)
2780 +#define GDLM_LKFLG_LQCONVERT (0x00100000)
2783 + void *lkb_astaddr;
2784 + void *lkb_bastaddr;
2785 + long lkb_astparam;
2787 + uint32_t lkb_flags;
2788 + uint16_t lkb_status; /* LKSTS_ granted, waiting, converting */
2789 + int8_t lkb_rqmode; /* Requested lock mode */
2790 + int8_t lkb_grmode; /* Granted lock mode */
2791 + uint8_t lkb_bastmode; /* Requested mode returned in bast */
2792 + uint8_t lkb_highbast; /* Highest mode we have sent a BAST for */
2793 + uint32_t lkb_retstatus; /* Status to return in lksb */
2795 + uint32_t lkb_id; /* Our lock ID */
2796 + struct dlm_lksb *lkb_lksb; /* Lock status block of caller */
2797 + struct list_head lkb_idtbl_list; /* list pointer into the
2800 + struct list_head lkb_statequeue; /* List of locks in this state */
2802 + struct list_head lkb_ownerqueue; /* List of locks owned by a
2805 + gd_lkb_t *lkb_parent; /* Pointer to parent if any */
2807 + atomic_t lkb_childcnt; /* Number of children */
2809 + struct list_head lkb_lockqueue; /* For when we are on the lock queue */
2810 + int lkb_lockqueue_state;
2811 + int lkb_lockqueue_flags; /* As passed into lock/unlock */
2812 + unsigned long lkb_lockqueue_time; /* Time we went on the lock
2815 + gd_res_t *lkb_resource;
2817 + unsigned long lkb_duetime; /* For deadlock detection */
2819 + uint32_t lkb_remid; /* Remote partner */
2820 + uint32_t lkb_nodeid;
2822 + struct list_head lkb_astqueue; /* For when we are on the AST queue */
2823 + uint32_t lkb_asts_to_deliver;
2825 + struct gd_remlockrequest *lkb_request;
2827 + struct list_head lkb_deadlockq; /* on ls_deadlockq list */
2829 + char *lkb_lvbptr; /* Points to lksb on a local lock, allocated
2830 + * LVB (if necessary) on a remote lock */
2831 + uint64_t *lkb_range; /* Points to an array of 64 bit numbers that
2832 + * represent the requested and granted ranges
2833 + * of the lock. NULL implies 0-ffffffffffffffff
2838 + * Used to save and manage recovery state for a lockspace.
2841 +struct gd_recover {
2842 + struct list_head gr_list;
2843 + uint32_t *gr_nodeids;
2844 + int gr_node_count;
2849 + * Header part of the mid-level comms system. All packets start with
2850 + * this header so we can identify them. The comms packet can
2851 + * contain many of these structs but the are split into individual
2852 + * work units before being passed to the lockqueue routines.
2853 + * below this are the structs that this is a header for
2856 +struct gd_req_header {
2857 + uint8_t rh_cmd; /* What we are */
2858 + uint8_t rh_flags; /* maybe just a pad */
2859 + uint16_t rh_length; /* Length of struct (so we can send several in
2861 + uint32_t rh_lkid; /* Lock ID tag: ie the local (requesting) lock
2863 + uint32_t rh_lockspace; /* Lockspace ID */
2867 + * This is the struct used in a remote lock/unlock/convert request
2868 + * The mid-level comms API should turn this into native byte order.
2869 + * Most "normal" lock operations will use these two structs for
2870 + * communications. Recovery operations use their own structs
2871 + * but still with the gd_req_header on the front.
2874 +struct gd_remlockrequest {
2875 + struct gd_req_header rr_header;
2877 + uint32_t rr_remlkid; /* Remote lock ID */
2878 + uint32_t rr_remparid; /* Parent's remote lock ID or 0 */
2879 + uint32_t rr_flags; /* Flags from lock/convert request */
2880 + uint64_t rr_range_start;/* Yes, these are in the right place... */
2881 + uint64_t rr_range_end;
2882 + uint32_t rr_status; /* Status to return if this is an AST request */
2883 + uint8_t rr_rqmode; /* Requested lock mode */
2884 + uint8_t rr_asts; /* Whether the LKB has ASTs or not */
2885 + uint8_t rr_resdir_seq; /* Directory sequence number */
2886 + char rr_lvb[DLM_LVB_LEN]; /* Value block */
2887 + char rr_name[1]; /* As long as needs be. Only used for directory
2888 + * lookups. The length of this can be worked
2889 + * out from the packet length */
2893 + * This is the struct returned by a remote lock/unlock/convert request
2894 + * The mid-level comms API should turn this into native byte order.
2897 +struct gd_remlockreply {
2898 + struct gd_req_header rl_header;
2900 + uint32_t rl_lockstate; /* Whether request was queued/granted/waiting */
2901 + uint32_t rl_nodeid; /* nodeid of lock master */
2902 + uint32_t rl_status; /* Status to return to caller */
2903 + uint32_t rl_lkid; /* Remote lkid */
2904 + uint8_t rl_resdir_seq; /* Returned directory sequence number */
2905 + char rl_lvb[DLM_LVB_LEN]; /* LVB itself */
2909 + * Recovery comms message
2913 + struct gd_req_header rc_header; /* 32 byte aligned */
2914 + uint32_t rc_msgid;
2915 + uint16_t rc_datalen;
2916 + uint8_t rc_expanded;
2917 + uint8_t rc_subcmd; /* secondary command */
2918 + char rc_buf[1]; /* first byte of data goes here and extends
2919 + * beyond here for another datalen - 1 bytes.
2920 + * rh_length is set to sizeof(gd_rcom_t) +
2925 +/* A remote query: GDLM_REMCMD_QUERY */
2926 +struct gd_remquery {
2927 + struct gd_req_header rq_header;
2929 + uint32_t rq_mstlkid; /* LockID on master node */
2930 + uint32_t rq_query; /* query from the user */
2931 + uint32_t rq_maxlocks; /* max number of locks we can cope with */
2934 +/* First block of a reply query. cmd = GDLM_REMCMD_QUERY */
2935 +/* There may be subsequent blocks of
2936 + lock info in GDLM_REMCMD_QUERYCONT messages which just have
2937 + a normal header. The last of these will have rh_flags set to
2938 + GDLM_REMFLAG_ENDQUERY
2940 +struct gd_remqueryreply {
2941 + struct gd_req_header rq_header;
2943 + uint32_t rq_numlocks; /* Number of locks in reply */
2944 + uint32_t rq_startlock; /* Which lock this block starts at (for multiple block replies) */
2945 + uint32_t rq_status;
2947 + /* Resource information */
2948 + uint32_t rq_grantcount; /* No. of nodes on grant queue */
2949 + uint32_t rq_convcount; /* No. of nodes on convert queue */
2950 + uint32_t rq_waitcount; /* No. of nodes on wait queue */
2951 + char rq_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable */
2955 + * Lockqueue wait lock states
2958 +#define GDLM_LQSTATE_WAIT_RSB 1
2959 +#define GDLM_LQSTATE_WAIT_CONVERT 2
2960 +#define GDLM_LQSTATE_WAIT_CONDGRANT 3
2961 +#define GDLM_LQSTATE_WAIT_UNLOCK 4
2963 +/* Commands sent across the comms link */
2964 +#define GDLM_REMCMD_LOOKUP 1
2965 +#define GDLM_REMCMD_LOCKREQUEST 2
2966 +#define GDLM_REMCMD_UNLOCKREQUEST 3
2967 +#define GDLM_REMCMD_CONVREQUEST 4
2968 +#define GDLM_REMCMD_LOCKREPLY 5
2969 +#define GDLM_REMCMD_LOCKGRANT 6
2970 +#define GDLM_REMCMD_SENDBAST 7
2971 +#define GDLM_REMCMD_SENDCAST 8
2972 +#define GDLM_REMCMD_REM_RESDATA 9
2973 +#define GDLM_REMCMD_RECOVERMESSAGE 20
2974 +#define GDLM_REMCMD_RECOVERREPLY 21
2975 +#define GDLM_REMCMD_QUERY 30
2976 +#define GDLM_REMCMD_QUERYREPLY 31
2978 +/* Set in rh_flags when this is the last block of
2979 + query information. Note this could also be the first
2981 +#define GDLM_REMFLAG_ENDQUERY 1
2984 + * This is a both a parameter to queue_ast and also the bitmap of ASTs in
2985 + * lkb_asts_to_deliver
2988 +typedef enum { GDLM_QUEUE_COMPAST = 1, GDLM_QUEUE_BLKAST = 2 } gd_ast_type_t;
2994 +void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...);
2995 +void dlm_debug_dump(void);
2997 +#endif /* __DLM_INTERNAL_DOT_H__ */
2998 diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
2999 --- linux-orig/cluster/dlm/lkb.c 1970-01-01 07:30:00.000000000 +0730
3000 +++ linux-patched/cluster/dlm/lkb.c 2004-06-25 18:31:07.000000000 +0800
3002 +/******************************************************************************
3003 +*******************************************************************************
3005 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3006 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3008 +** This copyrighted material is made available to anyone wishing to use,
3009 +** modify, copy, or redistribute it subject to the terms and conditions
3010 +** of the GNU General Public License v.2.
3012 +*******************************************************************************
3013 +******************************************************************************/
3018 + * Allocate and free locks on the lock ID table.
3020 + * This is slightly naff but I don't really like the
3021 + * VMS lockidtbl stuff as it uses a realloced array
3022 + * to hold the locks in. I think this is slightly better
3025 + * Any better suggestions gratefully received. Patrick
3029 +#include "dlm_internal.h"
3030 +#include "lockqueue.h"
3032 +#include "config.h"
3034 +#include "memory.h"
3035 +#include "lockspace.h"
3039 + * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
3042 +static gd_lkb_t *__find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
3044 + uint16_t entry = lkid & 0xFFFF;
3047 + if (entry >= ls->ls_lockidtbl_size)
3050 + list_for_each_entry(lkb, &ls->ls_lockidtbl[entry].list, lkb_idtbl_list){
3051 + if (lkb->lkb_id == lkid)
3060 + * Should be called at lockspace initialisation time.
3063 +int init_lockidtbl(gd_ls_t *ls, int entries)
3067 + /* Make sure it's a power of two */
3068 + GDLM_ASSERT(!(entries & (entries - 1)),);
3070 + ls->ls_lockidtbl_size = entries;
3071 + rwlock_init(&ls->ls_lockidtbl_lock);
3073 + ls->ls_lockidtbl = kmalloc(entries * sizeof(struct gd_lockidtbl_entry),
3075 + if (!ls->ls_lockidtbl)
3078 + for (i = 0; i < entries; i++) {
3079 + INIT_LIST_HEAD(&ls->ls_lockidtbl[i].list);
3080 + ls->ls_lockidtbl[i].counter = 1;
3087 + * Free up the space - returns an error if there are still locks hanging around
3090 +int free_lockidtbl(gd_ls_t *ls)
3094 + write_lock(&ls->ls_lockidtbl_lock);
3096 + for (i = 0; i < ls->ls_lockidtbl_size; i++) {
3097 + if (!list_empty(&ls->ls_lockidtbl[i].list)) {
3098 + write_unlock(&ls->ls_lockidtbl_lock);
3102 + kfree(ls->ls_lockidtbl);
3104 + write_unlock(&ls->ls_lockidtbl_lock);
3110 + * LKB lkid's are 32 bits and have two 16 bit parts. The bottom 16 bits are a
3111 + * random number between 0 and lockidtbl_size-1. This random number specifies
3112 + * the "bucket" for the lkb in lockidtbl. The upper 16 bits are a sequentially
3113 + * assigned per-bucket id.
3115 + * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
3116 + * against the lkid of all lkb's in the bucket to avoid duplication.
3120 +gd_lkb_t *create_lkb(gd_ls_t *ls)
3126 + lkb = allocate_lkb(ls);
3130 + write_lock(&ls->ls_lockidtbl_lock);
3132 + get_random_bytes(&bucket, sizeof(bucket));
3133 + bucket &= (ls->ls_lockidtbl_size - 1);
3134 + lkid = bucket | (ls->ls_lockidtbl[bucket].counter++ << 16);
3136 + while (__find_lock_by_id(ls, lkid));
3138 + lkb->lkb_id = (uint32_t) lkid;
3139 + list_add(&lkb->lkb_idtbl_list, &ls->ls_lockidtbl[bucket].list);
3140 + write_unlock(&ls->ls_lockidtbl_lock);
3147 + * Free LKB and remove it from the lockidtbl.
3148 + * NB - this always frees the lkb whereas release_rsb doesn't free an
3149 + * rsb unless its reference count is zero.
3152 +void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb)
3154 + if (lkb->lkb_status) {
3155 + log_error(ls, "release lkb with status %u", lkb->lkb_status);
3160 + if (lkb->lkb_parent)
3161 + atomic_dec(&lkb->lkb_parent->lkb_childcnt);
3163 + write_lock(&ls->ls_lockidtbl_lock);
3164 + list_del(&lkb->lkb_idtbl_list);
3165 + write_unlock(&ls->ls_lockidtbl_lock);
3167 + /* if this is not a master copy then lvbptr points into the user's
3168 + * lksb, so don't free it */
3169 + if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3170 + free_lvb(lkb->lkb_lvbptr);
3172 + if (lkb->lkb_range)
3173 + free_range(lkb->lkb_range);
3178 +gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
3182 + read_lock(&ls->ls_lockidtbl_lock);
3183 + lkb = __find_lock_by_id(ls, lkid);
3184 + read_unlock(&ls->ls_lockidtbl_lock);
3189 +gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid)
3191 + gd_ls_t *lspace = find_lockspace_by_local_id(ls);
3192 + return find_lock_by_id(lspace, lkid);
3196 + * Initialise the range parts of an LKB.
3199 +int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end)
3201 + int ret = -ENOMEM;
3204 + * if this wasn't already a range lock, make it one
3206 + if (!lkb->lkb_range) {
3207 + lkb->lkb_range = allocate_range(lspace);
3208 + if (!lkb->lkb_range)
3212 + * This is needed for conversions that contain ranges where the
3213 + * original lock didn't but it's harmless for new locks too.
3215 + lkb->lkb_range[GR_RANGE_START] = 0LL;
3216 + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
3219 + lkb->lkb_range[RQ_RANGE_START] = start;
3220 + lkb->lkb_range[RQ_RANGE_END] = end;
3227 diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
3228 --- linux-orig/cluster/dlm/lkb.h 1970-01-01 07:30:00.000000000 +0730
3229 +++ linux-patched/cluster/dlm/lkb.h 2004-06-25 18:31:07.000000000 +0800
3231 +/******************************************************************************
3232 +*******************************************************************************
3234 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3235 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3237 +** This copyrighted material is made available to anyone wishing to use,
3238 +** modify, copy, or redistribute it subject to the terms and conditions
3239 +** of the GNU General Public License v.2.
3241 +*******************************************************************************
3242 +******************************************************************************/
3244 +#ifndef __LKB_DOT_H__
3245 +#define __LKB_DOT_H__
3247 +int free_lockidtbl(gd_ls_t * lspace);
3248 +int init_lockidtbl(gd_ls_t * lspace, int entries);
3250 +gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid);
3251 +gd_lkb_t *create_lkb(gd_ls_t *ls);
3252 +void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb);
3253 +gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid);
3254 +int verify_lkb_nodeids(gd_ls_t *ls);
3255 +int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end);
3257 +#endif /* __LKB_DOT_H__ */
3258 diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
3259 --- linux-orig/cluster/dlm/locking.c 1970-01-01 07:30:00.000000000 +0730
3260 +++ linux-patched/cluster/dlm/locking.c 2004-06-25 18:31:07.000000000 +0800
3262 +/******************************************************************************
3263 +*******************************************************************************
3265 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3266 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3268 +** This copyrighted material is made available to anyone wishing to use,
3269 +** modify, copy, or redistribute it subject to the terms and conditions
3270 +** of the GNU General Public License v.2.
3272 +*******************************************************************************
3273 +******************************************************************************/
3278 + * This is where the main work of the DLM goes on
3282 +#include "dlm_internal.h"
3283 +#include "lockqueue.h"
3284 +#include "locking.h"
3285 +#include "lockspace.h"
3290 +#include "memory.h"
3293 +#define MAX(a, b) (((a) > (b)) ? (a) : (b))
3296 + * Lock compatibilty matrix - thanks Steve
3297 + * UN = Unlocked state. Not really a state, used as a flag
3298 + * PD = Padding. Used to make the matrix a nice power of two in size
3299 + * Other states are the same as the VMS DLM.
3300 + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
3303 +#define modes_compat(gr, rq) \
3304 + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
3306 +const int __dlm_compat_matrix[8][8] = {
3307 + /* UN NL CR CW PR PW EX PD */
3308 + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
3309 + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
3310 + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
3311 + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
3312 + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
3313 + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
3314 + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
3315 + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3319 + * Compatibility matrix for conversions with QUECVT set.
3320 + * Granted mode is the row; requested mode is the column.
3321 + * Usage: matrix[grmode+1][rqmode+1]
3324 +const int __quecvt_compat_matrix[8][8] = {
3325 + /* UN NL CR CW PR PW EX PD */
3326 + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
3327 + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
3328 + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
3329 + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
3330 + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
3331 + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
3332 + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
3333 + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3337 + * This defines the direction of transfer of LVB data.
3338 + * Granted mode is the row; requested mode is the column.
3339 + * Usage: matrix[grmode+1][rqmode+1]
3340 + * 1 = LVB is returned to the caller
3341 + * 0 = LVB is written to the resource
3342 + * -1 = nothing happens to the LVB
3345 +const int __lvb_operations[8][8] = {
3346 + /* UN NL CR CW PR PW EX PD*/
3347 + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
3348 + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
3349 + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
3350 + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
3351 + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
3352 + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
3353 + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
3354 + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
3357 +static void grant_lock(gd_lkb_t * lkb, int send_remote);
3358 +static void send_blocking_asts(gd_res_t * rsb, gd_lkb_t * lkb);
3359 +static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb);
3360 +static int convert_lock(gd_ls_t * ls, int mode, struct dlm_lksb *lksb,
3361 + int flags, void *ast, void *astarg, void *bast,
3362 + struct dlm_range *range);
3363 +static int dlm_lock_stage1(gd_ls_t * lspace, gd_lkb_t * lkb, int flags,
3364 + char *name, int namelen);
3367 +static inline int first_in_list(gd_lkb_t *lkb, struct list_head *head)
3369 + gd_lkb_t *first = list_entry(head->next, gd_lkb_t, lkb_statequeue);
3371 + if (lkb->lkb_id == first->lkb_id)
3378 + * Return 1 if the locks' ranges overlap
3379 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
3382 +static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
3384 + if (!lkb1->lkb_range || !lkb2->lkb_range)
3387 + if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
3388 + lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
3395 + * Resolve conversion deadlock by changing to NL the granted mode of deadlocked
3396 + * locks on the convert queue. One of the deadlocked locks is allowed to
3397 + * retain its original granted state (we choose the lkb provided although it
3398 + * shouldn't matter which.) We do not change the granted mode on locks without
3399 + * the CONVDEADLK flag. If any of these exist (there shouldn't if the app uses
3400 + * the flag consistently) the false return value is used.
3403 +static int conversion_deadlock_resolve(gd_res_t *rsb, gd_lkb_t *lkb)
3408 + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3412 + if (!ranges_overlap(lkb, this))
3415 + if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) {
3417 + if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){
3421 + this->lkb_grmode = DLM_LOCK_NL;
3422 + this->lkb_flags |= GDLM_LKFLG_DEMOTED;
3429 + * "A conversion deadlock arises with a pair of lock requests in the converting
3430 + * queue for one resource. The granted mode of each lock blocks the requested
3431 + * mode of the other lock."
3434 +static int conversion_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
3438 + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3442 + if (!ranges_overlap(lkb, this))
3445 + if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
3452 + * Check if the given lkb conflicts with another lkb on the queue.
3455 +static int queue_conflict(struct list_head *head, gd_lkb_t *lkb)
3459 + list_for_each_entry(this, head, lkb_statequeue) {
3462 + if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
3469 + * Deadlock can arise when using the QUECVT flag if the requested mode of the
3470 + * first converting lock is incompatible with the granted mode of another
3471 + * converting lock further down the queue. To prevent this deadlock, a
3472 + * requested QUEUECVT lock is granted immediately if adding it to the end of
3473 + * the queue would prevent a lock ahead of it from being granted.
3476 +static int queuecvt_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
3480 + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3484 + if (ranges_overlap(lkb, this) && !modes_compat(lkb, this))
3491 + * Return 1 if the lock can be granted, 0 otherwise.
3492 + * Also detect and resolve conversion deadlocks.
3495 +static int can_be_granted(gd_res_t *rsb, gd_lkb_t *lkb)
3497 + if (lkb->lkb_rqmode == DLM_LOCK_NL)
3500 + if (lkb->lkb_rqmode == lkb->lkb_grmode)
3503 + if (queue_conflict(&rsb->res_grantqueue, lkb))
3506 + if (!queue_conflict(&rsb->res_convertqueue, lkb)) {
3507 + if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
3510 + if (list_empty(&rsb->res_convertqueue) ||
3511 + first_in_list(lkb, &rsb->res_convertqueue) ||
3512 + queuecvt_deadlock_detect(rsb, lkb))
3518 + /* there *is* a conflict between this lkb and a converting lock so
3519 + we return false unless conversion deadlock resolution is permitted
3520 + (only conversion requests will have the CONVDEADLK flag set) */
3522 + if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK))
3525 + if (!conversion_deadlock_detect(rsb, lkb))
3528 + if (conversion_deadlock_resolve(rsb, lkb))
3534 +int dlm_lock(void *lockspace,
3536 + struct dlm_lksb *lksb,
3539 + unsigned int namelen,
3541 + void (*ast) (void *astarg),
3543 + void (*bast) (void *astarg, int mode),
3544 + struct dlm_range *range)
3547 + gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
3548 + int ret = -EINVAL;
3550 + lspace = find_lockspace_by_local_id(lockspace);
3554 + if (mode < 0 || mode > DLM_LOCK_EX)
3557 + if (namelen > DLM_RESNAME_MAXLEN)
3560 + if (flags & DLM_LKF_CANCEL)
3563 + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
3566 + if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT))
3569 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
3572 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
3575 + if (!ast || !lksb)
3578 + if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK))
3581 + if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
3585 + * Take conversion path.
3588 + if (flags & DLM_LKF_CONVERT) {
3589 + ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
3595 + * Take new lock path.
3599 + down_read(&lspace->ls_unlock_sem);
3601 + parent_lkb = find_lock_by_id(lspace, parent);
3603 + if (!parent_lkb ||
3604 + parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
3605 + parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
3606 + parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3607 + up_read(&lspace->ls_unlock_sem);
3611 + atomic_inc(&parent_lkb->lkb_childcnt);
3612 + up_read(&lspace->ls_unlock_sem);
3615 + down_read(&lspace->ls_in_recovery);
3619 + lkb = create_lkb(lspace);
3622 + lkb->lkb_astaddr = ast;
3623 + lkb->lkb_astparam = (long) astarg;
3624 + lkb->lkb_bastaddr = bast;
3625 + lkb->lkb_rqmode = mode;
3626 + lkb->lkb_grmode = DLM_LOCK_IV;
3627 + lkb->lkb_lksb = lksb;
3628 + lkb->lkb_parent = parent_lkb;
3629 + lkb->lkb_lockqueue_flags = flags;
3630 + lkb->lkb_lvbptr = lksb->sb_lvbptr;
3632 + /* Copy the range if appropriate */
3634 + if (range->ra_start > range->ra_end) {
3639 + if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
3643 + /* Convert relevant flags to internal numbers */
3644 + if (flags & DLM_LKF_VALBLK)
3645 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3646 + if (flags & DLM_LKF_PERSISTENT)
3647 + lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
3648 + if (flags & DLM_LKF_NODLCKWT)
3649 + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
3651 + lksb->sb_lkid = lkb->lkb_id;
3653 + ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
3657 + up_read(&lspace->ls_in_recovery);
3664 + release_lkb(lspace, lkb);
3669 + atomic_dec(&parent_lkb->lkb_childcnt);
3672 + up_read(&lspace->ls_in_recovery);
3678 +int dlm_lock_stage1(gd_ls_t *ls, gd_lkb_t *lkb, int flags, char *name,
3681 + gd_res_t *rsb, *parent_rsb = NULL;
3682 + gd_lkb_t *parent_lkb = lkb->lkb_parent;
3688 + parent_rsb = parent_lkb->lkb_resource;
3690 + error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb);
3694 + lkb->lkb_resource = rsb;
3695 + lkb->lkb_nodeid = rsb->res_nodeid;
3698 + * Next stage, do we need to find the master or can
3699 + * we get on with the real locking work ?
3702 + if (rsb->res_nodeid == -1) {
3703 + if (get_directory_nodeid(rsb) != our_nodeid()) {
3704 + error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
3708 + error = get_resdata(ls, our_nodeid(), rsb->res_name,
3709 + rsb->res_length, &rd, 0);
3713 + nodeid = rd->rd_master_nodeid;
3714 + if (nodeid == our_nodeid())
3716 + rsb->res_nodeid = nodeid;
3717 + lkb->lkb_nodeid = nodeid;
3718 + rsb->res_resdir_seq = rd->rd_sequence;
3721 + error = dlm_lock_stage2(ls, lkb, rsb, flags);
3731 + * Locking routine called after we have an RSB, either a copy of a remote one
3732 + * or a local one, or perhaps a shiny new one all of our very own
3735 +int dlm_lock_stage2(gd_ls_t *ls, gd_lkb_t *lkb, gd_res_t *rsb, int flags)
3739 + if (rsb->res_nodeid) {
3740 + res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3741 + error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
3743 + dlm_lock_stage3(lkb);
3750 + * Called on an RSB's master node to do stage2 locking for a remote lock
3751 + * request. Returns a proper lkb with rsb ready for lock processing.
3752 + * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
3755 +gd_lkb_t *remote_stage2(int remote_nodeid, gd_ls_t *ls,
3756 + struct gd_remlockrequest *freq)
3758 + gd_res_t *rsb = NULL, *parent_rsb = NULL;
3759 + gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
3760 + int error, namelen;
3762 + if (freq->rr_remparid) {
3763 + parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
3767 + atomic_inc(&parent_lkb->lkb_childcnt);
3768 + parent_rsb = parent_lkb->lkb_resource;
3772 + * A new MSTCPY lkb. Initialize lkb fields including the real lkid and
3773 + * node actually holding the (non-MSTCPY) lkb. AST address are just
3774 + * flags in the master copy.
3777 + lkb = create_lkb(ls);
3780 + lkb->lkb_grmode = DLM_LOCK_IV;
3781 + lkb->lkb_rqmode = freq->rr_rqmode;
3782 + lkb->lkb_parent = parent_lkb;
3783 + lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & GDLM_QUEUE_COMPAST);
3784 + lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & GDLM_QUEUE_BLKAST);
3785 + lkb->lkb_nodeid = remote_nodeid;
3786 + lkb->lkb_remid = freq->rr_header.rh_lkid;
3787 + lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
3788 + lkb->lkb_lockqueue_flags = freq->rr_flags;
3790 + if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
3791 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3792 + allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
3793 + if (!lkb->lkb_lvbptr)
3797 + if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
3798 + error = lkb_set_range(ls, lkb, freq->rr_range_start,
3799 + freq->rr_range_end);
3805 + * Get the RSB which this lock is for. Create a new RSB if this is a
3806 + * new lock on a new resource. We must be the master of any new rsb.
3809 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
3811 + error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 1,
3816 + lkb->lkb_resource = rsb;
3817 + if (rsb->res_nodeid == -1)
3818 + rsb->res_nodeid = 0;
3819 + if (freq->rr_resdir_seq)
3820 + rsb->res_resdir_seq = freq->rr_resdir_seq;
3826 + /* release_lkb handles parent */
3827 + release_lkb(ls, lkb);
3828 + parent_lkb = NULL;
3832 + atomic_dec(&parent_lkb->lkb_childcnt);
3838 + * The final bit of lock request processing on the master node. Here the lock
3839 + * is granted and the completion ast is queued, or the lock is put on the
3840 + * waitqueue and blocking asts are sent.
3843 +void dlm_lock_stage3(gd_lkb_t *lkb)
3845 + gd_res_t *rsb = lkb->lkb_resource;
3848 + * This is a locally mastered lock on a resource that already exists,
3849 + * see if it can be granted or if it must wait. When this function is
3850 + * called for a remote lock request (process_cluster_request,
3851 + * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
3852 + * requesting node at the end of process_cluster_request, not at the
3853 + * end of grant_lock.
3856 + down_write(&rsb->res_lock);
3858 + if (can_be_granted(rsb, lkb)) {
3859 + grant_lock(lkb, 0);
3864 + * This request is not a conversion, so the lkb didn't exist other than
3865 + * for this request and should be freed after EAGAIN is returned in the
3869 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
3870 + lkb->lkb_flags |= GDLM_LKFLG_DELAST;
3871 + lkb->lkb_retstatus = -EAGAIN;
3872 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
3873 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
3874 + send_blocking_asts_all(rsb, lkb);
3879 + * The requested lkb must wait. Because the rsb of the requested lkb
3880 + * is mastered here, send blocking asts for the lkb's blocking the
3884 + lkb->lkb_retstatus = 0;
3885 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3887 + send_blocking_asts(rsb, lkb);
3890 + up_write(&rsb->res_lock);
3893 +int dlm_unlock(void *lockspace,
3896 + struct dlm_lksb *lksb,
3899 + gd_ls_t *ls = find_lockspace_by_local_id(lockspace);
3902 + int ret = -EINVAL;
3907 + lkb = find_lock_by_id(ls, lkid);
3911 + /* Can't dequeue a master copy (a remote node's mastered lock) */
3912 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3915 + /* Already waiting for a remote lock operation */
3916 + if (lkb->lkb_lockqueue_state) {
3921 + /* Can only cancel WAITING or CONVERTing locks.
3922 + * This is just a quick check - it is also checked in unlock_stage2()
3923 + * (which may be on the master) under the semaphore.
3925 + if ((flags & DLM_LKF_CANCEL) &&
3926 + (lkb->lkb_status == GDLM_LKSTS_GRANTED))
3929 + /* "Normal" unlocks must operate on a granted lock */
3930 + if (!(flags & DLM_LKF_CANCEL) &&
3931 + (lkb->lkb_status != GDLM_LKSTS_GRANTED))
3934 + down_write(&ls->ls_unlock_sem);
3936 + /* Can't dequeue a lock with sublocks */
3937 + if (atomic_read(&lkb->lkb_childcnt)) {
3938 + up_write(&ls->ls_unlock_sem);
3943 + /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
3944 + if (!(flags & DLM_LKF_CANCEL))
3945 + lkb->lkb_flags |= GDLM_LKFLG_DELETED;
3946 + up_write(&ls->ls_unlock_sem);
3948 + /* Save any new params */
3950 + lkb->lkb_lksb = lksb;
3952 + lkb->lkb_astparam = (long) astarg;
3954 + lkb->lkb_lockqueue_flags = flags;
3956 + rsb = lkb->lkb_resource;
3958 + down_read(&ls->ls_in_recovery);
3960 + if (rsb->res_nodeid)
3961 + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
3963 + ret = dlm_unlock_stage2(lkb, flags);
3965 + up_read(&ls->ls_in_recovery);
3973 +int dlm_unlock_stage2(gd_lkb_t *lkb, uint32_t flags)
3975 + gd_res_t *rsb = lkb->lkb_resource;
3977 + int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
3979 + down_write(&rsb->res_lock);
3981 + /* Can only cancel WAITING or CONVERTing locks */
3982 + if ((flags & DLM_LKF_CANCEL) &&
3983 + (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
3984 + lkb->lkb_retstatus = -EINVAL;
3985 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
3989 + old_status = lkb_dequeue(lkb);
3992 + * If was granted grant any converting or waiting locks.
3995 + if (old_status == GDLM_LKSTS_GRANTED)
3996 + grant_pending_locks(rsb);
3999 + * Cancelling a conversion
4002 + if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
4003 + /* VMS semantics say we should send blocking ASTs again here */
4004 + send_blocking_asts(rsb, lkb);
4006 + /* Remove from deadlock detection */
4007 + if (lkb->lkb_duetime)
4008 + remove_from_deadlockqueue(lkb);
4010 + /* Stick it back on the granted queue */
4011 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4012 + lkb->lkb_rqmode = lkb->lkb_grmode;
4014 + /* Was it blocking any other locks? */
4015 + if (first_in_list(lkb, &rsb->res_convertqueue))
4016 + grant_pending_locks(rsb);
4018 + lkb->lkb_retstatus = -DLM_ECANCEL;
4019 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4024 + * The lvb can be saved or cleared on unlock.
4027 + if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
4028 + if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
4029 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4030 + if (flags & DLM_LKF_IVVALBLK)
4031 + memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
4034 + lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4035 + lkb->lkb_retstatus =
4036 + (flags & DLM_LKF_CANCEL) ? -DLM_ECANCEL : -DLM_EUNLOCK;
4037 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4040 + * Only free the LKB if we are the master copy. Otherwise the AST
4041 + * delivery routine will free it after delivery. queue_ast for MSTCPY
4042 + * lkb just sends a message.
4046 + up_write(&rsb->res_lock);
4047 + release_lkb(rsb->res_ls, lkb);
4053 + up_write(&rsb->res_lock);
4063 +static int convert_lock(gd_ls_t *ls, int mode, struct dlm_lksb *lksb,
4064 + int flags, void *ast, void *astarg, void *bast,
4065 + struct dlm_range *range)
4069 + int ret = -EINVAL;
4071 + lkb = find_lock_by_id(ls, lksb->sb_lkid);
4076 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
4081 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4085 + if ((flags & DLM_LKF_QUECVT) &&
4086 + !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
4090 + if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
4094 + if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) {
4098 + /* Set up the ranges as appropriate */
4100 + if (range->ra_start > range->ra_end)
4103 + if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
4109 + rsb = lkb->lkb_resource;
4110 + down_read(&rsb->res_ls->ls_in_recovery);
4112 + lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
4113 + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
4115 + if (flags & DLM_LKF_NODLCKWT)
4116 + lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
4118 + lkb->lkb_astaddr = ast;
4120 + lkb->lkb_astparam = (long) astarg;
4122 + lkb->lkb_bastaddr = bast;
4123 + lkb->lkb_rqmode = mode;
4124 + lkb->lkb_lockqueue_flags = flags;
4125 + lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
4126 + lkb->lkb_lvbptr = lksb->sb_lvbptr;
4128 + if (rsb->res_nodeid) {
4129 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4130 + ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
4132 + ret = dlm_convert_stage2(lkb, FALSE);
4135 + up_read(&rsb->res_ls->ls_in_recovery);
4144 + * For local conversion requests on locally mastered locks this is called
4145 + * directly from dlm_lock/convert_lock. This function is also called for
4146 + * remote conversion requests of MSTCPY locks (from process_cluster_request).
4149 +int dlm_convert_stage2(gd_lkb_t *lkb, int do_ast)
4151 + gd_res_t *rsb = lkb->lkb_resource;
4154 + down_write(&rsb->res_lock);
4156 + if (can_be_granted(rsb, lkb)) {
4157 + grant_lock(lkb, 0);
4158 + grant_pending_locks(rsb);
4163 + * Remove lkb from granted queue.
4169 + * The user won't wait so stick it back on the grant queue
4172 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4173 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4174 + ret = lkb->lkb_retstatus = -EAGAIN;
4176 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4177 + if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4178 + send_blocking_asts_all(rsb, lkb);
4183 + * The lkb's status tells which queue it's on. Put back on convert
4184 + * queue. (QUECVT requests added at end of the queue, all others in
4188 + lkb->lkb_retstatus = 0;
4189 + lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4192 + * If the request can't be granted
4195 + send_blocking_asts(rsb, lkb);
4197 + if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
4198 + add_to_deadlockqueue(lkb);
4201 + up_write(&rsb->res_lock);
4206 + * Remove lkb from any queue it's on, add it to the granted queue, and queue a
4207 + * completion ast. rsb res_lock must be held in write when this is called.
4210 +static void grant_lock(gd_lkb_t *lkb, int send_remote)
4212 + gd_res_t *rsb = lkb->lkb_resource;
4214 + if (lkb->lkb_duetime)
4215 + remove_from_deadlockqueue(lkb);
4217 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
4219 + GDLM_ASSERT(lkb->lkb_lvbptr,);
4221 + if (!rsb->res_lvbptr)
4222 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
4224 + b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
4226 + memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
4228 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4231 + if (lkb->lkb_range) {
4232 + lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
4233 + lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
4236 + lkb->lkb_grmode = lkb->lkb_rqmode;
4237 + lkb->lkb_rqmode = DLM_LOCK_IV;
4238 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4240 + lkb->lkb_highbast = 0;
4241 + lkb->lkb_retstatus = 0;
4242 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4245 + * A remote conversion request has been granted, either immediately
4246 + * upon being requested or after waiting a bit. In the former case,
4247 + * reply_and_grant() is called. In the later case send_remote is 1 and
4248 + * remote_grant() is called.
4250 + * The "send_remote" flag is set only for locks which are granted "out
4251 + * of band" - ie by another lock being converted or unlocked.
4253 + * The second case occurs when this lkb is granted right away as part
4254 + * of processing the initial request. In that case, we send a single
4255 + * message in reply_and_grant which combines the request reply with the
4259 + if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
4261 + remote_grant(lkb);
4262 + else if (lkb->lkb_request)
4263 + reply_and_grant(lkb);
4268 +static void send_bast_queue(struct list_head *head, gd_lkb_t *lkb)
4272 + list_for_each_entry(gr, head, lkb_statequeue) {
4273 + if (gr->lkb_bastaddr &&
4274 + gr->lkb_highbast < lkb->lkb_rqmode &&
4275 + ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
4276 + queue_ast(gr, GDLM_QUEUE_BLKAST, lkb->lkb_rqmode);
4277 + gr->lkb_highbast = lkb->lkb_rqmode;
4283 + * Notify granted locks if they are blocking a newly forced-to-wait lock.
4286 +static void send_blocking_asts(gd_res_t *rsb, gd_lkb_t *lkb)
4288 + send_bast_queue(&rsb->res_grantqueue, lkb);
4289 + /* check if the following improves performance */
4290 + /* send_bast_queue(&rsb->res_convertqueue, lkb); */
4293 +static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb)
4295 + send_bast_queue(&rsb->res_grantqueue, lkb);
4296 + send_bast_queue(&rsb->res_convertqueue, lkb);
4300 + * Called when a lock has been dequeued. Look for any locks to grant that are
4301 + * waiting for conversion or waiting to be granted.
4302 + * The rsb res_lock must be held in write when this function is called.
4305 +int grant_pending_locks(gd_res_t *rsb)
4308 + struct list_head *list;
4309 + struct list_head *temp;
4310 + int8_t high = DLM_LOCK_IV;
4312 + list_for_each_safe(list, temp, &rsb->res_convertqueue) {
4313 + lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4315 + if (can_be_granted(rsb, lkb))
4316 + grant_lock(lkb, 1);
4318 + high = MAX(lkb->lkb_rqmode, high);
4321 + list_for_each_safe(list, temp, &rsb->res_waitqueue) {
4322 + lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4324 + if (can_be_granted(rsb, lkb))
4325 + grant_lock(lkb, 1);
4327 + high = MAX(lkb->lkb_rqmode, high);
4331 + * If there are locks left on the wait/convert queue then send blocking
4332 + * ASTs to granted locks that are blocking
4334 + * FIXME: This might generate some spurious blocking ASTs for range
4338 + if (high > DLM_LOCK_IV) {
4339 + list_for_each_safe(list, temp, &rsb->res_grantqueue) {
4340 + lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4342 + if (lkb->lkb_bastaddr &&
4343 + (lkb->lkb_highbast < high) &&
4344 + !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
4346 + queue_ast(lkb, GDLM_QUEUE_BLKAST, high);
4347 + lkb->lkb_highbast = high;
4356 + * Called to cancel a locking operation that failed due to some internal
4359 + * Waiting locks will be removed, converting locks will be reverted to their
4360 + * granted status, unlocks will be left where they are.
4362 + * A completion AST will be delivered to the caller.
4365 +int cancel_lockop(gd_lkb_t *lkb, int status)
4367 + int state = lkb->lkb_lockqueue_state;
4369 + lkb->lkb_lockqueue_state = 0;
4372 + case GDLM_LQSTATE_WAIT_RSB:
4373 + lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4376 + case GDLM_LQSTATE_WAIT_CONDGRANT:
4377 + res_lkb_dequeue(lkb);
4378 + lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4381 + case GDLM_LQSTATE_WAIT_CONVERT:
4382 + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
4384 + /* Remove from deadlock detection */
4385 + if (lkb->lkb_duetime) {
4386 + remove_from_deadlockqueue(lkb);
4390 + case GDLM_LQSTATE_WAIT_UNLOCK:
4391 + /* We can leave this. I think.... */
4395 + lkb->lkb_retstatus = status;
4396 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4402 + * Check for conversion deadlock. If a deadlock was found
4403 + * return lkb to kill, else return NULL
4406 +gd_lkb_t *conversion_deadlock_check(gd_lkb_t *lkb)
4408 + gd_res_t *rsb = lkb->lkb_resource;
4409 + struct list_head *entry;
4411 + GDLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
4413 + /* Work our way up to the head of the queue looking for locks that
4414 + * conflict with us */
4416 + down_read(&rsb->res_lock);
4418 + entry = lkb->lkb_statequeue.prev;
4419 + while (entry != &rsb->res_convertqueue) {
4420 + gd_lkb_t *lkb2 = list_entry(entry, gd_lkb_t, lkb_statequeue);
4422 + if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
4423 + up_read(&rsb->res_lock);
4426 + entry = entry->prev;
4428 + up_read(&rsb->res_lock);
4434 + * Conversion operation was cancelled by us (not the user).
4435 + * ret contains the return code to pass onto the user
4438 +void cancel_conversion(gd_lkb_t *lkb, int ret)
4440 + gd_res_t *rsb = lkb->lkb_resource;
4442 + /* Stick it back on the granted queue */
4443 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4444 + lkb->lkb_rqmode = lkb->lkb_grmode;
4446 + remove_from_deadlockqueue(lkb);
4448 + lkb->lkb_retstatus = ret;
4449 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4454 + * As new master of the rsb for this lkb, we need to handle these requests
4455 + * removed from the lockqueue and originating from local processes:
4456 + * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
4457 + * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
4460 +void process_remastered_lkb(gd_lkb_t *lkb, int state)
4463 + case GDLM_LQSTATE_WAIT_RSB:
4464 + dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
4465 + lkb->lkb_lockqueue_flags,
4466 + lkb->lkb_resource->res_name,
4467 + lkb->lkb_resource->res_length);
4470 + case GDLM_LQSTATE_WAIT_CONDGRANT:
4471 + res_lkb_dequeue(lkb);
4472 + dlm_lock_stage3(lkb);
4475 + case GDLM_LQSTATE_WAIT_UNLOCK:
4476 + dlm_unlock_stage2(lkb, lkb->lkb_lockqueue_flags);
4479 + case GDLM_LQSTATE_WAIT_CONVERT:
4480 + dlm_convert_stage2(lkb, TRUE);
4487 diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
4488 --- linux-orig/cluster/dlm/locking.h 1970-01-01 07:30:00.000000000 +0730
4489 +++ linux-patched/cluster/dlm/locking.h 2004-06-25 18:31:07.000000000 +0800
4491 +/******************************************************************************
4492 +*******************************************************************************
4494 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4495 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4497 +** This copyrighted material is made available to anyone wishing to use,
4498 +** modify, copy, or redistribute it subject to the terms and conditions
4499 +** of the GNU General Public License v.2.
4501 +*******************************************************************************
4502 +******************************************************************************/
4504 +#ifndef __LOCKING_DOT_H__
4505 +#define __LOCKING_DOT_H__
4507 +void process_remastered_lkb(gd_lkb_t * lkb, int state);
4508 +void dlm_lock_stage3(gd_lkb_t * lkb);
4509 +int dlm_convert_stage2(gd_lkb_t * lkb, int do_ast);
4510 +int dlm_unlock_stage2(gd_lkb_t * lkb, uint32_t flags);
4511 +int dlm_lock_stage2(gd_ls_t * lspace, gd_lkb_t * lkb, gd_res_t * rsb,
4513 +gd_res_t *create_rsb(gd_ls_t * lspace, gd_lkb_t * lkb, char *name, int namelen);
4514 +int free_rsb_if_unused(gd_res_t * rsb);
4515 +gd_lkb_t *remote_stage2(int remote_csid, gd_ls_t * lspace,
4516 + struct gd_remlockrequest *freq);
4517 +int cancel_lockop(gd_lkb_t * lkb, int status);
4518 +int dlm_remove_lock(gd_lkb_t * lkb, uint32_t flags);
4519 +int grant_pending_locks(gd_res_t * rsb);
4520 +void cancel_conversion(gd_lkb_t * lkb, int ret);
4521 +gd_lkb_t *conversion_deadlock_check(gd_lkb_t * lkb);
4523 +#endif /* __LOCKING_DOT_H__ */
4524 diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
4525 --- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730
4526 +++ linux-patched/cluster/dlm/lockqueue.c 2004-06-25 18:31:07.000000000 +0800
4528 +/******************************************************************************
4529 +*******************************************************************************
4531 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4532 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4534 +** This copyrighted material is made available to anyone wishing to use,
4535 +** modify, copy, or redistribute it subject to the terms and conditions
4536 +** of the GNU General Public License v.2.
4538 +*******************************************************************************
4539 +******************************************************************************/
4544 + * This controls the lock queue, which is where locks
4545 + * come when they need to wait for a remote operation
4548 + * This could also be thought of as the "high-level" comms
4553 +#include "dlm_internal.h"
4554 +#include "lockqueue.h"
4556 +#include "locking.h"
4558 +#include "lowcomms.h"
4559 +#include "midcomms.h"
4560 +#include "reccomms.h"
4562 +#include "lockspace.h"
4564 +#include "memory.h"
4566 +#include "queries.h"
4568 +static void add_reply_lvb(gd_lkb_t * lkb, struct gd_remlockreply *reply);
4569 +static void add_request_lvb(gd_lkb_t * lkb, struct gd_remlockrequest *req);
4572 + * format of an entry on the request queue
4575 + struct list_head rqe_list;
4576 + uint32_t rqe_nodeid;
4577 + char rqe_request[1];
4581 + * Add a new request (if appropriate) to the request queue and send the remote
4582 + * request out. - runs in the context of the locking caller
4584 + * Recovery of a remote_stage request if the remote end fails while the lkb
4585 + * is still on the lockqueue:
4587 + * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
4588 + * lockqueue_lkb_mark() at the start of recovery.
4590 + * o Some lkb's will be rebuilt on new master rsb's during recovery.
4591 + * (depends on the type of request, see below).
4593 + * o At the end of recovery, resend_cluster_requests() looks at these
4594 + * LQRESEND lkb's and either:
4596 + * i) resends the request to the new master for the rsb where the
4597 + * request is processed as usual. The lkb remains on the lockqueue until
4598 + * the new master replies and we run process_lockqueue_reply().
4600 + * ii) if we've become the rsb master, remove the lkb from the lockqueue
4601 + * and processes the request locally via process_remastered_lkb().
4603 + * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
4604 + * and the request should be resent if dest node is failed.
4606 + * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
4607 + * wait queue. Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
4608 + * makes send_lkb_queue() skip it). Resend this request to the new master.
4610 + * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue. It will
4611 + * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
4612 + * Resend this request to the new master.
4614 + * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
4615 + * It will be rebuilt on the new master rsb's granted queue. Resend this
4616 + * request to the new master.
4619 +int remote_stage(gd_lkb_t *lkb, int state)
4623 + lkb->lkb_lockqueue_state = state;
4624 + add_to_lockqueue(lkb);
4626 + error = send_cluster_request(lkb, state);
4628 + log_print("remote_stage error sending request %d", error);
4630 + /* Leave on lockqueue, it will be resent to correct node during
4634 + lkb->lkb_lockqueue_state = 0;
4635 + remove_from_lockqueue(lkb);
4643 + * Requests received while the lockspace is in recovery get added to the
4644 + * request queue and processed when recovery is complete.
4647 +void add_to_requestqueue(gd_ls_t *ls, int nodeid, char *request, int length)
4649 + struct rq_entry *entry;
4651 + if (in_nodes_gone(ls, nodeid))
4654 + entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
4656 + // TODO something better
4657 + printk("dlm: add_to_requestqueue: out of memory\n");
4661 + log_debug(ls, "add_to_requestqueue %d", nodeid);
4662 + entry->rqe_nodeid = nodeid;
4663 + memcpy(entry->rqe_request, request, length);
4664 + list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
4667 +int process_requestqueue(gd_ls_t *ls)
4669 + int error = 0, count = 0;
4670 + struct rq_entry *entry, *safe;
4671 + struct gd_req_header *req;
4673 + log_all(ls, "process held requests");
4675 + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4676 + req = (struct gd_req_header *) entry->rqe_request;
4677 + log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid);
4679 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
4680 + log_debug(ls, "process_requestqueue aborted");
4685 + error = process_cluster_request(entry->rqe_nodeid, req, TRUE);
4686 + if (error == -EINTR) {
4687 + log_debug(ls, "process_requestqueue interrupted");
4691 + list_del(&entry->rqe_list);
4697 + log_all(ls, "processed %d requests", count);
4701 +void wait_requestqueue(gd_ls_t *ls)
4703 + while (!list_empty(&ls->ls_requestqueue) &&
4704 + test_bit(LSFL_LS_RUN, &ls->ls_flags))
4709 + * Resdir requests (lookup or remove) and replies from before recovery are
4710 + * invalid since the resdir was rebuilt. Clear them. Requests from nodes now
4711 + * gone are also invalid.
4714 +void purge_requestqueue(gd_ls_t *ls)
4717 + struct rq_entry *entry, *safe;
4718 + struct gd_req_header *req;
4719 + struct gd_remlockrequest *freq;
4722 + log_all(ls, "purge requests");
4724 + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4725 + req = (struct gd_req_header *) entry->rqe_request;
4726 + freq = (struct gd_remlockrequest *) req;
4728 + if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
4729 + req->rh_cmd == GDLM_REMCMD_LOOKUP ||
4730 + in_nodes_gone(ls, entry->rqe_nodeid)) {
4732 + list_del(&entry->rqe_list);
4736 + } else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
4739 + * Replies to resdir lookups are invalid and must be
4740 + * purged. The lookup requests are marked in
4741 + * lockqueue_lkb_mark and will be resent in
4742 + * resend_cluster_requests. The only way to check if
4743 + * this is a lookup reply is to look at the
4744 + * lockqueue_state of the lkb.
4747 + lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid);
4748 + GDLM_ASSERT(lkb,);
4749 + if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
4750 + list_del(&entry->rqe_list);
4757 + log_all(ls, "purged %d requests", count);
4761 + * Check if there's a reply for the given lkid in the requestqueue.
4764 +int reply_in_requestqueue(gd_ls_t *ls, int lkid)
4767 + struct rq_entry *entry, *safe;
4768 + struct gd_req_header *req;
4769 + struct gd_remlockrequest *freq;
4771 + list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4772 + req = (struct gd_req_header *) entry->rqe_request;
4773 + freq = (struct gd_remlockrequest *) req;
4775 + if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY &&
4776 + freq->rr_header.rh_lkid == lkid) {
4785 +void allocate_and_copy_lvb(gd_ls_t *ls, char **lvbptr, char *src)
4788 + *lvbptr = allocate_lvb(ls);
4790 + memcpy(*lvbptr, src, DLM_LVB_LEN);
4794 + * Process a lockqueue LKB after it has had it's remote processing complete and
4795 + * been pulled from the lockqueue. Runs in the context of the DLM recvd thread on
4796 + * the machine that requested the lock.
4799 +static void process_lockqueue_reply(gd_lkb_t *lkb,
4800 + struct gd_remlockreply *reply)
4802 + int state = lkb->lkb_lockqueue_state;
4804 + gd_res_t *rsb = lkb->lkb_resource;
4805 + gd_ls_t *ls = rsb->res_ls;
4807 + lkb->lkb_lockqueue_state = 0;
4809 + remove_from_lockqueue(lkb);
4812 + case GDLM_LQSTATE_WAIT_RSB:
4814 + GDLM_ASSERT(reply->rl_status == 0,);
4816 + if (reply->rl_nodeid == our_nodeid())
4817 + rsb->res_nodeid = 0;
4819 + rsb->res_nodeid = reply->rl_nodeid;
4821 + rsb->res_resdir_seq = reply->rl_resdir_seq;
4822 + lkb->lkb_nodeid = rsb->res_nodeid;
4824 + dlm_lock_stage2(rsb->res_ls, lkb, rsb,
4825 + lkb->lkb_lockqueue_flags);
4828 + case GDLM_LQSTATE_WAIT_CONVERT:
4829 + case GDLM_LQSTATE_WAIT_CONDGRANT:
4832 + * After a remote lock/conversion/grant request we put the lock
4833 + * on the right queue and send an AST if appropriate. Any lock
4834 + * shuffling (eg newly granted locks because this one was
4835 + * converted downwards) will be dealt with in seperate messages
4836 + * (which may be in the same network message)
4839 + if (!lkb->lkb_remid)
4840 + lkb->lkb_remid = reply->rl_lkid;
4843 + * The remote request failed (we assume because of NOQUEUE).
4844 + * If this is a new request (non-conv) the lkb was created just
4845 + * for it so the lkb should be freed. If this was a
4846 + * conversion, the lkb already existed so we should put it back
4847 + * on the grant queue.
4850 + if (reply->rl_status != 0) {
4851 + GDLM_ASSERT(reply->rl_status == -EAGAIN,);
4853 + if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
4854 + res_lkb_dequeue(lkb);
4855 + lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4857 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4859 + lkb->lkb_retstatus = reply->rl_status;
4860 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4865 + * The remote request was successful in granting the request or
4866 + * queuing it to be granted later. Add the lkb to the
4867 + * appropriate rsb queue.
4870 + switch (reply->rl_lockstate) {
4871 + case GDLM_LKSTS_GRANTED:
4873 + /* Compact version of grant_lock(). */
4875 + down_write(&rsb->res_lock);
4876 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
4877 + memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
4880 + lkb->lkb_grmode = lkb->lkb_rqmode;
4881 + lkb->lkb_rqmode = DLM_LOCK_IV;
4882 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4884 + if (lkb->lkb_range) {
4885 + lkb->lkb_range[GR_RANGE_START] =
4886 + lkb->lkb_range[RQ_RANGE_START];
4887 + lkb->lkb_range[GR_RANGE_END] =
4888 + lkb->lkb_range[RQ_RANGE_END];
4890 + up_write(&rsb->res_lock);
4892 + lkb->lkb_retstatus = 0;
4893 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4896 + case GDLM_LKSTS_WAITING:
4898 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4899 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4901 + log_error(ls, "wait reply for granted %x %u",
4902 + lkb->lkb_id, lkb->lkb_nodeid);
4905 + case GDLM_LKSTS_CONVERT:
4907 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4908 + res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4910 + log_error(ls, "convert reply for granted %x %u",
4911 + lkb->lkb_id, lkb->lkb_nodeid);
4915 + log_error(ls, "process_lockqueue_reply state %d",
4916 + reply->rl_lockstate);
4921 + case GDLM_LQSTATE_WAIT_UNLOCK:
4924 + * Unlocks should never fail. Update local lock info. This
4925 + * always sends completion AST with status in lksb
4928 + GDLM_ASSERT(reply->rl_status == 0,);
4929 + oldstate = res_lkb_dequeue(lkb);
4931 + /* Differentiate between unlocks and conversion cancellations */
4932 + if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL &&
4933 + oldstate == GDLM_LKSTS_CONVERT) {
4934 + res_lkb_enqueue(lkb->lkb_resource, lkb,
4935 + GDLM_LKSTS_GRANTED);
4936 + lkb->lkb_retstatus = -DLM_ECANCEL;
4938 + lkb->lkb_flags |= GDLM_LKFLG_DELAST;
4939 + lkb->lkb_retstatus = -DLM_EUNLOCK;
4941 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
4945 + log_error(ls, "process_lockqueue_reply id %x state %d",
4946 + lkb->lkb_id, state);
4951 + * Tell a remote node to grant a lock. This happens when we are the master
4952 + * copy for a lock that is actually held on a remote node. The remote end is
4953 + * also responsible for sending the completion AST.
4956 +void remote_grant(gd_lkb_t *lkb)
4958 + struct writequeue_entry *e;
4959 + struct gd_remlockrequest *req;
4961 + // TODO Error handling
4962 + e = lowcomms_get_buffer(lkb->lkb_nodeid,
4963 + sizeof(struct gd_remlockrequest),
4964 + lkb->lkb_resource->res_ls->ls_allocation,
4969 + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
4970 + req->rr_header.rh_length = sizeof(struct gd_remlockrequest);
4971 + req->rr_header.rh_flags = 0;
4972 + req->rr_header.rh_lkid = lkb->lkb_id;
4973 + req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
4974 + req->rr_remlkid = lkb->lkb_remid;
4975 + req->rr_flags = 0;
4977 + if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
4978 + /* This is a confusing non-standard use of rr_flags which is
4979 + * usually used to pass lockqueue_flags. */
4980 + req->rr_flags |= GDLM_LKFLG_DEMOTED;
4983 + add_request_lvb(lkb, req);
4984 + midcomms_send_buffer(&req->rr_header, e);
4987 +void reply_and_grant(gd_lkb_t *lkb)
4989 + struct gd_remlockrequest *req = lkb->lkb_request;
4990 + struct gd_remlockreply *reply;
4991 + struct writequeue_entry *e;
4993 + // TODO Error handling
4994 + e = lowcomms_get_buffer(lkb->lkb_nodeid,
4995 + sizeof(struct gd_remlockreply),
4996 + lkb->lkb_resource->res_ls->ls_allocation,
4997 + (char **) &reply);
5001 + reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5002 + reply->rl_header.rh_flags = 0;
5003 + reply->rl_header.rh_length = sizeof(struct gd_remlockreply);
5004 + reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
5005 + reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
5007 + reply->rl_status = lkb->lkb_retstatus;
5008 + reply->rl_lockstate = lkb->lkb_status;
5009 + reply->rl_lkid = lkb->lkb_id;
5011 + GDLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
5013 + lkb->lkb_request = NULL;
5015 + add_reply_lvb(lkb, reply);
5016 + midcomms_send_buffer(&reply->rl_header, e);
5020 + * Request removal of a dead entry in the resource directory
5023 +void remote_remove_resdata(gd_ls_t *ls, int nodeid, char *name, int namelen,
5026 + struct writequeue_entry *e;
5027 + struct gd_remlockrequest *req;
5029 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5030 + gd_rcom_t *rc = allocate_rcom_buffer(ls);
5032 + memcpy(rc->rc_buf, name, namelen);
5033 + rc->rc_datalen = namelen;
5035 + rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
5037 + free_rcom_buffer(rc);
5040 + // TODO Error handling
5041 + e = lowcomms_get_buffer(nodeid,
5042 + sizeof(struct gd_remlockrequest) + namelen - 1,
5043 + ls->ls_allocation, (char **) &req);
5047 + memset(req, 0, sizeof(struct gd_remlockrequest) + namelen - 1);
5048 + req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
5049 + req->rr_header.rh_length =
5050 + sizeof(struct gd_remlockrequest) + namelen - 1;
5051 + req->rr_header.rh_flags = 0;
5052 + req->rr_header.rh_lkid = 0;
5053 + req->rr_header.rh_lockspace = ls->ls_global_id;
5054 + req->rr_remlkid = 0;
5055 + req->rr_resdir_seq = sequence;
5056 + memcpy(req->rr_name, name, namelen);
5058 + midcomms_send_buffer(&req->rr_header, e);
5062 + * Send remote cluster request to directory or master node before the request
5063 + * is put on the lock queue. Runs in the context of the locking caller.
5066 +int send_cluster_request(gd_lkb_t *lkb, int state)
5068 + uint32_t target_nodeid;
5069 + gd_res_t *rsb = lkb->lkb_resource;
5070 + gd_ls_t *ls = rsb->res_ls;
5071 + struct gd_remlockrequest *req;
5072 + struct writequeue_entry *e;
5074 + /* Need to know the target nodeid before we allocate a send buffer */
5075 + target_nodeid = lkb->lkb_nodeid;
5076 + GDLM_ASSERT(target_nodeid != 0,);
5078 + if (state == GDLM_LQSTATE_WAIT_RSB)
5079 + target_nodeid = get_directory_nodeid(rsb);
5081 + GDLM_ASSERT(target_nodeid,);
5083 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5084 + /* this may happen when called by resend_cluster_request */
5085 + log_error(ls, "send_cluster_request to %u state %d recovery",
5086 + target_nodeid, state);
5089 + e = lowcomms_get_buffer(target_nodeid,
5090 + sizeof(struct gd_remlockrequest) +
5091 + rsb->res_length - 1, ls->ls_allocation,
5095 + memset(req, 0, sizeof(struct gd_remlockrequest) + rsb->res_length - 1);
5097 + /* Common stuff, some are just defaults */
5099 + if (lkb->lkb_bastaddr)
5100 + req->rr_asts = GDLM_QUEUE_BLKAST;
5101 + if (lkb->lkb_astaddr)
5102 + req->rr_asts |= GDLM_QUEUE_COMPAST;
5103 + if (lkb->lkb_parent)
5104 + req->rr_remparid = lkb->lkb_parent->lkb_remid;
5106 + req->rr_flags = lkb->lkb_lockqueue_flags;
5107 + req->rr_rqmode = lkb->lkb_rqmode;
5108 + req->rr_remlkid = lkb->lkb_remid;
5109 + req->rr_header.rh_length =
5110 + sizeof(struct gd_remlockrequest) + rsb->res_length - 1;
5111 + req->rr_header.rh_flags = 0;
5112 + req->rr_header.rh_lkid = lkb->lkb_id;
5113 + req->rr_header.rh_lockspace = ls->ls_global_id;
5117 + case GDLM_LQSTATE_WAIT_RSB:
5119 + /* The lock must be a root lock */
5120 + GDLM_ASSERT(!lkb->lkb_parent,);
5122 + req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
5123 + memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5126 + case GDLM_LQSTATE_WAIT_CONVERT:
5128 + req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
5129 + if (lkb->lkb_range) {
5130 + req->rr_flags |= GDLM_LKFLG_RANGE;
5131 + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5132 + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5136 + case GDLM_LQSTATE_WAIT_CONDGRANT:
5138 + req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
5139 + req->rr_resdir_seq = rsb->res_resdir_seq;
5140 + memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5141 + if (lkb->lkb_range) {
5142 + req->rr_flags |= GDLM_LKFLG_RANGE;
5143 + req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5144 + req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5148 + case GDLM_LQSTATE_WAIT_UNLOCK:
5150 + req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
5154 + GDLM_ASSERT(!"Unknown cluster request",);
5157 + add_request_lvb(lkb, req);
5158 + midcomms_send_buffer(&req->rr_header, e);
5164 + * We got a request from another cluster node, process it and return an info
5165 + * structure with the lock state/LVB etc as required. Executes in the DLM's
5169 +int process_cluster_request(int nodeid, struct gd_req_header *req, int recovery)
5172 + gd_lkb_t *lkb = NULL;
5174 + int send_reply = 0, status = 0, namelen;
5175 + struct gd_remlockrequest *freq = (struct gd_remlockrequest *) req;
5176 + struct gd_remlockreply reply;
5178 + lspace = find_lockspace_by_global_id(req->rh_lockspace);
5181 + log_print("process_cluster_request invalid lockspace %x "
5182 + "from %d req %u", req->rh_lockspace, nodeid,
5188 + /* wait for recoverd to drain requestqueue */
5190 + wait_requestqueue(lspace);
5193 + * If we're in recovery then queue the request for later. Otherwise,
5194 + * we still need to get the "in_recovery" lock to make sure the
5195 + * recovery itself doesn't start until we are done.
5198 + if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
5199 + if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags))
5200 + log_error(lspace, "process_cluster_request warning %u",
5202 + add_to_requestqueue(lspace, nodeid, (char *) req,
5204 + log_debug(lspace, "process_cluster_request abort");
5208 + if (!down_read_trylock(&lspace->ls_in_recovery)) {
5215 + * Process the request.
5218 + switch (req->rh_cmd) {
5220 + case GDLM_REMCMD_LOOKUP:
5224 + uint32_t dir_nodeid;
5226 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5228 + dir_nodeid = name_to_directory_nodeid(lspace,
5231 + if (dir_nodeid != our_nodeid())
5232 + log_debug(lspace, "ignoring directory lookup");
5234 + status = get_resdata(lspace, nodeid, freq->rr_name,
5239 + reply.rl_status = status;
5240 + reply.rl_lockstate = 0;
5241 + reply.rl_nodeid = rd->rd_master_nodeid;
5242 + reply.rl_resdir_seq = rd->rd_sequence;
5247 + case GDLM_REMCMD_REM_RESDATA:
5249 + namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5250 + remove_resdata(lspace, nodeid, freq->rr_name, namelen,
5251 + freq->rr_resdir_seq);
5254 + case GDLM_REMCMD_LOCKREQUEST:
5256 + lkb = remote_stage2(nodeid, lspace, freq);
5258 + lkb->lkb_request = freq;
5259 + dlm_lock_stage3(lkb);
5262 + * If the request was granted in lock_stage3, then a
5263 + * reply message was already sent in combination with
5264 + * the grant message and lkb_request is NULL.
5267 + if (lkb->lkb_request) {
5268 + lkb->lkb_request = NULL;
5270 + reply.rl_status = lkb->lkb_retstatus;
5271 + reply.rl_lockstate = lkb->lkb_status;
5272 + reply.rl_lkid = lkb->lkb_id;
5275 + * If the request could not be granted and the
5276 + * user won't wait, then free up the LKB
5279 + if (lkb->lkb_flags & GDLM_LKFLG_DELAST) {
5280 + rsb = lkb->lkb_resource;
5281 + release_lkb(lspace, lkb);
5287 + reply.rl_status = -ENOMEM;
5292 + case GDLM_REMCMD_CONVREQUEST:
5294 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5296 + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5298 + freq->rr_header.rh_lkid, nodeid););
5300 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5301 + log_error(lspace, "convrequest: invalid status %d",
5304 + lkb->lkb_rqmode = freq->rr_rqmode;
5305 + lkb->lkb_lockqueue_flags = freq->rr_flags;
5306 + lkb->lkb_request = freq;
5307 + lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
5309 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK
5310 + || freq->rr_flags & DLM_LKF_VALBLK) {
5311 + lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
5312 + allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
5316 + if (freq->rr_flags & GDLM_LKFLG_RANGE) {
5317 + if (lkb_set_range(lspace, lkb, freq->rr_range_start,
5318 + freq->rr_range_end)) {
5319 + reply.rl_status = -ENOMEM;
5325 + dlm_convert_stage2(lkb, FALSE);
5328 + * If the conv request was granted in stage2, then a reply
5329 + * message was already sent in combination with the grant
5333 + if (lkb->lkb_request) {
5334 + lkb->lkb_request = NULL;
5336 + reply.rl_status = lkb->lkb_retstatus;
5337 + reply.rl_lockstate = lkb->lkb_status;
5338 + reply.rl_lkid = lkb->lkb_id;
5342 + case GDLM_REMCMD_LOCKREPLY:
5344 + lkb = find_lock_by_id(lspace, freq->rr_header.rh_lkid);
5346 + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5348 + freq->rr_header.rh_lkid, nodeid););
5350 + process_lockqueue_reply(lkb, (struct gd_remlockreply *) req);
5353 + case GDLM_REMCMD_LOCKGRANT:
5356 + * Remote lock has been granted asynchronously. Do a compact
5357 + * version of what grant_lock() does.
5360 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5362 + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5364 + freq->rr_header.rh_lkid, nodeid););
5366 + rsb = lkb->lkb_resource;
5368 + if (lkb->lkb_lockqueue_state)
5369 + log_error(rsb->res_ls, "granting lock on lockqueue "
5370 + "id=%x from=%u lqstate=%d flags=%x",
5371 + lkb->lkb_id, nodeid, lkb->lkb_lockqueue_state,
5374 + down_write(&rsb->res_lock);
5376 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5377 + memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN);
5379 + lkb->lkb_grmode = lkb->lkb_rqmode;
5380 + lkb->lkb_rqmode = DLM_LOCK_IV;
5382 + if (lkb->lkb_range) {
5383 + lkb->lkb_range[GR_RANGE_START] =
5384 + lkb->lkb_range[RQ_RANGE_START];
5385 + lkb->lkb_range[GR_RANGE_END] =
5386 + lkb->lkb_range[RQ_RANGE_END];
5389 + lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5390 + up_write(&rsb->res_lock);
5392 + if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
5393 + lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
5395 + lkb->lkb_retstatus = 0;
5396 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
5399 + case GDLM_REMCMD_SENDBAST:
5401 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5403 + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5405 + freq->rr_header.rh_lkid, nodeid););
5407 + if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
5408 + queue_ast(lkb, GDLM_QUEUE_BLKAST, freq->rr_rqmode);
5411 + case GDLM_REMCMD_SENDCAST:
5413 + /* This is only used for some error completion ASTs */
5415 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5417 + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5419 + freq->rr_header.rh_lkid, nodeid););
5421 + /* Return the lock to granted status */
5422 + res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
5424 + lkb->lkb_retstatus = freq->rr_status;
5425 + queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
5428 + case GDLM_REMCMD_UNLOCKREQUEST:
5430 + lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5432 + GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5434 + freq->rr_header.rh_lkid, nodeid););
5436 + reply.rl_status = dlm_unlock_stage2(lkb, freq->rr_flags);
5440 + case GDLM_REMCMD_QUERY:
5441 + remote_query(nodeid, lspace, req);
5444 + case GDLM_REMCMD_QUERYREPLY:
5445 + remote_query_reply(nodeid, lspace, req);
5449 + log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
5452 + up_read(&lspace->ls_in_recovery);
5456 + reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5457 + reply.rl_header.rh_flags = 0;
5458 + reply.rl_header.rh_length = sizeof(reply);
5459 + reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
5460 + reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
5462 + status = midcomms_send_message(nodeid, &reply.rl_header,
5471 +static void add_reply_lvb(gd_lkb_t *lkb, struct gd_remlockreply *reply)
5473 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5474 + memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5477 +static void add_request_lvb(gd_lkb_t *lkb, struct gd_remlockrequest *req)
5479 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5480 + memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5482 diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
5483 --- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730
5484 +++ linux-patched/cluster/dlm/lockqueue.h 2004-06-25 18:31:07.000000000 +0800
5486 +/******************************************************************************
5487 +*******************************************************************************
5489 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5490 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5492 +** This copyrighted material is made available to anyone wishing to use,
5493 +** modify, copy, or redistribute it subject to the terms and conditions
5494 +** of the GNU General Public License v.2.
5496 +*******************************************************************************
5497 +******************************************************************************/
5499 +#ifndef __LOCKQUEUE_DOT_H__
5500 +#define __LOCKQUEUE_DOT_H__
5502 +void remote_grant(gd_lkb_t * lkb);
5503 +void reply_and_grant(gd_lkb_t * lkb);
5504 +int remote_stage(gd_lkb_t * lkb, int state);
5505 +int process_cluster_request(int csid, struct gd_req_header *req, int recovery);
5506 +int send_cluster_request(gd_lkb_t * lkb, int state);
5507 +void purge_requestqueue(gd_ls_t * ls);
5508 +int process_requestqueue(gd_ls_t * ls);
5509 +int reply_in_requestqueue(gd_ls_t * ls, int lkid);
5510 +void remote_remove_resdata(gd_ls_t * ls, int nodeid, char *name, int namelen,
5511 + uint8_t sequence);
5512 +void allocate_and_copy_lvb(gd_ls_t * ls, char **lvbptr, char *src);
5514 +#endif /* __LOCKQUEUE_DOT_H__ */
5515 diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
5516 --- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730
5517 +++ linux-patched/cluster/dlm/lockspace.c 2004-06-25 18:31:07.000000000 +0800
5519 +/******************************************************************************
5520 +*******************************************************************************
5522 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5523 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5525 +** This copyrighted material is made available to anyone wishing to use,
5526 +** modify, copy, or redistribute it subject to the terms and conditions
5527 +** of the GNU General Public License v.2.
5529 +*******************************************************************************
5530 +******************************************************************************/
5532 +#include <linux/module.h>
5534 +#include "dlm_internal.h"
5535 +#include "recoverd.h"
5540 +#include "lowcomms.h"
5541 +#include "config.h"
5542 +#include "memory.h"
5543 +#include "lockspace.h"
5544 +#include "device.h"
5546 +#define GDST_NONE (0)
5547 +#define GDST_RUNNING (1)
5549 +static int gdlmstate;
5550 +static int gdlmcount;
5551 +static struct semaphore gdlmstate_lock;
5552 +struct list_head lslist;
5553 +spinlock_t lslist_lock;
5554 +struct kcl_service_ops ls_ops;
5556 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
5559 +void dlm_lockspace_init(void)
5561 + gdlmstate = GDST_NONE;
5563 + init_MUTEX(&gdlmstate_lock);
5564 + INIT_LIST_HEAD(&lslist);
5565 + spin_lock_init(&lslist_lock);
5568 +gd_ls_t *find_lockspace_by_global_id(uint32_t id)
5572 + spin_lock(&lslist_lock);
5574 + list_for_each_entry(ls, &lslist, ls_list) {
5575 + if (ls->ls_global_id == id)
5580 + spin_unlock(&lslist_lock);
5584 +/* TODO: make this more efficient */
5585 +gd_ls_t *find_lockspace_by_local_id(void *id)
5589 + spin_lock(&lslist_lock);
5591 + list_for_each_entry(ls, &lslist, ls_list) {
5592 + if (ls->ls_local_id == (uint32_t)(long)id)
5597 + spin_unlock(&lslist_lock);
5601 +gd_ls_t *find_lockspace_by_name(char *name, int namelen)
5605 + spin_lock(&lslist_lock);
5607 + list_for_each_entry(ls, &lslist, ls_list) {
5608 + if (ls->ls_namelen == namelen &&
5609 + memcmp(ls->ls_name, name, namelen) == 0)
5614 + spin_unlock(&lslist_lock);
5619 + * Called from dlm_init. These are the general threads which are not
5620 + * lockspace-specific and work for all gdlm lockspaces.
5623 +static int threads_start(void)
5627 + /* Thread which interacts with cman for all ls's */
5628 + error = recoverd_start();
5630 + log_print("cannot start recovery thread %d", error);
5634 + /* Thread which process lock requests for all ls's */
5635 + error = astd_start();
5637 + log_print("cannot start ast thread %d", error);
5638 + goto recoverd_fail;
5641 + /* Thread for sending/receiving messages for all ls's */
5642 + error = lowcomms_start();
5644 + log_print("cannot start lowcomms %d", error);
5660 +static void threads_stop(void)
5667 +static int init_internal(void)
5671 + if (gdlmstate == GDST_RUNNING)
5674 + error = threads_start();
5678 + gdlmstate = GDST_RUNNING;
5688 + * Called after gdlm module is loaded and before any lockspaces are created.
5689 + * Starts and initializes global threads and structures. These global entities
5690 + * are shared by and independent of all lockspaces.
5692 + * There should be a gdlm-specific user command which a person can run which
5693 + * calls this function. If a user hasn't run that command and something
5694 + * creates a new lockspace, this is called first.
5696 + * This also starts the default lockspace.
5703 + down(&gdlmstate_lock);
5704 + error = init_internal();
5705 + up(&gdlmstate_lock);
5710 +int dlm_release(void)
5714 + down(&gdlmstate_lock);
5716 + if (gdlmstate == GDST_NONE)
5725 + spin_lock(&lslist_lock);
5726 + if (!list_empty(&lslist)) {
5727 + spin_unlock(&lslist_lock);
5728 + log_print("cannot stop threads, lockspaces still exist");
5731 + spin_unlock(&lslist_lock);
5734 + gdlmstate = GDST_NONE;
5737 + up(&gdlmstate_lock);
5742 +gd_ls_t *allocate_ls(int namelen)
5746 + /* FIXME: use appropriate malloc type */
5748 + ls = kmalloc(sizeof(gd_ls_t) + namelen, GFP_KERNEL);
5750 + memset(ls, 0, sizeof(gd_ls_t) + namelen);
5755 +void free_ls(gd_ls_t *ls)
5760 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
5763 + int i, error = -ENOMEM;
5764 + uint32_t local_id = 0;
5766 + if (!try_module_get(THIS_MODULE))
5769 + if (namelen > MAX_SERVICE_NAME_LEN)
5772 + if ((ls = find_lockspace_by_name(name, namelen))) {
5773 + *lockspace = (void *)ls->ls_local_id;
5778 + * Initialize ls fields
5781 + ls = allocate_ls(namelen);
5785 + memcpy(ls->ls_name, name, namelen);
5786 + ls->ls_namelen = namelen;
5788 + ls->ls_allocation = GFP_KERNEL;
5789 + memset(&ls->ls_flags, 0, sizeof(unsigned long));
5790 + INIT_LIST_HEAD(&ls->ls_rootres);
5791 + ls->ls_hashsize = dlm_config.reshashtbl;
5792 + ls->ls_hashmask = ls->ls_hashsize - 1;
5794 + ls->ls_reshashtbl =
5795 + kmalloc(sizeof(struct list_head) * ls->ls_hashsize, GFP_KERNEL);
5796 + if (!ls->ls_reshashtbl)
5799 + for (i = 0; i < ls->ls_hashsize; i++)
5800 + INIT_LIST_HEAD(&ls->ls_reshashtbl[i]);
5802 + rwlock_init(&ls->ls_reshash_lock);
5804 + if (init_lockidtbl(ls, dlm_config.lockidtbl) == -1)
5807 + INIT_LIST_HEAD(&ls->ls_nodes);
5808 + ls->ls_num_nodes = 0;
5809 + INIT_LIST_HEAD(&ls->ls_nodes_gone);
5810 + INIT_LIST_HEAD(&ls->ls_recover);
5811 + spin_lock_init(&ls->ls_recover_lock);
5812 + INIT_LIST_HEAD(&ls->ls_recover_list);
5813 + ls->ls_recover_list_count = 0;
5814 + spin_lock_init(&ls->ls_recover_list_lock);
5815 + init_waitqueue_head(&ls->ls_wait_general);
5816 + INIT_LIST_HEAD(&ls->ls_requestqueue);
5817 + INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
5818 + ls->ls_last_stop = 0;
5819 + ls->ls_last_start = 0;
5820 + ls->ls_last_finish = 0;
5821 + ls->ls_rcom_msgid = 0;
5822 + init_MUTEX(&ls->ls_rcom_lock);
5823 + init_rwsem(&ls->ls_in_recovery);
5824 + init_rwsem(&ls->ls_unlock_sem);
5825 + init_rwsem(&ls->ls_rec_rsblist);
5826 + init_rwsem(&ls->ls_gap_rsblist);
5827 + down_write(&ls->ls_in_recovery);
5829 + for (i = 0; i < RESDIRHASH_SIZE; i++) {
5830 + INIT_LIST_HEAD(&ls->ls_resdir_hash[i].rb_reslist);
5831 + rwlock_init(&ls->ls_resdir_hash[i].rb_lock);
5834 + if (flags & DLM_LSF_NOTIMERS)
5835 + set_bit(LSFL_NOTIMERS, &ls->ls_flags);
5838 + * Connect this lockspace with the cluster manager
5841 + error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
5842 + &ls_ops, TRUE, (void *) ls, &local_id);
5844 + goto out_idtblfree;
5846 + ls->ls_state = LSST_INIT;
5847 + ls->ls_local_id = local_id;
5849 + spin_lock(&lslist_lock);
5850 + list_add(&ls->ls_list, &lslist);
5851 + spin_unlock(&lslist_lock);
5853 + error = kcl_join_service(local_id);
5855 + log_error(ls, "service manager join error %d", error);
5859 + /* The ls isn't actually running until it receives a start() from CMAN.
5860 + * Neither does it have a global ls id until started. */
5863 + /* Return the local ID as the lockspace handle. I've left this
5864 + cast to a void* as it allows us to replace it with pretty much
5865 + anything at a future date without breaking clients. But returning
5866 + the address of the lockspace is a bad idea as it could get
5867 + forcibly removed, leaving client with a dangling pointer */
5868 + *lockspace = (void *)local_id;
5873 + kcl_unregister_service(ls->ls_local_id);
5876 + free_lockidtbl(ls);
5879 + kfree(ls->ls_reshashtbl);
5889 + * Called by a system like GFS which wants independent lock spaces.
5892 +int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
5894 + int error = -ENOSYS;
5896 + down(&gdlmstate_lock);
5898 + error = init_internal();
5902 + error = new_lockspace(name, namelen, lockspace, flags);
5905 + up(&gdlmstate_lock);
5910 +/* Return 1 if the lockspace still has active remote locks,
5911 + * 2 if the lockspace still has active local locks.
5913 +static int lockspace_busy(gd_ls_t *ls)
5916 + int lkb_found = 0;
5919 + /* NOTE: We check the lockidtbl here rather than the resource table.
5920 + * This is because there may be LKBs queued as ASTs that have been unlinked
5921 + * from their RSBs and are pending deletion once the AST has been delivered
5923 + read_lock(&ls->ls_lockidtbl_lock);
5924 + for (i = 0; i < ls->ls_lockidtbl_size; i++) {
5925 + if (!list_empty(&ls->ls_lockidtbl[i].list)) {
5927 + list_for_each_entry(lkb, &ls->ls_lockidtbl[i].list, lkb_idtbl_list) {
5928 + if (!lkb->lkb_nodeid) {
5929 + read_unlock(&ls->ls_lockidtbl_lock);
5935 + read_unlock(&ls->ls_lockidtbl_lock);
5939 +/* Actually release the lockspace */
5940 +static int release_lockspace(gd_ls_t *ls, int force)
5946 + struct list_head *head;
5948 + int busy = lockspace_busy(ls);
5950 + /* Don't destroy a busy lockspace */
5955 + kcl_leave_service(ls->ls_local_id);
5956 + kcl_unregister_service(ls->ls_local_id);
5959 + spin_lock(&lslist_lock);
5960 + list_del(&ls->ls_list);
5961 + spin_unlock(&lslist_lock);
5964 + * Free resdata structs.
5970 + * Free all lkb's on lockidtbl[] lists.
5973 + for (i = 0; i < ls->ls_lockidtbl_size; i++) {
5974 + head = &ls->ls_lockidtbl[i].list;
5975 + while (!list_empty(head)) {
5976 + lkb = list_entry(head->next, gd_lkb_t, lkb_idtbl_list);
5977 + list_del(&lkb->lkb_idtbl_list);
5979 + if (lkb->lkb_lockqueue_state)
5980 + remove_from_lockqueue(lkb);
5982 + if (lkb->lkb_asts_to_deliver)
5983 + list_del(&lkb->lkb_astqueue);
5985 + if (lkb->lkb_lvbptr
5986 + && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
5987 + free_lvb(lkb->lkb_lvbptr);
5994 + * Free lkidtbl[] itself
5997 + kfree(ls->ls_lockidtbl);
6000 + * Free all rsb's on reshashtbl[] lists
6003 + for (i = 0; i < ls->ls_hashsize; i++) {
6004 + head = &ls->ls_reshashtbl[i];
6005 + while (!list_empty(head)) {
6006 + rsb = list_entry(head->next, gd_res_t, res_hashchain);
6007 + list_del(&rsb->res_hashchain);
6009 + if (rsb->res_lvbptr)
6010 + free_lvb(rsb->res_lvbptr);
6017 + * Free reshashtbl[] itself
6020 + kfree(ls->ls_reshashtbl);
6023 + * Free structures on any other lists
6026 + head = &ls->ls_recover;
6027 + while (!list_empty(head)) {
6028 + gr = list_entry(head->next, gd_recover_t, gr_list);
6029 + list_del(&gr->gr_list);
6030 + free_dlm_recover(gr);
6033 + head = &ls->ls_nodes;
6034 + while (!list_empty(head)) {
6035 + csb = list_entry(head->next, gd_csb_t, csb_list);
6036 + list_del(&csb->csb_list);
6040 + head = &ls->ls_nodes_gone;
6041 + while (!list_empty(head)) {
6042 + csb = list_entry(head->next, gd_csb_t, csb_list);
6043 + list_del(&csb->csb_list);
6051 + module_put(THIS_MODULE);
6057 + * Called when a system has released all its locks and is not going to use the
6058 + * lockspace any longer. We blindly free everything we're managing for this
6059 + * lockspace. Remaining nodes will go through the recovery process as if we'd
6060 + * died. The lockspace must continue to function as usual, participating in
6061 + * recoveries, until kcl_leave_service returns.
6063 + * Force has 4 possible values:
6064 + * 0 - don't destroy locksapce if it has any LKBs
6065 + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
6066 + * 2 - destroy lockspace regardless of LKBs
6067 + * 3 - destroy lockspace as part of a forced shutdown
6070 +int dlm_release_lockspace(void *lockspace, int force)
6074 + ls = find_lockspace_by_local_id(lockspace);
6078 + return release_lockspace(ls, force);
6082 +/* Called when the cluster is being shut down dirtily */
6083 +void dlm_emergency_shutdown()
6088 + /* Shut lowcomms down to prevent any socket activity */
6089 + lowcomms_stop_accept();
6091 + /* Delete the devices that belong the the userland
6092 + lockspaces to be deleted. */
6093 + dlm_device_free_devices();
6095 + /* Now try to clean the lockspaces */
6096 + spin_lock(&lslist_lock);
6098 + list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
6099 + spin_unlock(&lslist_lock);
6100 + release_lockspace(ls, 3);
6101 + spin_lock(&lslist_lock);
6104 + spin_unlock(&lslist_lock);
6107 +gd_recover_t *allocate_dlm_recover(void)
6111 + gr = (gd_recover_t *) kmalloc(sizeof(gd_recover_t), GFP_KERNEL);
6113 + memset(gr, 0, sizeof(gd_recover_t));
6118 +void free_dlm_recover(gd_recover_t * gr)
6124 + * Called by CMAN on a specific ls. "stop" means set flag which while set
6125 + * causes all new requests to ls to be queued and not submitted until flag is
6126 + * cleared. stop on a ls also needs to cancel any prior starts on the ls.
6127 + * The recoverd thread carries out any work called for by this event.
6130 +static int dlm_ls_stop(void *servicedata)
6132 + gd_ls_t *ls = (gd_ls_t *) servicedata;
6135 + spin_lock(&ls->ls_recover_lock);
6136 + ls->ls_last_stop = ls->ls_last_start;
6137 + set_bit(LSFL_LS_STOP, &ls->ls_flags);
6138 + new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
6139 + spin_unlock(&ls->ls_recover_lock);
6142 + * This in_recovery lock does two things:
6144 + * 1) Keeps this function from returning until all threads are out
6145 + * of locking routines and locking is truely stopped.
6146 + * 2) Keeps any new requests from being processed until it's unlocked
6147 + * when recovery is complete.
6151 + down_write(&ls->ls_in_recovery);
6153 + clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
6154 + clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
6155 + clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
6156 + clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
6158 + recoverd_kick(ls);
6164 + * Called by CMAN on a specific ls. "start" means enable the lockspace to do
6165 + * request processing which first requires that the recovery procedure be
6166 + * stepped through with all nodes sharing the lockspace (nodeids). The first
6167 + * start on the ls after it's created is a special case and requires some extra
6168 + * work like figuring out our own local nodeid. We can't do all this in the
6169 + * calling CMAN context, so we must pass this work off to the recoverd thread
6170 + * which was created in gdlm_init(). The recoverd thread carries out any work
6171 + * called for by this event.
6174 +static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
6175 + int event_id, int type)
6177 + gd_ls_t *ls = (gd_ls_t *) servicedata;
6179 + int error = -ENOMEM;
6181 + gr = allocate_dlm_recover();
6185 + gr->gr_nodeids = nodeids;
6186 + gr->gr_node_count = count;
6187 + gr->gr_event_id = event_id;
6189 + spin_lock(&ls->ls_recover_lock);
6190 + ls->ls_last_start = event_id;
6191 + list_add_tail(&gr->gr_list, &ls->ls_recover);
6192 + set_bit(LSFL_LS_START, &ls->ls_flags);
6193 + spin_unlock(&ls->ls_recover_lock);
6195 + recoverd_kick(ls);
6203 + * Called by CMAN on a specific ls. "finish" means that all nodes which
6204 + * received a "start" have completed the start and called kcl_start_done.
6205 + * The recoverd thread carries out any work called for by this event.
6208 +static void dlm_ls_finish(void *servicedata, int event_id)
6210 + gd_ls_t *ls = (gd_ls_t *) servicedata;
6212 + spin_lock(&ls->ls_recover_lock);
6213 + ls->ls_last_finish = event_id;
6214 + set_bit(LSFL_LS_FINISH, &ls->ls_flags);
6215 + spin_unlock(&ls->ls_recover_lock);
6217 + recoverd_kick(ls);
6220 +struct kcl_service_ops ls_ops = {
6221 + .stop = dlm_ls_stop,
6222 + .start = dlm_ls_start,
6223 + .finish = dlm_ls_finish
6225 diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
6226 --- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730
6227 +++ linux-patched/cluster/dlm/lockspace.h 2004-06-25 18:31:07.000000000 +0800
6229 +/******************************************************************************
6230 +*******************************************************************************
6232 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6233 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6235 +** This copyrighted material is made available to anyone wishing to use,
6236 +** modify, copy, or redistribute it subject to the terms and conditions
6237 +** of the GNU General Public License v.2.
6239 +*******************************************************************************
6240 +******************************************************************************/
6242 +#ifndef __LOCKSPACE_DOT_H__
6243 +#define __LOCKSPACE_DOT_H__
6245 +void dlm_lockspace_init(void);
6246 +int dlm_init(void);
6247 +int dlm_release(void);
6248 +int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
6249 +int dlm_release_lockspace(void *ls, int force);
6250 +gd_ls_t *find_lockspace_by_global_id(uint32_t id);
6251 +gd_ls_t *find_lockspace_by_local_id(void *id);
6252 +gd_ls_t *find_lockspace_by_name(char *name, int namelen);
6253 +void free_dlm_recover(gd_recover_t *gr);
6254 +int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out);
6255 +void dlm_emergency_shutdown(void);
6257 +#endif /* __LOCKSPACE_DOT_H__ */
6258 diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
6259 --- linux-orig/cluster/dlm/lowcomms.c 1970-01-01 07:30:00.000000000 +0730
6260 +++ linux-patched/cluster/dlm/lowcomms.c 2004-06-25 18:31:07.000000000 +0800
6262 +/******************************************************************************
6263 +*******************************************************************************
6265 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6266 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6268 +** This copyrighted material is made available to anyone wishing to use,
6269 +** modify, copy, or redistribute it subject to the terms and conditions
6270 +** of the GNU General Public License v.2.
6272 +*******************************************************************************
6273 +******************************************************************************/
6278 + * This is the "low-level" comms layer.
6280 + * It is responsible for sending/receiving messages
6281 + * from other nodes in the cluster.
6283 + * Cluster nodes are referred to by their nodeids. nodeids are
6284 + * simply 32 bit numbers to the locking module - if they need to
6285 + * be expanded for the cluster infrastructure then that is it's
6286 + * responsibility. It is this layer's
6287 + * responsibility to resolve these into IP address or
6288 + * whatever it needs for inter-node communication.
6290 + * The comms level is two kernel threads that deal mainly with
6291 + * the receiving of messages from other nodes and passing them
6292 + * up to the mid-level comms layer (which understands the
6293 + * message format) for execution by the locking core, and
6294 + * a send thread which does all the setting up of connections
6295 + * to remote nodes and the sending of data. Threads are not allowed
6296 + * to send their own data because it may cause them to wait in times
6297 + * of high load. Also, this way, the sending thread can collect together
6298 + * messages bound for one node and send them in one block.
6300 + * I don't see any problem with the recv thread executing the locking
6301 + * code on behalf of remote processes as the locking code is
6302 + * short, efficient and never waits.
6307 +#include <asm/ioctls.h>
6308 +#include <net/sock.h>
6309 +#include <net/tcp.h>
6310 +#include <linux/pagemap.h>
6311 +#include <cluster/cnxman.h>
6313 +#include "dlm_internal.h"
6314 +#include "lowcomms.h"
6315 +#include "midcomms.h"
6316 +#include "config.h"
6324 +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
6325 +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
6326 +#define CBUF_EMPTY(cb) ((cb)->len == 0)
6327 +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
6328 +#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \
6329 + (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
6330 +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
6332 +struct connection {
6333 + struct socket *sock; /* NULL if not connected */
6334 + uint32_t nodeid; /* So we know who we are in the list */
6335 + struct rw_semaphore sock_sem; /* Stop connect races */
6336 + struct list_head read_list; /* On this list when ready for reading */
6337 + struct list_head write_list; /* On this list when ready for writing */
6338 + struct list_head state_list; /* On this list when ready to connect */
6339 + unsigned long flags; /* bit 1,2 = We are on the read/write lists */
6340 +#define CF_READ_PENDING 1
6341 +#define CF_WRITE_PENDING 2
6342 +#define CF_CONNECT_PENDING 3
6343 +#define CF_IS_OTHERSOCK 4
6344 + struct list_head writequeue; /* List of outgoing writequeue_entries */
6345 + struct list_head listenlist; /* List of allocated listening sockets */
6346 + spinlock_t writequeue_lock;
6347 + int (*rx_action) (struct connection *); /* What to do when active */
6348 + struct page *rx_page;
6351 +#define MAX_CONNECT_RETRIES 3
6352 + struct connection *othersock;
6354 +#define sock2con(x) ((struct connection *)(x)->sk_user_data)
6355 +#define nodeid2con(x) (&connections[(x)])
6357 +/* An entry waiting to be sent */
6358 +struct writequeue_entry {
6359 + struct list_head list;
6360 + struct page *page;
6365 + struct connection *con;
6368 +/* "Template" structure for IPv4 and IPv6 used to fill
6369 + * in the missing bits when converting between cman (which knows
6370 + * nothing about sockaddr structs) and real life where we actually
6371 + * have to connect to these addresses. Also one of these structs
6372 + * will hold the cached "us" address.
6374 + * It's an in6 sockaddr just so there's enough space for anything
6375 + * we're likely to see here.
6377 +static struct sockaddr_in6 local_addr;
6379 +/* Manage daemons */
6380 +static struct semaphore thread_lock;
6381 +static struct completion thread_completion;
6382 +static atomic_t send_run;
6383 +static atomic_t recv_run;
6385 +/* An array of connections, indexed by NODEID */
6386 +static struct connection *connections;
6387 +static int conn_array_size;
6388 +static atomic_t writequeue_length;
6389 +static atomic_t accepting;
6391 +static wait_queue_t lowcomms_send_waitq_head;
6392 +static wait_queue_head_t lowcomms_send_waitq;
6394 +static wait_queue_t lowcomms_recv_waitq_head;
6395 +static wait_queue_head_t lowcomms_recv_waitq;
6397 +/* List of sockets that have reads pending */
6398 +static struct list_head read_sockets;
6399 +static spinlock_t read_sockets_lock;
6401 +/* List of sockets which have writes pending */
6402 +static struct list_head write_sockets;
6403 +static spinlock_t write_sockets_lock;
6405 +/* List of sockets which have connects pending */
6406 +static struct list_head state_sockets;
6407 +static spinlock_t state_sockets_lock;
6409 +/* List of allocated listen sockets */
6410 +static struct list_head listen_sockets;
6412 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
6413 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
6416 +/* Data available on socket or listen socket received a connect */
6417 +static void lowcomms_data_ready(struct sock *sk, int count_unused)
6419 + struct connection *con = sock2con(sk);
6421 + if (test_and_set_bit(CF_READ_PENDING, &con->flags))
6424 + spin_lock_bh(&read_sockets_lock);
6425 + list_add_tail(&con->read_list, &read_sockets);
6426 + spin_unlock_bh(&read_sockets_lock);
6428 + wake_up_interruptible(&lowcomms_recv_waitq);
6431 +static void lowcomms_write_space(struct sock *sk)
6433 + struct connection *con = sock2con(sk);
6435 + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
6438 + spin_lock_bh(&write_sockets_lock);
6439 + list_add_tail(&con->write_list, &write_sockets);
6440 + spin_unlock_bh(&write_sockets_lock);
6442 + wake_up_interruptible(&lowcomms_send_waitq);
6445 +static inline void lowcomms_connect_sock(struct connection *con)
6447 + if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
6449 + if (!atomic_read(&accepting))
6452 + spin_lock_bh(&state_sockets_lock);
6453 + list_add_tail(&con->state_list, &state_sockets);
6454 + spin_unlock_bh(&state_sockets_lock);
6456 + wake_up_interruptible(&lowcomms_send_waitq);
6459 +static void lowcomms_state_change(struct sock *sk)
6461 +/* struct connection *con = sock2con(sk); */
6463 + switch (sk->sk_state) {
6464 + case TCP_ESTABLISHED:
6465 + lowcomms_write_space(sk);
6468 + case TCP_FIN_WAIT1:
6469 + case TCP_FIN_WAIT2:
6470 + case TCP_TIME_WAIT:
6472 + case TCP_CLOSE_WAIT:
6473 + case TCP_LAST_ACK:
6475 + /* FIXME: I think this causes more trouble than it solves.
6476 + lowcomms wil reconnect anyway when there is something to
6477 + send. This just attempts reconnection if a node goes down!
6479 + /* lowcomms_connect_sock(con); */
6483 + printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
6488 +/* Make a socket active */
6489 +static int add_sock(struct socket *sock, struct connection *con)
6493 + /* Install a data_ready callback */
6494 + con->sock->sk->sk_data_ready = lowcomms_data_ready;
6495 + con->sock->sk->sk_write_space = lowcomms_write_space;
6496 + con->sock->sk->sk_state_change = lowcomms_state_change;
6501 +/* Add the port number to an IP6 or 4 sockaddr and return the address
6503 +static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
6506 + saddr->sin6_family = local_addr.sin6_family;
6507 + if (local_addr.sin6_family == AF_INET) {
6508 + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
6509 + in4_addr->sin_port = cpu_to_be16(port);
6510 + *addr_len = sizeof(struct sockaddr_in);
6513 + saddr->sin6_port = cpu_to_be16(port);
6514 + *addr_len = sizeof(struct sockaddr_in6);
6518 +/* Close a remote connection and tidy up */
6519 +static void close_connection(struct connection *con)
6521 + if (test_bit(CF_IS_OTHERSOCK, &con->flags))
6524 + down_write(&con->sock_sem);
6527 + sock_release(con->sock);
6529 + if (con->othersock) {
6530 + down_write(&con->othersock->sock_sem);
6531 + sock_release(con->othersock->sock);
6532 + con->othersock->sock = NULL;
6533 + up_write(&con->othersock->sock_sem);
6534 + kfree(con->othersock);
6535 + con->othersock = NULL;
6538 + if (con->rx_page) {
6539 + __free_page(con->rx_page);
6540 + con->rx_page = NULL;
6542 + up_write(&con->sock_sem);
6545 +/* Data received from remote end */
6546 +static int receive_from_sock(struct connection *con)
6549 + struct msghdr msg;
6550 + struct iovec iov[2];
6554 + int call_again_soon = 0;
6556 + down_read(&con->sock_sem);
6558 + if (con->sock == NULL)
6560 + if (con->rx_page == NULL) {
6562 + * This doesn't need to be atomic, but I think it should
6563 + * improve performance if it is.
6565 + con->rx_page = alloc_page(GFP_ATOMIC);
6566 + if (con->rx_page == NULL)
6568 + CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
6571 + * To avoid doing too many short reads, we will reschedule for another
6572 + * another time if there are less than 32 bytes left in the buffer.
6574 + if (!CBUF_MAY_ADD(&con->cb, 32))
6577 + msg.msg_control = NULL;
6578 + msg.msg_controllen = 0;
6579 + msg.msg_iovlen = 1;
6580 + msg.msg_iov = iov;
6581 + msg.msg_name = NULL;
6582 + msg.msg_namelen = 0;
6583 + msg.msg_flags = 0;
6586 + * iov[0] is the bit of the circular buffer between the current end
6587 + * point (cb.base + cb.len) and the end of the buffer.
6589 + iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
6590 + iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
6591 + iov[1].iov_len = 0;
6594 + * iov[1] is the bit of the circular buffer between the start of the
6595 + * buffer and the start of the currently used section (cb.base)
6597 + if (CBUF_DATA(&con->cb) >= con->cb.base) {
6598 + iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
6599 + iov[1].iov_len = con->cb.base;
6600 + iov[1].iov_base = page_address(con->rx_page);
6601 + msg.msg_iovlen = 2;
6603 + len = iov[0].iov_len + iov[1].iov_len;
6607 + r = ret = sock_recvmsg(con->sock, &msg, len,
6608 + MSG_DONTWAIT | MSG_NOSIGNAL);
6614 + call_again_soon = 1;
6615 + CBUF_ADD(&con->cb, ret);
6616 + ret = midcomms_process_incoming_buffer(con->nodeid,
6617 + page_address(con->rx_page),
6618 + con->cb.base, con->cb.len,
6620 + if (ret == -EBADMSG) {
6621 + printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
6622 + "iov_len=%u, iov_base[0]=%p, read=%d\n",
6623 + page_address(con->rx_page), con->cb.base, con->cb.len,
6624 + len, iov[0].iov_base, r);
6628 + CBUF_EAT(&con->cb, ret);
6630 + if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
6631 + __free_page(con->rx_page);
6632 + con->rx_page = NULL;
6635 + if (call_again_soon)
6637 + up_read(&con->sock_sem);
6642 + lowcomms_data_ready(con->sock->sk, 0);
6643 + up_read(&con->sock_sem);
6648 + up_read(&con->sock_sem);
6649 + if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) {
6650 + close_connection(con);
6651 + lowcomms_connect_sock(con);
6658 +/* Listening socket is busy, accept a connection */
6659 +static int accept_from_sock(struct connection *con)
6662 + struct sockaddr_in6 peeraddr;
6663 + struct socket *newsock;
6666 + struct connection *newcon;
6668 + memset(&peeraddr, 0, sizeof(peeraddr));
6669 + newsock = sock_alloc();
6673 + down_read(&con->sock_sem);
6675 + result = -ENOTCONN;
6676 + if (con->sock == NULL)
6679 + newsock->type = con->sock->type;
6680 + newsock->ops = con->sock->ops;
6682 + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
6686 + /* Get the connected socket's peer */
6687 + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
6689 + result = -ECONNABORTED;
6693 + /* Get the new node's NODEID */
6694 + nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
6695 + if (nodeid == 0) {
6696 + printk("dlm: connect from non cluster node\n");
6697 + sock_release(newsock);
6698 + up_read(&con->sock_sem);
6702 + log_print("got connection from %d", nodeid);
6704 + /* Check to see if we already have a connection to this node. This
6705 + * could happen if the two nodes initiate a connection at roughly
6706 + * the same time and the connections cross on the wire.
6708 + * In this case we store the incoming one in "othersock"
6710 + newcon = nodeid2con(nodeid);
6711 + down_write(&newcon->sock_sem);
6712 + if (newcon->sock) {
6713 + struct connection *othercon;
6715 + othercon = kmalloc(sizeof(struct connection), GFP_KERNEL);
6717 + printk("dlm: failed to allocate incoming socket\n");
6718 + sock_release(newsock);
6719 + up_write(&newcon->sock_sem);
6720 + up_read(&con->sock_sem);
6723 + memset(othercon, 0, sizeof(*othercon));
6724 + newcon->othersock = othercon;
6725 + othercon->nodeid = nodeid;
6726 + othercon->sock = newsock;
6727 + othercon->rx_action = receive_from_sock;
6728 + add_sock(newsock, othercon);
6729 + init_rwsem(&othercon->sock_sem);
6730 + set_bit(CF_IS_OTHERSOCK, &othercon->flags);
6731 + newsock->sk->sk_user_data = othercon;
6733 + up_write(&newcon->sock_sem);
6734 + lowcomms_data_ready(newsock->sk, 0);
6735 + up_read(&con->sock_sem);
6739 + newsock->sk->sk_user_data = newcon;
6740 + newcon->rx_action = receive_from_sock;
6741 + add_sock(newsock, newcon);
6742 + up_write(&newcon->sock_sem);
6745 + * Add it to the active queue in case we got data
6746 + * beween processing the accept adding the socket
6747 + * to the read_sockets list
6749 + lowcomms_data_ready(newsock->sk, 0);
6751 + up_read(&con->sock_sem);
6757 + up_read(&con->sock_sem);
6758 + sock_release(newsock);
6760 + printk("dlm: error accepting connection from node: %d\n", result);
6764 +/* Connect a new socket to its peer */
6765 +static int connect_to_sock(struct connection *con)
6767 + int result = -EHOSTUNREACH;
6768 + struct sockaddr_in6 saddr;
6770 + struct socket *sock;
6772 + if (con->nodeid == 0) {
6773 + log_print("attempt to connect sock 0 foiled");
6777 + down_write(&con->sock_sem);
6778 + if (con->retries++ > MAX_CONNECT_RETRIES)
6781 + // FIXME not sure this should happen, let alone like this.
6783 + sock_release(con->sock);
6787 + /* Create a socket to communicate with */
6788 + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6792 + if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
6795 + sock->sk->sk_user_data = con;
6796 + con->rx_action = receive_from_sock;
6798 + make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
6800 + add_sock(sock, con);
6802 + sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
6804 + if (result == -EINPROGRESS)
6810 + up_write(&con->sock_sem);
6812 + * Returning an error here means we've given up trying to connect to
6813 + * a remote node, otherwise we return 0 and reschedule the connetion
6820 + sock_release(con->sock);
6824 + * Some errors are fatal and this list might need adjusting. For other
6825 + * errors we try again until the max number of retries is reached.
6827 + if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
6828 + result != -ENETDOWN && result != EINVAL
6829 + && result != -EPROTONOSUPPORT) {
6830 + lowcomms_connect_sock(con);
6836 +static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
6838 + struct socket *sock = NULL;
6842 + struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
6844 + /* Create a socket to communicate with */
6845 + result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6847 + printk("dlm: Can't create listening comms socket\n");
6853 + result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
6856 + printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
6858 + sock->sk->sk_user_data = con;
6859 + con->rx_action = accept_from_sock;
6862 + /* Bind to our port */
6863 + make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
6864 + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
6866 + printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
6867 + sock_release(sock);
6875 + result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
6878 + printk("dlm: Set keepalive failed: %d\n", result);
6881 + result = sock->ops->listen(sock, 5);
6883 + printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
6884 + sock_release(sock);
6894 +/* Listen on all interfaces */
6895 +static int listen_for_all(void)
6899 + struct socket *sock = NULL;
6900 + struct list_head *addr_list;
6901 + struct connection *con = nodeid2con(0);
6902 + struct cluster_node_addr *node_addr;
6903 + char local_addr[sizeof(struct sockaddr_in6)];
6905 + /* This will also fill in local_addr */
6906 + nodeid = lowcomms_our_nodeid();
6908 + addr_list = kcl_get_node_addresses(nodeid);
6910 + printk("dlm: cannot initialise comms layer\n");
6911 + result = -ENOTCONN;
6915 + list_for_each_entry(node_addr, addr_list, list) {
6918 + con = kmalloc(sizeof(struct connection), GFP_KERNEL);
6920 + printk("dlm: failed to allocate listen socket\n");
6923 + memset(con, 0, sizeof(*con));
6924 + init_rwsem(&con->sock_sem);
6925 + spin_lock_init(&con->writequeue_lock);
6926 + INIT_LIST_HEAD(&con->writequeue);
6927 + set_bit(CF_IS_OTHERSOCK, &con->flags);
6930 + memcpy(local_addr, node_addr->addr, node_addr->addr_len);
6931 + sock = create_listen_sock(con, local_addr,
6932 + node_addr->addr_len);
6934 + add_sock(sock, con);
6940 + /* Keep a list of dynamically allocated listening sockets
6941 + so we can free them at shutdown */
6942 + if (test_bit(CF_IS_OTHERSOCK, &con->flags)) {
6943 + list_add_tail(&con->listenlist, &listen_sockets);
6954 +static struct writequeue_entry *new_writequeue_entry(struct connection *con,
6957 + struct writequeue_entry *entry;
6959 + entry = kmalloc(sizeof(struct writequeue_entry), allocation);
6963 + entry->page = alloc_page(allocation);
6964 + if (!entry->page) {
6969 + entry->offset = 0;
6978 +struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
6979 + int allocation, char **ppc)
6981 + struct connection *con = nodeid2con(nodeid);
6982 + struct writequeue_entry *e;
6986 + if (!atomic_read(&accepting))
6989 + spin_lock(&con->writequeue_lock);
6990 + e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
6991 + if (((struct list_head *) e == &con->writequeue) ||
6992 + (PAGE_CACHE_SIZE - e->end < len)) {
6997 + users = e->users++;
6999 + spin_unlock(&con->writequeue_lock);
7005 + *ppc = page_address(e->page) + offset;
7009 + e = new_writequeue_entry(con, allocation);
7011 + spin_lock(&con->writequeue_lock);
7014 + users = e->users++;
7015 + list_add_tail(&e->list, &con->writequeue);
7016 + spin_unlock(&con->writequeue_lock);
7017 + atomic_inc(&writequeue_length);
7023 +void lowcomms_commit_buffer(struct writequeue_entry *e)
7025 + struct connection *con = e->con;
7028 + if (!atomic_read(&accepting))
7031 + spin_lock(&con->writequeue_lock);
7032 + users = --e->users;
7035 + e->len = e->end - e->offset;
7037 + spin_unlock(&con->writequeue_lock);
7039 + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
7040 + spin_lock_bh(&write_sockets_lock);
7041 + list_add_tail(&con->write_list, &write_sockets);
7042 + spin_unlock_bh(&write_sockets_lock);
7044 + wake_up_interruptible(&lowcomms_send_waitq);
7049 + spin_unlock(&con->writequeue_lock);
7053 +static void free_entry(struct writequeue_entry *e)
7055 + __free_page(e->page);
7057 + atomic_dec(&writequeue_length);
7060 +/* Send a message */
7061 +static int send_to_sock(struct connection *con)
7064 + ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
7065 + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
7066 + struct writequeue_entry *e;
7069 + down_read(&con->sock_sem);
7070 + if (con->sock == NULL)
7073 + sendpage = con->sock->ops->sendpage;
7075 + spin_lock(&con->writequeue_lock);
7077 + e = list_entry(con->writequeue.next, struct writequeue_entry,
7079 + if ((struct list_head *) e == &con->writequeue)
7083 + offset = e->offset;
7084 + BUG_ON(len == 0 && e->users == 0);
7085 + spin_unlock(&con->writequeue_lock);
7089 + ret = sendpage(con->sock, e->page, offset, len,
7091 + if (ret == -EAGAIN || ret == 0)
7097 + spin_lock(&con->writequeue_lock);
7101 + if (e->len == 0 && e->users == 0) {
7102 + list_del(&e->list);
7107 + spin_unlock(&con->writequeue_lock);
7109 + up_read(&con->sock_sem);
7113 + up_read(&con->sock_sem);
7114 + close_connection(con);
7115 + lowcomms_connect_sock(con);
7119 + up_read(&con->sock_sem);
7120 + lowcomms_connect_sock(con);
7124 +/* Called from recoverd when it knows that a node has
7125 + left the cluster */
7126 +int lowcomms_close(int nodeid)
7128 + struct connection *con;
7133 + con = nodeid2con(nodeid);
7135 + close_connection(con);
7143 +/* API send message call, may queue the request */
7144 +/* N.B. This is the old interface - use the new one for new calls */
7145 +int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
7147 + struct writequeue_entry *e;
7150 + GDLM_ASSERT(nodeid < dlm_config.max_connections,
7151 + printk("nodeid=%u\n", nodeid););
7153 + e = lowcomms_get_buffer(nodeid, len, allocation, &b);
7155 + memcpy(b, buf, len);
7156 + lowcomms_commit_buffer(e);
7162 +/* Look for activity on active sockets */
7163 +static void process_sockets(void)
7165 + struct list_head *list;
7166 + struct list_head *temp;
7168 + spin_lock_bh(&read_sockets_lock);
7169 + list_for_each_safe(list, temp, &read_sockets) {
7170 + struct connection *con =
7171 + list_entry(list, struct connection, read_list);
7172 + list_del(&con->read_list);
7173 + clear_bit(CF_READ_PENDING, &con->flags);
7175 + spin_unlock_bh(&read_sockets_lock);
7177 + con->rx_action(con);
7179 + /* Don't starve out everyone else */
7181 + spin_lock_bh(&read_sockets_lock);
7183 + spin_unlock_bh(&read_sockets_lock);
7186 +/* Try to send any messages that are pending
7188 +static void process_output_queue(void)
7190 + struct list_head *list;
7191 + struct list_head *temp;
7194 + spin_lock_bh(&write_sockets_lock);
7195 + list_for_each_safe(list, temp, &write_sockets) {
7196 + struct connection *con =
7197 + list_entry(list, struct connection, write_list);
7198 + list_del(&con->write_list);
7199 + clear_bit(CF_WRITE_PENDING, &con->flags);
7201 + spin_unlock_bh(&write_sockets_lock);
7203 + ret = send_to_sock(con);
7206 + spin_lock_bh(&write_sockets_lock);
7208 + spin_unlock_bh(&write_sockets_lock);
7211 +static void process_state_queue(void)
7213 + struct list_head *list;
7214 + struct list_head *temp;
7217 + spin_lock_bh(&state_sockets_lock);
7218 + list_for_each_safe(list, temp, &state_sockets) {
7219 + struct connection *con =
7220 + list_entry(list, struct connection, state_list);
7221 + list_del(&con->state_list);
7222 + clear_bit(CF_CONNECT_PENDING, &con->flags);
7223 + spin_unlock_bh(&state_sockets_lock);
7225 + ret = connect_to_sock(con);
7228 + spin_lock_bh(&state_sockets_lock);
7230 + spin_unlock_bh(&state_sockets_lock);
7233 +/* Discard all entries on the write queues */
7234 +static void clean_writequeues(void)
7236 + struct list_head *list;
7237 + struct list_head *temp;
7240 + for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) {
7241 + struct connection *con = nodeid2con(nodeid);
7243 + spin_lock(&con->writequeue_lock);
7244 + list_for_each_safe(list, temp, &con->writequeue) {
7245 + struct writequeue_entry *e =
7246 + list_entry(list, struct writequeue_entry, list);
7247 + list_del(&e->list);
7250 + spin_unlock(&con->writequeue_lock);
7254 +static int read_list_empty(void)
7258 + spin_lock_bh(&read_sockets_lock);
7259 + status = list_empty(&read_sockets);
7260 + spin_unlock_bh(&read_sockets_lock);
7265 +/* DLM Transport comms receive daemon */
7266 +static int dlm_recvd(void *data)
7268 + daemonize("dlm_recvd");
7269 + atomic_set(&recv_run, 1);
7271 + init_waitqueue_head(&lowcomms_recv_waitq);
7272 + init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
7273 + add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
7275 + complete(&thread_completion);
7277 + while (atomic_read(&recv_run)) {
7279 + set_task_state(current, TASK_INTERRUPTIBLE);
7281 + if (read_list_empty())
7284 + set_task_state(current, TASK_RUNNING);
7286 + process_sockets();
7289 + down(&thread_lock);
7292 + complete(&thread_completion);
7297 +static int write_and_state_lists_empty(void)
7301 + spin_lock_bh(&write_sockets_lock);
7302 + status = list_empty(&write_sockets);
7303 + spin_unlock_bh(&write_sockets_lock);
7305 + spin_lock_bh(&state_sockets_lock);
7306 + if (list_empty(&state_sockets) == 0)
7308 + spin_unlock_bh(&state_sockets_lock);
7313 +/* DLM Transport send daemon */
7314 +static int dlm_sendd(void *data)
7316 + daemonize("dlm_sendd");
7317 + atomic_set(&send_run, 1);
7319 + init_waitqueue_head(&lowcomms_send_waitq);
7320 + init_waitqueue_entry(&lowcomms_send_waitq_head, current);
7321 + add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
7323 + complete(&thread_completion);
7325 + while (atomic_read(&send_run)) {
7327 + set_task_state(current, TASK_INTERRUPTIBLE);
7329 + if (write_and_state_lists_empty())
7332 + set_task_state(current, TASK_RUNNING);
7334 + process_state_queue();
7335 + process_output_queue();
7338 + down(&thread_lock);
7341 + complete(&thread_completion);
7346 +static void daemons_stop(void)
7348 + if (atomic_read(&recv_run)) {
7349 + down(&thread_lock);
7350 + atomic_set(&recv_run, 0);
7351 + wake_up_interruptible(&lowcomms_recv_waitq);
7353 + wait_for_completion(&thread_completion);
7356 + if (atomic_read(&send_run)) {
7357 + down(&thread_lock);
7358 + atomic_set(&send_run, 0);
7359 + wake_up_interruptible(&lowcomms_send_waitq);
7361 + wait_for_completion(&thread_completion);
7365 +static int daemons_start(void)
7369 + error = kernel_thread(dlm_recvd, NULL, 0);
7371 + log_print("can't start recvd thread: %d", error);
7374 + wait_for_completion(&thread_completion);
7376 + error = kernel_thread(dlm_sendd, NULL, 0);
7378 + log_print("can't start sendd thread: %d", error);
7382 + wait_for_completion(&thread_completion);
7390 + * Return the largest buffer size we can cope with.
7392 +int lowcomms_max_buffer_size(void)
7394 + return PAGE_CACHE_SIZE;
7397 +void lowcomms_stop(void)
7400 + struct connection *temp;
7401 + struct connection *lcon;
7403 + atomic_set(&accepting, 0);
7405 + /* Set all the activity flags to prevent any
7408 + for (i = 0; i < conn_array_size; i++) {
7409 + connections[i].flags = 0x7;
7412 + clean_writequeues();
7414 + for (i = 0; i < conn_array_size; i++) {
7415 + close_connection(nodeid2con(i));
7418 + kfree(connections);
7419 + connections = NULL;
7421 + /* Free up any dynamically allocated listening sockets */
7422 + list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
7423 + sock_release(lcon->sock);
7427 + kcl_releaseref_cluster();
7430 +/* This is quite likely to sleep... */
7431 +int lowcomms_start(void)
7436 + INIT_LIST_HEAD(&read_sockets);
7437 + INIT_LIST_HEAD(&write_sockets);
7438 + INIT_LIST_HEAD(&state_sockets);
7439 + INIT_LIST_HEAD(&listen_sockets);
7441 + spin_lock_init(&read_sockets_lock);
7442 + spin_lock_init(&write_sockets_lock);
7443 + spin_lock_init(&state_sockets_lock);
7445 + init_completion(&thread_completion);
7446 + init_MUTEX(&thread_lock);
7447 + atomic_set(&send_run, 0);
7448 + atomic_set(&recv_run, 0);
7450 + error = -ENOTCONN;
7451 + if (kcl_addref_cluster())
7455 + * Temporarily initialise the waitq head so that lowcomms_send_message
7456 + * doesn't crash if it gets called before the thread is fully
7459 + init_waitqueue_head(&lowcomms_send_waitq);
7463 + connections = kmalloc(sizeof(struct connection) *
7464 + dlm_config.max_connections, GFP_KERNEL);
7468 + memset(connections, 0,
7469 + sizeof(struct connection) * dlm_config.max_connections);
7470 + for (i = 0; i < dlm_config.max_connections; i++) {
7471 + connections[i].nodeid = i;
7472 + init_rwsem(&connections[i].sock_sem);
7473 + INIT_LIST_HEAD(&connections[i].writequeue);
7474 + spin_lock_init(&connections[i].writequeue_lock);
7476 + conn_array_size = dlm_config.max_connections;
7478 + /* Start listening */
7479 + error = listen_for_all();
7481 + goto fail_free_conn;
7483 + error = daemons_start();
7485 + goto fail_free_conn;
7487 + atomic_set(&accepting, 1);
7492 + kfree(connections);
7498 +/* Don't accept any more outgoing work */
7499 +void lowcomms_stop_accept()
7501 + atomic_set(&accepting, 0);
7504 +/* Cluster Manager interface functions for looking up
7505 + nodeids and IP addresses by each other
7508 +/* Return the IP address of a node given its NODEID */
7509 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
7511 + struct list_head *addrs;
7512 + struct cluster_node_addr *node_addr;
7513 + struct cluster_node_addr *current_addr = NULL;
7514 + struct sockaddr_in6 *saddr;
7518 + addrs = kcl_get_node_addresses(nodeid);
7522 + interface = kcl_get_current_interface();
7524 + /* Look for address number <interface> */
7525 + i=0; /* i/f numbers start at 1 */
7526 + list_for_each_entry(node_addr, addrs, list) {
7527 + if (interface == ++i) {
7528 + current_addr = node_addr;
7533 + /* If that failed then just use the first one */
7534 + if (!current_addr)
7535 + current_addr = (struct cluster_node_addr *)addrs->next;
7537 + saddr = (struct sockaddr_in6 *)current_addr->addr;
7539 + /* Extract the IP address */
7540 + if (saddr->sin6_family == AF_INET) {
7541 + struct sockaddr_in *in4 = (struct sockaddr_in *)saddr;
7542 + struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
7543 + ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
7546 + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
7547 + memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
7553 +/* Return the NODEID for a node given its sockaddr */
7554 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
7556 + struct kcl_cluster_node node;
7557 + struct sockaddr_in6 ipv6_addr;
7558 + struct sockaddr_in ipv4_addr;
7560 + if (addr->sa_family == AF_INET) {
7561 + struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
7562 + memcpy(&ipv4_addr, &local_addr, addr_len);
7563 + memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
7565 + addr = (struct sockaddr *)&ipv4_addr;
7568 + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
7569 + memcpy(&ipv6_addr, &local_addr, addr_len);
7570 + memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
7572 + addr = (struct sockaddr *)&ipv6_addr;
7575 + if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
7576 + return node.node_id;
7581 +int lowcomms_our_nodeid(void)
7583 + struct kcl_cluster_node node;
7584 + struct list_head *addrs;
7585 + struct cluster_node_addr *first_addr;
7586 + static int our_nodeid = 0;
7589 + return our_nodeid;
7591 + if (kcl_get_node_by_nodeid(0, &node) == -1)
7594 + our_nodeid = node.node_id;
7596 + /* Fill in the "template" structure */
7597 + addrs = kcl_get_node_addresses(our_nodeid);
7601 + first_addr = (struct cluster_node_addr *) addrs->next;
7602 + memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
7604 + return node.node_id;
7607 + * Overrides for Emacs so that we follow Linus's tabbing style.
7608 + * Emacs will notice this stuff at the end of the file and automatically
7609 + * adjust the settings for this buffer only. This must remain at the end
7611 + * ---------------------------------------------------------------------------
7612 + * Local variables:
7613 + * c-file-style: "linux"
7616 diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
7617 --- linux-orig/cluster/dlm/lowcomms.h 1970-01-01 07:30:00.000000000 +0730
7618 +++ linux-patched/cluster/dlm/lowcomms.h 2004-06-25 18:31:07.000000000 +0800
7620 +/******************************************************************************
7621 +*******************************************************************************
7623 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7624 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7626 +** This copyrighted material is made available to anyone wishing to use,
7627 +** modify, copy, or redistribute it subject to the terms and conditions
7628 +** of the GNU General Public License v.2.
7630 +*******************************************************************************
7631 +******************************************************************************/
7633 +#ifndef __LOWCOMMS_DOT_H__
7634 +#define __LOWCOMMS_DOT_H__
7636 +/* The old interface */
7637 +int lowcomms_send_message(int csid, char *buf, int len, int allocation);
7639 +/* The new interface */
7640 +struct writequeue_entry;
7641 +extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7642 + int allocation, char **ppc);
7643 +extern void lowcomms_commit_buffer(struct writequeue_entry *e);
7645 +int lowcomms_start(void);
7646 +void lowcomms_stop(void);
7647 +void lowcomms_stop_accept(void);
7648 +int lowcomms_close(int nodeid);
7649 +int lowcomms_max_buffer_size(void);
7651 +int lowcomms_our_nodeid(void);
7653 +#endif /* __LOWCOMMS_DOT_H__ */
7654 diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
7655 --- linux-orig/cluster/dlm/main.c 1970-01-01 07:30:00.000000000 +0730
7656 +++ linux-patched/cluster/dlm/main.c 2004-06-25 18:31:07.000000000 +0800
7658 +/******************************************************************************
7659 +*******************************************************************************
7661 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7662 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7664 +** This copyrighted material is made available to anyone wishing to use,
7665 +** modify, copy, or redistribute it subject to the terms and conditions
7666 +** of the GNU General Public License v.2.
7668 +*******************************************************************************
7669 +******************************************************************************/
7671 +#define EXPORT_SYMTAB
7673 +#include <linux/init.h>
7674 +#include <linux/proc_fs.h>
7675 +#include <linux/ctype.h>
7676 +#include <linux/seq_file.h>
7677 +#include <linux/module.h>
7678 +#include <net/sock.h>
7680 +#include <cluster/cnxman.h>
7682 +#include "dlm_internal.h"
7683 +#include "lockspace.h"
7684 +#include "recoverd.h"
7688 +#include "locking.h"
7689 +#include "config.h"
7690 +#include "memory.h"
7691 +#include "recover.h"
7692 +#include "lowcomms.h"
7694 +int dlm_device_init(void);
7695 +void dlm_device_exit(void);
7696 +void dlm_proc_init(void);
7697 +void dlm_proc_exit(void);
7700 +/* Cluster manager callbacks, we want to know if a node dies
7701 + N.B. this is independent of lockspace-specific event callbacks from SM */
7703 +static void cman_callback(kcl_callback_reason reason, long arg)
7705 + if (reason == DIED) {
7706 + lowcomms_close((int) arg);
7709 + /* This is unconditional. so do what we can to tidy up */
7710 + if (reason == LEAVING) {
7711 + dlm_emergency_shutdown();
7715 +int __init init_dlm(void)
7718 + dlm_lockspace_init();
7719 + dlm_recoverd_init();
7721 + dlm_device_init();
7722 + dlm_memory_init();
7723 + dlm_config_init();
7725 + kcl_add_callback(cman_callback);
7727 + printk("DLM %s (built %s %s) installed\n",
7728 + DLM_RELEASE_NAME, __DATE__, __TIME__);
7733 +void __exit exit_dlm(void)
7735 + kcl_remove_callback(cman_callback);
7737 + dlm_device_exit();
7738 + dlm_memory_exit();
7739 + dlm_config_exit();
7743 +MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
7744 +MODULE_AUTHOR("Red Hat, Inc.");
7745 +MODULE_LICENSE("GPL");
7747 +module_init(init_dlm);
7748 +module_exit(exit_dlm);
7750 +EXPORT_SYMBOL(dlm_init);
7751 +EXPORT_SYMBOL(dlm_release);
7752 +EXPORT_SYMBOL(dlm_new_lockspace);
7753 +EXPORT_SYMBOL(dlm_release_lockspace);
7754 +EXPORT_SYMBOL(dlm_lock);
7755 +EXPORT_SYMBOL(dlm_unlock);
7756 diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
7757 --- linux-orig/cluster/dlm/memory.c 1970-01-01 07:30:00.000000000 +0730
7758 +++ linux-patched/cluster/dlm/memory.c 2004-06-25 18:31:07.000000000 +0800
7760 +/******************************************************************************
7761 +*******************************************************************************
7763 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7764 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7766 +** This copyrighted material is made available to anyone wishing to use,
7767 +** modify, copy, or redistribute it subject to the terms and conditions
7768 +** of the GNU General Public License v.2.
7770 +*******************************************************************************
7771 +******************************************************************************/
7775 + * memory allocation routines
7779 +#include "dlm_internal.h"
7780 +#include "memory.h"
7781 +#include "config.h"
7783 +/* as the man says...Shouldn't this be in a header file somewhere? */
7784 +#define BYTES_PER_WORD sizeof(void *)
7786 +static kmem_cache_t *rsb_cache_small;
7787 +static kmem_cache_t *rsb_cache_large;
7788 +static kmem_cache_t *lkb_cache;
7789 +static kmem_cache_t *lvb_cache;
7790 +static kmem_cache_t *resdir_cache_large;
7791 +static kmem_cache_t *resdir_cache_small;
7793 +/* The thresholds above which we allocate large RSBs/resdatas rather than small
7794 + * ones. This must make the resultant structure end on a word boundary */
7795 +#define LARGE_RSB_NAME 28
7796 +#define LARGE_RES_NAME 28
7798 +int dlm_memory_init()
7800 + int ret = -ENOMEM;
7804 + kmem_cache_create("dlm_rsb(small)",
7805 + (sizeof(gd_res_t) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7806 + __alignof__(gd_res_t), 0, NULL, NULL);
7807 + if (!rsb_cache_small)
7811 + kmem_cache_create("dlm_rsb(large)",
7812 + sizeof(gd_res_t) + DLM_RESNAME_MAXLEN,
7813 + __alignof__(gd_res_t), 0, NULL, NULL);
7814 + if (!rsb_cache_large)
7815 + goto out_free_rsbs;
7817 + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(gd_lkb_t),
7818 + __alignof__(gd_lkb_t), 0, NULL, NULL);
7820 + goto out_free_rsbl;
7822 + resdir_cache_large =
7823 + kmem_cache_create("dlm_resdir(l)",
7824 + sizeof(gd_resdata_t) + DLM_RESNAME_MAXLEN,
7825 + __alignof__(gd_resdata_t), 0, NULL, NULL);
7826 + if (!resdir_cache_large)
7827 + goto out_free_lkb;
7829 + resdir_cache_small =
7830 + kmem_cache_create("dlm_resdir(s)",
7831 + (sizeof(gd_resdata_t) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7832 + __alignof__(gd_resdata_t), 0, NULL, NULL);
7833 + if (!resdir_cache_small)
7834 + goto out_free_resl;
7836 + /* LVB cache also holds ranges, so should be 64bit aligned */
7837 + lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
7838 + __alignof__(uint64_t), 0, NULL, NULL);
7840 + goto out_free_ress;
7846 + kmem_cache_destroy(resdir_cache_small);
7849 + kmem_cache_destroy(resdir_cache_large);
7852 + kmem_cache_destroy(lkb_cache);
7855 + kmem_cache_destroy(rsb_cache_large);
7858 + kmem_cache_destroy(rsb_cache_small);
7864 +void dlm_memory_exit()
7866 + kmem_cache_destroy(rsb_cache_large);
7867 + kmem_cache_destroy(rsb_cache_small);
7868 + kmem_cache_destroy(lkb_cache);
7869 + kmem_cache_destroy(resdir_cache_small);
7870 + kmem_cache_destroy(resdir_cache_large);
7871 + kmem_cache_destroy(lvb_cache);
7874 +gd_res_t *allocate_rsb(gd_ls_t *ls, int namelen)
7878 + GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
7880 + if (namelen >= LARGE_RSB_NAME)
7881 + r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
7883 + r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
7886 + memset(r, 0, sizeof(gd_res_t) + namelen);
7891 +void free_rsb(gd_res_t *r)
7893 + int length = r->res_length;
7896 + memset(r, 0x55, sizeof(gd_res_t) + r->res_length);
7899 + if (length >= LARGE_RSB_NAME)
7900 + kmem_cache_free(rsb_cache_large, r);
7902 + kmem_cache_free(rsb_cache_small, r);
7905 +gd_lkb_t *allocate_lkb(gd_ls_t *ls)
7909 + l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
7911 + memset(l, 0, sizeof(gd_lkb_t));
7916 +void free_lkb(gd_lkb_t *l)
7919 + memset(l, 0xAA, sizeof(gd_lkb_t));
7921 + kmem_cache_free(lkb_cache, l);
7924 +gd_resdata_t *allocate_resdata(gd_ls_t *ls, int namelen)
7928 + GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
7930 + if (namelen >= LARGE_RES_NAME)
7931 + rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
7933 + rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
7936 + memset(rd, 0, sizeof(gd_resdata_t));
7941 +void free_resdata(gd_resdata_t *rd)
7943 + if (rd->rd_length >= LARGE_RES_NAME)
7944 + kmem_cache_free(resdir_cache_large, rd);
7946 + kmem_cache_free(resdir_cache_small, rd);
7949 +char *allocate_lvb(gd_ls_t *ls)
7953 + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
7955 + memset(l, 0, DLM_LVB_LEN);
7960 +void free_lvb(char *l)
7962 + kmem_cache_free(lvb_cache, l);
7965 +/* Ranges are allocated from the LVB cache as they are the same size (4x64
7967 +uint64_t *allocate_range(gd_ls_t * ls)
7971 + l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
7973 + memset(l, 0, DLM_LVB_LEN);
7978 +void free_range(uint64_t *l)
7980 + kmem_cache_free(lvb_cache, l);
7983 +gd_rcom_t *allocate_rcom_buffer(gd_ls_t *ls)
7987 + rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
7989 + memset(rc, 0, dlm_config.buffer_size);
7994 +void free_rcom_buffer(gd_rcom_t *rc)
7998 diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
7999 --- linux-orig/cluster/dlm/memory.h 1970-01-01 07:30:00.000000000 +0730
8000 +++ linux-patched/cluster/dlm/memory.h 2004-06-25 18:31:07.000000000 +0800
8002 +/******************************************************************************
8003 +*******************************************************************************
8005 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8006 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8008 +** This copyrighted material is made available to anyone wishing to use,
8009 +** modify, copy, or redistribute it subject to the terms and conditions
8010 +** of the GNU General Public License v.2.
8012 +*******************************************************************************
8013 +******************************************************************************/
8015 +#ifndef __MEMORY_DOT_H__
8016 +#define __MEMORY_DOT_H__
8018 +int dlm_memory_init(void);
8019 +void dlm_memory_exit(void);
8020 +gd_res_t *allocate_rsb(gd_ls_t * ls, int namelen);
8021 +void free_rsb(gd_res_t * r);
8022 +gd_lkb_t *allocate_lkb(gd_ls_t * ls);
8023 +void free_lkb(gd_lkb_t * l);
8024 +gd_resdata_t *allocate_resdata(gd_ls_t * ls, int namelen);
8025 +void free_resdata(gd_resdata_t * rd);
8026 +char *allocate_lvb(gd_ls_t * ls);
8027 +void free_lvb(char *l);
8028 +gd_rcom_t *allocate_rcom_buffer(gd_ls_t * ls);
8029 +void free_rcom_buffer(gd_rcom_t * rc);
8030 +uint64_t *allocate_range(gd_ls_t * ls);
8031 +void free_range(uint64_t * l);
8033 +#endif /* __MEMORY_DOT_H__ */
8034 diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
8035 --- linux-orig/cluster/dlm/midcomms.c 1970-01-01 07:30:00.000000000 +0730
8036 +++ linux-patched/cluster/dlm/midcomms.c 2004-06-25 18:31:07.000000000 +0800
8038 +/******************************************************************************
8039 +*******************************************************************************
8041 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8042 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8044 +** This copyrighted material is made available to anyone wishing to use,
8045 +** modify, copy, or redistribute it subject to the terms and conditions
8046 +** of the GNU General Public License v.2.
8048 +*******************************************************************************
8049 +******************************************************************************/
8054 + * This is the appallingly named "mid-level" comms layer.
8056 + * Its purpose is to take packets from the "real" comms layer,
8057 + * split them up into packets and pass them to the interested
8058 + * part of the locking mechanism.
8060 + * It also takes messages from the locking layer, formats them
8061 + * into packets and sends them to the comms layer.
8063 + * It knows the format of the mid-level messages used and nodeidss
8064 + * but it does not know how to resolve a nodeid into an IP address
8065 + * or any of the comms channel details
8069 +#include "dlm_internal.h"
8070 +#include "lowcomms.h"
8071 +#include "midcomms.h"
8072 +#include "lockqueue.h"
8074 +#include "reccomms.h"
8075 +#include "config.h"
8077 +/* Byteorder routines */
8079 +static void host_to_network(void *msg)
8081 + struct gd_req_header *head = msg;
8082 + struct gd_remlockrequest *req = msg;
8083 + struct gd_remlockreply *reply = msg;
8084 + struct gd_remquery *query = msg;
8085 + struct gd_remqueryreply *queryrep = msg;
8086 + gd_rcom_t *rc = msg;
8088 + /* Force into network byte order */
8091 + * Do the common header first
8094 + head->rh_length = cpu_to_le16(head->rh_length);
8095 + head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
8096 + /* Leave the lkid alone as it is transparent at the remote end */
8099 + * Do the fields in the remlockrequest or remlockreply structs
8102 + switch (req->rr_header.rh_cmd) {
8104 + case GDLM_REMCMD_LOCKREQUEST:
8105 + case GDLM_REMCMD_CONVREQUEST:
8106 + req->rr_range_start = cpu_to_le64(req->rr_range_start);
8107 + req->rr_range_end = cpu_to_le64(req->rr_range_end);
8108 + /* Deliberate fall through */
8109 + case GDLM_REMCMD_UNLOCKREQUEST:
8110 + case GDLM_REMCMD_LOOKUP:
8111 + case GDLM_REMCMD_LOCKGRANT:
8112 + case GDLM_REMCMD_SENDBAST:
8113 + case GDLM_REMCMD_SENDCAST:
8114 + case GDLM_REMCMD_REM_RESDATA:
8115 + req->rr_flags = cpu_to_le32(req->rr_flags);
8116 + req->rr_status = cpu_to_le32(req->rr_status);
8119 + case GDLM_REMCMD_LOCKREPLY:
8120 + reply->rl_lockstate = cpu_to_le32(reply->rl_lockstate);
8121 + reply->rl_nodeid = cpu_to_le32(reply->rl_nodeid);
8122 + reply->rl_status = cpu_to_le32(reply->rl_status);
8125 + case GDLM_REMCMD_RECOVERMESSAGE:
8126 + case GDLM_REMCMD_RECOVERREPLY:
8127 + rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
8128 + rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
8131 + case GDLM_REMCMD_QUERY:
8132 + query->rq_mstlkid = cpu_to_le32(query->rq_mstlkid);
8133 + query->rq_query = cpu_to_le32(query->rq_query);
8134 + query->rq_maxlocks = cpu_to_le32(query->rq_maxlocks);
8137 + case GDLM_REMCMD_QUERYREPLY:
8138 + queryrep->rq_numlocks = cpu_to_le32(queryrep->rq_numlocks);
8139 + queryrep->rq_status = cpu_to_le32(queryrep->rq_status);
8140 + queryrep->rq_grantcount = cpu_to_le32(queryrep->rq_grantcount);
8141 + queryrep->rq_waitcount = cpu_to_le32(queryrep->rq_waitcount);
8142 + queryrep->rq_convcount = cpu_to_le32(queryrep->rq_convcount);
8146 + printk("dlm: warning, unknown REMCMD type %u\n",
8147 + req->rr_header.rh_cmd);
8151 +static void network_to_host(void *msg)
8153 + struct gd_req_header *head = msg;
8154 + struct gd_remlockrequest *req = msg;
8155 + struct gd_remlockreply *reply = msg;
8156 + struct gd_remquery *query = msg;
8157 + struct gd_remqueryreply *queryrep = msg;
8158 + gd_rcom_t *rc = msg;
8160 + /* Force into host byte order */
8163 + * Do the common header first
8166 + head->rh_length = le16_to_cpu(head->rh_length);
8167 + head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
8168 + /* Leave the lkid alone as it is transparent at the remote end */
8171 + * Do the fields in the remlockrequest or remlockreply structs
8174 + switch (req->rr_header.rh_cmd) {
8176 + case GDLM_REMCMD_LOCKREQUEST:
8177 + case GDLM_REMCMD_CONVREQUEST:
8178 + req->rr_range_start = le64_to_cpu(req->rr_range_start);
8179 + req->rr_range_end = le64_to_cpu(req->rr_range_end);
8180 + case GDLM_REMCMD_LOOKUP:
8181 + case GDLM_REMCMD_UNLOCKREQUEST:
8182 + case GDLM_REMCMD_LOCKGRANT:
8183 + case GDLM_REMCMD_SENDBAST:
8184 + case GDLM_REMCMD_SENDCAST:
8185 + case GDLM_REMCMD_REM_RESDATA:
8186 + /* Actually, not much to do here as the remote lock IDs are
8187 + * transparent too */
8188 + req->rr_flags = le32_to_cpu(req->rr_flags);
8189 + req->rr_status = le32_to_cpu(req->rr_status);
8192 + case GDLM_REMCMD_LOCKREPLY:
8193 + reply->rl_lockstate = le32_to_cpu(reply->rl_lockstate);
8194 + reply->rl_nodeid = le32_to_cpu(reply->rl_nodeid);
8195 + reply->rl_status = le32_to_cpu(reply->rl_status);
8198 + case GDLM_REMCMD_RECOVERMESSAGE:
8199 + case GDLM_REMCMD_RECOVERREPLY:
8200 + rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
8201 + rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
8205 + case GDLM_REMCMD_QUERY:
8206 + query->rq_mstlkid = le32_to_cpu(query->rq_mstlkid);
8207 + query->rq_query = le32_to_cpu(query->rq_query);
8208 + query->rq_maxlocks = le32_to_cpu(query->rq_maxlocks);
8211 + case GDLM_REMCMD_QUERYREPLY:
8212 + queryrep->rq_numlocks = le32_to_cpu(queryrep->rq_numlocks);
8213 + queryrep->rq_status = le32_to_cpu(queryrep->rq_status);
8214 + queryrep->rq_grantcount = le32_to_cpu(queryrep->rq_grantcount);
8215 + queryrep->rq_waitcount = le32_to_cpu(queryrep->rq_waitcount);
8216 + queryrep->rq_convcount = le32_to_cpu(queryrep->rq_convcount);
8220 + printk("dlm: warning, unknown REMCMD type %u\n",
8221 + req->rr_header.rh_cmd);
8225 +static void copy_from_cb(void *dst, const void *base, unsigned offset,
8226 + unsigned len, unsigned limit)
8228 + unsigned copy = len;
8230 + if ((copy + offset) > limit)
8231 + copy = limit - offset;
8232 + memcpy(dst, base + offset, copy);
8235 + memcpy(dst + copy, base, len);
8238 +static void khexdump(const unsigned char *c, int len)
8240 + while (len > 16) {
8242 + "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
8243 + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
8244 + c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
8248 + printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
8253 + printk(KERN_INFO "%02x\n", c[0]);
8259 + * Called from the low-level comms layer to process a buffer of
8262 + * Only complete messages are processed here, any "spare" bytes from
8263 + * the end of a buffer are saved and tacked onto the front of the next
8264 + * message that comes in. I doubt this will happen very often but we
8265 + * need to be able to cope with it and I don't want the task to be waiting
8266 + * for packets to come in when there is useful work to be done.
8269 +int midcomms_process_incoming_buffer(int nodeid, const void *base,
8270 + unsigned offset, unsigned len,
8273 + unsigned char __tmp[sizeof(struct gd_req_header) + 64];
8274 + struct gd_req_header *msg = (struct gd_req_header *) __tmp;
8280 + while (len > sizeof(struct gd_req_header)) {
8281 + /* Get message header and check it over */
8282 + copy_from_cb(msg, base, offset, sizeof(struct gd_req_header),
8284 + msglen = le16_to_cpu(msg->rh_length);
8285 + id = msg->rh_lkid;
8286 + space = msg->rh_lockspace;
8288 + /* Check message size */
8290 + if (msglen < sizeof(struct gd_req_header))
8293 + if (msglen > dlm_config.buffer_size) {
8294 + printk("dlm: message size too big %d\n", msglen);
8299 + /* Not enough in buffer yet? wait for some more */
8303 + /* Make sure our temp buffer is large enough */
8304 + if (msglen > sizeof(__tmp) &&
8305 + msg == (struct gd_req_header *) __tmp) {
8306 + msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
8311 + copy_from_cb(msg, base, offset, msglen, limit);
8312 + BUG_ON(id != msg->rh_lkid);
8313 + BUG_ON(space != msg->rh_lockspace);
8316 + offset &= (limit - 1);
8318 + network_to_host(msg);
8320 + if ((msg->rh_cmd > 32) ||
8321 + (msg->rh_cmd == 0) ||
8322 + (msg->rh_length < sizeof(struct gd_req_header)) ||
8323 + (msg->rh_length > dlm_config.buffer_size)) {
8325 + printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
8326 + "lkid=%u, lockspace=%u\n",
8327 + msg->rh_cmd, msg->rh_flags, msg->rh_length,
8328 + msg->rh_lkid, msg->rh_lockspace);
8330 + printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
8331 + "ret=%u, limit=%08x newbuf=%d\n",
8332 + base, offset, len, ret, limit,
8333 + ((struct gd_req_header *) __tmp == msg));
8335 + khexdump((const unsigned char *) msg, msg->rh_length);
8340 + switch (msg->rh_cmd) {
8341 + case GDLM_REMCMD_RECOVERMESSAGE:
8342 + case GDLM_REMCMD_RECOVERREPLY:
8343 + process_recovery_comm(nodeid, msg);
8346 + process_cluster_request(nodeid, msg, FALSE);
8350 + if (msg != (struct gd_req_header *) __tmp)
8353 + return err ? err : ret;
8357 + * Send a lowcomms buffer
8360 +void midcomms_send_buffer(struct gd_req_header *msg, struct writequeue_entry *e)
8362 + host_to_network(msg);
8363 + lowcomms_commit_buffer(e);
8367 + * Make the message into network byte order and send it
8370 +int midcomms_send_message(uint32_t nodeid, struct gd_req_header *msg,
8373 + int len = msg->rh_length;
8375 + host_to_network(msg);
8378 + * Loopback. In fact, the locking code pretty much prevents this from
8379 + * being needed but it can happen when the directory node is also the
8383 + if (nodeid == our_nodeid())
8384 + return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
8387 + return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
8389 diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
8390 --- linux-orig/cluster/dlm/midcomms.h 1970-01-01 07:30:00.000000000 +0730
8391 +++ linux-patched/cluster/dlm/midcomms.h 2004-06-25 18:31:07.000000000 +0800
8393 +/******************************************************************************
8394 +*******************************************************************************
8396 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8397 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8399 +** This copyrighted material is made available to anyone wishing to use,
8400 +** modify, copy, or redistribute it subject to the terms and conditions
8401 +** of the GNU General Public License v.2.
8403 +*******************************************************************************
8404 +******************************************************************************/
8406 +#ifndef __MIDCOMMS_DOT_H__
8407 +#define __MIDCOMMS_DOT_H__
8409 +int midcomms_send_message(uint32_t csid, struct gd_req_header *msg,
8411 +int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
8412 + unsigned len, unsigned limit);
8413 +void midcomms_send_buffer(struct gd_req_header *msg,
8414 + struct writequeue_entry *e);
8416 +#endif /* __MIDCOMMS_DOT_H__ */
8417 diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
8418 --- linux-orig/cluster/dlm/nodes.c 1970-01-01 07:30:00.000000000 +0730
8419 +++ linux-patched/cluster/dlm/nodes.c 2004-06-25 18:31:07.000000000 +0800
8421 +/******************************************************************************
8422 +*******************************************************************************
8424 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8425 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8427 +** This copyrighted material is made available to anyone wishing to use,
8428 +** modify, copy, or redistribute it subject to the terms and conditions
8429 +** of the GNU General Public License v.2.
8431 +*******************************************************************************
8432 +******************************************************************************/
8434 +#include <net/sock.h>
8435 +#include <cluster/cnxman.h>
8437 +#include "dlm_internal.h"
8438 +#include "lowcomms.h"
8440 +#include "recover.h"
8441 +#include "reccomms.h"
8444 +static struct list_head cluster_nodes;
8445 +static spinlock_t node_lock;
8446 +static uint32_t local_nodeid;
8447 +static struct semaphore local_init_lock;
8450 +void dlm_nodes_init(void)
8452 + INIT_LIST_HEAD(&cluster_nodes);
8453 + spin_lock_init(&node_lock);
8455 + init_MUTEX(&local_init_lock);
8458 +static gd_node_t *search_node(uint32_t nodeid)
8462 + list_for_each_entry(node, &cluster_nodes, gn_list) {
8463 + if (node->gn_nodeid == nodeid)
8471 +static void put_node(gd_node_t *node)
8473 + spin_lock(&node_lock);
8474 + node->gn_refcount--;
8475 + if (node->gn_refcount == 0) {
8476 + list_del(&node->gn_list);
8477 + spin_unlock(&node_lock);
8481 + spin_unlock(&node_lock);
8484 +static int get_node(uint32_t nodeid, gd_node_t **ndp)
8486 + gd_node_t *node, *node2;
8487 + int error = -ENOMEM;
8489 + spin_lock(&node_lock);
8490 + node = search_node(nodeid);
8492 + node->gn_refcount++;
8493 + spin_unlock(&node_lock);
8498 + node = (gd_node_t *) kmalloc(sizeof(gd_node_t), GFP_KERNEL);
8502 + memset(node, 0, sizeof(gd_node_t));
8503 + node->gn_nodeid = nodeid;
8505 + spin_lock(&node_lock);
8506 + node2 = search_node(nodeid);
8508 + node2->gn_refcount++;
8509 + spin_unlock(&node_lock);
8515 + node->gn_refcount = 1;
8516 + list_add_tail(&node->gn_list, &cluster_nodes);
8517 + spin_unlock(&node_lock);
8527 +int init_new_csb(uint32_t nodeid, gd_csb_t **ret_csb)
8531 + int error = -ENOMEM;
8533 + csb = (gd_csb_t *) kmalloc(sizeof(gd_csb_t), GFP_KERNEL);
8537 + memset(csb, 0, sizeof(gd_csb_t));
8539 + error = get_node(nodeid, &node);
8543 + csb->csb_node = node;
8545 + down(&local_init_lock);
8547 + if (!local_nodeid) {
8548 + if (nodeid == our_nodeid()) {
8549 + local_nodeid = node->gn_nodeid;
8552 + up(&local_init_lock);
8563 +void release_csb(gd_csb_t *csb)
8565 + put_node(csb->csb_node);
8569 +uint32_t our_nodeid(void)
8571 + return lowcomms_our_nodeid();
8574 +int nodes_reconfig_wait(gd_ls_t *ls)
8578 + if (ls->ls_low_nodeid == our_nodeid()) {
8579 + error = gdlm_wait_status_all(ls, NODES_VALID);
8581 + set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
8583 + /* Experimental: this delay should allow any final messages
8584 + * from the previous node to be received before beginning
8587 + if (ls->ls_num_nodes == 1) {
8588 + current->state = TASK_UNINTERRUPTIBLE;
8589 + schedule_timeout((2) * HZ);
8593 + error = gdlm_wait_status_low(ls, NODES_ALL_VALID);
8598 +static void add_ordered_node(gd_ls_t *ls, gd_csb_t *new)
8600 + gd_csb_t *csb = NULL;
8601 + struct list_head *tmp;
8602 + struct list_head *newlist = &new->csb_list;
8603 + struct list_head *head = &ls->ls_nodes;
8605 + list_for_each(tmp, head) {
8606 + csb = list_entry(tmp, gd_csb_t, csb_list);
8608 + if (new->csb_node->gn_nodeid < csb->csb_node->gn_nodeid)
8613 + list_add_tail(newlist, head);
8615 + /* FIXME: can use list macro here */
8616 + newlist->prev = tmp->prev;
8617 + newlist->next = tmp;
8618 + tmp->prev->next = newlist;
8619 + tmp->prev = newlist;
8623 +int ls_nodes_reconfig(gd_ls_t *ls, gd_recover_t *gr, int *neg_out)
8625 + gd_csb_t *csb, *safe;
8626 + int error, i, found, pos = 0, neg = 0;
8627 + uint32_t low = (uint32_t) (-1);
8630 + * Remove (and save) departed nodes from lockspace's nodes list
8633 + list_for_each_entry_safe(csb, safe, &ls->ls_nodes, csb_list) {
8635 + for (i = 0; i < gr->gr_node_count; i++) {
8636 + if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
8644 + csb->csb_gone_event = gr->gr_event_id;
8645 + list_del(&csb->csb_list);
8646 + list_add_tail(&csb->csb_list, &ls->ls_nodes_gone);
8647 + ls->ls_num_nodes--;
8648 + log_all(ls, "remove node %u", csb->csb_node->gn_nodeid);
8653 + * Add new nodes to lockspace's nodes list
8656 + for (i = 0; i < gr->gr_node_count; i++) {
8658 + list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
8659 + if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
8668 + error = init_new_csb(gr->gr_nodeids[i], &csb);
8669 + GDLM_ASSERT(!error,);
8671 + add_ordered_node(ls, csb);
8672 + ls->ls_num_nodes++;
8673 + log_all(ls, "add node %u", csb->csb_node->gn_nodeid);
8677 + list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
8678 + if (csb->csb_node->gn_nodeid < low)
8679 + low = csb->csb_node->gn_nodeid;
8682 + rcom_log_clear(ls);
8683 + ls->ls_low_nodeid = low;
8684 + ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
8685 + set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8688 + error = nodes_reconfig_wait(ls);
8690 + log_all(ls, "total nodes %d", ls->ls_num_nodes);
8695 +int ls_nodes_init(gd_ls_t *ls, gd_recover_t *gr)
8699 + uint32_t low = (uint32_t) (-1);
8701 + log_all(ls, "add nodes");
8703 + for (i = 0; i < gr->gr_node_count; i++) {
8704 + error = init_new_csb(gr->gr_nodeids[i], &csb);
8708 + add_ordered_node(ls, csb);
8709 + ls->ls_num_nodes++;
8711 + if (csb->csb_node->gn_nodeid < low)
8712 + low = csb->csb_node->gn_nodeid;
8715 + ls->ls_low_nodeid = low;
8716 + ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
8717 + set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8719 + error = nodes_reconfig_wait(ls);
8721 + log_all(ls, "total nodes %d", ls->ls_num_nodes);
8726 + while (!list_empty(&ls->ls_nodes)) {
8727 + csb = list_entry(ls->ls_nodes.next, gd_csb_t, csb_list);
8728 + list_del(&csb->csb_list);
8731 + ls->ls_num_nodes = 0;
8736 +int in_nodes_gone(gd_ls_t *ls, uint32_t nodeid)
8740 + list_for_each_entry(csb, &ls->ls_nodes_gone, csb_list) {
8741 + if (csb->csb_node->gn_nodeid == nodeid)
8746 diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
8747 --- linux-orig/cluster/dlm/nodes.h 1970-01-01 07:30:00.000000000 +0730
8748 +++ linux-patched/cluster/dlm/nodes.h 2004-06-25 18:31:07.000000000 +0800
8750 +/******************************************************************************
8751 +*******************************************************************************
8753 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8754 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8756 +** This copyrighted material is made available to anyone wishing to use,
8757 +** modify, copy, or redistribute it subject to the terms and conditions
8758 +** of the GNU General Public License v.2.
8760 +*******************************************************************************
8761 +******************************************************************************/
8763 +#ifndef __NODES_DOT_H__
8764 +#define __NODES_DOT_H__
8766 +void dlm_nodes_init(void);
8767 +int init_new_csb(uint32_t nodeid, gd_csb_t ** ret_csb);
8768 +void release_csb(gd_csb_t * csb);
8769 +uint32_t our_nodeid(void);
8770 +int ls_nodes_reconfig(gd_ls_t * ls, gd_recover_t * gr, int *neg);
8771 +int ls_nodes_init(gd_ls_t * ls, gd_recover_t * gr);
8772 +int in_nodes_gone(gd_ls_t * ls, uint32_t nodeid);
8774 +#endif /* __NODES_DOT_H__ */
8775 diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
8776 --- linux-orig/cluster/dlm/proc.c 1970-01-01 07:30:00.000000000 +0730
8777 +++ linux-patched/cluster/dlm/proc.c 2004-06-25 18:31:07.000000000 +0800
8779 +/******************************************************************************
8780 +*******************************************************************************
8782 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8783 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8785 +** This copyrighted material is made available to anyone wishing to use,
8786 +** modify, copy, or redistribute it subject to the terms and conditions
8787 +** of the GNU General Public License v.2.
8789 +*******************************************************************************
8790 +******************************************************************************/
8792 +#include <linux/init.h>
8793 +#include <linux/proc_fs.h>
8794 +#include <linux/ctype.h>
8795 +#include <linux/seq_file.h>
8796 +#include <linux/module.h>
8798 +#include "dlm_internal.h"
8799 +#include "lockspace.h"
8801 +#if defined(DLM_DEBUG)
8802 +#define DLM_DEBUG_SIZE (1024)
8803 +#define MAX_DEBUG_MSG_LEN (64)
8805 +#define DLM_DEBUG_SIZE (0)
8806 +#define MAX_DEBUG_MSG_LEN (0)
8809 +static char * debug_buf;
8810 +static unsigned int debug_size;
8811 +static unsigned int debug_point;
8812 +static int debug_wrap;
8813 +static spinlock_t debug_lock;
8814 +static struct proc_dir_entry * debug_proc_entry = NULL;
8815 +static struct proc_dir_entry * rcom_proc_entry = NULL;
8816 +static char proc_ls_name[255] = "";
8818 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
8819 +static struct proc_dir_entry * locks_proc_entry = NULL;
8820 +static struct seq_operations locks_info_op;
8823 +static int locks_open(struct inode *inode, struct file *file)
8825 + return seq_open(file, &locks_info_op);
8828 +/* Write simply sets the lockspace to use */
8829 +static ssize_t locks_write(struct file *file, const char *buf,
8830 + size_t count, loff_t * ppos)
8832 + if (count < sizeof(proc_ls_name)) {
8833 + copy_from_user(proc_ls_name, buf, count);
8834 + proc_ls_name[count] = '\0';
8836 + /* Remove any trailing LF so that lazy users
8837 + can just echo "lsname" > /proc/cluster/dlm_locks */
8838 + if (proc_ls_name[count - 1] == '\n')
8839 + proc_ls_name[count - 1] = '\0';
8846 +static struct file_operations locks_fops = {
8848 + write:locks_write,
8851 + release:seq_release,
8854 +struct ls_dumpinfo {
8856 + struct list_head *next;
8861 +static int print_resource(gd_res_t * res, struct seq_file *s);
8863 +static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
8865 + read_lock(&di->ls->ls_reshash_lock);
8867 + /* Find the next non-empty hash bucket */
8868 + while (list_empty(&di->ls->ls_reshashtbl[di->entry]) &&
8869 + di->entry < di->ls->ls_hashsize) {
8872 + if (di->entry >= di->ls->ls_hashsize) {
8873 + read_unlock(&di->ls->ls_reshash_lock);
8874 + return NULL; /* End of hash list */
8877 + di->next = di->ls->ls_reshashtbl[di->entry].next;
8878 + } else { /* Find the next entry in the list */
8880 + di->next = di->next->next;
8881 + if (di->next->next == di->ls->ls_reshashtbl[di->entry].next) {
8882 + /* End of list - move to next bucket */
8885 + read_unlock(&di->ls->ls_reshash_lock);
8887 + return next_rsb(di); /* do the top half of this conditional */
8890 + di->rsb = list_entry(di->next, gd_res_t, res_hashchain);
8891 + read_unlock(&di->ls->ls_reshash_lock);
8896 +static void *s_start(struct seq_file *m, loff_t * pos)
8898 + struct ls_dumpinfo *di;
8902 + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
8906 + di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
8911 + seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
8917 + for (i = 0; i < *pos; i++)
8918 + if (next_rsb(di) == NULL)
8921 + return next_rsb(di);
8924 +static void *s_next(struct seq_file *m, void *p, loff_t * pos)
8926 + struct ls_dumpinfo *di = p;
8930 + return next_rsb(di);
8933 +static int s_show(struct seq_file *m, void *p)
8935 + struct ls_dumpinfo *di = p;
8936 + return print_resource(di->rsb, m);
8939 +static void s_stop(struct seq_file *m, void *p)
8944 +static struct seq_operations locks_info_op = {
8951 +static char *print_lockmode(int mode)
8973 +static void print_lock(struct seq_file *s, gd_lkb_t * lkb, gd_res_t * res)
8976 + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
8978 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8979 + || lkb->lkb_status == GDLM_LKSTS_WAITING)
8980 + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
8982 + if (lkb->lkb_range) {
8983 + /* This warns on Alpha. Tough. Only I see it */
8984 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8985 + || lkb->lkb_status == GDLM_LKSTS_GRANTED)
8986 + seq_printf(s, " %" PRIx64 "-%" PRIx64,
8987 + lkb->lkb_range[GR_RANGE_START],
8988 + lkb->lkb_range[GR_RANGE_END]);
8989 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8990 + || lkb->lkb_status == GDLM_LKSTS_WAITING)
8991 + seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
8992 + lkb->lkb_range[RQ_RANGE_START],
8993 + lkb->lkb_range[RQ_RANGE_END]);
8996 + if (lkb->lkb_nodeid) {
8997 + if (lkb->lkb_nodeid != res->res_nodeid)
8998 + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
9001 + seq_printf(s, " Master: %08x", lkb->lkb_remid);
9004 + if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
9005 + seq_printf(s, " LQ: %d", lkb->lkb_lockqueue_state);
9007 + seq_printf(s, "\n");
9010 +static int print_resource(gd_res_t *res, struct seq_file *s)
9013 + struct list_head *locklist;
9015 + seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
9016 + res->res_parent, res->res_length);
9017 + for (i = 0; i < res->res_length; i++) {
9018 + if (isprint(res->res_name[i]))
9019 + seq_printf(s, "%c", res->res_name[i]);
9021 + seq_printf(s, "%c", '.');
9023 + if (res->res_nodeid)
9024 + seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
9027 + seq_printf(s, "\" \nMaster Copy\n");
9029 + /* Print the LVB: */
9030 + if (res->res_lvbptr) {
9031 + seq_printf(s, "LVB: ");
9032 + for (i = 0; i < DLM_LVB_LEN; i++) {
9033 + if (i == DLM_LVB_LEN / 2)
9034 + seq_printf(s, "\n ");
9035 + seq_printf(s, "%02x ",
9036 + (unsigned char) res->res_lvbptr[i]);
9038 + seq_printf(s, "\n");
9041 + /* Print the locks attached to this resource */
9042 + seq_printf(s, "Granted Queue\n");
9043 + list_for_each(locklist, &res->res_grantqueue) {
9044 + gd_lkb_t *this_lkb =
9045 + list_entry(locklist, gd_lkb_t, lkb_statequeue);
9046 + print_lock(s, this_lkb, res);
9049 + seq_printf(s, "Conversion Queue\n");
9050 + list_for_each(locklist, &res->res_convertqueue) {
9051 + gd_lkb_t *this_lkb =
9052 + list_entry(locklist, gd_lkb_t, lkb_statequeue);
9053 + print_lock(s, this_lkb, res);
9056 + seq_printf(s, "Waiting Queue\n");
9057 + list_for_each(locklist, &res->res_waitqueue) {
9058 + gd_lkb_t *this_lkb =
9059 + list_entry(locklist, gd_lkb_t, lkb_statequeue);
9060 + print_lock(s, this_lkb, res);
9064 +#endif /* CONFIG_CLUSTER_DLM_PROCLOCKS */
9066 +void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...)
9069 + int i, n, size, len;
9070 + char buf[MAX_DEBUG_MSG_LEN+1];
9072 + spin_lock(&debug_lock);
9077 + size = MAX_DEBUG_MSG_LEN;
9078 + memset(buf, 0, size+1);
9080 + n = snprintf(buf, size, "%s ", ls->ls_name);
9083 + va_start(va, fmt);
9084 + vsnprintf(buf+n, size, fmt, va);
9087 + len = strlen(buf);
9088 + if (len > MAX_DEBUG_MSG_LEN-1)
9089 + len = MAX_DEBUG_MSG_LEN-1;
9091 + buf[len+1] = '\0';
9093 + for (i = 0; i < strlen(buf); i++) {
9094 + debug_buf[debug_point++] = buf[i];
9096 + if (debug_point == debug_size) {
9102 + spin_unlock(&debug_lock);
9105 +void dlm_debug_dump(void)
9109 + spin_lock(&debug_lock);
9111 + for (i = debug_point; i < debug_size; i++)
9112 + printk("%c", debug_buf[i]);
9114 + for (i = 0; i < debug_point; i++)
9115 + printk("%c", debug_buf[i]);
9116 + spin_unlock(&debug_lock);
9119 +void dlm_debug_setup(int size)
9123 + if (size > PAGE_SIZE)
9126 + b = kmalloc(size, GFP_KERNEL);
9128 + spin_lock(&debug_lock);
9133 + debug_size = size;
9137 + memset(debug_buf, 0, debug_size);
9139 + spin_unlock(&debug_lock);
9142 +static void dlm_debug_init(void)
9148 + spin_lock_init(&debug_lock);
9150 + dlm_debug_setup(DLM_DEBUG_SIZE);
9153 +#ifdef CONFIG_PROC_FS
9154 +int dlm_debug_info(char *b, char **start, off_t offset, int length)
9158 + spin_lock(&debug_lock);
9161 + for (i = debug_point; i < debug_size; i++)
9162 + n += sprintf(b + n, "%c", debug_buf[i]);
9164 + for (i = 0; i < debug_point; i++)
9165 + n += sprintf(b + n, "%c", debug_buf[i]);
9167 + spin_unlock(&debug_lock);
9172 +int dlm_rcom_info(char *b, char **start, off_t offset, int length)
9178 + ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9182 + n += sprintf(b + n, "nodeid names_send_count names_send_msgid "
9183 + "names_recv_count names_recv_msgid "
9184 + "locks_send_count locks_send_msgid "
9185 + "locks_recv_count locks_recv_msgid\n");
9187 + list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
9188 + n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n",
9189 + csb->csb_node->gn_nodeid,
9190 + csb->csb_names_send_count,
9191 + csb->csb_names_send_msgid,
9192 + csb->csb_names_recv_count,
9193 + csb->csb_names_recv_msgid,
9194 + csb->csb_locks_send_count,
9195 + csb->csb_locks_send_msgid,
9196 + csb->csb_locks_recv_count,
9197 + csb->csb_locks_recv_msgid);
9203 +void dlm_proc_init(void)
9205 +#ifdef CONFIG_PROC_FS
9206 + debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
9208 + if (!debug_proc_entry)
9211 + debug_proc_entry->get_info = &dlm_debug_info;
9213 + rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL);
9214 + if (!rcom_proc_entry)
9217 + rcom_proc_entry->get_info = &dlm_rcom_info;
9221 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9222 + locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
9224 + NULL, NULL, NULL);
9225 + if (!locks_proc_entry)
9227 + locks_proc_entry->proc_fops = &locks_fops;
9231 +void dlm_proc_exit(void)
9233 +#ifdef CONFIG_PROC_FS
9234 + if (debug_proc_entry) {
9235 + remove_proc_entry("cluster/dlm_debug", NULL);
9236 + dlm_debug_setup(0);
9239 + if (rcom_proc_entry)
9240 + remove_proc_entry("cluster/dlm_rcom", NULL);
9243 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9244 + if (locks_proc_entry)
9245 + remove_proc_entry("cluster/dlm_locks", NULL);
9248 diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
9249 --- linux-orig/cluster/dlm/queries.c 1970-01-01 07:30:00.000000000 +0730
9250 +++ linux-patched/cluster/dlm/queries.c 2004-06-25 18:31:07.000000000 +0800
9252 +/******************************************************************************
9253 +*******************************************************************************
9255 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9256 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9258 +** This copyrighted material is made available to anyone wishing to use,
9259 +** modify, copy, or redistribute it subject to the terms and conditions
9260 +** of the GNU General Public License v.2.
9262 +*******************************************************************************
9263 +******************************************************************************/
9268 + * This file provides the kernel query interface to the DLM.
9272 +#define EXPORT_SYMTAB
9273 +#include <linux/module.h>
9275 +#include "dlm_internal.h"
9276 +#include "lockqueue.h"
9277 +#include "locking.h"
9282 +#include "memory.h"
9283 +#include "lowcomms.h"
9284 +#include "midcomms.h"
9287 +static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo);
9288 +static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo);
9291 + * API entry point.
9293 +int dlm_query(void *lockspace,
9294 + struct dlm_lksb *lksb,
9296 + struct dlm_queryinfo *qinfo,
9297 + void (ast_routine(void *)),
9300 + int status = -EINVAL;
9301 + gd_lkb_t *target_lkb;
9302 + gd_lkb_t *query_lkb = NULL; /* Our temporary LKB */
9303 + gd_ls_t *ls = (gd_ls_t *) find_lockspace_by_local_id(lockspace);
9315 + if (!qinfo->gqi_lockinfo)
9316 + qinfo->gqi_locksize = 0;
9318 + /* Find the lkid */
9319 + target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
9323 + /* If the user wants a list of locks that are blocking or
9324 + not blocking this lock, then it must be waiting
9327 + if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
9328 + (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
9329 + target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
9332 + /* We now allocate an LKB for our own use (so we can hang
9333 + * things like the AST routine and the lksb from it) */
9334 + lksb->sb_status = -EBUSY;
9335 + query_lkb = create_lkb(ls);
9340 + query_lkb->lkb_astaddr = ast_routine;
9341 + query_lkb->lkb_astparam = (long)astarg;
9342 + query_lkb->lkb_resource = target_lkb->lkb_resource;
9343 + query_lkb->lkb_lksb = lksb;
9345 + /* Don't free the resource while we are querying it. This ref
9346 + * will be dropped when the LKB is freed */
9347 + hold_rsb(query_lkb->lkb_resource);
9349 + /* Fill in the stuff that's always local */
9350 + if (qinfo->gqi_resinfo) {
9351 + if (target_lkb->lkb_resource->res_nodeid)
9352 + qinfo->gqi_resinfo->rsi_masternode =
9353 + target_lkb->lkb_resource->res_nodeid;
9355 + qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
9356 + qinfo->gqi_resinfo->rsi_length =
9357 + target_lkb->lkb_resource->res_length;
9358 + memcpy(qinfo->gqi_resinfo->rsi_name,
9359 + target_lkb->lkb_resource->res_name,
9360 + qinfo->gqi_resinfo->rsi_length);
9363 + /* If the master is local (or the user doesn't want the overhead of a
9364 + * remote call) - fill in the details here */
9365 + if (target_lkb->lkb_resource->res_nodeid == 0 ||
9366 + (query & DLM_QUERY_LOCAL)) {
9369 + /* Resource info */
9370 + if (qinfo->gqi_resinfo) {
9371 + query_resource(target_lkb->lkb_resource,
9372 + qinfo->gqi_resinfo);
9376 + if (qinfo->gqi_lockinfo) {
9377 + status = query_locks(query, target_lkb, qinfo);
9380 + query_lkb->lkb_retstatus = status;
9381 + query_lkb->lkb_flags |= GDLM_LKFLG_DELAST;
9382 + queue_ast(query_lkb, GDLM_QUEUE_COMPAST, 0);
9385 + /* An AST will be delivered so we must return success here */
9390 + /* Remote master */
9391 + if (target_lkb->lkb_resource->res_nodeid != 0)
9393 + struct gd_remquery *remquery;
9394 + struct writequeue_entry *e;
9396 + /* Clear this cos the receiving end adds to it with
9397 + each incoming packet */
9398 + qinfo->gqi_lockcount = 0;
9400 + /* Squirrel a pointer to the query info struct
9401 + somewhere illegal */
9402 + query_lkb->lkb_request = (struct gd_remlockrequest *) qinfo;
9404 + e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
9405 + sizeof(struct gd_remquery),
9406 + ls->ls_allocation,
9407 + (char **) &remquery);
9409 + status = -ENOBUFS;
9413 + /* Build remote packet */
9414 + memset(remquery, 0, sizeof(struct gd_remquery));
9416 + remquery->rq_maxlocks = qinfo->gqi_locksize;
9417 + remquery->rq_query = query;
9418 + remquery->rq_mstlkid = target_lkb->lkb_remid;
9419 + if (qinfo->gqi_lockinfo)
9420 + remquery->rq_maxlocks = qinfo->gqi_locksize;
9422 + remquery->rq_header.rh_cmd = GDLM_REMCMD_QUERY;
9423 + remquery->rq_header.rh_flags = 0;
9424 + remquery->rq_header.rh_length = sizeof(struct gd_remquery);
9425 + remquery->rq_header.rh_lkid = query_lkb->lkb_id;
9426 + remquery->rq_header.rh_lockspace = ls->ls_global_id;
9428 + midcomms_send_buffer(&remquery->rq_header, e);
9437 +static inline int valid_range(struct dlm_range *r)
9439 + if (r->ra_start != 0ULL ||
9440 + r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
9446 +static void put_int(int x, char *buf, int *offp)
9448 + x = cpu_to_le32(x);
9449 + memcpy(buf + *offp, &x, sizeof(int));
9450 + *offp += sizeof(int);
9453 +static void put_int64(uint64_t x, char *buf, int *offp)
9455 + x = cpu_to_le64(x);
9456 + memcpy(buf + *offp, &x, sizeof(uint64_t));
9457 + *offp += sizeof(uint64_t);
9460 +static int get_int(char *buf, int *offp)
9463 + memcpy(&value, buf + *offp, sizeof(int));
9464 + *offp += sizeof(int);
9465 + return le32_to_cpu(value);
9468 +static uint64_t get_int64(char *buf, int *offp)
9472 + memcpy(&value, buf + *offp, sizeof(uint64_t));
9473 + *offp += sizeof(uint64_t);
9474 + return le64_to_cpu(value);
9477 +#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
9479 +/* Called from recvd to get lock info for a remote node */
9480 +int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
9482 + struct gd_remquery *query = (struct gd_remquery *) msg;
9483 + struct gd_remqueryreply *reply;
9484 + struct dlm_resinfo resinfo;
9485 + struct dlm_queryinfo qinfo;
9486 + struct writequeue_entry *e;
9493 + int start_lock = 0;
9495 + lkb = find_lock_by_id(ls, query->rq_mstlkid);
9501 + qinfo.gqi_resinfo = &resinfo;
9502 + qinfo.gqi_locksize = query->rq_maxlocks;
9504 + /* Get the resource bits */
9505 + query_resource(lkb->lkb_resource, &resinfo);
9507 + /* Now get the locks if wanted */
9508 + if (query->rq_maxlocks) {
9509 + qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
9511 + if (!qinfo.gqi_lockinfo) {
9516 + status = query_locks(query->rq_query, lkb, &qinfo);
9517 + if (status && status != -E2BIG) {
9518 + kfree(qinfo.gqi_lockinfo);
9523 + qinfo.gqi_lockinfo = NULL;
9524 + qinfo.gqi_lockcount = 0;
9527 + /* Send as many blocks as needed for all the locks */
9530 + int msg_len = sizeof(struct gd_remqueryreply);
9531 + int last_msg_len = msg_len; /* keeps compiler quiet */
9534 + /* First work out how many locks we can fit into a block */
9535 + for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
9537 + last_msg_len = msg_len;
9539 + msg_len += LOCK_LEN;
9540 + if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
9541 + valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
9543 + msg_len += sizeof(uint64_t) * 4;
9547 + /* There must be a neater way of doing this... */
9548 + if (msg_len > PAGE_SIZE) {
9550 + msg_len = last_msg_len;
9556 + e = lowcomms_get_buffer(nodeid,
9558 + ls->ls_allocation,
9559 + (char **) &reply);
9561 + kfree(qinfo.gqi_lockinfo);
9562 + status = -ENOBUFS;
9566 + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
9567 + reply->rq_header.rh_length = msg_len;
9568 + reply->rq_header.rh_lkid = msg->rh_lkid;
9569 + reply->rq_header.rh_lockspace = msg->rh_lockspace;
9571 + reply->rq_status = status;
9572 + reply->rq_startlock = cur_lock;
9573 + reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
9574 + reply->rq_convcount = qinfo.gqi_resinfo->rsi_convcount;
9575 + reply->rq_waitcount = qinfo.gqi_resinfo->rsi_waitcount;
9576 + memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
9578 + buf = (char *)reply;
9579 + bufidx = sizeof(struct gd_remqueryreply);
9581 + for (; cur_lock < last_lock; cur_lock++) {
9583 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
9584 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
9585 + buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
9586 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
9587 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
9588 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
9589 + put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
9591 + if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
9592 + valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
9594 + buf[bufidx++] = 1;
9595 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
9596 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
9597 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
9598 + put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
9601 + buf[bufidx++] = 0;
9605 + if (cur_lock == qinfo.gqi_lockcount) {
9606 + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
9610 + reply->rq_header.rh_flags = 0;
9613 + reply->rq_numlocks = cur_lock - start_lock;
9614 + start_lock = cur_lock;
9616 + midcomms_send_buffer(&reply->rq_header, e);
9617 + } while (!finished);
9619 + kfree(qinfo.gqi_lockinfo);
9624 + e = lowcomms_get_buffer(nodeid,
9625 + sizeof(struct gd_remqueryreply),
9626 + ls->ls_allocation,
9627 + (char **) &reply);
9629 + status = -ENOBUFS;
9632 + reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
9633 + reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */
9634 + reply->rq_header.rh_length = sizeof(struct gd_remqueryreply);
9635 + reply->rq_header.rh_lkid = msg->rh_lkid;
9636 + reply->rq_header.rh_lockspace = msg->rh_lockspace;
9637 + reply->rq_status = status;
9638 + reply->rq_numlocks = 0;
9639 + reply->rq_startlock = 0;
9640 + reply->rq_grantcount = 0;
9641 + reply->rq_convcount = 0;
9642 + reply->rq_waitcount = 0;
9644 + midcomms_send_buffer(&reply->rq_header, e);
9649 +/* Reply to a remote query */
9650 +int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
9652 + gd_lkb_t *query_lkb;
9653 + struct dlm_queryinfo *qinfo;
9654 + struct gd_remqueryreply *reply;
9659 + query_lkb = find_lock_by_id(ls, msg->rh_lkid);
9663 + qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
9664 + reply = (struct gd_remqueryreply *) msg;
9666 + /* Copy the easy bits first */
9667 + qinfo->gqi_lockcount += reply->rq_numlocks;
9668 + if (qinfo->gqi_resinfo) {
9669 + qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
9670 + qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
9671 + qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
9672 + memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
9676 + /* Now unpack the locks */
9677 + bufidx = sizeof(struct gd_remqueryreply);
9678 + buf = (char *) msg;
9680 + GDLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
9681 + printk("start = %d, num + %d. Max= %d\n",
9682 + reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
9684 + for (i = reply->rq_startlock;
9685 + i < reply->rq_startlock + reply->rq_numlocks; i++) {
9686 + qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
9687 + qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
9688 + qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
9689 + qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
9690 + qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
9691 + qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
9692 + qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
9693 + if (buf[bufidx++]) {
9694 + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
9695 + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = get_int64(buf, &bufidx);
9696 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
9697 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = get_int64(buf, &bufidx);
9700 + qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
9701 + qinfo->gqi_lockinfo[i].lki_grrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
9702 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
9703 + qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
9707 + /* If this was the last block then now tell the user */
9708 + if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
9709 + query_lkb->lkb_retstatus = reply->rq_status;
9710 + query_lkb->lkb_flags |= GDLM_LKFLG_DELAST;
9711 + queue_ast(query_lkb, GDLM_QUEUE_COMPAST, 0);
9718 +/* Aggregate resource information */
9719 +static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo)
9721 + struct list_head *tmp;
9724 + if (rsb->res_lvbptr)
9725 + memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
9727 + resinfo->rsi_grantcount = 0;
9728 + list_for_each(tmp, &rsb->res_grantqueue) {
9729 + resinfo->rsi_grantcount++;
9732 + resinfo->rsi_waitcount = 0;
9733 + list_for_each(tmp, &rsb->res_waitqueue) {
9734 + resinfo->rsi_waitcount++;
9737 + resinfo->rsi_convcount = 0;
9738 + list_for_each(tmp, &rsb->res_convertqueue) {
9739 + resinfo->rsi_convcount++;
9745 +static int add_lock(gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
9749 + /* Don't fill it in if the buffer is full */
9750 + if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
9753 + /* gqi_lockcount contains the number of locks we have returned */
9754 + entry = qinfo->gqi_lockcount++;
9756 + /* Fun with master copies */
9757 + if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
9758 + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
9759 + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
9762 + qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
9763 + qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
9766 + /* Also make sure we always have a valid nodeid in there, the
9767 + calling end may not know which node "0" is */
9768 + if (lkb->lkb_nodeid)
9769 + qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
9771 + qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
9773 + if (lkb->lkb_parent)
9774 + qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
9776 + qinfo->gqi_lockinfo[entry].lki_parent = 0;
9778 + qinfo->gqi_lockinfo[entry].lki_state = lkb->lkb_status;
9779 + qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
9780 + qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
9782 + if (lkb->lkb_range) {
9783 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
9784 + lkb->lkb_range[GR_RANGE_START];
9785 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
9786 + lkb->lkb_range[GR_RANGE_END];
9787 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
9788 + lkb->lkb_range[RQ_RANGE_START];
9789 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
9790 + lkb->lkb_range[RQ_RANGE_END];
9792 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
9793 + qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
9794 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
9795 + qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
9800 +static int query_lkb_queue(struct list_head *queue, int query,
9801 + struct dlm_queryinfo *qinfo)
9803 + struct list_head *tmp;
9805 + int mode = query & DLM_QUERY_MODE_MASK;
9807 + list_for_each(tmp, queue) {
9808 + gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9811 + if (query & DLM_QUERY_RQMODE)
9812 + lkmode = lkb->lkb_rqmode;
9814 + lkmode = lkb->lkb_grmode;
9816 + /* Add the LKB info to the list if it matches the criteria in
9817 + * the query bitmap */
9818 + switch (query & DLM_QUERY_MASK) {
9819 + case DLM_QUERY_LOCKS_ALL:
9820 + status = add_lock(lkb, qinfo);
9823 + case DLM_QUERY_LOCKS_HIGHER:
9824 + if (lkmode > mode)
9825 + status = add_lock(lkb, qinfo);
9828 + case DLM_QUERY_LOCKS_EQUAL:
9829 + if (lkmode == mode)
9830 + status = add_lock(lkb, qinfo);
9833 + case DLM_QUERY_LOCKS_LOWER:
9834 + if (lkmode < mode)
9835 + status = add_lock(lkb, qinfo);
9843 + * Return 1 if the locks' ranges overlap
9844 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
9846 +static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
9848 + if (!lkb1->lkb_range || !lkb2->lkb_range)
9851 + if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
9852 + lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
9857 +extern const int __dlm_compat_matrix[8][8];
9860 +static int get_blocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
9862 + struct list_head *tmp;
9865 + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9866 + gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9868 + if (ranges_overlap(lkb, qlkb) &&
9869 + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
9870 + status = add_lock(lkb, qinfo);
9876 +static int get_nonblocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
9878 + struct list_head *tmp;
9881 + list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9882 + gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9884 + if (!(ranges_overlap(lkb, qlkb) &&
9885 + !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
9886 + status = add_lock(lkb, qinfo);
9892 +/* Gather a list of appropriate locks */
9893 +static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
9898 + /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
9899 + * was requested as the mode
9901 + if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
9902 + query &= ~DLM_QUERY_MODE_MASK;
9903 + if (query & DLM_QUERY_RQMODE)
9904 + query |= lkb->lkb_rqmode;
9906 + query |= lkb->lkb_grmode;
9909 + qinfo->gqi_lockcount = 0;
9911 + /* BLOCKING/NOTBLOCK only look at the granted queue */
9912 + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
9913 + return get_blocking_locks(lkb, qinfo);
9915 + if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
9916 + return get_nonblocking_locks(lkb, qinfo);
9918 + /* Do the lock queues that were requested */
9919 + if (query & DLM_QUERY_QUEUE_GRANT) {
9920 + status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue,
9924 + if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
9925 + status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue,
9929 + if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
9930 + status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue,
9938 +EXPORT_SYMBOL(dlm_query);
9940 + * Overrides for Emacs so that we follow Linus's tabbing style.
9941 + * Emacs will notice this stuff at the end of the file and automatically
9942 + * adjust the settings for this buffer only. This must remain at the end
9944 + * ---------------------------------------------------------------------------
9945 + * Local variables:
9946 + * c-file-style: "linux"
9949 diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
9950 --- linux-orig/cluster/dlm/queries.h 1970-01-01 07:30:00.000000000 +0730
9951 +++ linux-patched/cluster/dlm/queries.h 2004-06-25 18:31:07.000000000 +0800
9953 +/******************************************************************************
9954 +*******************************************************************************
9956 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9957 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9959 +** This copyrighted material is made available to anyone wishing to use,
9960 +** modify, copy, or redistribute it subject to the terms and conditions
9961 +** of the GNU General Public License v.2.
9963 +*******************************************************************************
9964 +******************************************************************************/
9966 +#ifndef __QUERIES_DOT_H__
9967 +#define __QUERIES_DOT_H__
9969 +extern int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
9970 +extern int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
9972 +#endif /* __QUERIES_DOT_H__ */
9973 diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
9974 --- linux-orig/cluster/dlm/rebuild.c 1970-01-01 07:30:00.000000000 +0730
9975 +++ linux-patched/cluster/dlm/rebuild.c 2004-06-25 18:31:07.000000000 +0800
9977 +/******************************************************************************
9978 +*******************************************************************************
9980 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9981 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9983 +** This copyrighted material is made available to anyone wishing to use,
9984 +** modify, copy, or redistribute it subject to the terms and conditions
9985 +** of the GNU General Public License v.2.
9987 +*******************************************************************************
9988 +******************************************************************************/
9991 + * Rebuild RSB's on new masters. Functions for transferring locks and
9992 + * subresources to new RSB masters during recovery.
9995 +#include "dlm_internal.h"
9996 +#include "reccomms.h"
10000 +#include "config.h"
10001 +#include "memory.h"
10002 +#include "recover.h"
10005 +/* Types of entity serialised in remastering messages */
10006 +#define REMASTER_ROOTRSB 1
10007 +#define REMASTER_RSB 2
10008 +#define REMASTER_LKB 3
10010 +struct rcom_fill {
10011 + char * outbuf; /* Beginning of data */
10012 + int offset; /* Current offset into outbuf */
10013 + int maxlen; /* Max value of offset */
10017 + gd_res_t * subrsb;
10019 + struct list_head * lkbqueue;
10022 +typedef struct rcom_fill rcom_fill_t;
10025 +struct rebuild_node {
10026 + struct list_head list;
10028 + gd_res_t * rootrsb;
10030 +typedef struct rebuild_node rebuild_node_t;
10034 + * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
10035 + * master. The rsb will be "done" with recovery when the new master has
10036 + * replied with all the new remote lockid's for this rsb's lkb's.
10039 +void expect_new_lkids(gd_res_t *rsb)
10041 + rsb->res_newlkid_expect = 0;
10042 + recover_list_add(rsb);
10046 + * This function is called on root rsb or subrsb when another lkb is being sent
10047 + * to the new master for which we expect to receive a corresponding remote lkid
10050 +void need_new_lkid(gd_res_t *rsb)
10052 + gd_res_t *root = rsb;
10054 + if (rsb->res_parent)
10055 + root = rsb->res_root;
10057 + if (!root->res_newlkid_expect)
10058 + recover_list_add(root);
10060 + GDLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
10062 + root->res_newlkid_expect++;
10066 + * This function is called for each lkb for which a new remote lkid is
10067 + * received. Decrement the expected number of remote lkids expected for the
10071 +void have_new_lkid(gd_lkb_t *lkb)
10073 + gd_res_t *root = lkb->lkb_resource;
10075 + if (root->res_parent)
10076 + root = root->res_root;
10078 + down_write(&root->res_lock);
10080 + GDLM_ASSERT(root->res_newlkid_expect,
10081 + printk("newlkid_expect=%d\n", root->res_newlkid_expect););
10083 + root->res_newlkid_expect--;
10085 + if (!root->res_newlkid_expect) {
10086 + clear_bit(RESFL_NEW_MASTER, &root->res_flags);
10087 + recover_list_del(root);
10089 + up_write(&root->res_lock);
10093 + * Return the rebuild struct for a node - will create an entry on the rootrsb
10094 + * list if necessary.
10096 + * Currently no locking is needed here as it all happens in the gdlm_recvd
10100 +static rebuild_node_t *find_rebuild_root(gd_ls_t *ls, int nodeid)
10102 + rebuild_node_t *node = NULL;
10104 + list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
10105 + if (node->nodeid == nodeid)
10109 + /* Not found, add one */
10110 + node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
10114 + node->nodeid = nodeid;
10115 + node->rootrsb = NULL;
10116 + list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
10122 + * Tidy up after a rebuild run. Called when all recovery has finished
10125 +void rebuild_freemem(gd_ls_t *ls)
10127 + rebuild_node_t *node = NULL, *s;
10129 + list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
10130 + list_del(&node->list);
10135 +static void put_int(int x, char *buf, int *offp)
10137 + x = cpu_to_le32(x);
10138 + memcpy(buf + *offp, &x, sizeof(int));
10139 + *offp += sizeof(int);
10142 +static void put_int64(uint64_t x, char *buf, int *offp)
10144 + x = cpu_to_le64(x);
10145 + memcpy(buf + *offp, &x, sizeof(uint64_t));
10146 + *offp += sizeof(uint64_t);
10149 +static void put_bytes(char *x, int len, char *buf, int *offp)
10151 + put_int(len, buf, offp);
10152 + memcpy(buf + *offp, x, len);
10156 +static void put_char(char x, char *buf, int *offp)
10162 +static int get_int(char *buf, int *offp)
10165 + memcpy(&value, buf + *offp, sizeof(int));
10166 + *offp += sizeof(int);
10167 + return le32_to_cpu(value);
10170 +static uint64_t get_int64(char *buf, int *offp)
10174 + memcpy(&value, buf + *offp, sizeof(uint64_t));
10175 + *offp += sizeof(uint64_t);
10176 + return le64_to_cpu(value);
10179 +static char get_char(char *buf, int *offp)
10181 + char x = buf[*offp];
10187 +static void get_bytes(char *bytes, int *len, char *buf, int *offp)
10189 + *len = get_int(buf, offp);
10190 + memcpy(bytes, buf + *offp, *len);
10194 +static int lkb_length(gd_lkb_t *lkb)
10198 + len += sizeof(int); /* lkb_id */
10199 + len += sizeof(int); /* lkb_resource->res_reamasterid */
10200 + len += sizeof(int); /* lkb_flags */
10201 + len += sizeof(int); /* lkb_status */
10202 + len += sizeof(char); /* lkb_rqmode */
10203 + len += sizeof(char); /* lkb_grmode */
10204 + len += sizeof(int); /* lkb_childcnt */
10205 + len += sizeof(int); /* lkb_parent->lkb_id */
10206 + len += sizeof(int); /* lkb_bastaddr */
10208 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10209 + len += sizeof(int); /* number of lvb bytes */
10210 + len += DLM_LVB_LEN;
10213 + if (lkb->lkb_range) {
10214 + len += sizeof(uint64_t);
10215 + len += sizeof(uint64_t);
10216 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
10217 + len += sizeof(uint64_t);
10218 + len += sizeof(uint64_t);
10226 + * It's up to the caller to be sure there's enough space in the buffer.
10229 +static void serialise_lkb(gd_lkb_t *lkb, char *buf, int *offp)
10233 + /* Need to tell the remote end if we have a range */
10234 + flags = lkb->lkb_flags;
10235 + if (lkb->lkb_range)
10236 + flags |= GDLM_LKFLG_RANGE;
10239 + * See lkb_length()
10240 + * Total: 30 (no lvb) or 66 (with lvb) bytes
10243 + put_int(lkb->lkb_id, buf, offp);
10244 + put_int(lkb->lkb_resource->res_remasterid, buf, offp);
10245 + put_int(flags, buf, offp);
10246 + put_int(lkb->lkb_status, buf, offp);
10247 + put_char(lkb->lkb_rqmode, buf, offp);
10248 + put_char(lkb->lkb_grmode, buf, offp);
10249 + put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
10251 + if (lkb->lkb_parent)
10252 + put_int(lkb->lkb_parent->lkb_id, buf, offp);
10254 + put_int(0, buf, offp);
10256 + if (lkb->lkb_bastaddr)
10257 + put_int(1, buf, offp);
10259 + put_int(0, buf, offp);
10261 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10262 + GDLM_ASSERT(lkb->lkb_lvbptr,);
10263 + put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
10266 + /* Only send the range we actually need */
10267 + if (lkb->lkb_range) {
10268 + switch (lkb->lkb_status) {
10269 + case GDLM_LKSTS_CONVERT:
10270 + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10271 + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10272 + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10273 + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10275 + case GDLM_LKSTS_WAITING:
10276 + put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10277 + put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10279 + case GDLM_LKSTS_GRANTED:
10280 + put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10281 + put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10289 +static int rsb_length(gd_res_t *rsb)
10293 + len += sizeof(int); /* number of res_name bytes */
10294 + len += rsb->res_length; /* res_name */
10295 + len += sizeof(int); /* res_remasterid */
10296 + len += sizeof(int); /* res_parent->res_remasterid */
10301 +static inline gd_res_t *next_subrsb(gd_res_t *subrsb)
10303 + struct list_head *tmp;
10306 + tmp = subrsb->res_subreslist.next;
10307 + r = list_entry(tmp, gd_res_t, res_subreslist);
10312 +static inline int last_in_list(gd_res_t *r, struct list_head *head)
10314 + gd_res_t *last = list_entry(head->prev, gd_res_t, res_subreslist);
10322 + * Used to decide if an rsb should be rebuilt on a new master. An rsb only
10323 + * needs to be rebuild if we have lkb's queued on it. NOREBUILD lkb's on the
10324 + * wait queue are not rebuilt.
10327 +static int lkbs_to_remaster(gd_res_t *r)
10332 + if (!list_empty(&r->res_grantqueue) ||
10333 + !list_empty(&r->res_convertqueue))
10336 + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
10337 + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10342 + list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
10343 + if (!list_empty(&sub->res_grantqueue) ||
10344 + !list_empty(&sub->res_convertqueue))
10347 + list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) {
10348 + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10357 +static void serialise_rsb(gd_res_t *rsb, char *buf, int *offp)
10360 + * See rsb_length()
10361 + * Total: 36 bytes (4 + 24 + 4 + 4)
10364 + put_bytes(rsb->res_name, rsb->res_length, buf, offp);
10365 + put_int(rsb->res_remasterid, buf, offp);
10367 + if (rsb->res_parent)
10368 + put_int(rsb->res_parent->res_remasterid, buf, offp);
10370 + put_int(0, buf, offp);
10372 + GDLM_ASSERT(!rsb->res_lvbptr,);
10376 + * Flatten an LKB into a buffer for sending to the new RSB master. As a
10377 + * side-effect the nodeid of the lock is set to the nodeid of the new RSB
10381 +static int pack_one_lkb(gd_res_t *r, gd_lkb_t *lkb, rcom_fill_t *fill)
10383 + if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
10386 + lkb->lkb_nodeid = r->res_nodeid;
10388 + put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
10389 + serialise_lkb(lkb, fill->outbuf, &fill->offset);
10392 + need_new_lkid(r);
10400 + * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
10403 +static int pack_lkb_queue(gd_res_t *r, struct list_head *queue,
10404 + rcom_fill_t *fill)
10409 + list_for_each_entry(lkb, queue, lkb_statequeue) {
10410 + if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10413 + error = pack_one_lkb(r, lkb, fill);
10422 + fill->lkbqueue = queue;
10427 +static int pack_lkb_queues(gd_res_t *r, rcom_fill_t *fill)
10431 + error = pack_lkb_queue(r, &r->res_grantqueue, fill);
10435 + error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10439 + error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10446 + * Pack remaining lkb's for rsb or subrsb. This may include a partial lkb
10447 + * queue and full lkb queues.
10450 +static int pack_lkb_remaining(gd_res_t *r, rcom_fill_t *fill)
10452 + struct list_head *tmp, *start, *end;
10457 + * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
10460 + error = pack_one_lkb(r, fill->lkb, fill);
10464 + start = fill->lkb->lkb_statequeue.next;
10465 + end = fill->lkbqueue;
10467 + for (tmp = start; tmp != end; tmp = tmp->next) {
10468 + lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
10470 + error = pack_one_lkb(r, lkb, fill);
10478 + * Pack all lkb's on r's queues following fill->lkbqueue.
10481 + if (fill->lkbqueue == &r->res_waitqueue)
10483 + if (fill->lkbqueue == &r->res_convertqueue)
10486 + GDLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
10488 + error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10492 + error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10498 +static int pack_one_subrsb(gd_res_t *rsb, gd_res_t *subrsb, rcom_fill_t *fill)
10502 + down_write(&subrsb->res_lock);
10504 + if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
10507 + subrsb->res_nodeid = rsb->res_nodeid;
10508 + subrsb->res_remasterid = ++fill->remasterid;
10510 + put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
10511 + serialise_rsb(subrsb, fill->outbuf, &fill->offset);
10513 + error = pack_lkb_queues(subrsb, fill);
10517 + up_write(&subrsb->res_lock);
10522 + up_write(&subrsb->res_lock);
10523 + fill->subrsb = subrsb;
10528 +static int pack_subrsbs(gd_res_t *rsb, gd_res_t *in_subrsb, rcom_fill_t *fill)
10530 + gd_res_t *subrsb;
10534 + * When an initial subrsb is given, we know it needs to be packed.
10535 + * When no initial subrsb is given, begin with the first (if any exist).
10538 + if (!in_subrsb) {
10539 + if (list_empty(&rsb->res_subreslist))
10542 + subrsb = list_entry(rsb->res_subreslist.next, gd_res_t,
10545 + subrsb = in_subrsb;
10548 + error = pack_one_subrsb(rsb, subrsb, fill);
10552 + if (last_in_list(subrsb, &rsb->res_subreslist))
10555 + subrsb = next_subrsb(subrsb);
10563 + * Finish packing whatever is left in an rsb tree. If space runs out while
10564 + * finishing, save subrsb/lkb and this will be called again for the same rsb.
10566 + * !subrsb && lkb, we left off part way through root rsb's lkbs.
10567 + * subrsb && !lkb, we left off just before starting a new subrsb.
10568 + * subrsb && lkb, we left off part way through a subrsb's lkbs.
10569 + * !subrsb && !lkb, we shouldn't be in this function, but starting
10570 + * a new rsb in pack_rsb_tree().
10573 +static int pack_rsb_tree_remaining(gd_ls_t *ls, gd_res_t *rsb,
10574 + rcom_fill_t *fill)
10576 + gd_res_t *subrsb = NULL;
10579 + if (!fill->subrsb && fill->lkb) {
10580 + error = pack_lkb_remaining(rsb, fill);
10584 + error = pack_subrsbs(rsb, NULL, fill);
10589 + else if (fill->subrsb && !fill->lkb) {
10590 + error = pack_subrsbs(rsb, fill->subrsb, fill);
10595 + else if (fill->subrsb && fill->lkb) {
10596 + error = pack_lkb_remaining(fill->subrsb, fill);
10600 + if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
10603 + subrsb = next_subrsb(fill->subrsb);
10605 + error = pack_subrsbs(rsb, subrsb, fill);
10610 + fill->subrsb = NULL;
10611 + fill->lkb = NULL;
10618 + * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
10619 + * buffer. When the buffer runs out of space, save the place to restart (the
10620 + * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
10623 +static int pack_rsb_tree(gd_ls_t *ls, gd_res_t *rsb, rcom_fill_t *fill)
10625 + int error = -ENOSPC;
10627 + fill->remasterid = 0;
10630 + * Pack the root rsb itself. A 1 byte type precedes the serialised
10631 + * rsb. Then pack the lkb's for the root rsb.
10634 + down_write(&rsb->res_lock);
10636 + if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
10639 + rsb->res_remasterid = ++fill->remasterid;
10640 + put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
10641 + serialise_rsb(rsb, fill->outbuf, &fill->offset);
10643 + error = pack_lkb_queues(rsb, fill);
10647 + up_write(&rsb->res_lock);
10650 + * Pack subrsb/lkb's under the root rsb.
10653 + error = pack_subrsbs(rsb, NULL, fill);
10658 + up_write(&rsb->res_lock);
10663 + * Given an RSB, return the next RSB that should be sent to a new master.
10666 +static gd_res_t *next_remastered_rsb(gd_ls_t *ls, gd_res_t *rsb)
10668 + struct list_head *tmp, *start, *end;
10672 + start = ls->ls_rootres.next;
10674 + start = rsb->res_rootlist.next;
10676 + end = &ls->ls_rootres;
10678 + for (tmp = start; tmp != end; tmp = tmp->next) {
10679 + r = list_entry(tmp, gd_res_t, res_rootlist);
10681 + if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
10682 + if (r->res_nodeid && lkbs_to_remaster(r)) {
10683 + expect_new_lkids(r);
10686 + clear_bit(RESFL_NEW_MASTER, &r->res_flags);
10694 + * Given an rcom buffer, fill it with RSB's that need to be sent to a single
10695 + * new master node. In the case where all the data to send to one node
10696 + * requires multiple messages, this function needs to resume filling each
10697 + * successive buffer from the point where it left off when the previous buffer
10701 +static void fill_rcom_buffer(gd_ls_t *ls, rcom_fill_t *fill, uint32_t *nodeid)
10703 + gd_res_t *rsb, *prev_rsb = fill->rsb;
10706 + fill->offset = 0;
10711 + * The first time this function is called.
10714 + rsb = next_remastered_rsb(ls, NULL);
10718 + } else if (fill->subrsb || fill->lkb) {
10721 + * Continue packing an rsb tree that was partially packed last
10722 + * time (fill->subrsb/lkb indicates where packing of last block
10727 + *nodeid = rsb->res_nodeid;
10729 + error = pack_rsb_tree_remaining(ls, rsb, fill);
10730 + if (error == -ENOSPC)
10733 + rsb = next_remastered_rsb(ls, prev_rsb);
10737 + if (rsb->res_nodeid != prev_rsb->res_nodeid)
10744 + * Pack rsb trees into the buffer until we run out of space, run out of
10745 + * new rsb's or hit a new nodeid.
10748 + *nodeid = rsb->res_nodeid;
10751 + error = pack_rsb_tree(ls, rsb, fill);
10752 + if (error == -ENOSPC)
10757 + rsb = next_remastered_rsb(ls, prev_rsb);
10761 + if (rsb->res_nodeid != prev_rsb->res_nodeid)
10775 + * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
10778 +int rebuild_rsbs_send(gd_ls_t *ls)
10781 + rcom_fill_t fill;
10785 + GDLM_ASSERT(recover_list_empty(ls),);
10787 + log_all(ls, "rebuild locks");
10790 + rc = allocate_rcom_buffer(ls);
10795 + memset(&fill, 0, sizeof(rcom_fill_t));
10796 + fill.outbuf = rc->rc_buf;
10797 + fill.maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
10800 + fill_rcom_buffer(ls, &fill, &nodeid);
10801 + if (!fill.offset)
10804 + rc->rc_datalen = fill.offset;
10805 + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
10810 + error = gdlm_recovery_stopped(ls);
10814 + while (fill.more);
10816 + error = gdlm_wait_function(ls, &recover_list_empty);
10818 + log_all(ls, "rebuilt %d locks", fill.count);
10821 + rebuild_freemem(ls);
10822 + free_rcom_buffer(rc);
10828 +static gd_res_t *find_by_remasterid(gd_ls_t *ls, int remasterid,
10829 + gd_res_t *rootrsb)
10833 + GDLM_ASSERT(rootrsb,);
10835 + if (rootrsb->res_remasterid == remasterid) {
10840 + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
10841 + if (rsb->res_remasterid == remasterid)
10851 + * Search a queue for the given remote lock id (remlkid).
10854 +static gd_lkb_t *search_remlkid(struct list_head *statequeue, int nodeid,
10859 + list_for_each_entry(lkb, statequeue, lkb_statequeue) {
10860 + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
10869 + * Given a remote lock ID (and a parent resource), return the local LKB for it
10870 + * Hopefully we dont need to do this too often on deep lock trees. This is
10871 + * VERY suboptimal for anything but the smallest lock trees. It searches the
10872 + * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
10873 + * returns the LKB address. OPTIMISATION: we should keep a list of these while
10874 + * we are building up the remastered LKBs
10877 +static gd_lkb_t *find_by_remlkid(gd_res_t *rootrsb, int nodeid, int remid)
10882 + lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
10886 + lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
10890 + lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
10894 + list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
10895 + lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
10899 + lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
10903 + lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
10914 + * Unpack an LKB from a remaster operation
10917 +static int deserialise_lkb(gd_ls_t *ls, int rem_nodeid, gd_res_t *rootrsb,
10918 + char *buf, int *ptr, char *outbuf, int *outoffp)
10922 + int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
10924 + remote_lkid = get_int(buf, ptr);
10926 + rsb_rmid = get_int(buf, ptr);
10927 + rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
10928 + GDLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
10931 + * We could have received this lkb already from a previous recovery
10932 + * that was interrupted. If so, just return the lkid to the remote
10935 + lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
10939 + lkb = create_lkb(rsb->res_ls);
10943 + lkb->lkb_remid = remote_lkid;
10944 + lkb->lkb_flags = get_int(buf, ptr);
10945 + status = get_int(buf, ptr);
10946 + lkb->lkb_rqmode = get_char(buf, ptr);
10947 + lkb->lkb_grmode = get_char(buf, ptr);
10948 + atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
10950 + parentid = get_int(buf, ptr);
10951 + lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
10953 + if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10954 + lkb->lkb_lvbptr = allocate_lvb(ls);
10955 + if (!lkb->lkb_lvbptr)
10957 + get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
10960 + if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
10961 + uint64_t start, end;
10963 + /* Don't need to keep the range flag, for comms use only */
10964 + lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
10965 + start = get_int64(buf, ptr);
10966 + end = get_int64(buf, ptr);
10968 + lkb->lkb_range = allocate_range(rsb->res_ls);
10969 + if (!lkb->lkb_range)
10972 + switch (status) {
10973 + case GDLM_LKSTS_CONVERT:
10974 + lkb->lkb_range[RQ_RANGE_START] = start;
10975 + lkb->lkb_range[RQ_RANGE_END] = end;
10976 + start = get_int64(buf, ptr);
10977 + end = get_int64(buf, ptr);
10978 + lkb->lkb_range[GR_RANGE_START] = start;
10979 + lkb->lkb_range[GR_RANGE_END] = end;
10981 + case GDLM_LKSTS_WAITING:
10982 + lkb->lkb_range[RQ_RANGE_START] = start;
10983 + lkb->lkb_range[RQ_RANGE_END] = end;
10986 + case GDLM_LKSTS_GRANTED:
10987 + lkb->lkb_range[GR_RANGE_START] = start;
10988 + lkb->lkb_range[GR_RANGE_END] = end;
10995 + /* Resolve local lock LKB address from parent ID */
10997 + lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
11000 + atomic_inc(&rsb->res_ref);
11001 + lkb->lkb_resource = rsb;
11003 + lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
11004 + lkb->lkb_nodeid = rem_nodeid;
11007 + * Put the lkb on an RSB queue. An lkb that's in the midst of a
11008 + * conversion request (on the requesting node's lockqueue and has
11009 + * LQCONVERT set) should be put on the granted queue. The convert
11010 + * request will be resent by the requesting node.
11013 + if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
11014 + lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
11015 + GDLM_ASSERT(status == GDLM_LKSTS_CONVERT,
11016 + printk("status=%d\n", status););
11017 + lkb->lkb_rqmode = DLM_LOCK_IV;
11018 + status = GDLM_LKSTS_GRANTED;
11021 + lkb_enqueue(rsb, lkb, status);
11024 + * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
11027 + if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
11028 + && lkb->lkb_grmode > DLM_LOCK_NL) {
11029 + if (!rsb->res_lvbptr)
11030 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
11031 + if (!rsb->res_lvbptr)
11033 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
11037 + * Clear flags that may have been sent over that are only relevant in
11038 + * the context of the sender.
11041 + lkb->lkb_flags &= ~(GDLM_LKFLG_DELAST | GDLM_LKFLG_DELETED |
11042 + GDLM_LKFLG_LQRESEND | GDLM_LKFLG_NOREBUILD |
11043 + GDLM_LKFLG_DEMOTED);
11046 + /* Return the new LKID to the caller's buffer */
11047 + put_int(lkb->lkb_id, outbuf, outoffp);
11048 + put_int(lkb->lkb_remid, outbuf, outoffp);
11055 +static gd_res_t *deserialise_rsb(gd_ls_t *ls, int nodeid, gd_res_t *rootrsb,
11056 + char *buf, int *ptr)
11060 + int parent_remasterid;
11061 + char name[DLM_RESNAME_MAXLEN];
11063 + gd_res_t *parent = NULL;
11066 + get_bytes(name, &length, buf, ptr);
11067 + remasterid = get_int(buf, ptr);
11068 + parent_remasterid = get_int(buf, ptr);
11070 + if (parent_remasterid)
11071 + parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
11074 + * The rsb reference from this find_or_create_rsb() will keep the rsb
11075 + * around while we add new lkb's to it from deserialise_lkb. Each of
11076 + * the lkb's will add an rsb reference. The reference added here is
11077 + * removed by release_rsb() after all lkb's are added.
11080 + error = find_or_create_rsb(ls, parent, name, length, 1, &rsb);
11081 + GDLM_ASSERT(!error,);
11083 + /* There is a case where the above needs to create the RSB. */
11084 + if (rsb->res_nodeid == -1)
11085 + rsb->res_nodeid = our_nodeid();
11087 + rsb->res_remasterid = remasterid;
11093 + * Processing at the receiving end of a NEWLOCKS message from a node in
11094 + * rebuild_rsbs_send(). Rebuild a remastered lock tree. Nodeid is the remote
11095 + * node whose locks we are now mastering. For a reply we need to send back the
11096 + * new lockids of the remastered locks so that remote ops can find them.
11099 +int rebuild_rsbs_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
11102 + gd_res_t *rsb = NULL;
11103 + rebuild_node_t *rnode;
11105 + int outptr, ptr = 0, error = -ENOMEM;
11107 + rnode = find_rebuild_root(ls, nodeid);
11112 + * Allocate a buffer for the reply message which is a list of remote
11113 + * lock IDs and their (new) local lock ids. It will always be big
11114 + * enough to fit <n> ID pairs if it already fit <n> LKBs.
11117 + rc = allocate_rcom_buffer(ls);
11120 + outbuf = rc->rc_buf;
11124 + * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
11125 + * created. Each deserialise_rsb adds an rsb reference that must be
11126 + * removed with release_rsb once all new lkb's for an rsb have been
11130 + while (ptr < len) {
11133 + type = get_char(buf, &ptr);
11136 + case REMASTER_ROOTRSB:
11138 + release_rsb(rsb);
11139 + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11141 + rnode->rootrsb = rsb;
11144 + case REMASTER_RSB:
11146 + release_rsb(rsb);
11147 + rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11151 + case REMASTER_LKB:
11152 + deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
11153 + outbuf, &outptr);
11157 + GDLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
11158 + "len=%d\n", type, nodeid, ptr,
11164 + release_rsb(rsb);
11167 + * Reply with the new lock IDs.
11170 + rc->rc_datalen = outptr;
11171 + error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
11173 + free_rcom_buffer(rc);
11180 + * Processing for a NEWLOCKIDS message. Called when we get the reply from the
11181 + * new master telling us what the new remote lock IDs are for the remastered
11185 +int rebuild_rsbs_lkids_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
11192 + while (offset < len) {
11197 + if (offset + 8 > len) {
11198 + log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
11199 + "length nodeid=%d offset=%d len=%d",
11200 + nodeid, offset, len);
11204 + remote_id = get_int(buf, &offset);
11205 + local_id = get_int(buf, &offset);
11207 + lkb = find_lock_by_id(ls, local_id);
11209 + lkb->lkb_remid = remote_id;
11210 + have_new_lkid(lkb);
11212 + log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
11213 + "nodeid=%d id=%x remid=%x offset=%d len=%d",
11214 + nodeid, local_id, remote_id, offset, len);
11218 + if (recover_list_empty(ls))
11219 + wake_up(&ls->ls_wait_general);
11223 diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
11224 --- linux-orig/cluster/dlm/rebuild.h 1970-01-01 07:30:00.000000000 +0730
11225 +++ linux-patched/cluster/dlm/rebuild.h 2004-06-25 18:31:07.000000000 +0800
11227 +/******************************************************************************
11228 +*******************************************************************************
11230 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11231 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11233 +** This copyrighted material is made available to anyone wishing to use,
11234 +** modify, copy, or redistribute it subject to the terms and conditions
11235 +** of the GNU General Public License v.2.
11237 +*******************************************************************************
11238 +******************************************************************************/
11240 +#ifndef __REBUILD_DOT_H__
11241 +#define __REBUILD_DOT_H__
11243 +int rebuild_rsbs_send(gd_ls_t * ls);
11244 +int rebuild_rsbs_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
11245 +int rebuild_rsbs_lkids_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
11246 +int rebuild_freemem(gd_ls_t * ls);
11248 +#endif /* __REBUILD_DOT_H__ */
11249 diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
11250 --- linux-orig/cluster/dlm/reccomms.c 1970-01-01 07:30:00.000000000 +0730
11251 +++ linux-patched/cluster/dlm/reccomms.c 2004-06-25 18:31:07.000000000 +0800
11253 +/******************************************************************************
11254 +*******************************************************************************
11256 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11257 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11259 +** This copyrighted material is made available to anyone wishing to use,
11260 +** modify, copy, or redistribute it subject to the terms and conditions
11261 +** of the GNU General Public License v.2.
11263 +*******************************************************************************
11264 +******************************************************************************/
11266 +#include "dlm_internal.h"
11267 +#include "lowcomms.h"
11268 +#include "midcomms.h"
11269 +#include "reccomms.h"
11270 +#include "nodes.h"
11271 +#include "lockspace.h"
11272 +#include "recover.h"
11274 +#include "config.h"
11275 +#include "rebuild.h"
11276 +#include "memory.h"
11278 +/* Running on the basis that only a single recovery communication will be done
11279 + * at a time per lockspace */
11281 +static void rcom_process_message(gd_ls_t * ls, uint32_t nodeid, gd_rcom_t * rc);
11284 + * Track per-node progress/stats during recovery to help debugging.
11287 +void rcom_log(gd_ls_t *ls, int nodeid, gd_rcom_t *rc, int send)
11292 + list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11293 + if (csb->csb_node->gn_nodeid == nodeid) {
11302 + if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) {
11304 + csb->csb_names_send_count++;
11305 + csb->csb_names_send_msgid = rc->rc_msgid;
11307 + csb->csb_names_recv_count++;
11308 + csb->csb_names_recv_msgid = rc->rc_msgid;
11310 + } else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) {
11312 + csb->csb_locks_send_count++;
11313 + csb->csb_locks_send_msgid = rc->rc_msgid;
11315 + csb->csb_locks_recv_count++;
11316 + csb->csb_locks_recv_msgid = rc->rc_msgid;
11321 +void rcom_log_clear(gd_ls_t *ls)
11325 + list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11326 + csb->csb_names_send_count = 0;
11327 + csb->csb_names_send_msgid = 0;
11328 + csb->csb_names_recv_count = 0;
11329 + csb->csb_names_recv_msgid = 0;
11330 + csb->csb_locks_send_count = 0;
11331 + csb->csb_locks_send_msgid = 0;
11332 + csb->csb_locks_recv_count = 0;
11333 + csb->csb_locks_recv_msgid = 0;
11337 +static int rcom_response(gd_ls_t *ls)
11339 + return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11343 + * rcom_send_message - send or request recovery data
11344 + * @ls: the lockspace
11345 + * @nodeid: node to which the message is sent
11346 + * @type: type of recovery message
11347 + * @rc: the rc buffer to send
11348 + * @need_reply: wait for reply if this is set
11350 + * Using this interface
11351 + * i) Allocate an rc buffer:
11352 + * rc = allocate_rcom_buffer(ls);
11353 + * ii) Copy data to send beginning at rc->rc_buf:
11354 + * memcpy(rc->rc_buf, mybuf, mylen);
11355 + * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
11356 + * rc->rc_datalen = mylen
11357 + * iv) Submit the rc to this function:
11358 + * rcom_send_message(rc);
11360 + * The max value of "mylen" is dlm_config.buffer_size - sizeof(gd_rcom_t). If
11361 + * more data must be passed in one send, use rcom_expand_buffer() which
11362 + * incrementally increases the size of the rc buffer by dlm_config.buffer_size
11365 + * Any data returned for the message (when need_reply is set) will saved in
11366 + * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
11367 + * number of bytes copied into rc->rc_buf.
11369 + * Returns: 0 on success, -EXXX on failure
11372 +int rcom_send_message(gd_ls_t *ls, uint32_t nodeid, int type, gd_rcom_t *rc,
11377 + if (!rc->rc_datalen)
11378 + rc->rc_datalen = 1;
11381 + * Fill in the header.
11384 + rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
11385 + rc->rc_header.rh_lockspace = ls->ls_global_id;
11386 + rc->rc_header.rh_length = sizeof(gd_rcom_t) + rc->rc_datalen - 1;
11387 + rc->rc_subcmd = type;
11388 + rc->rc_msgid = ++ls->ls_rcom_msgid;
11390 + rcom_log(ls, nodeid, rc, 1);
11393 + * When a reply is received, the reply data goes back into this buffer.
11394 + * Synchronous rcom requests (need_reply=1) are serialised because of
11395 + * the single ls_rcom.
11398 + if (need_reply) {
11399 + down(&ls->ls_rcom_lock);
11400 + ls->ls_rcom = rc;
11404 + * After sending the message we'll wait at the end of this function to
11405 + * get a reply. The READY flag will be set when the reply has been
11406 + * received and requested data has been copied into
11407 + * ls->ls_rcom->rc_buf;
11410 + GDLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
11413 + * The WAIT bit indicates that we're waiting for and willing to accept a
11414 + * reply. Any replies are ignored unless this bit is set.
11417 + set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11420 + * Process the message locally.
11423 + if (nodeid == our_nodeid()) {
11424 + rcom_process_message(ls, nodeid, rc);
11429 + * Send the message.
11432 + log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
11434 + error = midcomms_send_message(nodeid, (struct gd_req_header *) rc,
11436 + GDLM_ASSERT(error >= 0, printk("error = %d\n", error););
11440 + * Wait for a reply. Once a reply is processed from midcomms, the
11441 + * READY bit will be set and we'll be awoken (gdlm_wait_function will
11445 + if (need_reply) {
11446 + error = gdlm_wait_function(ls, &rcom_response);
11448 + log_debug(ls, "rcom wait error %d", error);
11452 + clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11453 + clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11456 + up(&ls->ls_rcom_lock);
11462 + * Runs in same context as midcomms.
11465 +static void rcom_process_message(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *rc)
11467 + gd_rcom_t rc_stack;
11468 + gd_rcom_t *reply = NULL;
11469 + gd_resdata_t *rd;
11470 + int status, datalen, maxlen;
11471 + uint32_t be_nodeid;
11476 + rcom_log(ls, nodeid, rc, 0);
11478 + if (gdlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
11479 + log_error(ls, "ignoring recovery message %x from %u",
11480 + rc->rc_subcmd, nodeid);
11484 + switch (rc->rc_subcmd) {
11486 + case RECCOMM_STATUS:
11488 + memset(&rc_stack, 0, sizeof(gd_rcom_t));
11489 + reply = &rc_stack;
11491 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11492 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11493 + reply->rc_subcmd = rc->rc_subcmd;
11494 + reply->rc_msgid = rc->rc_msgid;
11495 + reply->rc_buf[0] = 0;
11497 + if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
11498 + reply->rc_buf[0] |= RESDIR_VALID;
11500 + if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
11501 + reply->rc_buf[0] |= RESDIR_ALL_VALID;
11503 + if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
11504 + reply->rc_buf[0] |= NODES_VALID;
11506 + if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
11507 + reply->rc_buf[0] |= NODES_ALL_VALID;
11509 + reply->rc_datalen = 1;
11510 + reply->rc_header.rh_length =
11511 + sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11513 + log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
11516 + case RECCOMM_RECOVERNAMES:
11518 + reply = allocate_rcom_buffer(ls);
11519 + GDLM_ASSERT(reply,);
11520 + maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
11522 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11523 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11524 + reply->rc_subcmd = rc->rc_subcmd;
11525 + reply->rc_msgid = rc->rc_msgid;
11528 + * The other node wants a bunch of resource names. The name of
11529 + * the resource to begin with is in rc->rc_buf.
11532 + datalen = resdir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
11533 + reply->rc_buf, maxlen, nodeid);
11535 + reply->rc_datalen = datalen;
11536 + reply->rc_header.rh_length =
11537 + sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11539 + log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
11540 + reply->rc_msgid);
11543 + case RECCOMM_GETMASTER:
11545 + reply = allocate_rcom_buffer(ls);
11546 + GDLM_ASSERT(reply,);
11548 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11549 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11550 + reply->rc_subcmd = rc->rc_subcmd;
11551 + reply->rc_msgid = rc->rc_msgid;
11554 + * The other node wants to know the master of a named resource.
11557 + status = get_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen,
11559 + if (status != 0) {
11560 + free_rcom_buffer(reply);
11564 + be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
11565 + memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
11566 + reply->rc_datalen = sizeof(uint32_t);
11567 + reply->rc_header.rh_length =
11568 + sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11571 + case RECCOMM_BULKLOOKUP:
11573 + reply = allocate_rcom_buffer(ls);
11574 + GDLM_ASSERT(reply,);
11576 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11577 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11578 + reply->rc_subcmd = rc->rc_subcmd;
11579 + reply->rc_msgid = rc->rc_msgid;
11582 + * This is a bulk version of the above and just returns a
11583 + * buffer full of node ids to match the resources
11586 + datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
11587 + rc->rc_datalen, reply->rc_buf);
11588 + if (datalen < 0) {
11589 + free_rcom_buffer(reply);
11594 + reply->rc_datalen = datalen;
11595 + reply->rc_header.rh_length =
11596 + sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11600 + * These RECCOMM messages don't need replies.
11603 + case RECCOMM_NEWLOCKS:
11604 + rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11607 + case RECCOMM_NEWLOCKIDS:
11608 + rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11611 + case RECCOMM_REMRESDATA:
11612 + remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen, 1);
11616 + GDLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
11620 + if (nodeid == our_nodeid()) {
11621 + GDLM_ASSERT(rc == ls->ls_rcom,);
11622 + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11623 + rc->rc_datalen = reply->rc_datalen;
11625 + midcomms_send_message(nodeid,
11626 + (struct gd_req_header *) reply,
11630 + if (reply != &rc_stack)
11631 + free_rcom_buffer(reply);
11635 +static void process_reply_sync(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11637 + gd_rcom_t *rc = ls->ls_rcom;
11639 + if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
11640 + log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
11644 + if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
11645 + log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
11646 + reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
11650 + memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11651 + rc->rc_datalen = reply->rc_datalen;
11654 + * Tell the thread waiting in rcom_send_message() that it can go ahead.
11657 + set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11658 + wake_up(&ls->ls_wait_general);
11661 +static void process_reply_async(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11663 + restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
11664 + reply->rc_msgid);
11668 + * Runs in same context as midcomms.
11671 +static void rcom_process_reply(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11673 + if (gdlm_recovery_stopped(ls)) {
11674 + log_error(ls, "ignoring recovery reply %x from %u",
11675 + reply->rc_subcmd, nodeid);
11679 + switch (reply->rc_subcmd) {
11680 + case RECCOMM_GETMASTER:
11681 + process_reply_async(ls, nodeid, reply);
11683 + case RECCOMM_STATUS:
11684 + case RECCOMM_NEWLOCKS:
11685 + case RECCOMM_NEWLOCKIDS:
11686 + case RECCOMM_RECOVERNAMES:
11687 + process_reply_sync(ls, nodeid, reply);
11690 + log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
11691 + reply->rc_subcmd, nodeid);
11696 +static int send_ls_not_ready(uint32_t nodeid, struct gd_req_header *header)
11698 + struct writequeue_entry *wq;
11699 + gd_rcom_t *rc = (gd_rcom_t *) header;
11700 + gd_rcom_t *reply;
11702 + wq = lowcomms_get_buffer(nodeid, sizeof(gd_rcom_t), GFP_KERNEL,
11703 + (char **)&reply);
11707 + reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11708 + reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11709 + reply->rc_subcmd = rc->rc_subcmd;
11710 + reply->rc_msgid = rc->rc_msgid;
11711 + reply->rc_buf[0] = 0;
11713 + reply->rc_datalen = 1;
11714 + reply->rc_header.rh_length = sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11716 + midcomms_send_buffer((struct gd_req_header *)reply, wq);
11722 + * Runs in same context as midcomms. Both recovery requests and recovery
11723 + * replies come through this function.
11726 +void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header)
11728 + gd_ls_t *ls = find_lockspace_by_global_id(header->rh_lockspace);
11729 + gd_rcom_t *rc = (gd_rcom_t *) header;
11731 + /* If the lockspace doesn't exist then still send a status message
11732 + back, it's possible that it just doesn't have it's global_id
11735 + send_ls_not_ready(nodeid, header);
11739 + switch (header->rh_cmd) {
11740 + case GDLM_REMCMD_RECOVERMESSAGE:
11741 + down_read(&ls->ls_rec_rsblist);
11742 + rcom_process_message(ls, nodeid, rc);
11743 + up_read(&ls->ls_rec_rsblist);
11746 + case GDLM_REMCMD_RECOVERREPLY:
11747 + rcom_process_reply(ls, nodeid, rc);
11751 + GDLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
11755 diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
11756 --- linux-orig/cluster/dlm/reccomms.h 1970-01-01 07:30:00.000000000 +0730
11757 +++ linux-patched/cluster/dlm/reccomms.h 2004-06-25 18:31:07.000000000 +0800
11759 +/******************************************************************************
11760 +*******************************************************************************
11762 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11763 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11765 +** This copyrighted material is made available to anyone wishing to use,
11766 +** modify, copy, or redistribute it subject to the terms and conditions
11767 +** of the GNU General Public License v.2.
11769 +*******************************************************************************
11770 +******************************************************************************/
11772 +#ifndef __RECCOMMS_DOT_H__
11773 +#define __RECCOMMS_DOT_H__
11777 +#define RESDIR_VALID (1)
11778 +#define RESDIR_ALL_VALID (2)
11779 +#define NODES_VALID (4)
11780 +#define NODES_ALL_VALID (8)
11782 +#define RECCOMM_STATUS (1)
11783 +#define RECCOMM_RECOVERNAMES (2)
11784 +#define RECCOMM_GETMASTER (3)
11785 +#define RECCOMM_BULKLOOKUP (4)
11786 +#define RECCOMM_NEWLOCKS (5)
11787 +#define RECCOMM_NEWLOCKIDS (6)
11788 +#define RECCOMM_REMRESDATA (7)
11790 +int rcom_send_message(gd_ls_t * ls, uint32_t nodeid, int type, gd_rcom_t * rc,
11792 +void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header);
11793 +void rcom_log_clear(gd_ls_t *ls);
11796 diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
11797 --- linux-orig/cluster/dlm/recover.c 1970-01-01 07:30:00.000000000 +0730
11798 +++ linux-patched/cluster/dlm/recover.c 2004-06-25 18:31:07.000000000 +0800
11800 +/******************************************************************************
11801 +*******************************************************************************
11803 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11804 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11806 +** This copyrighted material is made available to anyone wishing to use,
11807 +** modify, copy, or redistribute it subject to the terms and conditions
11808 +** of the GNU General Public License v.2.
11810 +*******************************************************************************
11811 +******************************************************************************/
11813 +#include "dlm_internal.h"
11814 +#include "reccomms.h"
11816 +#include "locking.h"
11818 +#include "lockspace.h"
11820 +#include "nodes.h"
11821 +#include "config.h"
11823 +#include "memory.h"
11826 + * Called in recovery routines to check whether the recovery process has been
11827 + * interrupted/stopped by another transition. A recovery in-process will abort
11828 + * if the lockspace is "stopped" so that a new recovery process can start from
11829 + * the beginning when the lockspace is "started" again.
11832 +int gdlm_recovery_stopped(gd_ls_t *ls)
11834 + return test_bit(LSFL_LS_STOP, &ls->ls_flags);
11837 +static void gdlm_wait_timer_fn(unsigned long data)
11839 + gd_ls_t *ls = (gd_ls_t *) data;
11841 + wake_up(&ls->ls_wait_general);
11845 + * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
11846 + * set due to failure of a node in ls_nodes). When another function thinks it
11847 + * could have completed the waited-on task, they should wake up ls_wait_general
11848 + * to get an immediate response rather than waiting for the timer to detect the
11849 + * result. A timer wakes us up periodically while waiting to see if we should
11850 + * abort due to a node failure.
11853 +int gdlm_wait_function(gd_ls_t *ls, int (*testfn) (gd_ls_t * ls))
11855 + struct timer_list timer;
11858 + init_timer(&timer);
11859 + timer.function = gdlm_wait_timer_fn;
11860 + timer.data = (long) ls;
11863 + mod_timer(&timer, jiffies + (5 * HZ));
11865 + wchan_cond_sleep_intr(ls->ls_wait_general,
11867 + !test_bit(LSFL_LS_STOP, &ls->ls_flags));
11869 + if (timer_pending(&timer))
11870 + del_timer(&timer);
11875 + if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
11884 +int gdlm_wait_status_all(gd_ls_t *ls, unsigned int wait_status)
11886 + gd_rcom_t rc_stack, *rc;
11891 + memset(&rc_stack, 0, sizeof(gd_rcom_t));
11893 + rc->rc_datalen = 0;
11895 + list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11897 + error = gdlm_recovery_stopped(ls);
11901 + error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
11902 + RECCOMM_STATUS, rc, 1);
11906 + status = rc->rc_buf[0];
11907 + if (status & wait_status)
11910 + set_current_state(TASK_INTERRUPTIBLE);
11911 + schedule_timeout(HZ >> 1);
11920 +int gdlm_wait_status_low(gd_ls_t *ls, unsigned int wait_status)
11922 + gd_rcom_t rc_stack, *rc;
11923 + uint32_t nodeid = ls->ls_low_nodeid;
11927 + memset(&rc_stack, 0, sizeof(gd_rcom_t));
11929 + rc->rc_datalen = 0;
11932 + error = gdlm_recovery_stopped(ls);
11936 + error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
11940 + status = rc->rc_buf[0];
11941 + if (status & wait_status)
11944 + set_current_state(TASK_INTERRUPTIBLE);
11945 + schedule_timeout(HZ >> 1);
11953 +static int purge_queue(gd_ls_t *ls, struct list_head *queue)
11955 + gd_lkb_t *lkb, *safe;
11959 + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
11960 + if (!lkb->lkb_nodeid)
11963 + GDLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
11965 + if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
11966 + list_del(&lkb->lkb_statequeue);
11968 + rsb = lkb->lkb_resource;
11969 + lkb->lkb_status = 0;
11971 + if (lkb->lkb_status == GDLM_LKSTS_CONVERT
11972 + && &lkb->lkb_duetime)
11973 + remove_from_deadlockqueue(lkb);
11975 + release_lkb(ls, lkb);
11976 + release_rsb(rsb);
11985 + * Go through local restbl and for each rsb we're master of, clear out any
11986 + * lkb's held by departed nodes.
11989 +int restbl_lkb_purge(gd_ls_t *ls)
11991 + struct list_head *tmp2, *safe2;
11993 + gd_res_t *rootrsb, *safe, *rsb;
11995 + log_all(ls, "purge locks of departed nodes");
11997 + list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
11999 + rootrsb->res_resdir_seq = 1;
12001 + if (rootrsb->res_nodeid)
12004 + hold_rsb(rootrsb);
12005 + down_write(&rootrsb->res_lock);
12007 + /* This traverses the subreslist in reverse order so we purge
12008 + * the children before their parents. */
12010 + for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
12011 + tmp2 != &rootrsb->res_subreslist;
12012 + tmp2 = safe2, safe2 = safe2->prev) {
12013 + rsb = list_entry(tmp2, gd_res_t, res_subreslist);
12016 + purge_queue(ls, &rsb->res_grantqueue);
12017 + purge_queue(ls, &rsb->res_convertqueue);
12018 + purge_queue(ls, &rsb->res_waitqueue);
12019 + release_rsb(rsb);
12021 + count += purge_queue(ls, &rootrsb->res_grantqueue);
12022 + count += purge_queue(ls, &rootrsb->res_convertqueue);
12023 + count += purge_queue(ls, &rootrsb->res_waitqueue);
12025 + up_write(&rootrsb->res_lock);
12026 + release_rsb(rootrsb);
12029 + log_all(ls, "purged %d locks", count);
12035 + * Grant any locks that have become grantable after a purge
12038 +int restbl_grant_after_purge(gd_ls_t *ls)
12040 + gd_res_t *root, *rsb, *safe;
12043 + down_write(&ls->ls_gap_rsblist);
12045 + list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
12046 + /* only the rsb master grants locks */
12047 + if (root->res_nodeid)
12050 + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
12051 + log_debug(ls, "restbl_grant_after_purge aborted");
12053 + up_write(&ls->ls_gap_rsblist);
12057 + down_write(&root->res_lock);
12058 + grant_pending_locks(root);
12059 + up_write(&root->res_lock);
12061 + list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
12062 + down_write(&rsb->res_lock);
12063 + grant_pending_locks(rsb);
12064 + up_write(&rsb->res_lock);
12067 + up_write(&ls->ls_gap_rsblist);
12074 + * Set the lock master for all LKBs in a lock queue
12077 +static void set_lock_master(struct list_head *queue, int nodeid)
12081 + list_for_each_entry(lkb, queue, lkb_statequeue) {
12082 + /* Don't muck around with pre-exising sublocks */
12083 + if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
12084 + lkb->lkb_nodeid = nodeid;
12088 +static void set_master_lkbs(gd_res_t *rsb)
12090 + set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
12091 + set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
12092 + set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
12096 + * This rsb struct is now the master so it is responsible for keeping the
12097 + * latest rsb. Find if any current lkb's have an up to date copy of the lvb to
12098 + * be used as the rsb copy. An equivalent step occurs as new lkb's arrive for
12099 + * this rsb in deserialise_lkb.
12102 +static void set_rsb_lvb(gd_res_t *rsb)
12106 + list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
12108 + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12109 + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12110 + (lkb->lkb_grmode > DLM_LOCK_NL))
12112 + if (!rsb->res_lvbptr)
12113 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12115 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12120 + list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
12122 + if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12123 + (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12124 + (lkb->lkb_grmode > DLM_LOCK_NL))
12126 + if (!rsb->res_lvbptr)
12127 + rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12129 + memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12136 + * Propogate the new master nodeid to locks, subrsbs, sublocks.
12137 + * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
12140 +static void set_new_master(gd_res_t *rsb)
12142 + gd_res_t *subrsb;
12144 + down_write(&rsb->res_lock);
12146 + if (rsb->res_nodeid == our_nodeid()) {
12147 + rsb->res_nodeid = 0;
12148 + set_rsb_lvb(rsb);
12151 + set_master_lkbs(rsb);
12153 + list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
12154 + subrsb->res_nodeid = rsb->res_nodeid;
12155 + set_master_lkbs(subrsb);
12158 + up_write(&rsb->res_lock);
12160 + set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
12164 + * The recover_list contains all the rsb's for which we've requested the new
12165 + * master nodeid. As replies are returned from the resource directories the
12166 + * rsb's are removed from the list. When the list is empty we're done.
12168 + * The recover_list is later similarly used for all rsb's for which we've sent
12169 + * new lkb's and need to receive new corresponding lkid's.
12172 +int recover_list_empty(gd_ls_t *ls)
12176 + spin_lock(&ls->ls_recover_list_lock);
12177 + empty = list_empty(&ls->ls_recover_list);
12178 + spin_unlock(&ls->ls_recover_list_lock);
12183 +int recover_list_count(gd_ls_t *ls)
12187 + spin_lock(&ls->ls_recover_list_lock);
12188 + count = ls->ls_recover_list_count;
12189 + spin_unlock(&ls->ls_recover_list_lock);
12194 +void recover_list_add(gd_res_t *rsb)
12196 + gd_ls_t *ls = rsb->res_ls;
12198 + spin_lock(&ls->ls_recover_list_lock);
12199 + if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
12200 + list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
12201 + ls->ls_recover_list_count++;
12204 + spin_unlock(&ls->ls_recover_list_lock);
12207 +void recover_list_del(gd_res_t *rsb)
12209 + gd_ls_t *ls = rsb->res_ls;
12211 + spin_lock(&ls->ls_recover_list_lock);
12212 + clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
12213 + list_del(&rsb->res_recover_list);
12214 + ls->ls_recover_list_count--;
12215 + spin_unlock(&ls->ls_recover_list_lock);
12217 + release_rsb(rsb);
12220 +static gd_res_t *recover_list_find(gd_ls_t *ls, int msgid)
12222 + gd_res_t *rsb = NULL;
12224 + spin_lock(&ls->ls_recover_list_lock);
12226 + list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
12227 + if (rsb->res_recover_msgid == msgid)
12233 + spin_unlock(&ls->ls_recover_list_lock);
12238 +static void recover_list_clear(gd_ls_t *ls)
12243 + spin_lock(&ls->ls_recover_list_lock);
12245 + while (!list_empty(&ls->ls_recover_list)) {
12246 + rsb = list_entry(ls->ls_recover_list.next, gd_res_t,
12247 + res_recover_list);
12248 + list_del(&rsb->res_recover_list);
12249 + ls->ls_recover_list_count--;
12251 + spin_unlock(&ls->ls_recover_list_lock);
12257 +void recover_list_dump(gd_ls_t *ls)
12259 + struct list_head *tmp;
12262 + spin_lock(&ls->ls_recover_list_lock);
12264 + printk("recover_list_count=%d\n", ls->ls_recover_list_count);
12266 + list_for_each(tmp, &ls->ls_recover_list) {
12267 + rsb = list_entry(tmp, gd_res_t, res_recover_list);
12268 + gdlm_res_dbprint(rsb);
12270 + spin_unlock(&ls->ls_recover_list_lock);
12274 +static int rsb_master_lookup(gd_res_t *rsb, gd_rcom_t *rc)
12276 + gd_ls_t *ls = rsb->res_ls;
12277 + gd_resdata_t *rd;
12278 + uint32_t dir_nodeid;
12281 + dir_nodeid = get_directory_nodeid(rsb);
12283 + if (dir_nodeid == our_nodeid()) {
12284 + error = get_resdata(ls, dir_nodeid, rsb->res_name,
12285 + rsb->res_length, &rd, 1);
12289 + rsb->res_nodeid = rd->rd_master_nodeid;
12290 + set_new_master(rsb);
12292 + /* As we are the only thread doing recovery this
12293 + should be safe. if not then we need to use a different
12294 + ID somehow. We must set it in the RSB before rcom_send_msg
12295 + completes cos we may get a reply quite quickly.
12297 + rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
12299 + recover_list_add(rsb);
12301 + memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
12302 + rc->rc_datalen = rsb->res_length;
12304 + error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
12315 + * Go through local root resources and for each rsb which has a master which
12316 + * has departed, get the new master nodeid from the resdir. The resdir will
12317 + * assign mastery to the first node to look up the new master. That means
12318 + * we'll discover in this lookup if we're the new master of any rsb's.
12320 + * We fire off all the resdir requests individually and asynchronously to the
12321 + * correct resdir node. The replies are processed in rsb_master_recv().
12324 +int restbl_rsb_update(gd_ls_t *ls)
12326 + gd_res_t *rsb, *safe;
12328 + int error = -ENOMEM;
12331 + log_all(ls, "update remastered resources");
12333 + rc = allocate_rcom_buffer(ls);
12337 + list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
12338 + if (!rsb->res_nodeid)
12341 + error = gdlm_recovery_stopped(ls);
12345 + if (in_nodes_gone(ls, rsb->res_nodeid)) {
12346 + error = rsb_master_lookup(rsb, rc);
12353 + error = gdlm_wait_function(ls, &recover_list_empty);
12355 + log_all(ls, "updated %d resources", count);
12358 + free_rcom_buffer(rc);
12364 +int restbl_rsb_update_recv(gd_ls_t *ls, uint32_t nodeid, char *buf, int length,
12368 + uint32_t be_nodeid;
12370 + rsb = recover_list_find(ls, msgid);
12372 + log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
12376 + memcpy(&be_nodeid, buf, sizeof(uint32_t));
12377 + rsb->res_nodeid = be32_to_cpu(be_nodeid);
12378 + set_new_master(rsb);
12379 + recover_list_del(rsb);
12381 + if (recover_list_empty(ls))
12382 + wake_up(&ls->ls_wait_general);
12389 + * This function not used any longer.
12392 +int bulk_master_lookup(gd_ls_t *ls, int nodeid, char *inbuf, int inlen,
12395 + char *inbufptr, *outbufptr;
12398 + * The other node wants nodeids matching the resource names in inbuf.
12399 + * The resource names are packed into inbuf as
12400 + * [len1][name1][len2][name2]... where lenX is 1 byte and nameX is
12401 + * lenX bytes. Matching nodeids are packed into outbuf in order
12402 + * [nodeid1][nodeid2]...
12405 + inbufptr = inbuf;
12406 + outbufptr = outbuf;
12408 + while (inbufptr < inbuf + inlen) {
12409 + gd_resdata_t *rd;
12410 + uint32_t be_nodeid;
12413 + status = get_resdata(ls, nodeid, inbufptr + 1, *inbufptr,
12418 + inbufptr += *inbufptr + 1;
12420 + be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
12421 + memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
12422 + outbufptr += sizeof(uint32_t);
12424 + /* add assertion that outbufptr - outbuf is not > than ... */
12427 + return (outbufptr - outbuf);
12432 diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
12433 --- linux-orig/cluster/dlm/recover.h 1970-01-01 07:30:00.000000000 +0730
12434 +++ linux-patched/cluster/dlm/recover.h 2004-06-25 18:31:07.000000000 +0800
12436 +/******************************************************************************
12437 +*******************************************************************************
12439 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12440 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12442 +** This copyrighted material is made available to anyone wishing to use,
12443 +** modify, copy, or redistribute it subject to the terms and conditions
12444 +** of the GNU General Public License v.2.
12446 +*******************************************************************************
12447 +******************************************************************************/
12449 +#ifndef __RECOVER_DOT_H__
12450 +#define __RECOVER_DOT_H__
12452 +int gdlm_wait_function(gd_ls_t * ls, int (*testfn) (gd_ls_t * ls));
12453 +int gdlm_wait_status_all(gd_ls_t * ls, unsigned int wait_status);
12454 +int gdlm_wait_status_low(gd_ls_t * ls, unsigned int wait_status);
12455 +int gdlm_recovery_stopped(gd_ls_t * ls);
12456 +int recover_list_empty(gd_ls_t * ls);
12457 +int recover_list_count(gd_ls_t * ls);
12458 +void recover_list_add(gd_res_t * rsb);
12459 +void recover_list_del(gd_res_t * rsb);
12460 +void recover_list_dump(gd_ls_t * ls);
12461 +int restbl_lkb_purge(gd_ls_t * ls);
12462 +void restbl_grant_after_purge(gd_ls_t * ls);
12463 +int restbl_rsb_update(gd_ls_t * ls);
12464 +int restbl_rsb_update_recv(gd_ls_t * ls, int nodeid, char *buf, int len,
12466 +int bulk_master_lookup(gd_ls_t * ls, int nodeid, char *inbuf, int inlen,
12469 +#endif /* __RECOVER_DOT_H__ */
12470 diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
12471 --- linux-orig/cluster/dlm/recoverd.c 1970-01-01 07:30:00.000000000 +0730
12472 +++ linux-patched/cluster/dlm/recoverd.c 2004-06-25 18:31:07.000000000 +0800
12474 +/******************************************************************************
12475 +*******************************************************************************
12477 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12478 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12480 +** This copyrighted material is made available to anyone wishing to use,
12481 +** modify, copy, or redistribute it subject to the terms and conditions
12482 +** of the GNU General Public License v.2.
12484 +*******************************************************************************
12485 +******************************************************************************/
12487 +#include "dlm_internal.h"
12488 +#include "nodes.h"
12491 +#include "recover.h"
12492 +#include "lockspace.h"
12493 +#include "lowcomms.h"
12494 +#include "lockqueue.h"
12496 +#include "rebuild.h"
12499 + * next_move actions
12502 +#define DO_STOP (1)
12503 +#define DO_START (2)
12504 +#define DO_FINISH (3)
12505 +#define DO_FINISH_STOP (4)
12506 +#define DO_FINISH_START (5)
12509 + * recoverd_flags for thread
12512 +#define THREAD_STOP (0)
12515 + * local thread variables
12518 +static unsigned long recoverd_flags;
12519 +static struct completion recoverd_run;
12520 +static wait_queue_head_t recoverd_wait;
12521 +static struct task_struct *recoverd_task;
12524 + * Queue of lockspaces (gr_recover_t structs) which need to be
12525 + * started/recovered
12528 +static struct list_head recoverd_start_queue;
12529 +static atomic_t recoverd_start_count;
12531 +extern struct list_head lslist;
12532 +extern spinlock_t lslist_lock;
12534 +void dlm_recoverd_init(void)
12536 + INIT_LIST_HEAD(&recoverd_start_queue);
12537 + atomic_set(&recoverd_start_count, 0);
12539 + init_completion(&recoverd_run);
12540 + init_waitqueue_head(&recoverd_wait);
12541 + memset(&recoverd_flags, 0, sizeof(unsigned long));
12544 +static int enable_locking(gd_ls_t *ls, int event_id)
12548 + spin_lock(&ls->ls_recover_lock);
12549 + if (ls->ls_last_stop < event_id) {
12550 + set_bit(LSFL_LS_RUN, &ls->ls_flags);
12551 + up_write(&ls->ls_in_recovery);
12554 + log_debug(ls, "enable_locking: abort %d", event_id);
12556 + spin_unlock(&ls->ls_recover_lock);
12560 +static int ls_first_start(gd_ls_t *ls, gd_recover_t *gr)
12564 + log_all(ls, "recover event %u (first)", gr->gr_event_id);
12566 + kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
12568 + error = ls_nodes_init(ls, gr);
12570 + log_error(ls, "nodes_init failed %d", error);
12574 + error = resdir_rebuild_local(ls);
12576 + log_error(ls, "resdir_rebuild_local failed %d", error);
12580 + error = resdir_rebuild_wait(ls);
12582 + log_error(ls, "resdir_rebuild_wait failed %d", error);
12586 + log_all(ls, "recover event %u done", gr->gr_event_id);
12587 + kcl_start_done(ls->ls_local_id, gr->gr_event_id);
12594 + * We are given here a new group of nodes which are in the lockspace. We first
12595 + * figure out the differences in ls membership from when we were last running.
12596 + * If nodes from before are gone, then there will be some lock recovery to do.
12597 + * If there are only nodes which have joined, then there's no lock recovery.
12599 + * note: cman requires an rc to finish starting on an revent (where nodes die)
12600 + * before it allows an sevent (where nodes join) to be processed. This means
12601 + * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
12605 +static int ls_reconfig(gd_ls_t *ls, gd_recover_t *gr)
12607 + int error, neg = 0;
12609 + log_all(ls, "recover event %u", gr->gr_event_id);
12612 + * Add or remove nodes from the lockspace's ls_nodes list.
12615 + error = ls_nodes_reconfig(ls, gr, &neg);
12617 + log_error(ls, "nodes_reconfig failed %d", error);
12622 + * Rebuild our own share of the resdir by collecting from all other
12623 + * nodes rsb name/master pairs for which the name hashes to us.
12626 + error = resdir_rebuild_local(ls);
12628 + log_error(ls, "resdir_rebuild_local failed %d", error);
12633 + * Purge resdir-related requests that are being held in requestqueue.
12634 + * All resdir requests from before recovery started are invalid now due
12635 + * to the resdir rebuild and will be resent by the requesting nodes.
12638 + purge_requestqueue(ls);
12639 + set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12642 + * Wait for all nodes to complete resdir rebuild.
12645 + error = resdir_rebuild_wait(ls);
12647 + log_error(ls, "resdir_rebuild_wait failed %d", error);
12652 + * Mark our own lkb's waiting in the lockqueue for remote replies from
12653 + * nodes that are now departed. These will be resent to the new
12654 + * masters in resend_cluster_requests. Also mark resdir lookup
12655 + * requests for resending.
12658 + lockqueue_lkb_mark(ls);
12660 + error = gdlm_recovery_stopped(ls);
12666 + * Clear lkb's for departed nodes. This can't fail since it
12667 + * doesn't involve communicating with other nodes.
12670 + down_write(&ls->ls_rec_rsblist);
12671 + restbl_lkb_purge(ls);
12672 + up_write(&ls->ls_rec_rsblist);
12674 + down_read(&ls->ls_rec_rsblist);
12677 + * Get new master id's for rsb's of departed nodes. This fails
12678 + * if we can't communicate with other nodes.
12681 + error = restbl_rsb_update(ls);
12683 + log_error(ls, "restbl_rsb_update failed %d", error);
12688 + * Send our lkb info to new masters. This fails if we can't
12689 + * communicate with a node.
12692 + error = rebuild_rsbs_send(ls);
12694 + log_error(ls, "rebuild_rsbs_send failed %d", error);
12697 + up_read(&ls->ls_rec_rsblist);
12700 + clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12702 + log_all(ls, "recover event %u done", gr->gr_event_id);
12703 + kcl_start_done(ls->ls_local_id, gr->gr_event_id);
12707 + up_read(&ls->ls_rec_rsblist);
12709 + log_all(ls, "recover event %d error %d", gr->gr_event_id, error);
12713 +static void clear_finished_nodes(gd_ls_t *ls, int finish_event)
12715 + gd_csb_t *csb, *safe;
12717 + list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, csb_list) {
12718 + if (csb->csb_gone_event <= finish_event) {
12719 + list_del(&csb->csb_list);
12720 + release_csb(csb);
12726 + * Between calls to this routine for a ls, there can be multiple stop/start
12727 + * events from cman where every start but the latest is cancelled by stops.
12728 + * There can only be a single finish from cman because every finish requires us
12729 + * to call start_done. A single finish event could be followed by multiple
12730 + * stop/start events. This routine takes any combination of events from cman
12731 + * and boils them down to one course of action.
12734 +int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out)
12736 + LIST_HEAD(events);
12737 + unsigned int cmd = 0, stop, start, finish;
12738 + unsigned int last_stop, last_start, last_finish;
12739 + gd_recover_t *gr = NULL, *start_gr = NULL;
12742 + * Grab the current state of cman/sm events.
12745 + spin_lock(&ls->ls_recover_lock);
12747 + stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
12748 + start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
12749 + finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
12751 + last_stop = ls->ls_last_stop;
12752 + last_start = ls->ls_last_start;
12753 + last_finish = ls->ls_last_finish;
12755 + while (!list_empty(&ls->ls_recover)) {
12756 + gr = list_entry(ls->ls_recover.next, gd_recover_t, gr_list);
12757 + list_del(&gr->gr_list);
12758 + list_add_tail(&gr->gr_list, &events);
12760 + spin_unlock(&ls->ls_recover_lock);
12762 + log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
12763 + last_stop, last_start, last_finish);
12766 + * Toss start events which have since been cancelled.
12769 + while (!list_empty(&events)) {
12770 + GDLM_ASSERT(start,);
12771 + gr = list_entry(events.next, gd_recover_t, gr_list);
12772 + list_del(&gr->gr_list);
12774 + if (gr->gr_event_id <= last_stop) {
12775 + log_debug(ls, "move skip event %u", gr->gr_event_id);
12776 + kfree(gr->gr_nodeids);
12777 + free_dlm_recover(gr);
12780 + log_debug(ls, "move use event %u", gr->gr_event_id);
12781 + GDLM_ASSERT(!start_gr,);
12787 + * Eight possible combinations of events.
12791 + if (!stop && !start && !finish) {
12792 + GDLM_ASSERT(!start_gr,);
12798 + if (!stop && !start && finish) {
12799 + GDLM_ASSERT(!start_gr,);
12800 + GDLM_ASSERT(last_start > last_stop,);
12801 + GDLM_ASSERT(last_finish == last_start,);
12803 + *finish_out = last_finish;
12808 + if (!stop && start && !finish) {
12809 + GDLM_ASSERT(start_gr,);
12810 + GDLM_ASSERT(last_start > last_stop,);
12812 + *gr_out = start_gr;
12817 + if (!stop && start && finish) {
12818 + GDLM_ASSERT(0, printk("finish and start with no stop\n"););
12822 + if (stop && !start && !finish) {
12823 + GDLM_ASSERT(!start_gr,);
12824 + GDLM_ASSERT(last_start == last_stop,);
12830 + if (stop && !start && finish) {
12831 + GDLM_ASSERT(!start_gr,);
12832 + GDLM_ASSERT(last_finish == last_start,);
12833 + GDLM_ASSERT(last_stop == last_start,);
12834 + cmd = DO_FINISH_STOP;
12835 + *finish_out = last_finish;
12840 + if (stop && start && !finish) {
12842 + GDLM_ASSERT(last_start > last_stop,);
12844 + *gr_out = start_gr;
12846 + GDLM_ASSERT(last_stop == last_start,);
12853 + if (stop && start && finish) {
12855 + GDLM_ASSERT(last_start > last_stop,);
12856 + GDLM_ASSERT(last_start > last_finish,);
12857 + cmd = DO_FINISH_START;
12858 + *finish_out = last_finish;
12859 + *gr_out = start_gr;
12861 + GDLM_ASSERT(last_start == last_stop,);
12862 + GDLM_ASSERT(last_start > last_finish,);
12863 + cmd = DO_FINISH_STOP;
12864 + *finish_out = last_finish;
12874 + * This function decides what to do given every combination of current
12875 + * lockspace state and next lockspace state.
12878 +static void do_ls_recovery(gd_ls_t *ls)
12880 + gd_recover_t *gr = NULL;
12881 + int error, cur_state, next_state = 0, do_now, finish_event = 0;
12883 + do_now = next_move(ls, &gr, &finish_event);
12887 + cur_state = ls->ls_state;
12890 + GDLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
12891 + log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
12894 + * LSST_CLEAR - we're not in any recovery state. We can get a stop or
12895 + * a stop and start which equates with a START.
12898 + if (cur_state == LSST_CLEAR) {
12899 + switch (do_now) {
12901 + next_state = LSST_WAIT_START;
12905 + error = ls_reconfig(ls, gr);
12907 + next_state = LSST_WAIT_START;
12909 + next_state = LSST_RECONFIG_DONE;
12912 + case DO_FINISH: /* invalid */
12913 + case DO_FINISH_STOP: /* invalid */
12914 + case DO_FINISH_START: /* invalid */
12922 + * LSST_WAIT_START - we're not running because of getting a stop or
12923 + * failing a start. We wait in this state for another stop/start or
12924 + * just the next start to begin another reconfig attempt.
12927 + if (cur_state == LSST_WAIT_START) {
12928 + switch (do_now) {
12933 + error = ls_reconfig(ls, gr);
12935 + next_state = LSST_WAIT_START;
12937 + next_state = LSST_RECONFIG_DONE;
12940 + case DO_FINISH: /* invalid */
12941 + case DO_FINISH_STOP: /* invalid */
12942 + case DO_FINISH_START: /* invalid */
12950 + * LSST_RECONFIG_DONE - we entered this state after successfully
12951 + * completing ls_reconfig and calling kcl_start_done. We expect to get
12952 + * a finish if everything goes ok. A finish could be followed by stop
12953 + * or stop/start before we get here to check it. Or a finish may never
12954 + * happen, only stop or stop/start.
12957 + if (cur_state == LSST_RECONFIG_DONE) {
12958 + switch (do_now) {
12960 + clear_finished_nodes(ls, finish_event);
12961 + next_state = LSST_CLEAR;
12963 + error = enable_locking(ls, finish_event);
12967 + error = process_requestqueue(ls);
12971 + error = resend_cluster_requests(ls);
12975 + restbl_grant_after_purge(ls);
12977 + log_all(ls, "recover event %u finished", finish_event);
12981 + next_state = LSST_WAIT_START;
12984 + case DO_FINISH_STOP:
12985 + clear_finished_nodes(ls, finish_event);
12986 + next_state = LSST_WAIT_START;
12989 + case DO_FINISH_START:
12990 + clear_finished_nodes(ls, finish_event);
12991 + /* fall into DO_START */
12994 + error = ls_reconfig(ls, gr);
12996 + next_state = LSST_WAIT_START;
12998 + next_state = LSST_RECONFIG_DONE;
13008 + * LSST_INIT - state after ls is created and before it has been
13009 + * started. A start operation will cause the ls to be started for the
13010 + * first time. A failed start will cause to just wait in INIT for
13011 + * another stop/start.
13014 + if (cur_state == LSST_INIT) {
13015 + switch (do_now) {
13017 + error = ls_first_start(ls, gr);
13019 + next_state = LSST_INIT_DONE;
13025 + case DO_FINISH: /* invalid */
13026 + case DO_FINISH_STOP: /* invalid */
13027 + case DO_FINISH_START: /* invalid */
13035 + * LSST_INIT_DONE - after the first start operation is completed
13036 + * successfully and kcl_start_done() called. If there are no errors, a
13037 + * finish will arrive next and we'll move to LSST_CLEAR.
13040 + if (cur_state == LSST_INIT_DONE) {
13041 + switch (do_now) {
13043 + case DO_FINISH_STOP:
13044 + next_state = LSST_WAIT_START;
13048 + case DO_FINISH_START:
13049 + error = ls_reconfig(ls, gr);
13051 + next_state = LSST_WAIT_START;
13053 + next_state = LSST_RECONFIG_DONE;
13057 + next_state = LSST_CLEAR;
13058 + enable_locking(ls, finish_event);
13059 + log_all(ls, "recover event %u finished", finish_event);
13070 + ls->ls_state = next_state;
13073 + kfree(gr->gr_nodeids);
13074 + free_dlm_recover(gr);
13078 +static __inline__ gd_ls_t *get_work(int clear)
13082 + spin_lock(&lslist_lock);
13084 + list_for_each_entry(ls, &lslist, ls_list) {
13086 + if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
13090 + if (test_bit(LSFL_WORK, &ls->ls_flags))
13097 + spin_unlock(&lslist_lock);
13103 + * Thread which does recovery for all lockspaces.
13106 +static int dlm_recoverd(void *arg)
13110 + daemonize("dlm_recoverd");
13111 + recoverd_task = current;
13112 + complete(&recoverd_run);
13114 + while (!test_bit(THREAD_STOP, &recoverd_flags)) {
13115 + wchan_cond_sleep_intr(recoverd_wait, !get_work(0));
13116 + if ((ls = get_work(1)))
13117 + do_ls_recovery(ls);
13120 + complete(&recoverd_run);
13125 + * Mark a specific lockspace as needing work and wake up the thread to do it.
13128 +void recoverd_kick(gd_ls_t *ls)
13130 + set_bit(LSFL_WORK, &ls->ls_flags);
13131 + wake_up(&recoverd_wait);
13135 + * Start the recoverd thread when gdlm is started (before any lockspaces).
13138 +int recoverd_start(void)
13142 + clear_bit(THREAD_STOP, &recoverd_flags);
13143 + error = kernel_thread(dlm_recoverd, NULL, 0);
13148 + wait_for_completion(&recoverd_run);
13155 + * Stop the recoverd thread when gdlm is shut down (all lockspaces are gone).
13158 +int recoverd_stop(void)
13160 + set_bit(THREAD_STOP, &recoverd_flags);
13161 + wake_up(&recoverd_wait);
13162 + wait_for_completion(&recoverd_run);
13166 diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
13167 --- linux-orig/cluster/dlm/recoverd.h 1970-01-01 07:30:00.000000000 +0730
13168 +++ linux-patched/cluster/dlm/recoverd.h 2004-06-25 18:31:07.000000000 +0800
13170 +/******************************************************************************
13171 +*******************************************************************************
13173 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13174 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13176 +** This copyrighted material is made available to anyone wishing to use,
13177 +** modify, copy, or redistribute it subject to the terms and conditions
13178 +** of the GNU General Public License v.2.
13180 +*******************************************************************************
13181 +******************************************************************************/
13183 +#ifndef __RECOVERD_DOT_H__
13184 +#define __RECOVERD_DOT_H__
13186 +void dlm_recoverd_init(void);
13187 +void recoverd_kick(gd_ls_t * ls);
13188 +int recoverd_start(void);
13189 +int recoverd_stop(void);
13191 +#endif /* __RECOVERD_DOT_H__ */
13192 diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
13193 --- linux-orig/cluster/dlm/rsb.c 1970-01-01 07:30:00.000000000 +0730
13194 +++ linux-patched/cluster/dlm/rsb.c 2004-06-25 18:31:07.000000000 +0800
13196 +/******************************************************************************
13197 +*******************************************************************************
13199 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13200 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13202 +** This copyrighted material is made available to anyone wishing to use,
13203 +** modify, copy, or redistribute it subject to the terms and conditions
13204 +** of the GNU General Public License v.2.
13206 +*******************************************************************************
13207 +******************************************************************************/
13209 +#include "dlm_internal.h"
13210 +#include "locking.h"
13211 +#include "memory.h"
13212 +#include "lockqueue.h"
13213 +#include "nodes.h"
13217 +static gd_res_t *search_hashchain(struct list_head *head, gd_res_t *parent,
13218 + char *name, int namelen)
13222 + list_for_each_entry(r, head, res_hashchain) {
13223 + if ((parent == r->res_parent) && (namelen == r->res_length) &&
13224 + (memcmp(name, r->res_name, namelen) == 0)) {
13225 + atomic_inc(&r->res_ref);
13234 + * A way to arbitrarily hold onto an rsb which we already have a reference to
13235 + * to make sure it doesn't go away. Opposite of release_rsb().
13238 +void hold_rsb(gd_res_t *r)
13240 + atomic_inc(&r->res_ref);
13244 + * release_rsb() - Decrement reference count on rsb struct. Free the rsb
13245 + * struct when there are zero references. Every lkb for the rsb adds a
13246 + * reference. When ref is zero there can be no more lkb's for the rsb, on the
13247 + * queue's or anywhere else.
13250 +void release_rsb(gd_res_t *r)
13252 + gd_ls_t *ls = r->res_ls;
13253 + int removed = FALSE;
13255 + write_lock(&ls->ls_reshash_lock);
13256 + atomic_dec(&r->res_ref);
13258 + if (!atomic_read(&r->res_ref)) {
13259 + GDLM_ASSERT(list_empty(&r->res_grantqueue),);
13260 + GDLM_ASSERT(list_empty(&r->res_waitqueue),);
13261 + GDLM_ASSERT(list_empty(&r->res_convertqueue),);
13263 + list_del(&r->res_hashchain);
13265 + write_unlock(&ls->ls_reshash_lock);
13268 + down_read(&ls->ls_gap_rsblist);
13269 + if (r->res_parent)
13270 + list_del(&r->res_subreslist);
13272 + list_del(&r->res_rootlist);
13273 + up_read(&ls->ls_gap_rsblist);
13276 + * Remove resdir entry if this was a locally mastered root rsb.
13278 + if (!r->res_parent && !r->res_nodeid) {
13279 + if (get_directory_nodeid(r) != our_nodeid())
13280 + remote_remove_resdata(r->res_ls,
13281 + get_directory_nodeid(r),
13284 + r->res_resdir_seq);
13286 + remove_resdata(r->res_ls, our_nodeid(),
13287 + r->res_name, r->res_length,
13288 + r->res_resdir_seq);
13291 + if (r->res_lvbptr)
13292 + free_lvb(r->res_lvbptr);
13299 + * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
13300 + * If the rsb exists, its ref count is incremented by this function. If it
13301 + * doesn't exist, it's created with a ref count of one.
13304 +int find_or_create_rsb(gd_ls_t *ls, gd_res_t *parent, char *name, int namelen,
13305 + int create, gd_res_t **rp)
13308 + gd_res_t *r, *tmp;
13309 + int error = -ENOMEM;
13311 + GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
13313 + hash = gdlm_hash(name, namelen);
13314 + hash &= ls->ls_hashmask;
13316 + read_lock(&ls->ls_reshash_lock);
13317 + r = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
13318 + read_unlock(&ls->ls_reshash_lock);
13327 + r = allocate_rsb(ls, namelen);
13331 + INIT_LIST_HEAD(&r->res_subreslist);
13332 + INIT_LIST_HEAD(&r->res_grantqueue);
13333 + INIT_LIST_HEAD(&r->res_convertqueue);
13334 + INIT_LIST_HEAD(&r->res_waitqueue);
13336 + memcpy(r->res_name, name, namelen);
13337 + r->res_length = namelen;
13339 + init_rwsem(&r->res_lock);
13340 + atomic_set(&r->res_ref, 1);
13343 + r->res_parent = parent;
13344 + r->res_depth = parent->res_depth + 1;
13345 + r->res_root = parent->res_root;
13346 + r->res_nodeid = parent->res_nodeid;
13348 + r->res_parent = NULL;
13349 + r->res_depth = 1;
13351 + r->res_nodeid = -1;
13354 + write_lock(&ls->ls_reshash_lock);
13355 + tmp = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
13357 + write_unlock(&ls->ls_reshash_lock);
13361 + list_add(&r->res_hashchain, &ls->ls_reshashtbl[hash]);
13362 + write_unlock(&ls->ls_reshash_lock);
13364 + down_read(&ls->ls_gap_rsblist);
13366 + list_add_tail(&r->res_subreslist,
13367 + &r->res_root->res_subreslist);
13369 + list_add(&r->res_rootlist, &ls->ls_rootres);
13370 + up_read(&ls->ls_gap_rsblist);
13384 + * Add a LKB to a resource's grant/convert/wait queue. in order
13387 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
13389 + gd_lkb_t *lkb = NULL;
13391 + list_for_each_entry(lkb, head, lkb_statequeue) {
13392 + if (lkb->lkb_rqmode < mode)
13397 + /* No entries in the queue, we are alone */
13398 + list_add_tail(new, head);
13400 + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
13405 + * The rsb res_lock must be held in write when this function is called.
13408 +void lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13411 + GDLM_ASSERT(!lkb->lkb_status, printk("status=%u\n", lkb->lkb_status););
13413 + lkb->lkb_status = type;
13416 + case GDLM_LKSTS_WAITING:
13417 + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
13420 + case GDLM_LKSTS_GRANTED:
13421 + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
13422 + lkb->lkb_grmode);
13425 + case GDLM_LKSTS_CONVERT:
13426 + if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
13427 + list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
13430 + if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)
13431 + list_add_tail(&lkb->lkb_statequeue,
13432 + &r->res_convertqueue);
13434 + lkb_add_ordered(&lkb->lkb_statequeue,
13435 + &r->res_convertqueue, lkb->lkb_rqmode);
13443 +void res_lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13445 + down_write(&r->res_lock);
13446 + lkb_enqueue(r, lkb, type);
13447 + up_write(&r->res_lock);
13451 + * The rsb res_lock must be held in write when this function is called.
13454 +int lkb_dequeue(gd_lkb_t *lkb)
13456 + int status = lkb->lkb_status;
13461 + lkb->lkb_status = 0;
13462 + list_del(&lkb->lkb_statequeue);
13468 +int res_lkb_dequeue(gd_lkb_t *lkb)
13472 + down_write(&lkb->lkb_resource->res_lock);
13473 + status = lkb_dequeue(lkb);
13474 + up_write(&lkb->lkb_resource->res_lock);
13480 + * The rsb res_lock must be held in write when this function is called.
13483 +int lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13487 + status = lkb_dequeue(lkb);
13488 + lkb_enqueue(r, lkb, type);
13493 +int res_lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13497 + down_write(&r->res_lock);
13498 + status = lkb_swqueue(r, lkb, type);
13499 + up_write(&r->res_lock);
13503 diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
13504 --- linux-orig/cluster/dlm/rsb.h 1970-01-01 07:30:00.000000000 +0730
13505 +++ linux-patched/cluster/dlm/rsb.h 2004-06-25 18:31:07.000000000 +0800
13507 +/******************************************************************************
13508 +*******************************************************************************
13510 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13511 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13513 +** This copyrighted material is made available to anyone wishing to use,
13514 +** modify, copy, or redistribute it subject to the terms and conditions
13515 +** of the GNU General Public License v.2.
13517 +*******************************************************************************
13518 +******************************************************************************/
13520 +#ifndef __RSB_DOT_H__
13521 +#define __RSB_DOT_H__
13523 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
13524 +void _release_rsb(gd_res_t * r);
13525 +void release_rsb(gd_res_t * r);
13526 +void hold_rsb(gd_res_t * r);
13527 +int find_or_create_rsb(gd_ls_t * ls, gd_res_t * parent, char *name, int namelen,
13528 + int create, gd_res_t ** rp);
13529 +void lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13530 +void res_lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13531 +int lkb_dequeue(gd_lkb_t * lkb);
13532 +int res_lkb_dequeue(gd_lkb_t * lkb);
13533 +int lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13534 +int res_lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13536 +#endif /* __RSB_DOT_H__ */
13537 diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
13538 --- linux-orig/cluster/dlm/util.c 1970-01-01 07:30:00.000000000 +0730
13539 +++ linux-patched/cluster/dlm/util.c 2004-06-25 18:31:07.000000000 +0800
13541 +/******************************************************************************
13542 +*******************************************************************************
13544 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13545 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13547 +** This copyrighted material is made available to anyone wishing to use,
13548 +** modify, copy, or redistribute it subject to the terms and conditions
13549 +** of the GNU General Public License v.2.
13551 +*******************************************************************************
13552 +******************************************************************************/
13554 +#include "dlm_internal.h"
13556 +static const uint32_t crc_32_tab[] = {
13557 + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
13558 + 0xe963a535, 0x9e6495a3,
13559 + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
13560 + 0xe7b82d07, 0x90bf1d91,
13561 + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
13562 + 0xf4d4b551, 0x83d385c7,
13563 + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
13564 + 0xfa0f3d63, 0x8d080df5,
13565 + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
13566 + 0xd20d85fd, 0xa50ab56b,
13567 + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
13568 + 0xdcd60dcf, 0xabd13d59,
13569 + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
13570 + 0xcfba9599, 0xb8bda50f,
13571 + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
13572 + 0xc1611dab, 0xb6662d3d,
13573 + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
13574 + 0x9fbfe4a5, 0xe8b8d433,
13575 + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
13576 + 0x91646c97, 0xe6635c01,
13577 + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
13578 + 0x8208f4c1, 0xf50fc457,
13579 + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
13580 + 0x8cd37cf3, 0xfbd44c65,
13581 + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
13582 + 0xa4d1c46d, 0xd3d6f4fb,
13583 + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
13584 + 0xaa0a4c5f, 0xdd0d7cc9,
13585 + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
13586 + 0xb966d409, 0xce61e49f,
13587 + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
13588 + 0xb7bd5c3b, 0xc0ba6cad,
13589 + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
13590 + 0x04db2615, 0x73dc1683,
13591 + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
13592 + 0x0a00ae27, 0x7d079eb1,
13593 + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
13594 + 0x196c3671, 0x6e6b06e7,
13595 + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
13596 + 0x17b7be43, 0x60b08ed5,
13597 + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
13598 + 0x3fb506dd, 0x48b2364b,
13599 + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
13600 + 0x316e8eef, 0x4669be79,
13601 + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
13602 + 0x220216b9, 0x5505262f,
13603 + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
13604 + 0x2cd99e8b, 0x5bdeae1d,
13605 + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
13606 + 0x72076785, 0x05005713,
13607 + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
13608 + 0x7cdcefb7, 0x0bdbdf21,
13609 + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
13610 + 0x6fb077e1, 0x18b74777,
13611 + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
13612 + 0x616bffd3, 0x166ccf45,
13613 + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
13614 + 0x4969474d, 0x3e6e77db,
13615 + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
13616 + 0x47b2cf7f, 0x30b5ffe9,
13617 + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
13618 + 0x54de5729, 0x23d967bf,
13619 + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
13620 + 0x5a05df1b, 0x2d02ef8d
13624 + * gdlm_hash - hash an array of data
13625 + * @data: the data to be hashed
13626 + * @len: the length of data to be hashed
13628 + * Copied from GFS.
13630 + * Take some data and convert it to a 32-bit hash.
13632 + * The hash function is a 32-bit CRC of the data. The algorithm uses
13633 + * the crc_32_tab table above.
13635 + * This may not be the fastest hash function, but it does a fair bit better
13636 + * at providing uniform results than the others I've looked at. That's
13637 + * really important for efficient directories.
13639 + * Returns: the hash
13642 +uint32_t gdlm_hash(const char *data, int len)
13644 + uint32_t hash = 0xFFFFFFFF;
13646 + for (; len--; data++)
13647 + hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
13654 +uint32_t gdlm_next_power2(uint32_t val)
13658 + for (x = 1; x < val; x <<= 1) ;
13663 +void print_lkb(gd_lkb_t *lkb)
13665 + printk("dlm: lkb id=%x remid=%x flags=%x status=%x rq=%d gr=%d "
13666 + "nodeid=%u lqstate=%x lqflags=%x\n",
13667 + lkb->lkb_id, lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_status,
13668 + lkb->lkb_rqmode, lkb->lkb_grmode, lkb->lkb_nodeid,
13669 + lkb->lkb_lockqueue_state, lkb->lkb_lockqueue_flags);
13671 diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
13672 --- linux-orig/cluster/dlm/util.h 1970-01-01 07:30:00.000000000 +0730
13673 +++ linux-patched/cluster/dlm/util.h 2004-06-25 18:31:07.000000000 +0800
13675 +/******************************************************************************
13676 +*******************************************************************************
13678 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13679 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13681 +** This copyrighted material is made available to anyone wishing to use,
13682 +** modify, copy, or redistribute it subject to the terms and conditions
13683 +** of the GNU General Public License v.2.
13685 +*******************************************************************************
13686 +******************************************************************************/
13688 +#ifndef __UTIL_DOT_H__
13689 +#define __UTIL_DOT_H__
13691 +uint32_t gdlm_hash(const char *data, int len);
13692 +uint32_t gdlm_next_power2(uint32_t val);
13694 +void print_lkb(gd_lkb_t *lkb);
13697 diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
13698 --- linux-orig/include/cluster/dlm.h 1970-01-01 07:30:00.000000000 +0730
13699 +++ linux-patched/include/cluster/dlm.h 2004-06-25 18:31:07.000000000 +0800
13701 +/******************************************************************************
13702 +*******************************************************************************
13704 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13705 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13707 +** This copyrighted material is made available to anyone wishing to use,
13708 +** modify, copy, or redistribute it subject to the terms and conditions
13709 +** of the GNU General Public License v.2.
13711 +*******************************************************************************
13712 +******************************************************************************/
13714 +#ifndef __DLM_DOT_H__
13715 +#define __DLM_DOT_H__
13718 + * Interface to DLM - routines and structures to use DLM lockspaces.
13725 +#define DLM_LOCK_IV (-1) /* invalid */
13726 +#define DLM_LOCK_NL (0) /* null */
13727 +#define DLM_LOCK_CR (1) /* concurrent read */
13728 +#define DLM_LOCK_CW (2) /* concurrent write */
13729 +#define DLM_LOCK_PR (3) /* protected read */
13730 +#define DLM_LOCK_PW (4) /* protected write */
13731 +#define DLM_LOCK_EX (5) /* exclusive */
13734 + * Maximum size in bytes of a dlm_lock name
13737 +#define DLM_RESNAME_MAXLEN (64)
13740 + * Size in bytes of Lock Value Block
13743 +#define DLM_LVB_LEN (32)
13746 + * Flags to dlm_new_lockspace
13748 + * DLM_LSF_NOTIMERS
13750 + * Do not subject locks in this lockspace to time-outs.
13754 +#define DLM_LSF_NOTIMERS (1)
13757 + * Flags to dlm_lock
13759 + * DLM_LKF_NOQUEUE
13761 + * Do not queue the lock request on the wait queue if it cannot be granted
13762 + * immediately. If the lock cannot be granted because of this flag, DLM will
13763 + * either return -EAGAIN from the dlm_lock call or will return 0 from
13764 + * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
13766 + * DLM_LKF_CONVERT
13768 + * Indicates a lock conversion request. For conversions the name and namelen
13769 + * are ignored and the lock ID in the LKSB is used to identify the lock.
13773 + * Requests DLM to return the current contents of the lock value block in the
13774 + * lock status block. When this flag is set in a lock conversion from PW or EX
13775 + * modes, DLM assigns the value specified in the lock status block to the lock
13776 + * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
13777 + * containing application-specific information.
13781 + * Force a conversion lock request to the back of the convert queue. All other
13782 + * conversion requests ahead of it must be granted before it can be granted.
13783 + * This enforces a FIFO ordering on the convert queue. When this flag is set,
13784 + * indefinite postponement is averted. This flag is allowed only when
13785 + * converting a lock to a more restrictive mode.
13789 + * Used to cancel a pending conversion (with dlm_unlock). Lock is returned to
13790 + * previously granted mode.
13792 + * DLM_LKF_IVVALBLK
13794 + * Invalidate/clear the lock value block.
13796 + * DLM_LKF_CONVDEADLK
13798 + * The granted mode of a lock being converted (from a non-NL mode) can be
13799 + * changed to NL in the process of acquiring the requested mode to avoid
13800 + * conversion deadlock.
13802 + * DLM_LKF_PERSISTENT
13804 + * Only relevant to locks originating in userspace. Signals to the ioctl.c code
13805 + * that this lock should not be unlocked when the process exits.
13807 + * DLM_LKF_NODLKWT
13809 + * This lock is not to be checked for conversion deadlocks.
13811 + * DLM_LKF_NODLCKBLK
13813 + * not yet implemented
13815 + * DLM_LKF_EXPEDITE
13817 + * If this lock conversion cannot be granted immediately it is to go to the
13818 + * head of the conversion queue regardless of its requested lock mode.
13820 + * DLM_LKF_NOQUEUEBAST
13822 + * Send blocking AST's before returning -EAGAIN to the caller. It is only
13823 + * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
13824 + * NOQUEUE requests otherwise.
13828 +#define DLM_LKF_NOQUEUE (0x00000001)
13829 +#define DLM_LKF_CANCEL (0x00000002)
13830 +#define DLM_LKF_CONVERT (0x00000004)
13831 +#define DLM_LKF_VALBLK (0x00000008)
13832 +#define DLM_LKF_QUECVT (0x00000010)
13833 +#define DLM_LKF_IVVALBLK (0x00000020)
13834 +#define DLM_LKF_CONVDEADLK (0x00000040)
13835 +#define DLM_LKF_PERSISTENT (0x00000080)
13836 +#define DLM_LKF_NODLCKWT (0x00000100)
13837 +#define DLM_LKF_NODLCKBLK (0x00000200)
13838 +#define DLM_LKF_EXPEDITE (0x00000400)
13839 +#define DLM_LKF_NOQUEUEBAST (0x00000800)
13842 + * Some return codes that are not not in errno.h
13845 +#define DLM_ECANCEL (0x10001)
13846 +#define DLM_EUNLOCK (0x10002)
13848 +typedef void dlm_lockspace_t;
13851 + * Lock range structure
13854 +struct dlm_range {
13855 + uint64_t ra_start;
13860 + * Lock status block
13862 + * Use this structure to specify the contents of the lock value block. For a
13863 + * conversion request, this structure is used to specify the lock ID of the
13864 + * lock. DLM writes the status of the lock request and the lock ID assigned
13865 + * to the request in the lock status block.
13867 + * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
13868 + * It is available when dlm_lock returns.
13870 + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
13871 + * shown for the DLM_LKF_VALBLK flag.
13873 + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
13874 + * it was first demoted to NL to avoid conversion deadlock.
13876 + * sb_status: the returned status of the lock request set prior to AST
13877 + * execution. Possible return values:
13879 + * 0 if lock request was successful
13880 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
13881 + * -ENOMEM if there is no memory to process request
13882 + * -EINVAL if there are invalid parameters
13883 + * -DLM_EUNLOCK if unlock request was successful
13887 +#define DLM_SBF_DEMOTED (0x01)
13891 + uint32_t sb_lkid;
13893 + char * sb_lvbptr;
13897 + * These defines are the bits that make up the
13901 +/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
13902 + * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
13903 +#define DLM_LOCK_THIS 0x0007
13904 +#define DLM_QUERY_MODE_MASK 0x0007
13906 +/* Bits 3, 4, 5 bitmap of queue(s) to query */
13907 +#define DLM_QUERY_QUEUE_WAIT 0x0008
13908 +#define DLM_QUERY_QUEUE_CONVERT 0x0010
13909 +#define DLM_QUERY_QUEUE_GRANT 0x0020
13910 +#define DLM_QUERY_QUEUE_GRANTED 0x0030 /* Shorthand */
13911 +#define DLM_QUERY_QUEUE_ALL 0x0038 /* Shorthand */
13913 +/* Bit 6, Return only the information that can be established without a network
13914 + * round-trip. The caller must be aware of the implications of this. Useful for
13915 + * just getting the master node id or resource name. */
13916 +#define DLM_QUERY_LOCAL 0x0040
13918 +/* Bits 8 up, query type */
13919 +#define DLM_QUERY_LOCKS_HIGHER 0x0100
13920 +#define DLM_QUERY_LOCKS_LOWER 0x0200
13921 +#define DLM_QUERY_LOCKS_EQUAL 0x0300
13922 +#define DLM_QUERY_LOCKS_BLOCKING 0x0400
13923 +#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
13924 +#define DLM_QUERY_LOCKS_ALL 0x0600
13925 +#define DLM_QUERY_MASK 0x0F00
13927 +/* GRMODE is the default for mode comparisons,
13928 + RQMODE might also be handy */
13929 +#define DLM_QUERY_GRMODE 0x0000
13930 +#define DLM_QUERY_RQMODE 0x1000
13932 +/* Structures passed into and out of the query */
13934 +struct dlm_lockinfo {
13935 + int lki_lkid; /* Lock ID on originating node */
13936 + int lki_mstlkid; /* Lock ID on master node */
13938 + int lki_node; /* Originating node (not master) */
13939 + uint8_t lki_state; /* Queue the lock is on */
13940 + uint8_t lki_grmode; /* Granted mode */
13941 + uint8_t lki_rqmode; /* Requested mode */
13942 + struct dlm_range lki_grrange; /* Granted range, if applicable */
13943 + struct dlm_range lki_rqrange; /* Requested range, if applicable */
13946 +struct dlm_resinfo {
13948 + int rsi_grantcount; /* No. of nodes on grant queue */
13949 + int rsi_convcount; /* No. of nodes on convert queue */
13950 + int rsi_waitcount; /* No. of nodes on wait queue */
13951 + int rsi_masternode; /* Master for this resource */
13952 + char rsi_name[DLM_RESNAME_MAXLEN]; /* Resource name */
13953 + char rsi_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable
13957 +struct dlm_queryinfo {
13958 + struct dlm_resinfo *gqi_resinfo;
13959 + struct dlm_lockinfo *gqi_lockinfo; /* This points to an array
13961 + int gqi_locksize; /* input */
13962 + int gqi_lockcount; /* output */
13969 + * Starts and initializes DLM threads and structures. Creation of the first
13970 + * lockspace will call this if it has not been called already.
13972 + * Returns: 0 if successful, -EXXX on error
13975 +int dlm_init(void);
13980 + * Stops DLM threads.
13982 + * Returns: 0 if successful, -EXXX on error
13985 +int dlm_release(void);
13988 + * dlm_new_lockspace
13990 + * Starts a lockspace with the given name. If the named lockspace exists in
13991 + * the cluster, the calling node joins it.
13994 +int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
13998 + * dlm_release_lockspace
14000 + * Stop a lockspace.
14003 +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
14008 + * Make an asyncronous request to acquire or convert a lock on a named
14011 + * lockspace: context for the request
14012 + * mode: the requested mode of the lock (DLM_LOCK_)
14013 + * lksb: lock status block for input and async return values
14014 + * flags: input flags (DLM_LKF_)
14015 + * name: name of the resource to lock, can be binary
14016 + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
14017 + * parent: the lock ID of a parent lock or 0 if none
14018 + * lockast: function DLM executes when it completes processing the request
14019 + * astarg: argument passed to lockast and bast functions
14020 + * bast: function DLM executes when this lock later blocks another request
14023 + * 0 if request is successfully queued for processing
14024 + * -EINVAL if any input parameters are invalid
14025 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14026 + * -ENOMEM if there is no memory to process request
14027 + * -ENOTCONN if there is a communication error
14029 + * If the call to dlm_lock returns an error then the operation has failed and
14030 + * the AST routine will not be called. If dlm_lock returns 0 it is still
14031 + * possible that the lock operation will fail. The AST routine will be called
14032 + * when the locking is complete and the status is returned in the lksb.
14034 + * If the AST routines or parameter are passed to a conversion operation then
14035 + * they will overwrite those values that were passed to a previous dlm_lock
14038 + * AST routines should not block (at least not for long), but may make
14039 + * any locking calls they please.
14042 +int dlm_lock(dlm_lockspace_t *lockspace,
14044 + struct dlm_lksb *lksb,
14047 + unsigned int namelen,
14049 + void (*lockast) (void *astarg),
14051 + void (*bast) (void *astarg, int mode),
14052 + struct dlm_range *range);
14057 + * Asynchronously release a lock on a resource. The AST routine is called
14058 + * when the resource is successfully unlocked.
14060 + * lockspace: context for the request
14061 + * lkid: the lock ID as returned in the lksb
14062 + * flags: input flags (DLM_LKF_)
14063 + * lksb: if NULL the lksb parameter passed to last lock request is used
14064 + * astarg: if NULL, astarg in last lock request is used
14067 + * 0 if request is successfully queued for processing
14068 + * -EINVAL if any input parameters are invalid
14069 + * -ENOTEMPTY if the lock still has sublocks
14070 + * -EBUSY if the lock is waiting for a remote lock operation
14071 + * -ENOTCONN if there is a communication error
14074 +extern int dlm_unlock(dlm_lockspace_t *lockspace,
14077 + struct dlm_lksb *lksb,
14080 +/* Query interface
14082 + * Query the other holders of a resource, given a known lock ID
14084 + * lockspace: context for the request
14085 + * lksb: LKSB, sb_lkid contains the lock ID of a valid lock
14086 + * on the resource. sb_status will contain the status
14087 + * of the request on completion.
14088 + * query: query bitmap see DLM_QUERY_* above
14089 + * qinfo: pointer to dlm_queryinfo structure
14090 + * ast_routine: AST routine to call on completion
14091 + * artarg: argument to AST routine. It is "traditional"
14092 + * to put the qinfo pointer into lksb->sb_lvbptr
14093 + * and pass the lksb in here.
14095 +extern int dlm_query(dlm_lockspace_t *lockspace,
14096 + struct dlm_lksb *lksb,
14098 + struct dlm_queryinfo *qinfo,
14099 + void (ast_routine(void *)),
14102 +#endif /* __KERNEL__ */
14104 +#endif /* __DLM_DOT_H__ */
14105 diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
14106 --- linux-orig/include/cluster/dlm_device.h 1970-01-01 07:30:00.000000000 +0730
14107 +++ linux-patched/include/cluster/dlm_device.h 2004-06-25 18:31:07.000000000 +0800
14109 +/******************************************************************************
14110 +*******************************************************************************
14112 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14113 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14115 +** This copyrighted material is made available to anyone wishing to use,
14116 +** modify, copy, or redistribute it subject to the terms and conditions
14117 +** of the GNU General Public License v.2.
14119 +*******************************************************************************
14120 +******************************************************************************/
14122 +/* This is the device interface for dlm, most users will use a library
14126 +/* Version of the device interface */
14127 +#define DLM_DEVICE_VERSION_MAJOR 2
14128 +#define DLM_DEVICE_VERSION_MINOR 0
14129 +#define DLM_DEVICE_VERSION_PATCH 0
14131 +/* struct passed to the lock write */
14132 +struct dlm_lock_params {
14133 + uint32_t version[3];
14139 + struct dlm_range range;
14144 + struct dlm_lksb *lksb;
14149 +/* struct read from the "device" fd,
14150 + consists mainly of userspace pointers for the library to use */
14151 +struct dlm_lock_result {
14154 + void (*astaddr)(void *astparam);
14155 + struct dlm_lksb *user_lksb;
14156 + struct dlm_lksb lksb; /* But this has real data in it */
14157 + uint8_t bast_mode; /* Not yet used */
14160 +/* commands passed to the device */
14161 +#define DLM_USER_LOCK 1
14162 +#define DLM_USER_UNLOCK 2
14163 +#define DLM_USER_QUERY 3
14165 +/* Arbitrary length restriction */
14166 +#define MAX_LS_NAME_LEN 64
14168 +/* ioctls on the device */
14169 +#define DLM_CREATE_LOCKSPACE _IOW('D', 0x01, char *)
14170 +#define DLM_RELEASE_LOCKSPACE _IOW('D', 0x02, char *)
14171 +#define DLM_FORCE_RELEASE_LOCKSPACE _IOW('D', 0x03, char *)