]> git.pld-linux.org Git - packages/kernel.git/blame - linux-cluster-dlm.patch
- _rel 1.19,
[packages/kernel.git] / linux-cluster-dlm.patch
CommitLineData
4bf12011 1# Add DLM to the build system
2diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
3--- linux-2.6.7/cluster/Kconfig 2004-06-17 15:00:36.000000000 +0800
4+++ linux/cluster/Kconfig 2004-06-17 15:00:57.000000000 +0800
5@@ -10,4 +10,22 @@ config CLUSTER
6 needed by all the other components. It provides membership services
7 for those other subsystems.
8
9+config CLUSTER_DLM
10+ tristate "Distributed Lock Manager"
11+ depends on CLUSTER
12+ ---help---
13+ A fully distributed lock manager, providing cluster-wide locking services
14+ and protected lock namespaces for kernel and userland applications.
15+
16+config CLUSTER_DLM_PROCLOCKS
17+ boolean "/proc/locks support for DLM"
18+ depends on CLUSTER_DLM
19+ depends on PROC_FS
20+ ---help---
21+ If this option is enabled a file will appear in /proc/cluster/dlm_locks.
22+ write into this "file" the name of a lockspace known to the DLM and then
23+ read out a list of all the resources and locks in that lockspace that are
24+ known to the local node. Note because the DLM is distributed this may not
25+ be the full lock picture.
26+
27 endmenu
28diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
29--- linux-2.6.7/cluster/Makefile 2004-06-17 15:00:36.000000000 +0800
30+++ linux/cluster/Makefile 2004-06-17 15:00:57.000000000 +0800
31@@ -1,3 +1,4 @@
32 obj-y := nocluster.o
33
34 obj-$(CONFIG_CLUSTER) += cman/
35+obj-$(CONFIG_CLUSTER_DLM) += dlm/
36diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile
37--- linux-2.6.7/cluster/dlm/Makefile 1970-01-01 07:30:00.000000000 +0730
38+++ linux/cluster/dlm/Makefile 2004-06-17 15:00:57.000000000 +0800
39@@ -0,0 +1,23 @@
40+dlm-objs := ast.o \
41+ config.o \
42+ device.o \
43+ dir.o \
44+ lkb.o \
45+ locking.o \
46+ lockqueue.o \
47+ lockspace.o \
48+ lowcomms.o \
49+ main.o \
50+ memory.o \
51+ midcomms.o \
52+ nodes.o \
53+ proc.o \
54+ queries.o \
55+ rebuild.o \
56+ reccomms.o \
57+ recover.o \
58+ recoverd.o \
59+ rsb.o \
60+ util.o \
61+
62+obj-$(CONFIG_CLUSTER_DLM) += dlm.o
63diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
64--- linux-orig/cluster/dlm/ast.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
65+++ linux-patched/cluster/dlm/ast.c 2004-06-29 20:01:19.000000000 +0800
66@@ -0,0 +1,560 @@
4bf12011 67+/******************************************************************************
68+*******************************************************************************
69+**
70+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
71+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
72+**
73+** This copyrighted material is made available to anyone wishing to use,
74+** modify, copy, or redistribute it subject to the terms and conditions
75+** of the GNU General Public License v.2.
76+**
77+*******************************************************************************
78+******************************************************************************/
79+
80+/*
81+ * This delivers ASTs and checks for dead remote requests and deadlocks.
82+ */
83+
84+#include <linux/timer.h>
85+
86+#include "dlm_internal.h"
87+#include "rsb.h"
88+#include "lockqueue.h"
89+#include "dir.h"
90+#include "locking.h"
91+#include "lkb.h"
92+#include "lowcomms.h"
93+#include "midcomms.h"
94+#include "ast.h"
95+#include "nodes.h"
96+#include "config.h"
97+
98+/* Wake up flags for astd */
99+#define GDLMD_WAKE_ASTS 1
100+#define GDLMD_WAKE_TIMER 2
101+
102+static struct list_head _deadlockqueue;
103+static struct semaphore _deadlockqueue_lock;
104+static struct list_head _lockqueue;
105+static struct semaphore _lockqueue_lock;
106+static struct timer_list _lockqueue_timer;
107+static struct list_head _ast_queue;
108+static struct semaphore _ast_queue_lock;
109+static wait_queue_head_t _astd_waitchan;
110+static atomic_t _astd_running;
111+static long _astd_pid;
112+static unsigned long _astd_wakeflags;
113+static struct completion _astd_done;
114+
115+void add_to_lockqueue(gd_lkb_t *lkb)
116+{
117+ /* Time stamp the entry so we know if it's been waiting too long */
118+ lkb->lkb_lockqueue_time = jiffies;
119+
120+ down(&_lockqueue_lock);
121+ list_add(&lkb->lkb_lockqueue, &_lockqueue);
122+ up(&_lockqueue_lock);
123+}
124+
125+void remove_from_lockqueue(gd_lkb_t *lkb)
126+{
127+ down(&_lockqueue_lock);
128+ list_del(&lkb->lkb_lockqueue);
129+ up(&_lockqueue_lock);
130+}
131+
132+void add_to_deadlockqueue(gd_lkb_t *lkb)
133+{
134+ if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
135+ return;
136+ lkb->lkb_duetime = jiffies;
137+ down(&_deadlockqueue_lock);
138+ list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
139+ up(&_deadlockqueue_lock);
140+}
141+
142+void remove_from_deadlockqueue(gd_lkb_t *lkb)
143+{
144+ if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
145+ return;
146+
147+ down(&_deadlockqueue_lock);
148+ list_del(&lkb->lkb_deadlockq);
149+ up(&_deadlockqueue_lock);
150+
151+ /* Invalidate the due time */
152+ memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
153+}
154+
4bf12011 155+/*
5cdbd17b 156+ * deliver an AST to a user
4bf12011 157+ */
158+
5cdbd17b 159+static void deliver_ast(gd_lkb_t *lkb, uint16_t ast_type)
4bf12011 160+{
161+ void (*cast) (long param) = lkb->lkb_astaddr;
162+ void (*bast) (long param, int mode) = lkb->lkb_bastaddr;
163+
5cdbd17b
AM
164+ if (ast_type == AST_BAST) {
165+ if (!bast)
166+ return;
167+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
168+ return;
4bf12011 169+ bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode);
5cdbd17b
AM
170+ } else {
171+ if (!cast)
172+ return;
173+ cast(lkb->lkb_astparam);
4bf12011 174+ }
4bf12011 175+}
176+
177+/*
178+ * Queue an AST for delivery, this will only deal with
179+ * kernel ASTs, usermode API will piggyback on top of this.
180+ *
181+ * This can be called in either the user or DLM context.
182+ * ASTs are queued EVEN IF we are already running in gdlm_astd
183+ * context as we don't know what other locks are held (eg we could
184+ * be being called from a lock operation that was called from
185+ * another AST!
186+ * If the AST is to be queued remotely then a message is sent to
187+ * the target system via midcomms.
188+ */
189+
5cdbd17b 190+void queue_ast(gd_lkb_t *lkb, uint16_t flags, uint8_t rqmode)
4bf12011 191+{
192+ struct gd_remlockrequest req;
193+
194+ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
195+ /*
196+ * Send a message to have an ast queued remotely. Note: we do
197+ * not send remote completion asts, they are handled as part of
198+ * remote lock granting.
199+ */
5cdbd17b 200+ if (flags & AST_BAST) {
4bf12011 201+ req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
202+ req.rr_header.rh_length = sizeof(req);
203+ req.rr_header.rh_flags = 0;
204+ req.rr_header.rh_lkid = lkb->lkb_id;
205+ req.rr_header.rh_lockspace =
206+ lkb->lkb_resource->res_ls->ls_global_id;
207+ req.rr_status = lkb->lkb_retstatus;
208+ req.rr_remlkid = lkb->lkb_remid;
209+ req.rr_rqmode = rqmode;
210+
211+ midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
5cdbd17b 212+ lkb->lkb_resource->res_ls->ls_allocation);
4bf12011 213+ } else if (lkb->lkb_retstatus == -EDEADLOCK) {
214+ /*
215+ * We only queue remote Completion ASTs here for error
216+ * completions that happen out of band.
217+ * DEADLOCK is one such.
218+ */
4bf12011 219+ req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
220+ req.rr_header.rh_length = sizeof(req);
221+ req.rr_header.rh_flags = 0;
222+ req.rr_header.rh_lkid = lkb->lkb_id;
223+ req.rr_header.rh_lockspace =
224+ lkb->lkb_resource->res_ls->ls_global_id;
225+ req.rr_status = lkb->lkb_retstatus;
226+ req.rr_remlkid = lkb->lkb_remid;
227+ req.rr_rqmode = rqmode;
228+
229+ midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
5cdbd17b 230+ lkb->lkb_resource->res_ls->ls_allocation);
4bf12011 231+ }
232+ } else {
233+ /*
5cdbd17b 234+ * Prepare info that will be returned in ast/bast.
4bf12011 235+ */
236+
5cdbd17b 237+ if (flags & AST_BAST) {
4bf12011 238+ lkb->lkb_bastmode = rqmode;
239+ } else {
240+ lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
241+
242+ if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
243+ lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
244+ else
245+ lkb->lkb_lksb->sb_flags = 0;
246+ }
247+
4bf12011 248+ down(&_ast_queue_lock);
5cdbd17b
AM
249+ if (lkb->lkb_astflags & AST_DEL)
250+ log_print("queue_ast on deleted lkb %x ast %x pid %u",
251+ lkb->lkb_id, lkb->lkb_astflags, current->pid);
252+ if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST)))
4bf12011 253+ list_add_tail(&lkb->lkb_astqueue, &_ast_queue);
5cdbd17b 254+ lkb->lkb_astflags |= flags;
4bf12011 255+ up(&_ast_queue_lock);
256+
257+ /* It is the responsibility of the caller to call wake_astd()
258+ * after it has finished other locking operations that request
259+ * the ASTs to be delivered after */
260+ }
261+}
262+
263+/*
5cdbd17b 264+ * Process any LKBs on the AST queue.
4bf12011 265+ */
266+
267+static void process_asts(void)
268+{
5cdbd17b
AM
269+ gd_lkb_t *lkb;
270+ uint16_t flags;
4bf12011 271+
5cdbd17b
AM
272+ for (;;) {
273+ down(&_ast_queue_lock);
274+ if (list_empty(&_ast_queue)) {
275+ up(&_ast_queue_lock);
276+ break;
277+ }
278+
279+ lkb = list_entry(_ast_queue.next, gd_lkb_t, lkb_astqueue);
280+ list_del(&lkb->lkb_astqueue);
281+ flags = lkb->lkb_astflags;
282+ lkb->lkb_astflags = 0;
283+ up(&_ast_queue_lock);
4bf12011 284+
5cdbd17b
AM
285+ if (flags & AST_COMP)
286+ deliver_ast(lkb, AST_COMP);
4bf12011 287+
5cdbd17b
AM
288+ if (flags & AST_BAST) {
289+ if (flags & AST_DEL)
290+ log_print("skip bast on %x", lkb->lkb_id);
291+ else
292+ deliver_ast(lkb, AST_BAST);
293+ }
4bf12011 294+
5cdbd17b
AM
295+ if (flags & AST_DEL) {
296+ gd_res_t *rsb = lkb->lkb_resource;
297+ gd_ls_t *ls = rsb->res_ls;
4bf12011 298+
5cdbd17b
AM
299+ GDLM_ASSERT(lkb->lkb_astflags == 0,
300+ printk("%x %x\n", lkb->lkb_id, lkb->lkb_astflags););
4bf12011 301+
5cdbd17b
AM
302+ down_read(&ls->ls_in_recovery);
303+ release_lkb(ls, lkb);
304+ release_rsb(rsb);
305+ up_read(&ls->ls_in_recovery);
306+ }
307+
308+ schedule();
4bf12011 309+ }
4bf12011 310+}
311+
312+void lockqueue_lkb_mark(gd_ls_t *ls)
313+{
314+ gd_lkb_t *lkb, *safe;
315+ int count = 0;
316+
317+ log_all(ls, "mark waiting requests");
318+
319+ down(&_lockqueue_lock);
320+
321+ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
322+
323+ if (lkb->lkb_resource->res_ls != ls)
324+ continue;
325+
326+ /*
327+ * These lkb's are new and the master is being looked up. Mark
328+ * the lkb request to be resent. Even if the destination node
329+ * for the request is still living and has our request, it will
330+ * purge all resdir requests in purge_requestqueue. If there's
331+ * a reply to the LOOKUP request in our requestqueue (the reply
332+ * arrived after ls_stop), it is invalid and will be discarded
333+ * in purge_requestqueue, too.
334+ */
335+
336+ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
337+ GDLM_ASSERT(lkb->lkb_nodeid == -1,
338+ log_error(ls, "nodeid=%d\n",
339+ lkb->lkb_nodeid););
340+
341+ lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
342+ count++;
343+ continue;
344+ }
345+
346+ /*
347+ * These lkb's have an outstanding request to a bygone node.
348+ * The request will be redirected to the new master node in
349+ * resend_cluster_requests(). Don't mark the request for
350+ * resending if there's a reply for it saved in the
351+ * requestqueue.
352+ */
353+
354+ if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
355+ !reply_in_requestqueue(ls, lkb->lkb_id)) {
356+
357+ lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
358+
359+ /*
360+ * Don't rebuild this lkb on a new rsb in
361+ * rebuild_rsbs_send().
362+ */
363+
364+ if (lkb->lkb_lockqueue_state ==
365+ GDLM_LQSTATE_WAIT_CONDGRANT) {
366+ GDLM_ASSERT(lkb->lkb_status ==
367+ GDLM_LKSTS_WAITING, );
368+ lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
369+ }
370+
371+ /*
372+ * This flag indicates to the new master that his lkb
373+ * is in the midst of a convert request and should be
374+ * placed on the granted queue rather than the convert
375+ * queue. We will resend this convert request to the
376+ * new master.
377+ */
378+
379+ else if (lkb->lkb_lockqueue_state ==
380+ GDLM_LQSTATE_WAIT_CONVERT) {
381+ GDLM_ASSERT(lkb->lkb_status ==
382+ GDLM_LKSTS_CONVERT, );
383+ lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
384+ }
385+
386+ count++;
387+ }
388+ }
389+ up(&_lockqueue_lock);
390+
391+ log_all(ls, "marked %d requests", count);
392+}
393+
394+int resend_cluster_requests(gd_ls_t *ls)
395+{
396+ gd_lkb_t *lkb, *safe;
397+ int error = 0, state, count = 0;
398+
399+ log_all(ls, "resend marked requests");
400+
401+ down(&_lockqueue_lock);
402+
403+ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
404+
405+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
406+ log_debug(ls, "resend_cluster_requests: aborted");
407+ error = -EINTR;
408+ break;
409+ }
410+
411+ if (lkb->lkb_resource->res_ls != ls)
412+ continue;
413+
414+ log_debug(ls, "resend_cluster_requests id=%x nodeid=%d "
415+ "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid,
416+ lkb->lkb_lockqueue_state, lkb->lkb_flags);
417+
418+ /*
419+ * Resend/process the lockqueue lkb's (in-progres requests)
420+ * that were flagged at the start of recovery in
421+ * lockqueue_lkb_mark().
422+ */
423+
424+ if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
425+ lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
426+ lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
427+ lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
428+
429+ if (lkb->lkb_nodeid == -1) {
430+ /*
431+ * Send lookup to new resdir node.
432+ */
433+ lkb->lkb_lockqueue_time = jiffies;
434+ send_cluster_request(lkb,
435+ lkb->lkb_lockqueue_state);
436+ }
437+
438+ else if (lkb->lkb_nodeid != 0) {
439+ /*
440+ * There's a new RSB master (that's not us.)
441+ */
442+ lkb->lkb_lockqueue_time = jiffies;
443+ send_cluster_request(lkb,
444+ lkb->lkb_lockqueue_state);
445+ }
446+
447+ else {
448+ /*
449+ * We are the new RSB master for this lkb
450+ * request.
451+ */
452+ state = lkb->lkb_lockqueue_state;
453+ lkb->lkb_lockqueue_state = 0;
454+ /* list_del equals remove_from_lockqueue() */
455+ list_del(&lkb->lkb_lockqueue);
456+ process_remastered_lkb(lkb, state);
457+ }
458+
459+ count++;
460+ }
461+ }
462+ up(&_lockqueue_lock);
463+
464+ log_all(ls, "resent %d requests", count);
465+ return error;
466+}
467+
468+/*
469+ * Process any LKBs on the Lock queue, this
470+ * just looks at the entries to see if they have been
471+ * on the queue too long and fails the requests if so.
472+ */
473+
474+static void process_lockqueue(void)
475+{
476+ gd_lkb_t *lkb, *safe;
477+ gd_ls_t *ls;
478+ int count = 0;
479+
480+ down(&_lockqueue_lock);
481+
482+ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
483+ ls = lkb->lkb_resource->res_ls;
484+
485+ if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
486+ continue;
487+
488+ /* Don't time out locks that are in transition */
489+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
490+ continue;
491+
492+ if (check_timeout(lkb->lkb_lockqueue_time,
493+ dlm_config.lock_timeout)) {
494+ count++;
495+ list_del(&lkb->lkb_lockqueue);
496+ up(&_lockqueue_lock);
497+ cancel_lockop(lkb, -ETIMEDOUT);
498+ down(&_lockqueue_lock);
499+ }
500+ }
501+ up(&_lockqueue_lock);
502+
503+ if (count)
504+ wake_astd();
505+
506+ if (atomic_read(&_astd_running))
507+ mod_timer(&_lockqueue_timer,
508+ jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
509+}
510+
511+/* Look for deadlocks */
512+static void process_deadlockqueue(void)
513+{
514+ gd_lkb_t *lkb, *safe;
515+
516+ down(&_deadlockqueue_lock);
517+
518+ list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
519+ gd_lkb_t *kill_lkb;
520+
521+ /* Only look at "due" locks */
522+ if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
523+ break;
524+
525+ /* Don't look at locks that are in transition */
526+ if (!test_bit(LSFL_LS_RUN,
527+ &lkb->lkb_resource->res_ls->ls_flags))
528+ continue;
529+
530+ up(&_deadlockqueue_lock);
531+
532+ /* Lock has hit due time, check for conversion deadlock */
533+ kill_lkb = conversion_deadlock_check(lkb);
534+ if (kill_lkb)
535+ cancel_conversion(kill_lkb, -EDEADLOCK);
536+
537+ down(&_deadlockqueue_lock);
538+ }
539+ up(&_deadlockqueue_lock);
540+}
541+
542+static __inline__ int no_asts(void)
543+{
544+ int ret;
545+
546+ down(&_ast_queue_lock);
547+ ret = list_empty(&_ast_queue);
548+ up(&_ast_queue_lock);
549+ return ret;
550+}
551+
552+static void lockqueue_timer_fn(unsigned long arg)
553+{
554+ set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags);
555+ wake_up(&_astd_waitchan);
556+}
557+
558+/*
559+ * DLM daemon which delivers asts.
560+ */
561+
562+static int dlm_astd(void *data)
563+{
564+ daemonize("dlm_astd");
565+
566+ INIT_LIST_HEAD(&_lockqueue);
567+ init_MUTEX(&_lockqueue_lock);
568+ INIT_LIST_HEAD(&_deadlockqueue);
569+ init_MUTEX(&_deadlockqueue_lock);
570+ INIT_LIST_HEAD(&_ast_queue);
571+ init_MUTEX(&_ast_queue_lock);
572+ init_waitqueue_head(&_astd_waitchan);
573+ complete(&_astd_done);
574+
575+ /*
576+ * Set a timer to check the lockqueue for dead locks (and deadlocks).
577+ */
578+
579+ init_timer(&_lockqueue_timer);
580+ _lockqueue_timer.function = lockqueue_timer_fn;
581+ _lockqueue_timer.data = 0;
582+ mod_timer(&_lockqueue_timer,
583+ jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
584+
585+ while (atomic_read(&_astd_running)) {
586+ wchan_cond_sleep_intr(_astd_waitchan, no_asts());
587+
588+ if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags))
589+ process_asts();
590+
591+ if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) {
592+ process_lockqueue();
593+ if (dlm_config.deadlocktime)
594+ process_deadlockqueue();
595+ }
596+ }
597+
598+ if (timer_pending(&_lockqueue_timer))
599+ del_timer(&_lockqueue_timer);
600+
601+ complete(&_astd_done);
602+
603+ return 0;
604+}
605+
606+void wake_astd(void)
607+{
608+ set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags);
609+ wake_up(&_astd_waitchan);
610+}
611+
612+int astd_start()
613+{
614+ init_completion(&_astd_done);
615+ atomic_set(&_astd_running, 1);
616+ _astd_pid = kernel_thread(dlm_astd, NULL, 0);
617+ wait_for_completion(&_astd_done);
618+ return 0;
619+}
620+
621+void astd_stop()
622+{
623+ atomic_set(&_astd_running, 0);
624+ wake_astd();
625+ wait_for_completion(&_astd_done);
626+}
627diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
628--- linux-orig/cluster/dlm/ast.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
629+++ linux-patched/cluster/dlm/ast.h 2004-06-29 20:01:19.000000000 +0800
630@@ -0,0 +1,28 @@
4bf12011 631+/******************************************************************************
632+*******************************************************************************
633+**
634+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
635+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
636+**
637+** This copyrighted material is made available to anyone wishing to use,
638+** modify, copy, or redistribute it subject to the terms and conditions
639+** of the GNU General Public License v.2.
640+**
641+*******************************************************************************
642+******************************************************************************/
643+
644+#ifndef __AST_DOT_H__
645+#define __AST_DOT_H__
646+
5cdbd17b
AM
647+void lockqueue_lkb_mark(gd_ls_t *ls);
648+int resend_cluster_requests(gd_ls_t *ls);
649+void add_to_lockqueue(gd_lkb_t *lkb);
650+void remove_from_lockqueue(gd_lkb_t *lkb);
651+void add_to_deadlockqueue(gd_lkb_t *lkb);
652+void remove_from_deadlockqueue(gd_lkb_t *lkb);
653+void queue_ast(gd_lkb_t *lkb, uint16_t astflags, uint8_t rqmode);
4bf12011 654+void wake_astd(void);
655+int astd_start(void);
656+void astd_stop(void);
657+
658+#endif /* __AST_DOT_H__ */
659diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
660--- linux-orig/cluster/dlm/config.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 661+++ linux-patched/cluster/dlm/config.c 2004-06-29 20:01:19.000000000 +0800
4bf12011 662@@ -0,0 +1,125 @@
663+/******************************************************************************
664+*******************************************************************************
665+**
666+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
667+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
668+**
669+** This copyrighted material is made available to anyone wishing to use,
670+** modify, copy, or redistribute it subject to the terms and conditions
671+** of the GNU General Public License v.2.
672+**
673+*******************************************************************************
674+******************************************************************************/
675+
676+#include <linux/module.h>
677+#include <linux/proc_fs.h>
678+
679+#include "dlm_internal.h"
680+#include "lowcomms.h"
681+#include "config.h"
682+
683+/* Config file defaults */
684+#define DEFAULT_TCP_PORT 21064
685+#define DEFAULT_LOCK_TIMEOUT 30
686+#define DEFAULT_BUFFER_SIZE 4096
687+#define DEFAULT_RESHASHTBL 256
688+#define DEFAULT_LOCKIDTBL 1024
689+#define DEFAULT_MAX_CONNECTIONS 128
690+#define DEFAULT_DEADLOCKTIME 10
691+
692+struct config_info dlm_config = {
693+ .tcp_port = DEFAULT_TCP_PORT,
694+ .lock_timeout = DEFAULT_LOCK_TIMEOUT,
695+ .buffer_size = DEFAULT_BUFFER_SIZE,
696+ .reshashtbl = DEFAULT_RESHASHTBL,
697+ .lockidtbl = DEFAULT_LOCKIDTBL,
698+ .max_connections = DEFAULT_MAX_CONNECTIONS,
699+ .deadlocktime = DEFAULT_DEADLOCKTIME,
700+};
701+
702+
703+static struct config_proc_info {
704+ char *name;
705+ int *value;
706+} config_proc[] = {
707+ {
708+ .name = "tcp_port",
709+ .value = &dlm_config.tcp_port,
710+ },
711+ {
712+ .name = "lock_timeout",
713+ .value = &dlm_config.lock_timeout,
714+ },
715+ {
716+ .name = "buffer_size",
717+ .value = &dlm_config.buffer_size,
718+ },
719+ {
720+ .name = "reshashtbl",
721+ .value = &dlm_config.reshashtbl,
722+ },
723+ {
724+ .name = "lockidtbl",
725+ .value = &dlm_config.lockidtbl,
726+ },
727+ {
728+ .name = "max_connections",
729+ .value = &dlm_config.max_connections,
730+ },
731+ {
732+ .name = "deadlocktime",
733+ .value = &dlm_config.deadlocktime,
734+ },
735+};
736+static struct proc_dir_entry *dlm_dir;
737+
738+static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
739+ int *eof, void *data)
740+{
741+ struct config_proc_info *cinfo = data;
742+ return snprintf(page, count, "%d\n", *cinfo->value);
743+}
744+
745+static int dlm_config_write_proc(struct file *file, const char *buffer,
746+ unsigned long count, void *data)
747+{
748+ struct config_proc_info *cinfo = data;
749+ int value;
750+ char *end;
751+
752+ value = simple_strtoul(buffer, &end, 10);
753+ if (*end)
754+ *cinfo->value = value;
755+ return count;
756+}
757+
758+int dlm_config_init(void)
759+{
760+ int i;
761+ struct proc_dir_entry *pde;
762+
763+ dlm_dir = proc_mkdir("cluster/config/dlm", 0);
764+ if (!dlm_dir)
765+ return -1;
766+
767+ dlm_dir->owner = THIS_MODULE;
768+
769+ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
770+ pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
771+ if (pde) {
772+ pde->data = &config_proc[i];
773+ pde->write_proc = dlm_config_write_proc;
774+ pde->read_proc = dlm_config_read_proc;
775+ }
776+ }
777+ return 0;
778+}
779+
780+void dlm_config_exit(void)
781+{
782+ int i;
783+
784+ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
785+ remove_proc_entry(config_proc[i].name, dlm_dir);
786+ remove_proc_entry("cluster/config/dlm", NULL);
787+}
788diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
789--- linux-orig/cluster/dlm/config.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 790+++ linux-patched/cluster/dlm/config.h 2004-06-29 20:01:19.000000000 +0800
4bf12011 791@@ -0,0 +1,31 @@
792+/******************************************************************************
793+*******************************************************************************
794+**
795+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
796+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
797+**
798+** This copyrighted material is made available to anyone wishing to use,
799+** modify, copy, or redistribute it subject to the terms and conditions
800+** of the GNU General Public License v.2.
801+**
802+*******************************************************************************
803+******************************************************************************/
804+
805+#ifndef __CONFIG_DOT_H__
806+#define __CONFIG_DOT_H__
807+
808+struct config_info {
809+ int tcp_port;
810+ int lock_timeout;
811+ int buffer_size;
812+ int reshashtbl;
813+ int lockidtbl;
814+ int max_connections;
815+ int deadlocktime;
816+};
817+
818+extern struct config_info dlm_config;
819+extern int dlm_config_init(void);
820+extern void dlm_config_exit(void);
821+
822+#endif /* __CONFIG_DOT_H__ */
823diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
824--- linux-orig/cluster/dlm/device.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 825+++ linux-patched/cluster/dlm/device.c 2004-06-29 20:01:19.000000000 +0800
4bf12011 826@@ -0,0 +1,1020 @@
827+/******************************************************************************
828+*******************************************************************************
829+**
830+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
831+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
832+**
833+** This copyrighted material is made available to anyone wishing to use,
834+** modify, copy, or redistribute it subject to the terms and conditions
835+** of the GNU General Public License v.2.
836+**
837+*******************************************************************************
838+******************************************************************************/
839+
840+/*
841+ * device.c
842+ *
843+ * This is the userland interface to the DLM.
844+ *
845+ * The locking is done via a misc char device (find the
846+ * registered minor number in /proc/misc).
847+ *
848+ * User code should not use this interface directly but
849+ * call the library routines in libdlm.a instead.
850+ *
851+ */
852+
853+#include <linux/miscdevice.h>
854+#include <linux/init.h>
855+#include <linux/wait.h>
856+#include <linux/module.h>
857+#include <linux/file.h>
858+#include <linux/fs.h>
859+#include <linux/poll.h>
860+#include <linux/signal.h>
861+#include <linux/spinlock.h>
862+#include <asm/ioctls.h>
863+
864+#include "dlm_internal.h"
865+#include "device.h"
866+
867+extern gd_lkb_t *dlm_get_lkb(gd_ls_t *, int);
868+static struct file_operations _dlm_fops;
869+static const char *name_prefix="dlm";
870+static struct list_head user_ls_list;
871+
872+/* Flags in li_flags */
873+#define LI_FLAG_COMPLETE 1
874+#define LI_FLAG_FIRSTLOCK 2
875+
876+struct lock_info {
877+ uint8_t li_cmd;
878+ struct dlm_lksb li_lksb;
879+ wait_queue_head_t li_waitq;
880+ unsigned long li_flags;
881+ void __user *li_astparam;
882+ void __user *li_astaddr;
883+ void __user *li_bastaddr;
884+ struct file_info *li_file;
885+ struct dlm_lksb __user *li_user_lksb;
886+ struct semaphore li_firstlock;
887+ struct dlm_queryinfo *li_queryinfo;
888+ struct dlm_queryinfo __user *li_user_queryinfo;
889+};
890+
891+/* A queued AST no less */
892+struct ast_info {
893+ struct dlm_lock_result result;
894+ struct dlm_queryinfo *queryinfo;
895+ struct dlm_queryinfo __user *user_queryinfo;
896+ struct list_head list;
897+};
898+
899+/* One of these per userland lockspace */
900+struct user_ls {
901+ void *ls_lockspace;
902+ atomic_t ls_refcnt;
903+ long ls_flags; /* bit 1 means LS has been deleted */
904+
905+ /* Passed into misc_register() */
906+ struct miscdevice ls_miscinfo;
907+ struct list_head ls_list;
908+};
909+
910+/* misc_device info for the control device */
911+static struct miscdevice ctl_device;
912+
913+/*
914+ * Stuff we hang off the file struct.
915+ * The first two are to cope with unlocking all the
916+ * locks help by a process when it dies.
917+ */
918+struct file_info {
919+ struct list_head fi_lkb_list; /* List of active lkbs */
920+ spinlock_t fi_lkb_lock;
921+ struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
922+ spinlock_t fi_ast_lock;
923+ wait_queue_head_t fi_wait;
924+ struct user_ls *fi_ls;
925+ atomic_t fi_refcnt; /* Number of users */
926+ unsigned long fi_flags; /* Bit 1 means the device is open */
927+};
928+
929+
930+/* get and put ops for file_info.
931+ Actually I don't really like "get" and "put", but everyone
932+ else seems to use them and I can't think of anything
933+ nicer at the moment */
934+static void get_file_info(struct file_info *f)
935+{
936+ atomic_inc(&f->fi_refcnt);
937+}
938+
939+static void put_file_info(struct file_info *f)
940+{
941+ if (atomic_dec_and_test(&f->fi_refcnt))
942+ kfree(f);
943+}
944+
945+/* Find a lockspace struct given the device minor number */
946+static struct user_ls *find_lockspace(int minor)
947+{
948+ struct user_ls *lsinfo;
949+
950+ list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
951+
952+ if (lsinfo->ls_miscinfo.minor == minor)
953+ return lsinfo;
954+ }
955+ return NULL;
956+}
957+
958+static void add_lockspace_to_list(struct user_ls *lsinfo)
959+{
960+ list_add(&lsinfo->ls_list, &user_ls_list);
961+}
962+
963+/* Register a lockspace with the DLM and create a misc
964+ device for userland to access it */
965+static int register_lockspace(char *name, struct user_ls **ls)
966+{
967+ struct user_ls *newls;
968+ int status;
969+ int namelen;
970+
971+ namelen = strlen(name)+strlen(name_prefix)+2;
972+
973+ newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
974+ if (!newls)
975+ return -ENOMEM;
976+ memset(newls, 0, sizeof(struct user_ls));
977+
978+ newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
979+ if (!newls->ls_miscinfo.name) {
980+ kfree(newls);
981+ return -ENOMEM;
982+ }
983+ snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
984+
985+ status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1,
986+ strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1,
987+ &newls->ls_lockspace, 0);
988+
989+ if (status != 0) {
990+ kfree(newls->ls_miscinfo.name);
991+ kfree(newls);
992+ return status;
993+ }
994+
995+ newls->ls_miscinfo.fops = &_dlm_fops;
996+ newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
997+
998+ status = misc_register(&newls->ls_miscinfo);
999+ if (status) {
1000+ log_print("failed to register misc device for %s", name);
1001+ dlm_release_lockspace(newls->ls_lockspace, 0);
1002+ kfree(newls->ls_miscinfo.name);
1003+ kfree(newls);
1004+ return status;
1005+ }
1006+
1007+
1008+ add_lockspace_to_list(newls);
1009+ *ls = newls;
1010+ return 0;
1011+}
1012+
1013+static int unregister_lockspace(struct user_ls *lsinfo, int force)
1014+{
1015+ int status;
1016+
1017+ status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
1018+ if (status)
1019+ return status;
1020+
1021+ status = misc_deregister(&lsinfo->ls_miscinfo);
1022+ if (status)
1023+ return status;
1024+
1025+ list_del(&lsinfo->ls_list);
1026+ kfree(lsinfo->ls_miscinfo.name);
1027+ kfree(lsinfo);
1028+
1029+ return 0;
1030+}
1031+
1032+/* Add it to userland's AST queue */
1033+static void add_to_astqueue(struct lock_info *li, void *astaddr)
1034+{
1035+ struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
1036+ if (!ast)
1037+ return;
1038+
1039+ ast->result.astparam = li->li_astparam;
1040+ ast->result.astaddr = astaddr;
1041+ ast->result.user_lksb = li->li_user_lksb;
1042+ ast->result.cmd = li->li_cmd;
1043+ memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
1044+
1045+ /* These two will both be NULL for anything other than queries */
1046+ ast->queryinfo = li->li_queryinfo;
1047+ ast->user_queryinfo = li->li_user_queryinfo;
1048+
1049+ spin_lock(&li->li_file->fi_ast_lock);
1050+ list_add_tail(&ast->list, &li->li_file->fi_ast_list);
1051+ spin_unlock(&li->li_file->fi_ast_lock);
1052+ wake_up_interruptible(&li->li_file->fi_wait);
1053+}
1054+
1055+static void bast_routine(void *param, int mode)
1056+{
1057+ struct lock_info *li = param;
1058+
1059+ if (param) {
1060+ add_to_astqueue(li, li->li_bastaddr);
1061+ }
1062+}
1063+
1064+/*
1065+ * This is the kernel's AST routine.
1066+ * All lock, unlock & query operations complete here.
1067+ * The only syncronous ops are those done during device close.
1068+ */
1069+static void ast_routine(void *param)
1070+{
1071+ struct lock_info *li = param;
1072+
1073+ /* Param may be NULL if a persistent lock is unlocked by someone else */
1074+ if (!param)
1075+ return;
1076+
1077+ /* If it's an async request then post data to the user's AST queue. */
1078+ if (li->li_astaddr) {
1079+
1080+ /* Only queue AST if the device is still open */
1081+ if (test_bit(1, &li->li_file->fi_flags))
1082+ add_to_astqueue(li, li->li_astaddr);
1083+
1084+ /* If it's a new lock operation that failed, then
1085+ * remove it from the owner queue and free the
1086+ * lock_info. The DLM will not free the LKB until this
1087+ * AST has completed.
1088+ */
1089+ if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1090+ li->li_lksb.sb_status != 0) {
1091+ gd_lkb_t *lkb;
1092+
1093+ /* Wait till dlm_lock() has finished */
1094+ down(&li->li_firstlock);
1095+ lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1096+ if (lkb) {
1097+ spin_lock(&li->li_file->fi_lkb_lock);
1098+ list_del(&lkb->lkb_ownerqueue);
1099+ spin_unlock(&li->li_file->fi_lkb_lock);
1100+ }
1101+ up(&li->li_firstlock);
1102+ put_file_info(li->li_file);
1103+ kfree(li);
1104+ return;
1105+ }
1106+ /* Free unlocks & queries */
1107+ if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
1108+ li->li_cmd == DLM_USER_QUERY) {
1109+ put_file_info(li->li_file);
1110+ kfree(li);
1111+ }
1112+ }
1113+ else {
1114+ /* Syncronous request, just wake up the caller */
1115+ set_bit(LI_FLAG_COMPLETE, &li->li_flags);
1116+ wake_up_interruptible(&li->li_waitq);
1117+ }
1118+}
1119+
1120+/*
1121+ * Wait for the lock op to complete and return the status.
1122+ */
1123+static int wait_for_ast(struct lock_info *li)
1124+{
1125+ /* Wait for the AST routine to complete */
1126+ set_task_state(current, TASK_INTERRUPTIBLE);
1127+ while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
1128+ schedule();
1129+
1130+ set_task_state(current, TASK_RUNNING);
1131+
1132+ return li->li_lksb.sb_status;
1133+}
1134+
1135+
1136+/* Open on control device */
1137+static int dlm_ctl_open(struct inode *inode, struct file *file)
1138+{
1139+ return 0;
1140+}
1141+
1142+/* Close on control device */
1143+static int dlm_ctl_close(struct inode *inode, struct file *file)
1144+{
1145+ return 0;
1146+}
1147+
1148+/* Open on lockspace device */
1149+static int dlm_open(struct inode *inode, struct file *file)
1150+{
1151+ struct file_info *f;
1152+ struct user_ls *lsinfo;
1153+
1154+ lsinfo = find_lockspace(iminor(inode));
1155+ if (!lsinfo)
1156+ return -ENOENT;
1157+
1158+ f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
1159+ if (!f)
1160+ return -ENOMEM;
1161+
1162+ atomic_inc(&lsinfo->ls_refcnt);
1163+ INIT_LIST_HEAD(&f->fi_lkb_list);
1164+ INIT_LIST_HEAD(&f->fi_ast_list);
1165+ spin_lock_init(&f->fi_ast_lock);
1166+ spin_lock_init(&f->fi_lkb_lock);
1167+ init_waitqueue_head(&f->fi_wait);
1168+ f->fi_ls = lsinfo;
1169+ atomic_set(&f->fi_refcnt, 1);
1170+ set_bit(1, &f->fi_flags);
1171+
1172+ file->private_data = f;
1173+
1174+ return 0;
1175+}
1176+
1177+/* Check the user's version matches ours */
1178+static int check_version(struct dlm_lock_params *params)
1179+{
1180+ if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
1181+ (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
1182+ params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
1183+
1184+ log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
1185+ params->version[0],
1186+ params->version[1],
1187+ params->version[2],
1188+ DLM_DEVICE_VERSION_MAJOR,
1189+ DLM_DEVICE_VERSION_MINOR,
1190+ DLM_DEVICE_VERSION_PATCH);
1191+ return -EINVAL;
1192+ }
1193+ return 0;
1194+}
1195+
1196+/* Close on lockspace device */
1197+static int dlm_close(struct inode *inode, struct file *file)
1198+{
1199+ struct file_info *f = file->private_data;
1200+ struct lock_info li;
1201+ sigset_t tmpsig;
1202+ sigset_t allsigs;
1203+ gd_lkb_t *lkb, *safe;
1204+ struct user_ls *lsinfo;
1205+ DECLARE_WAITQUEUE(wq, current);
1206+
1207+ lsinfo = find_lockspace(iminor(inode));
1208+ if (!lsinfo)
1209+ return -ENOENT;
1210+
1211+ /* Mark this closed so that ASTs will not be delivered any more */
1212+ clear_bit(1, &f->fi_flags);
1213+
1214+ /* Block signals while we are doing this */
1215+ sigfillset(&allsigs);
1216+ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1217+
1218+ /* We use our own lock_info struct here, so that any
1219+ * outstanding "real" ASTs will be delivered with the
1220+ * corresponding "real" params, thus freeing the lock_info
1221+ * that belongs the lock. This catches the corner case where
1222+ * a lock is BUSY when we try to unlock it here
1223+ */
1224+ memset(&li, 0, sizeof(li));
1225+ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1226+ init_waitqueue_head(&li.li_waitq);
1227+ add_wait_queue(&li.li_waitq, &wq);
1228+
1229+ /*
1230+ * Free any outstanding locks, they are on the
1231+ * list in LIFO order so there should be no problems
1232+ * about unlocking parents before children.
1233+ * Although we don't remove the lkbs from the list here
1234+ * (what would be the point?), foreach_safe is needed
1235+ * because the lkbs are freed during dlm_unlock operations
1236+ */
1237+ list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) {
1238+ int status;
1239+ int lock_status;
1240+ int flags = 0;
1241+ struct lock_info *old_li;
1242+
1243+ /* Make a copy of this pointer. If all goes well we will
1244+ * free it later. if not it will be left to the AST routine
1245+ * to tidy up
1246+ */
1247+ old_li = (struct lock_info *)lkb->lkb_astparam;
1248+
1249+ /* Don't unlock persistent locks */
1250+ if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
1251+ list_del(&lkb->lkb_ownerqueue);
1252+
1253+ /* But tidy our references in it */
1254+ kfree(old_li);
1255+ lkb->lkb_astparam = (long)NULL;
1256+ put_file_info(f);
1257+ continue;
1258+ }
1259+
1260+ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1261+
1262+ /* If it's not granted then cancel the request.
1263+ * If the lock was WAITING then it will be dropped,
1264+ * if it was converting then it will be reverted to GRANTED,
1265+ * then we will unlock it.
1266+ */
1267+ lock_status = lkb->lkb_status;
1268+
1269+ if (lock_status != GDLM_LKSTS_GRANTED)
1270+ flags = DLM_LKF_CANCEL;
1271+
1272+ status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1273+
1274+ /* Must wait for it to complete as the next lock could be its
1275+ * parent */
1276+ if (status == 0)
1277+ wait_for_ast(&li);
1278+
1279+ /* If it was waiting for a conversion, it will
1280+ now be granted so we can unlock it properly */
1281+ if (lock_status == GDLM_LKSTS_CONVERT) {
1282+
1283+ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1284+ status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li);
1285+
1286+ if (status == 0)
1287+ wait_for_ast(&li);
1288+ }
1289+ /* Unlock suceeded, free the lock_info struct. */
1290+ if (status == 0) {
1291+ kfree(old_li);
1292+ put_file_info(f);
1293+ }
1294+ }
1295+
1296+ remove_wait_queue(&li.li_waitq, &wq);
1297+
1298+ /* If this is the last reference, and the lockspace has been deleted
1299+ the free the struct */
1300+ if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
1301+ kfree(lsinfo);
1302+ }
1303+
1304+ /* Restore signals */
1305+ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1306+ recalc_sigpending();
1307+
1308+ return 0;
1309+}
1310+
1311+/*
1312+ * ioctls to create/remove lockspaces, and check how many
1313+ * outstanding ASTs there are against a particular LS.
1314+ */
1315+static int dlm_ioctl(struct inode *inode, struct file *file,
1316+ uint command, ulong u)
1317+{
1318+ struct file_info *fi = file->private_data;
1319+ int status = -EINVAL;
1320+ int count;
1321+ struct list_head *tmp_list;
1322+
1323+ switch (command) {
1324+
1325+ /* Are there any ASTs for us to read?
1326+ * Warning, this returns the number of messages (ASTs)
1327+ * in the queue, NOT the number of bytes to read
1328+ */
1329+ case FIONREAD:
1330+ count = 0;
1331+ spin_lock(&fi->fi_ast_lock);
1332+ list_for_each(tmp_list, &fi->fi_ast_list)
1333+ count++;
1334+ spin_unlock(&fi->fi_ast_lock);
1335+ status = put_user(count, (int *)u);
1336+ break;
1337+
1338+ default:
1339+ return -ENOTTY;
1340+ }
1341+
1342+ return status;
1343+}
1344+
1345+/*
1346+ * ioctls to create/remove lockspaces.
1347+ */
1348+static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
1349+ uint command, ulong u)
1350+{
1351+ int status = -EINVAL;
1352+ char ls_name[MAX_LS_NAME_LEN];
1353+ struct user_ls *lsinfo;
1354+ int force = 0;
1355+
1356+ switch (command) {
1357+ case DLM_CREATE_LOCKSPACE:
1358+ if (!capable(CAP_SYS_ADMIN))
1359+ return -EPERM;
1360+
1361+ if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
1362+ return -EFAULT;
1363+ status = register_lockspace(ls_name, &lsinfo);
1364+
1365+ /* If it succeeded then return the minor number */
1366+ if (status == 0)
1367+ status = lsinfo->ls_miscinfo.minor;
1368+ break;
1369+
1370+ case DLM_FORCE_RELEASE_LOCKSPACE:
1371+ force = 2;
1372+
1373+ case DLM_RELEASE_LOCKSPACE:
1374+ if (!capable(CAP_SYS_ADMIN))
1375+ return -EPERM;
1376+
1377+ lsinfo = find_lockspace(u);
1378+ if (!lsinfo)
1379+ return -EINVAL;
1380+ status = unregister_lockspace(lsinfo, force);
1381+ break;
1382+
1383+ default:
1384+ return -ENOTTY;
1385+ }
1386+
1387+ return status;
1388+}
1389+
1390+/* Deal with the messy stuff of copying a web of structs
1391+ from kernel space to userspace */
1392+static int copy_query_result(struct ast_info *ast)
1393+{
1394+ int status = -EFAULT;
1395+ struct dlm_queryinfo qi;
1396+
1397+ /* Get the pointers to userspace structs */
1398+ if (copy_from_user(&qi, ast->user_queryinfo,
1399+ sizeof(struct dlm_queryinfo)))
1400+ goto copy_out;
1401+
1402+ /* TODO: does this deref a user pointer? */
1403+ if (put_user(ast->queryinfo->gqi_lockcount,
1404+ &ast->user_queryinfo->gqi_lockcount))
1405+ goto copy_out;
1406+
1407+ if (qi.gqi_resinfo) {
1408+ if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
1409+ sizeof(struct dlm_resinfo)))
1410+ goto copy_out;
1411+ }
1412+
1413+ if (qi.gqi_lockinfo) {
1414+ if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
1415+ sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
1416+ goto copy_out;
1417+ }
1418+
1419+ status = 0;
1420+
1421+ if (ast->queryinfo->gqi_lockinfo)
1422+ kfree(ast->queryinfo->gqi_lockinfo);
1423+
1424+ if (ast->queryinfo->gqi_resinfo)
1425+ kfree(ast->queryinfo->gqi_resinfo);
1426+
1427+ kfree(ast->queryinfo);
1428+
1429+ copy_out:
1430+ return status;
1431+}
1432+
1433+/* Read call, might block if no ASTs are waiting.
1434+ * It will only ever return one message at a time, regardless
1435+ * of how many are pending.
1436+ */
1437+static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
1438+{
1439+ struct file_info *fi = file->private_data;
1440+ struct ast_info *ast;
1441+ int ret;
1442+ DECLARE_WAITQUEUE(wait, current);
1443+
1444+ if (count < sizeof(struct dlm_lock_result))
1445+ return -EINVAL;
1446+
1447+ spin_lock(&fi->fi_ast_lock);
1448+ if (list_empty(&fi->fi_ast_list)) {
1449+
1450+ /* No waiting ASTs.
1451+ * Return EOF if the lockspace been deleted.
1452+ */
1453+ if (test_bit(1, &fi->fi_ls->ls_flags))
1454+ return 0;
1455+
1456+ if (file->f_flags & O_NONBLOCK) {
1457+ spin_unlock(&fi->fi_ast_lock);
1458+ return -EAGAIN;
1459+ }
1460+
1461+ add_wait_queue(&fi->fi_wait, &wait);
1462+
1463+ repeat:
1464+ set_current_state(TASK_INTERRUPTIBLE);
1465+ if (list_empty(&fi->fi_ast_list) &&
1466+ !signal_pending(current)) {
1467+
1468+ spin_unlock(&fi->fi_ast_lock);
1469+ schedule();
1470+ spin_lock(&fi->fi_ast_lock);
1471+ goto repeat;
1472+ }
1473+
1474+ current->state = TASK_RUNNING;
1475+ remove_wait_queue(&fi->fi_wait, &wait);
1476+
1477+ if (signal_pending(current)) {
1478+ spin_unlock(&fi->fi_ast_lock);
1479+ return -ERESTARTSYS;
1480+ }
1481+ }
1482+
1483+ ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
1484+ list_del(&ast->list);
1485+ spin_unlock(&fi->fi_ast_lock);
1486+
1487+ ret = sizeof(struct dlm_lock_result);
1488+ if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
1489+ ret = -EFAULT;
1490+
1491+ /* If it was a query then copy the result block back here */
1492+ if (ast->queryinfo) {
1493+ int status = copy_query_result(ast);
1494+ if (status)
1495+ ret = status;
1496+ }
1497+
1498+ kfree(ast);
1499+ return ret;
1500+}
1501+
1502+static unsigned int dlm_poll(struct file *file, poll_table *wait)
1503+{
1504+ struct file_info *fi = file->private_data;
1505+
1506+ poll_wait(file, &fi->fi_wait, wait);
1507+
1508+ spin_lock(&fi->fi_ast_lock);
1509+ if (!list_empty(&fi->fi_ast_list)) {
1510+ spin_unlock(&fi->fi_ast_lock);
1511+ return POLLIN | POLLRDNORM;
1512+ }
1513+
1514+ spin_unlock(&fi->fi_ast_lock);
1515+ return 0;
1516+}
1517+
1518+static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
1519+{
1520+ struct lock_info *li;
1521+ int status;
1522+
1523+ li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1524+ if (!li)
1525+ return -ENOMEM;
1526+
1527+ get_file_info(fi);
1528+ li->li_user_lksb = kparams->lksb;
1529+ li->li_astparam = kparams->astparam;
1530+ li->li_bastaddr = kparams->bastaddr;
1531+ li->li_astaddr = kparams->astaddr;
1532+ li->li_file = fi;
1533+ li->li_flags = 0;
1534+ li->li_cmd = kparams->cmd;
1535+ clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1536+
1537+ if (copy_from_user(&li->li_lksb, kparams->lksb,
1538+ sizeof(struct dlm_lksb))) {
1539+ kfree(li);
1540+ return -EFAULT;
1541+ }
1542+ li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
1543+
1544+ /* Allocate query structs */
1545+ status = -ENOMEM;
1546+ li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
1547+ if (!li->li_queryinfo)
1548+ goto out1;
1549+
1550+ /* Mainly to get gqi_lock buffer size */
1551+ if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
1552+ sizeof(struct dlm_queryinfo))) {
1553+ status = -EFAULT;
1554+ goto out1;
1555+ }
1556+
1557+ /* Overwrite userspace pointers we just copied with kernel space ones */
1558+ if (li->li_queryinfo->gqi_resinfo) {
1559+ li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
1560+ if (!li->li_queryinfo->gqi_resinfo)
1561+ goto out1;
1562+ }
1563+ if (li->li_queryinfo->gqi_lockinfo) {
1564+ li->li_queryinfo->gqi_lockinfo =
1565+ kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
1566+ GFP_KERNEL);
1567+ if (!li->li_queryinfo->gqi_lockinfo)
1568+ goto out2;
1569+ }
1570+
1571+ li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
1572+
1573+ return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
1574+ kparams->flags, /* query */
1575+ li->li_queryinfo,
1576+ ast_routine, li);
1577+
1578+ out2:
1579+ kfree(li->li_queryinfo);
1580+
1581+ out1:
1582+ kfree(li);
1583+ return status;
1584+}
1585+
1586+static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
1587+ const char *buffer)
1588+{
1589+ struct lock_info *li;
1590+ int status;
1591+ char name[DLM_RESNAME_MAXLEN];
1592+
1593+ /*
1594+ * Validate things that we need to have correct.
1595+ */
1596+ if (kparams->namelen > DLM_RESNAME_MAXLEN)
1597+ return -EINVAL;
1598+
1599+ if (!kparams->astaddr)
1600+ return -EINVAL;
1601+
1602+ if (!kparams->lksb)
1603+ return -EINVAL;
1604+
1605+ /* Get the lock name */
1606+ if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
1607+ kparams->namelen)) {
1608+ return -EFAULT;
1609+ }
1610+
1611+ /* For conversions, the lock will already have a lock_info
1612+ block squirelled away in astparam */
1613+ if (kparams->flags & DLM_LKF_CONVERT) {
1614+ gd_lkb_t *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1615+ if (!lkb) {
1616+ return -EINVAL;
1617+ }
1618+ li = (struct lock_info *)lkb->lkb_astparam;
1619+
1620+ /* Only override these if they are provided */
1621+ if (li->li_user_lksb)
1622+ li->li_user_lksb = kparams->lksb;
1623+ if (li->li_astparam)
1624+ li->li_astparam = kparams->astparam;
1625+ if (li->li_bastaddr)
1626+ li->li_bastaddr = kparams->bastaddr;
1627+ if (li->li_bastaddr)
1628+ li->li_astaddr = kparams->astaddr;
1629+ li->li_flags = 0;
1630+ }
1631+ else {
1632+ li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1633+ if (!li)
1634+ return -ENOMEM;
1635+
1636+ li->li_user_lksb = kparams->lksb;
1637+ li->li_astparam = kparams->astparam;
1638+ li->li_bastaddr = kparams->bastaddr;
1639+ li->li_astaddr = kparams->astaddr;
1640+ li->li_file = fi;
1641+ li->li_flags = 0;
1642+ li->li_cmd = kparams->cmd;
1643+ li->li_queryinfo = NULL;
1644+
1645+ /* semaphore to allow us to complete our work before
1646+ the AST routine runs. In fact we only need (and use) this
1647+ when the initial lock fails */
1648+ init_MUTEX_LOCKED(&li->li_firstlock);
1649+ set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1650+
1651+ get_file_info(fi);
1652+ }
1653+
1654+ /* Copy the user's LKSB into kernel space,
1655+ needed for conversions & value block operations */
1656+ if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb,
1657+ sizeof(struct dlm_lksb)))
1658+ return -EFAULT;
1659+
1660+ /* Lock it ... */
1661+ status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
1662+ kparams->flags, name, kparams->namelen,
1663+ kparams->parent,
1664+ ast_routine,
1665+ li,
1666+ li->li_bastaddr ? bast_routine : NULL,
1667+ kparams->range.ra_end ? &kparams->range : NULL);
1668+
1669+ /* If it succeeded (this far) with a new lock then keep track of
1670+ it on the file's lkb list */
1671+ if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
1672+ gd_lkb_t *lkb;
1673+ lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1674+
1675+ if (lkb) {
1676+ spin_lock(&fi->fi_lkb_lock);
1677+ list_add(&lkb->lkb_ownerqueue,
1678+ &fi->fi_lkb_list);
1679+ spin_unlock(&fi->fi_lkb_lock);
1680+ }
1681+ else {
1682+ log_print("failed to get lkb for new lock");
1683+ }
1684+ up(&li->li_firstlock);
1685+ }
1686+
1687+ return status;
1688+}
1689+
1690+static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
1691+{
1692+ struct lock_info *li;
1693+ gd_lkb_t *lkb;
1694+ int status;
1695+
1696+ lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1697+ if (!lkb) {
1698+ return -EINVAL;
1699+ }
1700+
1701+ li = (struct lock_info *)lkb->lkb_astparam;
1702+
1703+ li->li_user_lksb = kparams->lksb;
1704+ li->li_astparam = kparams->astparam;
1705+ li->li_cmd = kparams->cmd;
1706+
1707+ /* Have to do it here cos the lkb may not exist after
1708+ * dlm_unlock() */
1709+ spin_lock(&fi->fi_lkb_lock);
1710+ list_del(&lkb->lkb_ownerqueue);
1711+ spin_unlock(&fi->fi_lkb_lock);
1712+
1713+ /* Use existing lksb & astparams */
1714+ status = dlm_unlock(fi->fi_ls->ls_lockspace,
1715+ kparams->lkid,
1716+ kparams->flags, NULL, NULL);
1717+
1718+ return status;
1719+}
1720+
1721+/* Write call, submit a locking request */
1722+static ssize_t dlm_write(struct file *file, const char __user *buffer,
1723+ size_t count, loff_t *ppos)
1724+{
1725+ struct file_info *fi = file->private_data;
1726+ struct dlm_lock_params kparams;
1727+ sigset_t tmpsig;
1728+ sigset_t allsigs;
1729+ int status;
1730+
1731+ if (count < sizeof(kparams))
1732+ return -EINVAL;
1733+
1734+ /* Has the lockspace been deleted */
1735+ if (test_bit(1, &fi->fi_ls->ls_flags))
1736+ return -ENOENT;
1737+
1738+ /* Get the command info */
1739+ if (copy_from_user(&kparams, buffer, sizeof(kparams)))
1740+ return -EFAULT;
1741+
1742+ if (check_version(&kparams))
1743+ return -EINVAL;
1744+
1745+ /* Block signals while we are doing this */
1746+ sigfillset(&allsigs);
1747+ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1748+
1749+ switch (kparams.cmd)
1750+ {
1751+ case DLM_USER_LOCK:
1752+ status = do_user_lock(fi, &kparams, buffer);
1753+ break;
1754+
1755+ case DLM_USER_UNLOCK:
1756+ status = do_user_unlock(fi, &kparams);
1757+ break;
1758+
1759+ case DLM_USER_QUERY:
1760+ status = do_user_query(fi, &kparams);
1761+ break;
1762+
1763+ default:
1764+ status = -EINVAL;
1765+ break;
1766+ }
1767+ /* Restore signals */
1768+ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1769+ recalc_sigpending();
1770+
1771+ if (status == 0)
1772+ return count;
1773+ else
1774+ return status;
1775+}
1776+
1777+void dlm_device_free_devices()
1778+{
1779+ struct user_ls *tmp;
1780+ struct user_ls *lsinfo;
1781+
1782+ list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
1783+ misc_deregister(&lsinfo->ls_miscinfo);
1784+
1785+ /* Tidy up, but don't delete the lsinfo struct until
1786+ all the users have closed their devices */
1787+ list_del(&lsinfo->ls_list);
1788+ kfree(lsinfo->ls_miscinfo.name);
1789+ set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
1790+ }
1791+}
1792+
1793+static struct file_operations _dlm_fops = {
1794+ .open = dlm_open,
1795+ .release = dlm_close,
1796+ .ioctl = dlm_ioctl,
1797+ .read = dlm_read,
1798+ .write = dlm_write,
1799+ .poll = dlm_poll,
1800+ .owner = THIS_MODULE,
1801+};
1802+
1803+static struct file_operations _dlm_ctl_fops = {
1804+ .open = dlm_ctl_open,
1805+ .release = dlm_ctl_close,
1806+ .ioctl = dlm_ctl_ioctl,
1807+ .owner = THIS_MODULE,
1808+};
1809+
1810+/*
1811+ * Create control device
1812+ */
1813+int dlm_device_init(void)
1814+{
1815+ int r;
1816+
1817+ INIT_LIST_HEAD(&user_ls_list);
1818+
1819+ ctl_device.name = "dlm-control";
1820+ ctl_device.fops = &_dlm_ctl_fops;
1821+ ctl_device.minor = MISC_DYNAMIC_MINOR;
1822+
1823+ r = misc_register(&ctl_device);
1824+ if (r) {
1825+ log_print("misc_register failed for DLM control device");
1826+ return r;
1827+ }
1828+
1829+ return 0;
1830+}
1831+
1832+void dlm_device_exit(void)
1833+{
1834+ misc_deregister(&ctl_device);
1835+}
1836+
1837+/*
1838+ * Overrides for Emacs so that we follow Linus's tabbing style.
1839+ * Emacs will notice this stuff at the end of the file and automatically
1840+ * adjust the settings for this buffer only. This must remain at the end
1841+ * of the file.
1842+ * ---------------------------------------------------------------------------
1843+ * Local variables:
1844+ * c-file-style: "linux"
1845+ * End:
1846+ */
1847diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
1848--- linux-orig/cluster/dlm/device.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 1849+++ linux-patched/cluster/dlm/device.h 2004-06-29 20:01:19.000000000 +0800
4bf12011 1850@@ -0,0 +1,19 @@
1851+/******************************************************************************
1852+*******************************************************************************
1853+**
1854+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
1855+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
1856+**
1857+** This copyrighted material is made available to anyone wishing to use,
1858+** modify, copy, or redistribute it subject to the terms and conditions
1859+** of the GNU General Public License v.2.
1860+**
1861+*******************************************************************************
1862+******************************************************************************/
1863+
1864+#ifndef __DEVICE_DOT_H__
1865+#define __DEVICE_DOT_H__
1866+
1867+extern void dlm_device_free_devices(void);
1868+
1869+#endif /* __DEVICE_DOT_H__ */
1870diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
1871--- linux-orig/cluster/dlm/dir.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 1872+++ linux-patched/cluster/dlm/dir.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 1873@@ -0,0 +1,430 @@
1874+/******************************************************************************
1875+*******************************************************************************
1876+**
1877+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
1878+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
1879+**
1880+** This copyrighted material is made available to anyone wishing to use,
1881+** modify, copy, or redistribute it subject to the terms and conditions
1882+** of the GNU General Public License v.2.
1883+**
1884+*******************************************************************************
1885+******************************************************************************/
1886+
1887+#include "dlm_internal.h"
1888+#include "nodes.h"
1889+#include "lockspace.h"
1890+#include "lowcomms.h"
1891+#include "reccomms.h"
1892+#include "rsb.h"
1893+#include "config.h"
1894+#include "memory.h"
1895+#include "recover.h"
1896+#include "util.h"
1897+
1898+/*
1899+ * We use the upper 16 bits of the hash value to select the directory node.
1900+ * Low bits are used for distribution of rsb's among hash buckets on each node.
1901+ *
1902+ * From the hash value, we are interested in arriving at a final value between
1903+ * zero and the number of nodes minus one (num_nodes - 1).
1904+ *
1905+ * To accomplish this scaling, we take the nearest power of two larger than
1906+ * num_nodes and subtract one to create a bit mask. The mask is applied to the
1907+ * hash, reducing the range to nearer the final range.
1908+ *
1909+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
1910+ * num_nodes to the previously masked hash value.
1911+ *
1912+ * This value in the desired range is used as an offset into the sorted list of
1913+ * nodeid's to give the particular nodeid of the directory node.
1914+ */
1915+
1916+uint32_t name_to_directory_nodeid(gd_ls_t *ls, char *name, int length)
1917+{
1918+ struct list_head *tmp;
1919+ gd_csb_t *csb = NULL;
1920+ uint32_t hash, node, n = 0, nodeid;
1921+
1922+ if (ls->ls_num_nodes == 1) {
1923+ nodeid = our_nodeid();
1924+ goto out;
1925+ }
1926+
1927+ hash = gdlm_hash(name, length);
1928+ node = (hash >> 16) & ls->ls_nodes_mask;
1929+ node %= ls->ls_num_nodes;
1930+
1931+ list_for_each(tmp, &ls->ls_nodes) {
1932+ if (n++ != node)
1933+ continue;
1934+ csb = list_entry(tmp, gd_csb_t, csb_list);
1935+ break;
1936+ }
1937+
1938+ GDLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n",
1939+ ls->ls_num_nodes, n, node, ls->ls_nodes_mask););
1940+ nodeid = csb->csb_node->gn_nodeid;
1941+
1942+ out:
1943+ return nodeid;
1944+}
1945+
1946+uint32_t get_directory_nodeid(gd_res_t *rsb)
1947+{
1948+ return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
1949+ rsb->res_length);
1950+}
1951+
1952+static inline uint32_t rd_hash(gd_ls_t *ls, char *name, int len)
1953+{
1954+ uint32_t val;
1955+
1956+ val = gdlm_hash(name, len);
1957+ val &= RESDIRHASH_MASK;
1958+
1959+ return val;
1960+}
1961+
1962+static void add_resdata_to_hash(gd_ls_t *ls, gd_resdata_t *rd)
1963+{
1964+ gd_resdir_bucket_t *bucket;
1965+ uint32_t hashval;
1966+
1967+ hashval = rd_hash(ls, rd->rd_name, rd->rd_length);
1968+ bucket = &ls->ls_resdir_hash[hashval];
1969+
1970+ list_add_tail(&rd->rd_list, &bucket->rb_reslist);
1971+}
1972+
1973+static gd_resdata_t *search_rdbucket(gd_ls_t *ls, char *name, int namelen,
1974+ uint32_t bucket)
1975+{
1976+ struct list_head *head;
1977+ gd_resdata_t *rd;
1978+
1979+ head = &ls->ls_resdir_hash[bucket].rb_reslist;
1980+ list_for_each_entry(rd, head, rd_list) {
1981+ if (rd->rd_length == namelen &&
1982+ !memcmp(name, rd->rd_name, namelen))
1983+ goto out;
1984+ }
1985+ rd = NULL;
1986+ out:
1987+ return rd;
1988+}
1989+
1990+void remove_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
1991+ uint8_t sequence)
1992+{
1993+ gd_resdata_t *rd;
1994+ uint32_t bucket;
1995+
1996+ bucket = rd_hash(ls, name, namelen);
1997+
1998+ write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
1999+
2000+ rd = search_rdbucket(ls, name, namelen, bucket);
2001+
2002+ if (!rd) {
2003+ log_debug(ls, "remove_resdata not found nodeid=%u", nodeid);
2004+ goto out;
2005+ }
2006+
2007+ if (rd->rd_master_nodeid != nodeid) {
2008+ log_debug(ls, "remove_resdata wrong nodeid=%u", nodeid);
2009+ goto out;
2010+ }
2011+
2012+ if (rd->rd_sequence == sequence) {
2013+ list_del(&rd->rd_list);
2014+ free_resdata(rd);
2015+ } else {
2016+ /*
2017+ log_debug(ls, "remove_resdata mismatch nodeid=%u rd=%u in=%u",
2018+ nodeid, rd->rd_sequence, sequence);
2019+ */
2020+ }
2021+
2022+ out:
2023+ write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2024+}
2025+
2026+void resdir_clear(gd_ls_t *ls)
2027+{
2028+ struct list_head *head;
2029+ gd_resdata_t *rd;
2030+ int i;
2031+
2032+ for (i = 0; i < RESDIRHASH_SIZE; i++) {
2033+ head = &ls->ls_resdir_hash[i].rb_reslist;
2034+ while (!list_empty(head)) {
2035+ rd = list_entry(head->next, gd_resdata_t, rd_list);
2036+ list_del(&rd->rd_list);
2037+ free_resdata(rd);
2038+ }
2039+ }
2040+}
2041+
2042+static void gdlm_resmov_in(gd_resmov_t *rm, char *buf)
2043+{
2044+ gd_resmov_t tmp;
2045+
2046+ memcpy(&tmp, buf, sizeof(gd_resmov_t));
2047+
2048+ rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
2049+ rm->rm_length = be16_to_cpu(tmp.rm_length);
2050+}
2051+
2052+int resdir_rebuild_local(gd_ls_t *ls)
2053+{
2054+ gd_csb_t *csb;
2055+ gd_resdata_t *rd;
2056+ gd_rcom_t *rc;
2057+ gd_resmov_t mov, last_mov;
2058+ char *b, *last_name;
2059+ int error = -ENOMEM, count = 0;
2060+
2061+ log_all(ls, "rebuild resource directory");
2062+
2063+ resdir_clear(ls);
2064+
2065+ rc = allocate_rcom_buffer(ls);
2066+ if (!rc)
2067+ goto out;
2068+
2069+ last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
2070+ if (!last_name)
2071+ goto free_rc;
2072+
2073+ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
2074+ last_mov.rm_length = 0;
2075+ for (;;) {
2076+ error = gdlm_recovery_stopped(ls);
2077+ if (error)
2078+ goto free_last;
2079+
2080+ memcpy(rc->rc_buf, last_name, last_mov.rm_length);
2081+ rc->rc_datalen = last_mov.rm_length;
2082+
2083+ error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
2084+ RECCOMM_RECOVERNAMES, rc, 1);
2085+ if (error)
2086+ goto free_last;
2087+
2088+ schedule();
2089+
2090+ /*
2091+ * pick each res out of buffer
2092+ */
2093+
2094+ b = rc->rc_buf;
2095+
2096+ for (;;) {
2097+ gdlm_resmov_in(&mov, b);
2098+ b += sizeof(gd_resmov_t);
2099+
2100+ /* Length of 0 with a non-zero nodeid marks the
2101+ * end of the list */
2102+ if (!mov.rm_length && mov.rm_nodeid)
2103+ goto done;
2104+
2105+ /* This is just the end of the block */
2106+ if (!mov.rm_length)
2107+ break;
2108+
2109+ error = -ENOMEM;
2110+ rd = allocate_resdata(ls, mov.rm_length);
2111+ if (!rd)
2112+ goto free_last;
2113+
2114+ rd->rd_master_nodeid = mov.rm_nodeid;
2115+ rd->rd_length = mov.rm_length;
2116+ rd->rd_sequence = 1;
2117+
2118+ memcpy(rd->rd_name, b, mov.rm_length);
2119+ b += mov.rm_length;
2120+
2121+ add_resdata_to_hash(ls, rd);
2122+ count++;
2123+
2124+ last_mov = mov;
2125+ memset(last_name, 0, DLM_RESNAME_MAXLEN);
2126+ memcpy(last_name, rd->rd_name, rd->rd_length);
2127+ }
2128+ }
2129+ done:
2130+ ;
2131+ }
2132+
2133+ set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
2134+ error = 0;
2135+
2136+ log_all(ls, "rebuilt %d resources", count);
2137+
2138+ free_last:
2139+ kfree(last_name);
2140+
2141+ free_rc:
2142+ free_rcom_buffer(rc);
2143+
2144+ out:
2145+ return error;
2146+}
2147+
2148+/*
2149+ * The reply end of resdir_rebuild_local/RECOVERNAMES. Collect and send as
2150+ * many resource names as can fit in the buffer.
2151+ */
2152+
2153+int resdir_rebuild_send(gd_ls_t *ls, char *inbuf, int inlen, char *outbuf,
2154+ int outlen, uint32_t nodeid)
2155+{
2156+ struct list_head *list;
2157+ gd_res_t *start_rsb = NULL, *rsb;
2158+ int offset = 0, start_namelen, error;
2159+ char *start_name;
2160+ gd_resmov_t tmp;
2161+ uint32_t dir_nodeid;
2162+
2163+ /*
2164+ * Find the rsb where we left off (or start again)
2165+ */
2166+
2167+ start_namelen = inlen;
2168+ start_name = inbuf;
2169+
2170+ if (start_namelen > 1) {
2171+ error = find_or_create_rsb(ls, NULL, start_name,
2172+ start_namelen, 0, &start_rsb);
2173+ GDLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
2174+ release_rsb(start_rsb);
2175+ }
2176+
2177+ /*
2178+ * Send rsb names for rsb's we're master of and whose directory node
2179+ * matches the requesting node.
2180+ */
2181+
2182+ down_read(&ls->ls_rec_rsblist);
2183+ if (start_rsb)
2184+ list = start_rsb->res_rootlist.next;
2185+ else
2186+ list = ls->ls_rootres.next;
2187+
2188+ for (offset = 0; list != &ls->ls_rootres; list = list->next) {
2189+ rsb = list_entry(list, gd_res_t, res_rootlist);
2190+ if (rsb->res_nodeid)
2191+ continue;
2192+
2193+ dir_nodeid = get_directory_nodeid(rsb);
2194+ if (dir_nodeid != nodeid)
2195+ continue;
2196+
2197+ if (offset + sizeof(gd_resmov_t)*2 + rsb->res_length > outlen) {
2198+ /* Write end-of-block record */
2199+ memset(&tmp, 0, sizeof(gd_resmov_t));
2200+ memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2201+ offset += sizeof(gd_resmov_t);
2202+ goto out;
2203+ }
2204+
2205+ memset(&tmp, 0, sizeof(gd_resmov_t));
2206+ tmp.rm_nodeid = cpu_to_be32(our_nodeid());
2207+ tmp.rm_length = cpu_to_be16(rsb->res_length);
2208+
2209+ memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2210+ offset += sizeof(gd_resmov_t);
2211+
2212+ memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
2213+ offset += rsb->res_length;
2214+ }
2215+
2216+ /*
2217+ * If we've reached the end of the list (and there's room) write a
2218+ * terminating record.
2219+ */
2220+
2221+ if ((list == &ls->ls_rootres) &&
2222+ (offset + sizeof(gd_resmov_t) <= outlen)) {
2223+
2224+ memset(&tmp, 0, sizeof(gd_resmov_t));
2225+ /* This only needs to be non-zero */
2226+ tmp.rm_nodeid = cpu_to_be32(1);
2227+ /* and this must be zero */
2228+ tmp.rm_length = 0;
2229+ memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
2230+ offset += sizeof(gd_resmov_t);
2231+ }
2232+
2233+ out:
2234+ up_read(&ls->ls_rec_rsblist);
2235+ return offset;
2236+}
2237+
2238+int get_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
2239+ gd_resdata_t **rdp, int recovery)
2240+{
2241+ gd_resdata_t *rd;
2242+ gd_resdata_t *tmp;
2243+ uint32_t bucket;
2244+
2245+ bucket = rd_hash(ls, name, namelen);
2246+
2247+ read_lock(&ls->ls_resdir_hash[bucket].rb_lock);
2248+ rd = search_rdbucket(ls, name, namelen, bucket);
2249+ read_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2250+
2251+ if (rd)
2252+ goto out;
2253+
2254+ rd = allocate_resdata(ls, namelen);
2255+ if (!rd)
2256+ return -ENOMEM;
2257+
2258+ rd->rd_master_nodeid = nodeid;
2259+ rd->rd_length = namelen;
2260+ memcpy(rd->rd_name, name, namelen);
2261+
2262+ write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
2263+ tmp = search_rdbucket(ls, name, namelen, bucket);
2264+ if (!tmp)
2265+ list_add_tail(&rd->rd_list,
2266+ &ls->ls_resdir_hash[bucket].rb_reslist);
2267+ write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
2268+
2269+ if (tmp) {
2270+ free_resdata(rd);
2271+ rd = tmp;
2272+ }
2273+
2274+ out:
2275+ *rdp = rd;
2276+
2277+ if (!recovery) {
2278+ if (++rd->rd_sequence == 0)
2279+ rd->rd_sequence++;
2280+ } else
2281+ rd->rd_sequence = 1;
2282+
2283+ return 0;
2284+}
2285+
2286+/*
2287+ * The node with lowest id queries all nodes to determine when all are done.
2288+ * All other nodes query the low nodeid for this.
2289+ */
2290+
2291+int resdir_rebuild_wait(gd_ls_t *ls)
2292+{
2293+ int error;
2294+
2295+ if (ls->ls_low_nodeid == our_nodeid()) {
2296+ error = gdlm_wait_status_all(ls, RESDIR_VALID);
2297+ if (!error)
2298+ set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
2299+ } else
2300+ error = gdlm_wait_status_low(ls, RESDIR_ALL_VALID);
2301+
2302+ return error;
2303+}
2304diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
2305--- linux-orig/cluster/dlm/dir.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 2306+++ linux-patched/cluster/dlm/dir.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 2307@@ -0,0 +1,30 @@
2308+/******************************************************************************
2309+*******************************************************************************
2310+**
2311+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2312+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2313+**
2314+** This copyrighted material is made available to anyone wishing to use,
2315+** modify, copy, or redistribute it subject to the terms and conditions
2316+** of the GNU General Public License v.2.
2317+**
2318+*******************************************************************************
2319+******************************************************************************/
2320+
2321+#ifndef __DIR_DOT_H__
2322+#define __DIR_DOT_H__
2323+
2324+uint32_t name_to_directory_nodeid(gd_ls_t * ls, char *name, int length);
2325+uint32_t get_directory_nodeid(gd_res_t * rsb);
2326+void remove_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
2327+ uint8_t sequence);
2328+int resdir_rebuild_local(gd_ls_t * ls);
2329+int resdir_rebuild_send(gd_ls_t * ls, char *inbuf, int inlen, char *outbuf,
2330+ int outlen, uint32_t nodeid);
2331+int get_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
2332+ gd_resdata_t ** rdp, int recovery);
2333+int resdir_rebuild_wait(gd_ls_t * ls);
2334+void resdir_clear(gd_ls_t * ls);
2335+void resdir_dump(gd_ls_t * ls);
2336+
2337+#endif /* __DIR_DOT_H__ */
2338diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
2339--- linux-orig/cluster/dlm/dlm_internal.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
2340+++ linux-patched/cluster/dlm/dlm_internal.h 2004-06-29 20:01:20.000000000 +0800
2341@@ -0,0 +1,626 @@
4bf12011 2342+/******************************************************************************
2343+*******************************************************************************
2344+**
2345+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2346+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2347+**
2348+** This copyrighted material is made available to anyone wishing to use,
2349+** modify, copy, or redistribute it subject to the terms and conditions
2350+** of the GNU General Public License v.2.
2351+**
2352+*******************************************************************************
2353+******************************************************************************/
2354+
2355+#ifndef __DLM_INTERNAL_DOT_H__
2356+#define __DLM_INTERNAL_DOT_H__
2357+
2358+/*
2359+ * This is the main header file to be included in each DLM source file.
2360+ */
2361+
2362+#define DLM_RELEASE_NAME "<CVS>"
2363+
2364+#include <linux/slab.h>
2365+#include <linux/sched.h>
2366+#include <asm/semaphore.h>
2367+#include <linux/types.h>
2368+#include <linux/spinlock.h>
2369+#include <linux/vmalloc.h>
2370+#include <asm/uaccess.h>
2371+#include <linux/list.h>
2372+#include <linux/errno.h>
2373+#include <linux/random.h>
2374+
2375+#include <cluster/dlm.h>
2376+#include <cluster/dlm_device.h>
2377+#include <cluster/service.h>
2378+
2379+#ifndef TRUE
2380+#define TRUE (1)
2381+#endif
2382+
2383+#ifndef FALSE
2384+#define FALSE (0)
2385+#endif
2386+
2387+#if (BITS_PER_LONG == 64)
2388+#define PRIu64 "lu"
2389+#define PRId64 "ld"
2390+#define PRIo64 "lo"
2391+#define PRIx64 "lx"
2392+#define PRIX64 "lX"
2393+#define SCNu64 "lu"
2394+#define SCNd64 "ld"
2395+#define SCNo64 "lo"
2396+#define SCNx64 "lx"
2397+#define SCNX64 "lX"
2398+#else
2399+#define PRIu64 "Lu"
2400+#define PRId64 "Ld"
2401+#define PRIo64 "Lo"
2402+#define PRIx64 "Lx"
2403+#define PRIX64 "LX"
2404+#define SCNu64 "Lu"
2405+#define SCNd64 "Ld"
2406+#define SCNo64 "Lo"
2407+#define SCNx64 "Lx"
2408+#define SCNX64 "LX"
2409+#endif
2410+
2411+#define wchan_cond_sleep_intr(chan, sleep_cond) \
2412+do \
2413+{ \
2414+ DECLARE_WAITQUEUE(__wait_chan, current); \
2415+ current->state = TASK_INTERRUPTIBLE; \
2416+ add_wait_queue(&chan, &__wait_chan); \
2417+ if ((sleep_cond)) \
2418+ schedule(); \
2419+ remove_wait_queue(&chan, &__wait_chan); \
2420+ current->state = TASK_RUNNING; \
2421+} \
2422+while (0)
2423+
2424+static inline int check_timeout(unsigned long stamp, unsigned int seconds)
2425+{
2426+ return time_after(jiffies, stamp + seconds * HZ);
2427+}
2428+
2429+
2430+#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
2431+
2432+#define log_all(ls, fmt, args...) \
2433+ do { \
2434+ printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
2435+ dlm_debug_log(ls, fmt, ##args); \
2436+ } while (0)
2437+
2438+#define log_error log_all
2439+
2440+
2441+#define DLM_DEBUG
2442+#if defined(DLM_DEBUG)
2443+#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
2444+#else
2445+#define log_debug(ls, fmt, args...)
2446+#endif
2447+
2448+#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
2449+#undef log_debug
2450+#define log_debug log_all
2451+#endif
2452+
2453+
2454+#define GDLM_ASSERT(x, do) \
2455+{ \
2456+ if (!(x)) \
2457+ { \
2458+ dlm_debug_dump(); \
2459+ printk("\nDLM: Assertion failed on line %d of file %s\n" \
2460+ "DLM: assertion: \"%s\"\n" \
2461+ "DLM: time = %lu\n", \
2462+ __LINE__, __FILE__, #x, jiffies); \
2463+ {do} \
2464+ printk("\n"); \
2465+ BUG(); \
2466+ panic("DLM: Record message above and reboot.\n"); \
2467+ } \
2468+}
2469+
2470+
2471+struct gd_ls;
2472+struct gd_lkb;
2473+struct gd_res;
2474+struct gd_csb;
2475+struct gd_node;
2476+struct gd_resmov;
2477+struct gd_resdata;
2478+struct gd_recover;
2479+struct gd_recinfo;
2480+struct gd_resdir_bucket;
2481+struct gd_remlockreply;
2482+struct gd_remlockrequest;
2483+struct gd_rcom;
2484+
2485+typedef struct gd_ls gd_ls_t;
2486+typedef struct gd_lkb gd_lkb_t;
2487+typedef struct gd_res gd_res_t;
2488+typedef struct gd_csb gd_csb_t;
2489+typedef struct gd_node gd_node_t;
2490+typedef struct gd_resmov gd_resmov_t;
2491+typedef struct gd_resdata gd_resdata_t;
2492+typedef struct gd_recover gd_recover_t;
2493+typedef struct gd_resdir_bucket gd_resdir_bucket_t;
2494+typedef struct gd_rcom gd_rcom_t;
2495+
2496+/*
2497+ * Resource Data - an entry for a resource in the resdir hash table
2498+ */
2499+
2500+struct gd_resdata {
2501+ struct list_head rd_list;
2502+ uint32_t rd_master_nodeid;
2503+ uint16_t rd_length;
2504+ uint8_t rd_sequence;
2505+ char rd_name[1]; /* <rd_length> bytes */
2506+};
2507+
2508+/*
2509+ * Resource Directory Bucket - a hash bucket of resdata entries in the resdir
2510+ * hash table
2511+ */
2512+
2513+struct gd_resdir_bucket {
2514+ struct list_head rb_reslist;
2515+ rwlock_t rb_lock;
2516+};
2517+
2518+/*
2519+ * A resource description as moved between nodes
2520+ */
2521+
2522+struct gd_resmov {
2523+ uint32_t rm_nodeid;
2524+ uint16_t rm_length;
2525+ uint16_t rm_pad;
2526+};
2527+
2528+/*
2529+ * An entry in the lock ID table. Locks for this bucket are kept on list.
2530+ * Counter is used to assign an id to locks as they are added to this bucket.
2531+ */
2532+
2533+struct gd_lockidtbl_entry {
2534+ struct list_head list;
2535+ uint16_t counter;
2536+};
2537+
2538+/* Elements in the range array */
2539+
2540+#define GR_RANGE_START 0
2541+#define GR_RANGE_END 1
2542+#define RQ_RANGE_START 2
2543+#define RQ_RANGE_END 3
2544+
2545+/*
2546+ * Lockspace structure. The context for GDLM locks.
2547+ */
2548+
2549+#define RESHASHTBL_SIZE (256)
2550+
2551+#define RESDIRHASH_SHIFT (9)
2552+#define RESDIRHASH_SIZE (1 << RESDIRHASH_SHIFT)
2553+#define RESDIRHASH_MASK (RESDIRHASH_SIZE - 1)
2554+
2555+#define LSFL_WORK (0)
2556+#define LSFL_LS_RUN (1)
2557+#define LSFL_LS_STOP (2)
2558+#define LSFL_LS_START (3)
2559+#define LSFL_LS_FINISH (4)
2560+#define LSFL_RECCOMM_WAIT (5)
2561+#define LSFL_RECCOMM_READY (6)
2562+#define LSFL_NOTIMERS (7)
2563+#define LSFL_FINISH_RECOVERY (8)
2564+#define LSFL_RESDIR_VALID (9)
2565+#define LSFL_ALL_RESDIR_VALID (10)
2566+#define LSFL_NODES_VALID (11)
2567+#define LSFL_ALL_NODES_VALID (12)
2568+#define LSFL_REQUEST_WARN (13)
2569+
2570+#define LSST_NONE (0)
2571+#define LSST_INIT (1)
2572+#define LSST_INIT_DONE (2)
2573+#define LSST_CLEAR (3)
2574+#define LSST_WAIT_START (4)
2575+#define LSST_RECONFIG_DONE (5)
2576+
2577+struct gd_ls {
2578+ struct list_head ls_list; /* list of lockspaces */
2579+ uint32_t ls_local_id; /* local unique lockspace ID */
2580+ uint32_t ls_global_id; /* global unique lockspace ID */
2581+ int ls_allocation; /* Memory allocation policy */
2582+ unsigned long ls_flags; /* LSFL_ */
2583+
2584+ struct list_head ls_rootres; /* List of root resources */
2585+
2586+ int ls_hashsize;
2587+ int ls_hashmask;
2588+ struct list_head *ls_reshashtbl; /* Hash table for resources */
2589+ rwlock_t ls_reshash_lock; /* Lock for hash table */
2590+
2591+ struct gd_lockidtbl_entry *ls_lockidtbl;
2592+ uint32_t ls_lockidtbl_size; /* Size of lock id table */
2593+ rwlock_t ls_lockidtbl_lock;
2594+
2595+ struct list_head ls_nodes; /* current nodes in RC */
2596+ uint32_t ls_num_nodes; /* number of nodes in RC */
2597+ uint32_t ls_nodes_mask;
2598+ uint32_t ls_low_nodeid;
2599+
2600+ int ls_state; /* state changes for recovery */
2601+ struct list_head ls_recover; /* gr_recover_t structs */
2602+ int ls_last_stop; /* event ids from sm */
2603+ int ls_last_start;
2604+ int ls_last_finish;
2605+ spinlock_t ls_recover_lock;
2606+ struct list_head ls_nodes_gone; /* dead node list for recovery */
2607+
2608+ wait_queue_head_t ls_wait_general;
2609+
2610+ gd_rcom_t *ls_rcom;
2611+ uint32_t ls_rcom_msgid;
2612+ struct semaphore ls_rcom_lock;
2613+
2614+ struct list_head ls_recover_list;
2615+ int ls_recover_list_count;
2616+ spinlock_t ls_recover_list_lock;
2617+
2618+ struct rw_semaphore ls_in_recovery; /* held in write during
2619+ * recovery, read for normal
2620+ * locking ops */
2621+ struct rw_semaphore ls_unlock_sem; /* To prevent unlock on a
2622+ * parent lock racing with a
2623+ * new child lock */
2624+
2625+ struct rw_semaphore ls_rec_rsblist; /* To prevent incoming recovery
2626+ * operations happening while
2627+ * we are purging */
2628+
2629+ struct rw_semaphore ls_gap_rsblist; /* To protect rootres list
2630+ * in grant_after_purge() which
2631+ * runs outside recovery */
2632+
2633+ struct list_head ls_rebuild_rootrsb_list; /* Root of lock trees
2634+ * we are deserialising
2635+ */
2636+
2637+ struct list_head ls_deadlockq; /* List of locks in conversion ordered
2638+ * by duetime. for deadlock detection */
2639+
2640+ struct list_head ls_requestqueue; /* List of incoming requests
2641+ * held while we are in
2642+ * recovery */
2643+
2644+ gd_resdir_bucket_t ls_resdir_hash[RESDIRHASH_SIZE];
2645+
2646+ int ls_namelen;
2647+ char ls_name[1]; /* <namelen> bytes */
2648+};
2649+
2650+/*
2651+ * Cluster node (per node in cluster)
2652+ */
2653+
2654+struct gd_node {
2655+ struct list_head gn_list; /* global list of cluster nodes */
2656+ uint32_t gn_nodeid; /* cluster unique nodeid (cman) */
2657+ uint32_t gn_ipaddr; /* node's first IP address (cman) */
2658+ int gn_refcount; /* number of csb's referencing */
2659+};
2660+
2661+/*
2662+ * Cluster System Block (per node in a ls)
2663+ */
2664+
2665+struct gd_csb {
2666+ struct list_head csb_list; /* per-lockspace list of nodes */
2667+ gd_node_t *csb_node; /* global node structure */
2668+ int csb_gone_event; /* event id when node was removed */
2669+
2670+ uint32_t csb_names_send_count;
2671+ uint32_t csb_names_send_msgid;
2672+ uint32_t csb_names_recv_count;
2673+ uint32_t csb_names_recv_msgid;
2674+ uint32_t csb_locks_send_count;
2675+ uint32_t csb_locks_send_msgid;
2676+ uint32_t csb_locks_recv_count;
2677+ uint32_t csb_locks_recv_msgid;
2678+};
2679+
2680+/*
2681+ * Resource block
2682+ */
2683+
2684+/* status */
2685+
2686+#define GDLM_RESSTS_DIRENTRY 1 /* This is a directory entry */
2687+#define GDLM_RESSTS_LVBINVALID 2 /* The LVB is invalid */
2688+
2689+#define RESFL_NEW_MASTER (0)
2690+#define RESFL_RECOVER_LIST (1)
2691+
2692+struct gd_res {
2693+ struct list_head res_hashchain; /* Chain of resources in this hash
2694+ * bucket */
2695+
2696+ gd_ls_t *res_ls; /* The owning lockspace */
2697+
2698+ struct list_head res_rootlist; /* List of root resources in lockspace */
2699+
2700+ struct list_head res_subreslist; /* List of all sub-resources
2701+ * for this root res. */
2702+ /* This is a list head on the root res and holds the whole tree below
2703+ * it. */
2704+ uint8_t res_depth; /* Depth in resource tree */
2705+ uint16_t res_status;
2706+ unsigned long res_flags; /* Flags, RESFL_ */
2707+
2708+ struct list_head res_grantqueue;
2709+ struct list_head res_convertqueue;
2710+ struct list_head res_waitqueue;
2711+
2712+ uint32_t res_nodeid; /* nodeid of master node */
2713+
2714+ gd_res_t *res_root; /* If a subresource, this is our root */
2715+ gd_res_t *res_parent; /* Our parent resource (if any) */
2716+
2717+ atomic_t res_ref; /* No of lkb's */
2718+ uint16_t res_remasterid; /* ID used during remaster */
2719+ struct list_head res_recover_list; /* General list for use during
2720+ * recovery */
2721+ int res_recover_msgid;
2722+ int res_newlkid_expect;
2723+
2724+ struct rw_semaphore res_lock;
2725+
2726+ char *res_lvbptr; /* Lock value block */
2727+
2728+ uint8_t res_resdir_seq; /* Last directory sequence number */
2729+
2730+ uint8_t res_length;
2731+ char res_name[1]; /* <res_length> bytes */
2732+};
2733+
2734+/*
2735+ * Lock block. To avoid confusion, where flags mirror the
2736+ * public flags, they should have the same value.
2737+ */
2738+
2739+#define GDLM_LKSTS_NEW (0)
2740+#define GDLM_LKSTS_WAITING (1)
2741+#define GDLM_LKSTS_GRANTED (2)
2742+#define GDLM_LKSTS_CONVERT (3)
2743+
2744+#define GDLM_LKFLG_VALBLK (0x00000008)
2745+#define GDLM_LKFLG_PERSISTENT (0x00000080) /* Don't unlock when process exits */
5cdbd17b
AM
2746+#define GDLM_LKFLG_NODLCKWT (0x00000100) /* Don't do deadlock detection */
2747+#define GDLM_LKFLG_EXPEDITE (0x00000400) /* Move to head of convert queue */
4bf12011 2748+
2749+/* Internal flags */
5cdbd17b
AM
2750+#define GDLM_LKFLG_RANGE (0x00001000) /* Range field is present
2751+ (remote protocol only) */
4bf12011 2752+#define GDLM_LKFLG_MSTCPY (0x00002000)
2753+#define GDLM_LKFLG_DELETED (0x00004000) /* LKB is being deleted */
5cdbd17b 2754+#define GDLM_LKFLG_LQCONVERT (0x00008000)
4bf12011 2755+#define GDLM_LKFLG_LQRESEND (0x00010000) /* LKB on lockqueue must be resent */
2756+#define GDLM_LKFLG_DEMOTED (0x00020000)
2757+#define GDLM_LKFLG_RESENT (0x00040000)
2758+#define GDLM_LKFLG_NOREBUILD (0x00080000)
4bf12011 2759+
5cdbd17b
AM
2760+#define AST_COMP (1)
2761+#define AST_BAST (2)
2762+#define AST_DEL (4)
4bf12011 2763+
5cdbd17b
AM
2764+struct gd_lkb {
2765+ uint32_t lkb_flags;
2766+ uint16_t lkb_status; /* grant, wait, convert */
2767+ int8_t lkb_rqmode; /* requested lock mode */
2768+ int8_t lkb_grmode; /* granted lock mode */
2769+ uint32_t lkb_retstatus; /* status to return in lksb */
2770+ uint32_t lkb_id; /* our lock ID */
2771+ struct dlm_lksb * lkb_lksb; /* status block of caller */
2772+ struct list_head lkb_idtbl_list; /* lockidtbl */
2773+ struct list_head lkb_statequeue; /* rsb's g/c/w queue */
2774+ gd_res_t * lkb_resource;
2775+ struct list_head lkb_ownerqueue; /* list of locks owned by a
2776+ process */
2777+ gd_lkb_t * lkb_parent; /* parent lock if any */
2778+ atomic_t lkb_childcnt; /* number of children */
2779+
2780+ struct list_head lkb_lockqueue; /* queue of locks waiting
2781+ for remote reply */
2782+ int lkb_lockqueue_state; /* reason on lockqueue */
2783+ int lkb_lockqueue_flags; /* as passed into
2784+ lock/unlock */
2785+ unsigned long lkb_lockqueue_time; /* time lkb went on the
2786+ lockqueue */
2787+ unsigned long lkb_duetime; /* for deadlock detection */
2788+
2789+ uint32_t lkb_remid; /* id on remote partner */
2790+ uint32_t lkb_nodeid; /* id of remote partner */
2791+
2792+ void * lkb_astaddr;
2793+ void * lkb_bastaddr;
2794+ long lkb_astparam;
2795+ struct list_head lkb_astqueue; /* locks with asts to deliver */
2796+ uint16_t lkb_astflags; /* COMP, BAST, DEL */
2797+ uint8_t lkb_bastmode; /* requested mode */
2798+ uint8_t lkb_highbast; /* highest mode bast sent for */
4bf12011 2799+
2800+ struct gd_remlockrequest *lkb_request;
2801+
5cdbd17b 2802+ struct list_head lkb_deadlockq; /* ls_deadlockq list */
4bf12011 2803+
5cdbd17b
AM
2804+ char * lkb_lvbptr; /* points to lksb lvb on local
2805+ lock, allocated lvb on
2806+ on remote lock */
2807+ uint64_t * lkb_range; /* Points to an array of 64 bit
2808+ numbers that represent the
2809+ requested and granted ranges
2810+ of the lock. NULL implies
2811+ 0-ffffffffffffffff */
4bf12011 2812+};
2813+
2814+/*
2815+ * Used to save and manage recovery state for a lockspace.
2816+ */
2817+
2818+struct gd_recover {
2819+ struct list_head gr_list;
2820+ uint32_t *gr_nodeids;
2821+ int gr_node_count;
2822+ int gr_event_id;
2823+};
2824+
2825+/*
2826+ * Header part of the mid-level comms system. All packets start with
2827+ * this header so we can identify them. The comms packet can
2828+ * contain many of these structs but the are split into individual
2829+ * work units before being passed to the lockqueue routines.
2830+ * below this are the structs that this is a header for
2831+ */
2832+
2833+struct gd_req_header {
2834+ uint8_t rh_cmd; /* What we are */
2835+ uint8_t rh_flags; /* maybe just a pad */
2836+ uint16_t rh_length; /* Length of struct (so we can send several in
2837+ * one message) */
2838+ uint32_t rh_lkid; /* Lock ID tag: ie the local (requesting) lock
2839+ * ID */
2840+ uint32_t rh_lockspace; /* Lockspace ID */
2841+};
2842+
2843+/*
2844+ * This is the struct used in a remote lock/unlock/convert request
2845+ * The mid-level comms API should turn this into native byte order.
2846+ * Most "normal" lock operations will use these two structs for
2847+ * communications. Recovery operations use their own structs
2848+ * but still with the gd_req_header on the front.
2849+ */
2850+
2851+struct gd_remlockrequest {
2852+ struct gd_req_header rr_header;
2853+
2854+ uint32_t rr_remlkid; /* Remote lock ID */
2855+ uint32_t rr_remparid; /* Parent's remote lock ID or 0 */
2856+ uint32_t rr_flags; /* Flags from lock/convert request */
2857+ uint64_t rr_range_start;/* Yes, these are in the right place... */
2858+ uint64_t rr_range_end;
2859+ uint32_t rr_status; /* Status to return if this is an AST request */
2860+ uint8_t rr_rqmode; /* Requested lock mode */
2861+ uint8_t rr_asts; /* Whether the LKB has ASTs or not */
2862+ uint8_t rr_resdir_seq; /* Directory sequence number */
2863+ char rr_lvb[DLM_LVB_LEN]; /* Value block */
2864+ char rr_name[1]; /* As long as needs be. Only used for directory
2865+ * lookups. The length of this can be worked
2866+ * out from the packet length */
2867+};
2868+
2869+/*
2870+ * This is the struct returned by a remote lock/unlock/convert request
2871+ * The mid-level comms API should turn this into native byte order.
2872+ */
2873+
2874+struct gd_remlockreply {
2875+ struct gd_req_header rl_header;
2876+
2877+ uint32_t rl_lockstate; /* Whether request was queued/granted/waiting */
2878+ uint32_t rl_nodeid; /* nodeid of lock master */
2879+ uint32_t rl_status; /* Status to return to caller */
2880+ uint32_t rl_lkid; /* Remote lkid */
2881+ uint8_t rl_resdir_seq; /* Returned directory sequence number */
2882+ char rl_lvb[DLM_LVB_LEN]; /* LVB itself */
2883+};
2884+
2885+/*
2886+ * Recovery comms message
2887+ */
2888+
2889+struct gd_rcom {
2890+ struct gd_req_header rc_header; /* 32 byte aligned */
2891+ uint32_t rc_msgid;
2892+ uint16_t rc_datalen;
2893+ uint8_t rc_expanded;
2894+ uint8_t rc_subcmd; /* secondary command */
2895+ char rc_buf[1]; /* first byte of data goes here and extends
2896+ * beyond here for another datalen - 1 bytes.
2897+ * rh_length is set to sizeof(gd_rcom_t) +
2898+ * datalen - 1 */
2899+};
2900+
2901+
2902+/* A remote query: GDLM_REMCMD_QUERY */
2903+struct gd_remquery {
2904+ struct gd_req_header rq_header;
2905+
2906+ uint32_t rq_mstlkid; /* LockID on master node */
2907+ uint32_t rq_query; /* query from the user */
2908+ uint32_t rq_maxlocks; /* max number of locks we can cope with */
2909+};
2910+
2911+/* First block of a reply query. cmd = GDLM_REMCMD_QUERY */
2912+/* There may be subsequent blocks of
2913+ lock info in GDLM_REMCMD_QUERYCONT messages which just have
2914+ a normal header. The last of these will have rh_flags set to
2915+ GDLM_REMFLAG_ENDQUERY
2916+ */
2917+struct gd_remqueryreply {
2918+ struct gd_req_header rq_header;
2919+
2920+ uint32_t rq_numlocks; /* Number of locks in reply */
2921+ uint32_t rq_startlock; /* Which lock this block starts at (for multiple block replies) */
2922+ uint32_t rq_status;
2923+
2924+ /* Resource information */
2925+ uint32_t rq_grantcount; /* No. of nodes on grant queue */
2926+ uint32_t rq_convcount; /* No. of nodes on convert queue */
2927+ uint32_t rq_waitcount; /* No. of nodes on wait queue */
2928+ char rq_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable */
2929+};
2930+
2931+/*
2932+ * Lockqueue wait lock states
2933+ */
2934+
2935+#define GDLM_LQSTATE_WAIT_RSB 1
2936+#define GDLM_LQSTATE_WAIT_CONVERT 2
2937+#define GDLM_LQSTATE_WAIT_CONDGRANT 3
2938+#define GDLM_LQSTATE_WAIT_UNLOCK 4
2939+
2940+/* Commands sent across the comms link */
2941+#define GDLM_REMCMD_LOOKUP 1
2942+#define GDLM_REMCMD_LOCKREQUEST 2
2943+#define GDLM_REMCMD_UNLOCKREQUEST 3
2944+#define GDLM_REMCMD_CONVREQUEST 4
2945+#define GDLM_REMCMD_LOCKREPLY 5
2946+#define GDLM_REMCMD_LOCKGRANT 6
2947+#define GDLM_REMCMD_SENDBAST 7
2948+#define GDLM_REMCMD_SENDCAST 8
2949+#define GDLM_REMCMD_REM_RESDATA 9
2950+#define GDLM_REMCMD_RECOVERMESSAGE 20
2951+#define GDLM_REMCMD_RECOVERREPLY 21
2952+#define GDLM_REMCMD_QUERY 30
2953+#define GDLM_REMCMD_QUERYREPLY 31
2954+
2955+/* Set in rh_flags when this is the last block of
2956+ query information. Note this could also be the first
2957+ block */
2958+#define GDLM_REMFLAG_ENDQUERY 1
2959+
4bf12011 2960+#ifndef BUG_ON
2961+#define BUG_ON(x)
2962+#endif
2963+
2964+void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...);
2965+void dlm_debug_dump(void);
2966+
2967+#endif /* __DLM_INTERNAL_DOT_H__ */
2968diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
2969--- linux-orig/cluster/dlm/lkb.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 2970+++ linux-patched/cluster/dlm/lkb.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 2971@@ -0,0 +1,225 @@
2972+/******************************************************************************
2973+*******************************************************************************
2974+**
2975+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2976+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2977+**
2978+** This copyrighted material is made available to anyone wishing to use,
2979+** modify, copy, or redistribute it subject to the terms and conditions
2980+** of the GNU General Public License v.2.
2981+**
2982+*******************************************************************************
2983+******************************************************************************/
2984+
2985+/*
2986+ * lkb.c
2987+ *
2988+ * Allocate and free locks on the lock ID table.
2989+ *
2990+ * This is slightly naff but I don't really like the
2991+ * VMS lockidtbl stuff as it uses a realloced array
2992+ * to hold the locks in. I think this is slightly better
2993+ * in some ways.
2994+ *
2995+ * Any better suggestions gratefully received. Patrick
2996+ *
2997+ */
2998+
2999+#include "dlm_internal.h"
3000+#include "lockqueue.h"
3001+#include "lkb.h"
3002+#include "config.h"
3003+#include "rsb.h"
3004+#include "memory.h"
3005+#include "lockspace.h"
3006+#include "util.h"
3007+
3008+/*
3009+ * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
3010+ */
3011+
3012+static gd_lkb_t *__find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
3013+{
3014+ uint16_t entry = lkid & 0xFFFF;
3015+ gd_lkb_t *lkb;
3016+
3017+ if (entry >= ls->ls_lockidtbl_size)
3018+ goto out;
3019+
3020+ list_for_each_entry(lkb, &ls->ls_lockidtbl[entry].list, lkb_idtbl_list){
3021+ if (lkb->lkb_id == lkid)
3022+ return lkb;
3023+ }
3024+
3025+ out:
3026+ return NULL;
3027+}
3028+
3029+/*
3030+ * Should be called at lockspace initialisation time.
3031+ */
3032+
3033+int init_lockidtbl(gd_ls_t *ls, int entries)
3034+{
3035+ int i;
3036+
3037+ /* Make sure it's a power of two */
3038+ GDLM_ASSERT(!(entries & (entries - 1)),);
3039+
3040+ ls->ls_lockidtbl_size = entries;
3041+ rwlock_init(&ls->ls_lockidtbl_lock);
3042+
3043+ ls->ls_lockidtbl = kmalloc(entries * sizeof(struct gd_lockidtbl_entry),
3044+ GFP_KERNEL);
3045+ if (!ls->ls_lockidtbl)
3046+ return -ENOMEM;
3047+
3048+ for (i = 0; i < entries; i++) {
3049+ INIT_LIST_HEAD(&ls->ls_lockidtbl[i].list);
3050+ ls->ls_lockidtbl[i].counter = 1;
3051+ }
3052+
3053+ return 0;
3054+}
3055+
3056+/*
3057+ * Free up the space - returns an error if there are still locks hanging around
3058+ */
3059+
3060+int free_lockidtbl(gd_ls_t *ls)
3061+{
3062+ int i;
3063+
3064+ write_lock(&ls->ls_lockidtbl_lock);
3065+
3066+ for (i = 0; i < ls->ls_lockidtbl_size; i++) {
3067+ if (!list_empty(&ls->ls_lockidtbl[i].list)) {
3068+ write_unlock(&ls->ls_lockidtbl_lock);
3069+ return -1;
3070+ }
3071+ }
3072+ kfree(ls->ls_lockidtbl);
3073+
3074+ write_unlock(&ls->ls_lockidtbl_lock);
3075+
3076+ return 0;
3077+}
3078+
3079+/*
3080+ * LKB lkid's are 32 bits and have two 16 bit parts. The bottom 16 bits are a
3081+ * random number between 0 and lockidtbl_size-1. This random number specifies
3082+ * the "bucket" for the lkb in lockidtbl. The upper 16 bits are a sequentially
3083+ * assigned per-bucket id.
3084+ *
3085+ * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
3086+ * against the lkid of all lkb's in the bucket to avoid duplication.
3087+ *
3088+ */
3089+
3090+gd_lkb_t *create_lkb(gd_ls_t *ls)
3091+{
3092+ gd_lkb_t *lkb;
3093+ uint32_t lkid;
3094+ uint16_t bucket;
3095+
3096+ lkb = allocate_lkb(ls);
3097+ if (!lkb)
3098+ goto out;
3099+
3100+ write_lock(&ls->ls_lockidtbl_lock);
3101+ do {
3102+ get_random_bytes(&bucket, sizeof(bucket));
3103+ bucket &= (ls->ls_lockidtbl_size - 1);
3104+ lkid = bucket | (ls->ls_lockidtbl[bucket].counter++ << 16);
3105+ }
3106+ while (__find_lock_by_id(ls, lkid));
3107+
3108+ lkb->lkb_id = (uint32_t) lkid;
3109+ list_add(&lkb->lkb_idtbl_list, &ls->ls_lockidtbl[bucket].list);
3110+ write_unlock(&ls->ls_lockidtbl_lock);
3111+
3112+ out:
3113+ return lkb;
3114+}
3115+
3116+/*
3117+ * Free LKB and remove it from the lockidtbl.
3118+ * NB - this always frees the lkb whereas release_rsb doesn't free an
3119+ * rsb unless its reference count is zero.
3120+ */
3121+
3122+void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb)
3123+{
3124+ if (lkb->lkb_status) {
3125+ log_error(ls, "release lkb with status %u", lkb->lkb_status);
3126+ print_lkb(lkb);
3127+ return;
3128+ }
3129+
3130+ if (lkb->lkb_parent)
3131+ atomic_dec(&lkb->lkb_parent->lkb_childcnt);
3132+
3133+ write_lock(&ls->ls_lockidtbl_lock);
3134+ list_del(&lkb->lkb_idtbl_list);
3135+ write_unlock(&ls->ls_lockidtbl_lock);
3136+
3137+ /* if this is not a master copy then lvbptr points into the user's
3138+ * lksb, so don't free it */
3139+ if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3140+ free_lvb(lkb->lkb_lvbptr);
3141+
3142+ if (lkb->lkb_range)
3143+ free_range(lkb->lkb_range);
3144+
3145+ free_lkb(lkb);
3146+}
3147+
3148+gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
3149+{
3150+ gd_lkb_t *lkb;
3151+
3152+ read_lock(&ls->ls_lockidtbl_lock);
3153+ lkb = __find_lock_by_id(ls, lkid);
3154+ read_unlock(&ls->ls_lockidtbl_lock);
3155+
3156+ return lkb;
3157+}
3158+
3159+gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid)
3160+{
3161+ gd_ls_t *lspace = find_lockspace_by_local_id(ls);
3162+ return find_lock_by_id(lspace, lkid);
3163+}
3164+
3165+/*
3166+ * Initialise the range parts of an LKB.
3167+ */
3168+
3169+int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end)
3170+{
3171+ int ret = -ENOMEM;
3172+
3173+ /*
3174+ * if this wasn't already a range lock, make it one
3175+ */
3176+ if (!lkb->lkb_range) {
3177+ lkb->lkb_range = allocate_range(lspace);
3178+ if (!lkb->lkb_range)
3179+ goto out;
3180+
3181+ /*
3182+ * This is needed for conversions that contain ranges where the
3183+ * original lock didn't but it's harmless for new locks too.
3184+ */
3185+ lkb->lkb_range[GR_RANGE_START] = 0LL;
3186+ lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
3187+ }
3188+
3189+ lkb->lkb_range[RQ_RANGE_START] = start;
3190+ lkb->lkb_range[RQ_RANGE_END] = end;
3191+
3192+ ret = 0;
3193+
3194+ out:
3195+ return ret;
3196+}
3197diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
3198--- linux-orig/cluster/dlm/lkb.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 3199+++ linux-patched/cluster/dlm/lkb.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 3200@@ -0,0 +1,27 @@
3201+/******************************************************************************
3202+*******************************************************************************
3203+**
3204+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3205+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3206+**
3207+** This copyrighted material is made available to anyone wishing to use,
3208+** modify, copy, or redistribute it subject to the terms and conditions
3209+** of the GNU General Public License v.2.
3210+**
3211+*******************************************************************************
3212+******************************************************************************/
3213+
3214+#ifndef __LKB_DOT_H__
3215+#define __LKB_DOT_H__
3216+
3217+int free_lockidtbl(gd_ls_t * lspace);
3218+int init_lockidtbl(gd_ls_t * lspace, int entries);
3219+
3220+gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid);
3221+gd_lkb_t *create_lkb(gd_ls_t *ls);
3222+void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb);
3223+gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid);
3224+int verify_lkb_nodeids(gd_ls_t *ls);
3225+int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end);
3226+
3227+#endif /* __LKB_DOT_H__ */
3228diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
3229--- linux-orig/cluster/dlm/locking.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
3230+++ linux-patched/cluster/dlm/locking.c 2004-06-29 20:01:20.000000000 +0800
3231@@ -0,0 +1,1223 @@
4bf12011 3232+/******************************************************************************
3233+*******************************************************************************
3234+**
3235+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3236+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3237+**
3238+** This copyrighted material is made available to anyone wishing to use,
3239+** modify, copy, or redistribute it subject to the terms and conditions
3240+** of the GNU General Public License v.2.
3241+**
3242+*******************************************************************************
3243+******************************************************************************/
3244+
3245+/*
3246+ * locking.c
3247+ *
3248+ * This is where the main work of the DLM goes on
3249+ *
3250+ */
3251+
3252+#include "dlm_internal.h"
3253+#include "lockqueue.h"
3254+#include "locking.h"
3255+#include "lockspace.h"
3256+#include "lkb.h"
3257+#include "nodes.h"
3258+#include "dir.h"
3259+#include "ast.h"
3260+#include "memory.h"
3261+#include "rsb.h"
3262+
3263+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
3264+
3265+/*
3266+ * Lock compatibilty matrix - thanks Steve
3267+ * UN = Unlocked state. Not really a state, used as a flag
3268+ * PD = Padding. Used to make the matrix a nice power of two in size
3269+ * Other states are the same as the VMS DLM.
3270+ * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
3271+ */
3272+
3273+#define modes_compat(gr, rq) \
3274+ __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
3275+
3276+const int __dlm_compat_matrix[8][8] = {
3277+ /* UN NL CR CW PR PW EX PD */
3278+ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
3279+ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
3280+ {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
3281+ {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
3282+ {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
3283+ {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
3284+ {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
3285+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3286+};
3287+
3288+/*
3289+ * Compatibility matrix for conversions with QUECVT set.
3290+ * Granted mode is the row; requested mode is the column.
3291+ * Usage: matrix[grmode+1][rqmode+1]
3292+ */
3293+
3294+const int __quecvt_compat_matrix[8][8] = {
3295+ /* UN NL CR CW PR PW EX PD */
3296+ {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
3297+ {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
3298+ {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
3299+ {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
3300+ {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
3301+ {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
3302+ {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
3303+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3304+};
3305+
3306+/*
3307+ * This defines the direction of transfer of LVB data.
3308+ * Granted mode is the row; requested mode is the column.
3309+ * Usage: matrix[grmode+1][rqmode+1]
3310+ * 1 = LVB is returned to the caller
3311+ * 0 = LVB is written to the resource
3312+ * -1 = nothing happens to the LVB
3313+ */
3314+
3315+const int __lvb_operations[8][8] = {
3316+ /* UN NL CR CW PR PW EX PD*/
3317+ { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
3318+ { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
3319+ { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
3320+ { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
3321+ { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
3322+ { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
3323+ { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
3324+ { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
3325+};
3326+
3327+static void grant_lock(gd_lkb_t * lkb, int send_remote);
3328+static void send_blocking_asts(gd_res_t * rsb, gd_lkb_t * lkb);
3329+static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb);
3330+static int convert_lock(gd_ls_t * ls, int mode, struct dlm_lksb *lksb,
3331+ int flags, void *ast, void *astarg, void *bast,
3332+ struct dlm_range *range);
3333+static int dlm_lock_stage1(gd_ls_t * lspace, gd_lkb_t * lkb, int flags,
3334+ char *name, int namelen);
3335+
3336+
3337+static inline int first_in_list(gd_lkb_t *lkb, struct list_head *head)
3338+{
3339+ gd_lkb_t *first = list_entry(head->next, gd_lkb_t, lkb_statequeue);
3340+
3341+ if (lkb->lkb_id == first->lkb_id)
3342+ return 1;
3343+
3344+ return 0;
3345+}
3346+
3347+/*
3348+ * Return 1 if the locks' ranges overlap
3349+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
3350+ */
3351+
3352+static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
3353+{
3354+ if (!lkb1->lkb_range || !lkb2->lkb_range)
3355+ return 1;
3356+
3357+ if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
3358+ lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
3359+ return 0;
3360+
3361+ return 1;
3362+}
3363+
3364+/*
3365+ * Resolve conversion deadlock by changing to NL the granted mode of deadlocked
3366+ * locks on the convert queue. One of the deadlocked locks is allowed to
3367+ * retain its original granted state (we choose the lkb provided although it
3368+ * shouldn't matter which.) We do not change the granted mode on locks without
3369+ * the CONVDEADLK flag. If any of these exist (there shouldn't if the app uses
3370+ * the flag consistently) the false return value is used.
3371+ */
3372+
3373+static int conversion_deadlock_resolve(gd_res_t *rsb, gd_lkb_t *lkb)
3374+{
3375+ gd_lkb_t *this;
3376+ int rv = TRUE;
3377+
3378+ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3379+ if (this == lkb)
3380+ continue;
3381+
3382+ if (!ranges_overlap(lkb, this))
3383+ continue;
3384+
3385+ if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) {
3386+
3387+ if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){
3388+ rv = FALSE;
3389+ continue;
3390+ }
3391+ this->lkb_grmode = DLM_LOCK_NL;
3392+ this->lkb_flags |= GDLM_LKFLG_DEMOTED;
3393+ }
3394+ }
3395+ return rv;
3396+}
3397+
3398+/*
3399+ * "A conversion deadlock arises with a pair of lock requests in the converting
3400+ * queue for one resource. The granted mode of each lock blocks the requested
3401+ * mode of the other lock."
3402+ */
3403+
3404+static int conversion_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
3405+{
3406+ gd_lkb_t *this;
3407+
3408+ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3409+ if (this == lkb)
3410+ continue;
3411+
3412+ if (!ranges_overlap(lkb, this))
3413+ continue;
3414+
3415+ if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
3416+ return TRUE;
3417+ }
3418+ return FALSE;
3419+}
3420+
3421+/*
3422+ * Check if the given lkb conflicts with another lkb on the queue.
3423+ */
3424+
3425+static int queue_conflict(struct list_head *head, gd_lkb_t *lkb)
3426+{
3427+ gd_lkb_t *this;
3428+
3429+ list_for_each_entry(this, head, lkb_statequeue) {
3430+ if (this == lkb)
3431+ continue;
3432+ if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
3433+ return TRUE;
3434+ }
3435+ return FALSE;
3436+}
3437+
3438+/*
3439+ * Deadlock can arise when using the QUECVT flag if the requested mode of the
3440+ * first converting lock is incompatible with the granted mode of another
3441+ * converting lock further down the queue. To prevent this deadlock, a
3442+ * requested QUEUECVT lock is granted immediately if adding it to the end of
3443+ * the queue would prevent a lock ahead of it from being granted.
3444+ */
3445+
3446+static int queuecvt_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
3447+{
3448+ gd_lkb_t *this;
3449+
3450+ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3451+ if (this == lkb)
3452+ break;
3453+
3454+ if (ranges_overlap(lkb, this) && !modes_compat(lkb, this))
3455+ return TRUE;
3456+ }
3457+ return FALSE;
3458+}
3459+
3460+/*
3461+ * Return 1 if the lock can be granted, 0 otherwise.
3462+ * Also detect and resolve conversion deadlocks.
3463+ */
3464+
3465+static int can_be_granted(gd_res_t *rsb, gd_lkb_t *lkb)
3466+{
3467+ if (lkb->lkb_rqmode == DLM_LOCK_NL)
3468+ return TRUE;
3469+
3470+ if (lkb->lkb_rqmode == lkb->lkb_grmode)
3471+ return TRUE;
3472+
3473+ if (queue_conflict(&rsb->res_grantqueue, lkb))
3474+ return FALSE;
3475+
3476+ if (!queue_conflict(&rsb->res_convertqueue, lkb)) {
3477+ if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
3478+ return TRUE;
3479+
3480+ if (list_empty(&rsb->res_convertqueue) ||
3481+ first_in_list(lkb, &rsb->res_convertqueue) ||
3482+ queuecvt_deadlock_detect(rsb, lkb))
3483+ return TRUE;
3484+ else
3485+ return FALSE;
3486+ }
3487+
3488+ /* there *is* a conflict between this lkb and a converting lock so
3489+ we return false unless conversion deadlock resolution is permitted
3490+ (only conversion requests will have the CONVDEADLK flag set) */
3491+
3492+ if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK))
3493+ return FALSE;
3494+
3495+ if (!conversion_deadlock_detect(rsb, lkb))
3496+ return FALSE;
3497+
3498+ if (conversion_deadlock_resolve(rsb, lkb))
3499+ return TRUE;
3500+
3501+ return FALSE;
3502+}
3503+
3504+int dlm_lock(void *lockspace,
3505+ uint32_t mode,
3506+ struct dlm_lksb *lksb,
3507+ uint32_t flags,
3508+ void *name,
3509+ unsigned int namelen,
3510+ uint32_t parent,
3511+ void (*ast) (void *astarg),
3512+ void *astarg,
3513+ void (*bast) (void *astarg, int mode),
3514+ struct dlm_range *range)
3515+{
3516+ gd_ls_t *lspace;
3517+ gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
3518+ int ret = -EINVAL;
3519+
3520+ lspace = find_lockspace_by_local_id(lockspace);
3521+ if (!lspace)
3522+ goto out;
3523+
3524+ if (mode < 0 || mode > DLM_LOCK_EX)
3525+ goto out;
3526+
3527+ if (namelen > DLM_RESNAME_MAXLEN)
3528+ goto out;
3529+
3530+ if (flags & DLM_LKF_CANCEL)
3531+ goto out;
3532+
3533+ if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
3534+ goto out;
3535+
3536+ if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT))
3537+ goto out;
3538+
3539+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
3540+ goto out;
3541+
3542+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
3543+ goto out;
3544+
3545+ if (!ast || !lksb)
3546+ goto out;
3547+
3548+ if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK))
3549+ goto out;
3550+
3551+ if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
3552+ goto out;
3553+
3554+ /*
3555+ * Take conversion path.
3556+ */
3557+
3558+ if (flags & DLM_LKF_CONVERT) {
3559+ ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
3560+ bast, range);
3561+ goto out;
3562+ }
3563+
3564+ /*
3565+ * Take new lock path.
3566+ */
3567+
3568+ if (parent) {
3569+ down_read(&lspace->ls_unlock_sem);
3570+
3571+ parent_lkb = find_lock_by_id(lspace, parent);
3572+
3573+ if (!parent_lkb ||
3574+ parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
3575+ parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
3576+ parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3577+ up_read(&lspace->ls_unlock_sem);
3578+ goto out;
3579+ }
3580+
3581+ atomic_inc(&parent_lkb->lkb_childcnt);
3582+ up_read(&lspace->ls_unlock_sem);
3583+ }
3584+
3585+ down_read(&lspace->ls_in_recovery);
3586+
3587+ ret = -ENOMEM;
3588+
3589+ lkb = create_lkb(lspace);
3590+ if (!lkb)
3591+ goto fail_dec;
3592+ lkb->lkb_astaddr = ast;
3593+ lkb->lkb_astparam = (long) astarg;
3594+ lkb->lkb_bastaddr = bast;
3595+ lkb->lkb_rqmode = mode;
3596+ lkb->lkb_grmode = DLM_LOCK_IV;
3597+ lkb->lkb_lksb = lksb;
3598+ lkb->lkb_parent = parent_lkb;
3599+ lkb->lkb_lockqueue_flags = flags;
3600+ lkb->lkb_lvbptr = lksb->sb_lvbptr;
3601+
3602+ /* Copy the range if appropriate */
3603+ if (range) {
3604+ if (range->ra_start > range->ra_end) {
3605+ ret = -EINVAL;
3606+ goto fail_free;
3607+ }
3608+
3609+ if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
3610+ goto fail_free;
3611+ }
3612+
3613+ /* Convert relevant flags to internal numbers */
3614+ if (flags & DLM_LKF_VALBLK)
3615+ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3616+ if (flags & DLM_LKF_PERSISTENT)
3617+ lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
3618+ if (flags & DLM_LKF_NODLCKWT)
3619+ lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
3620+
3621+ lksb->sb_lkid = lkb->lkb_id;
3622+
3623+ ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
3624+ if (ret)
3625+ goto fail_free;
3626+
3627+ up_read(&lspace->ls_in_recovery);
3628+
3629+ wake_astd();
3630+
3631+ return 0;
3632+
3633+ fail_free:
3634+ release_lkb(lspace, lkb);
3635+ goto fail_unlock;
3636+
3637+ fail_dec:
3638+ if (parent_lkb)
3639+ atomic_dec(&parent_lkb->lkb_childcnt);
3640+
3641+ fail_unlock:
3642+ up_read(&lspace->ls_in_recovery);
3643+
3644+ out:
3645+ return ret;
3646+}
3647+
3648+int dlm_lock_stage1(gd_ls_t *ls, gd_lkb_t *lkb, int flags, char *name,
3649+ int namelen)
3650+{
3651+ gd_res_t *rsb, *parent_rsb = NULL;
3652+ gd_lkb_t *parent_lkb = lkb->lkb_parent;
3653+ gd_resdata_t *rd;
3654+ uint32_t nodeid;
3655+ int error;
3656+
3657+ if (parent_lkb)
3658+ parent_rsb = parent_lkb->lkb_resource;
3659+
3660+ error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb);
3661+ if (error)
3662+ goto out;
3663+
3664+ lkb->lkb_resource = rsb;
3665+ lkb->lkb_nodeid = rsb->res_nodeid;
3666+
3667+ /*
3668+ * Next stage, do we need to find the master or can
3669+ * we get on with the real locking work ?
3670+ */
3671+
3672+ if (rsb->res_nodeid == -1) {
3673+ if (get_directory_nodeid(rsb) != our_nodeid()) {
3674+ error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
3675+ goto out;
3676+ }
3677+
3678+ error = get_resdata(ls, our_nodeid(), rsb->res_name,
3679+ rsb->res_length, &rd, 0);
3680+ if (error)
3681+ goto out;
3682+
3683+ nodeid = rd->rd_master_nodeid;
3684+ if (nodeid == our_nodeid())
3685+ nodeid = 0;
3686+ rsb->res_nodeid = nodeid;
3687+ lkb->lkb_nodeid = nodeid;
3688+ rsb->res_resdir_seq = rd->rd_sequence;
3689+ }
3690+
3691+ error = dlm_lock_stage2(ls, lkb, rsb, flags);
3692+
3693+ out:
3694+ if (error)
3695+ release_rsb(rsb);
3696+
3697+ return error;
3698+}
3699+
3700+/*
3701+ * Locking routine called after we have an RSB, either a copy of a remote one
3702+ * or a local one, or perhaps a shiny new one all of our very own
3703+ */
3704+
3705+int dlm_lock_stage2(gd_ls_t *ls, gd_lkb_t *lkb, gd_res_t *rsb, int flags)
3706+{
3707+ int error = 0;
3708+
3709+ if (rsb->res_nodeid) {
3710+ res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3711+ error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
3712+ } else {
3713+ dlm_lock_stage3(lkb);
3714+ }
3715+
3716+ return error;
3717+}
3718+
3719+/*
3720+ * Called on an RSB's master node to do stage2 locking for a remote lock
3721+ * request. Returns a proper lkb with rsb ready for lock processing.
3722+ * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
3723+ */
3724+
3725+gd_lkb_t *remote_stage2(int remote_nodeid, gd_ls_t *ls,
3726+ struct gd_remlockrequest *freq)
3727+{
3728+ gd_res_t *rsb = NULL, *parent_rsb = NULL;
3729+ gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
3730+ int error, namelen;
3731+
3732+ if (freq->rr_remparid) {
3733+ parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
3734+ if (!parent_lkb)
3735+ goto fail;
3736+
3737+ atomic_inc(&parent_lkb->lkb_childcnt);
3738+ parent_rsb = parent_lkb->lkb_resource;
3739+ }
3740+
3741+ /*
3742+ * A new MSTCPY lkb. Initialize lkb fields including the real lkid and
3743+ * node actually holding the (non-MSTCPY) lkb. AST address are just
3744+ * flags in the master copy.
3745+ */
3746+
3747+ lkb = create_lkb(ls);
3748+ if (!lkb)
3749+ goto fail_dec;
3750+ lkb->lkb_grmode = DLM_LOCK_IV;
3751+ lkb->lkb_rqmode = freq->rr_rqmode;
3752+ lkb->lkb_parent = parent_lkb;
5cdbd17b
AM
3753+ lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP);
3754+ lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST);
4bf12011 3755+ lkb->lkb_nodeid = remote_nodeid;
3756+ lkb->lkb_remid = freq->rr_header.rh_lkid;
3757+ lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
3758+ lkb->lkb_lockqueue_flags = freq->rr_flags;
3759+
3760+ if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
3761+ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3762+ allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
3763+ if (!lkb->lkb_lvbptr)
3764+ goto fail_free;
3765+ }
3766+
3767+ if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
3768+ error = lkb_set_range(ls, lkb, freq->rr_range_start,
3769+ freq->rr_range_end);
3770+ if (error)
3771+ goto fail_free;
3772+ }
3773+
3774+ /*
3775+ * Get the RSB which this lock is for. Create a new RSB if this is a
3776+ * new lock on a new resource. We must be the master of any new rsb.
3777+ */
3778+
3779+ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
3780+
3781+ error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 1,
3782+ &rsb);
3783+ if (error)
3784+ goto fail_free;
3785+
3786+ lkb->lkb_resource = rsb;
3787+ if (rsb->res_nodeid == -1)
3788+ rsb->res_nodeid = 0;
3789+ if (freq->rr_resdir_seq)
3790+ rsb->res_resdir_seq = freq->rr_resdir_seq;
3791+
3792+ return lkb;
3793+
3794+
3795+ fail_free:
3796+ /* release_lkb handles parent */
3797+ release_lkb(ls, lkb);
3798+ parent_lkb = NULL;
3799+
3800+ fail_dec:
3801+ if (parent_lkb)
3802+ atomic_dec(&parent_lkb->lkb_childcnt);
3803+ fail:
3804+ return NULL;
3805+}
3806+
3807+/*
3808+ * The final bit of lock request processing on the master node. Here the lock
3809+ * is granted and the completion ast is queued, or the lock is put on the
3810+ * waitqueue and blocking asts are sent.
3811+ */
3812+
3813+void dlm_lock_stage3(gd_lkb_t *lkb)
3814+{
3815+ gd_res_t *rsb = lkb->lkb_resource;
3816+
3817+ /*
3818+ * This is a locally mastered lock on a resource that already exists,
3819+ * see if it can be granted or if it must wait. When this function is
3820+ * called for a remote lock request (process_cluster_request,
3821+ * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
3822+ * requesting node at the end of process_cluster_request, not at the
3823+ * end of grant_lock.
3824+ */
3825+
3826+ down_write(&rsb->res_lock);
3827+
3828+ if (can_be_granted(rsb, lkb)) {
3829+ grant_lock(lkb, 0);
3830+ goto out;
3831+ }
3832+
3833+ /*
3834+ * This request is not a conversion, so the lkb didn't exist other than
3835+ * for this request and should be freed after EAGAIN is returned in the
3836+ * ast.
3837+ */
3838+
3839+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4bf12011 3840+ lkb->lkb_retstatus = -EAGAIN;
4bf12011 3841+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
3842+ send_blocking_asts_all(rsb, lkb);
5cdbd17b 3843+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
4bf12011 3844+ goto out;
3845+ }
3846+
3847+ /*
3848+ * The requested lkb must wait. Because the rsb of the requested lkb
3849+ * is mastered here, send blocking asts for the lkb's blocking the
3850+ * request.
3851+ */
3852+
3853+ lkb->lkb_retstatus = 0;
3854+ lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3855+
3856+ send_blocking_asts(rsb, lkb);
3857+
3858+ out:
3859+ up_write(&rsb->res_lock);
3860+}
3861+
3862+int dlm_unlock(void *lockspace,
3863+ uint32_t lkid,
3864+ uint32_t flags,
3865+ struct dlm_lksb *lksb,
3866+ void *astarg)
3867+{
3868+ gd_ls_t *ls = find_lockspace_by_local_id(lockspace);
3869+ gd_lkb_t *lkb;
3870+ gd_res_t *rsb;
3871+ int ret = -EINVAL;
3872+
3873+ if (!ls)
3874+ goto out;
3875+
3876+ lkb = find_lock_by_id(ls, lkid);
3877+ if (!lkb)
3878+ goto out;
3879+
3880+ /* Can't dequeue a master copy (a remote node's mastered lock) */
3881+ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3882+ goto out;
3883+
3884+ /* Already waiting for a remote lock operation */
3885+ if (lkb->lkb_lockqueue_state) {
3886+ ret = -EBUSY;
3887+ goto out;
3888+ }
3889+
3890+ /* Can only cancel WAITING or CONVERTing locks.
3891+ * This is just a quick check - it is also checked in unlock_stage2()
3892+ * (which may be on the master) under the semaphore.
3893+ */
3894+ if ((flags & DLM_LKF_CANCEL) &&
3895+ (lkb->lkb_status == GDLM_LKSTS_GRANTED))
3896+ goto out;
3897+
3898+ /* "Normal" unlocks must operate on a granted lock */
3899+ if (!(flags & DLM_LKF_CANCEL) &&
3900+ (lkb->lkb_status != GDLM_LKSTS_GRANTED))
3901+ goto out;
3902+
3903+ down_write(&ls->ls_unlock_sem);
3904+
3905+ /* Can't dequeue a lock with sublocks */
3906+ if (atomic_read(&lkb->lkb_childcnt)) {
3907+ up_write(&ls->ls_unlock_sem);
3908+ ret = -ENOTEMPTY;
3909+ goto out;
3910+ }
3911+
3912+ /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
3913+ if (!(flags & DLM_LKF_CANCEL))
3914+ lkb->lkb_flags |= GDLM_LKFLG_DELETED;
3915+ up_write(&ls->ls_unlock_sem);
3916+
3917+ /* Save any new params */
3918+ if (lksb)
3919+ lkb->lkb_lksb = lksb;
3920+ if (astarg)
3921+ lkb->lkb_astparam = (long) astarg;
3922+
3923+ lkb->lkb_lockqueue_flags = flags;
3924+
3925+ rsb = lkb->lkb_resource;
3926+
3927+ down_read(&ls->ls_in_recovery);
3928+
3929+ if (rsb->res_nodeid)
3930+ ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
3931+ else
3932+ ret = dlm_unlock_stage2(lkb, flags);
3933+
3934+ up_read(&ls->ls_in_recovery);
3935+
3936+ wake_astd();
3937+
3938+ out:
3939+ return ret;
3940+}
3941+
3942+int dlm_unlock_stage2(gd_lkb_t *lkb, uint32_t flags)
3943+{
3944+ gd_res_t *rsb = lkb->lkb_resource;
3945+ int old_status;
3946+ int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
3947+
3948+ down_write(&rsb->res_lock);
3949+
3950+ /* Can only cancel WAITING or CONVERTing locks */
3951+ if ((flags & DLM_LKF_CANCEL) &&
3952+ (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
3953+ lkb->lkb_retstatus = -EINVAL;
5cdbd17b 3954+ queue_ast(lkb, AST_COMP, 0);
4bf12011 3955+ goto out;
3956+ }
3957+
3958+ old_status = lkb_dequeue(lkb);
3959+
3960+ /*
3961+ * If was granted grant any converting or waiting locks.
3962+ */
3963+
3964+ if (old_status == GDLM_LKSTS_GRANTED)
3965+ grant_pending_locks(rsb);
3966+
3967+ /*
3968+ * Cancelling a conversion
3969+ */
3970+
3971+ if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
3972+ /* VMS semantics say we should send blocking ASTs again here */
3973+ send_blocking_asts(rsb, lkb);
3974+
3975+ /* Remove from deadlock detection */
3976+ if (lkb->lkb_duetime)
3977+ remove_from_deadlockqueue(lkb);
3978+
3979+ /* Stick it back on the granted queue */
3980+ lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
3981+ lkb->lkb_rqmode = lkb->lkb_grmode;
3982+
3983+ /* Was it blocking any other locks? */
3984+ if (first_in_list(lkb, &rsb->res_convertqueue))
3985+ grant_pending_locks(rsb);
3986+
3987+ lkb->lkb_retstatus = -DLM_ECANCEL;
5cdbd17b 3988+ queue_ast(lkb, AST_COMP, 0);
4bf12011 3989+ goto out;
3990+ }
3991+
3992+ /*
3993+ * The lvb can be saved or cleared on unlock.
3994+ */
3995+
3996+ if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
3997+ if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
3998+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
3999+ if (flags & DLM_LKF_IVVALBLK)
4000+ memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
4001+ }
4002+
5cdbd17b
AM
4003+ lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK;
4004+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
4bf12011 4005+
4006+ /*
4007+ * Only free the LKB if we are the master copy. Otherwise the AST
4008+ * delivery routine will free it after delivery. queue_ast for MSTCPY
4009+ * lkb just sends a message.
4010+ */
4011+
4012+ if (remote) {
4013+ up_write(&rsb->res_lock);
4014+ release_lkb(rsb->res_ls, lkb);
4015+ release_rsb(rsb);
4016+ goto out2;
4017+ }
4018+
4019+ out:
4020+ up_write(&rsb->res_lock);
4021+ out2:
4022+ wake_astd();
4023+ return 0;
4024+}
4025+
4026+/*
4027+ * Lock conversion
4028+ */
4029+
4030+static int convert_lock(gd_ls_t *ls, int mode, struct dlm_lksb *lksb,
4031+ int flags, void *ast, void *astarg, void *bast,
4032+ struct dlm_range *range)
4033+{
4034+ gd_lkb_t *lkb;
4035+ gd_res_t *rsb;
4036+ int ret = -EINVAL;
4037+
4038+ lkb = find_lock_by_id(ls, lksb->sb_lkid);
4039+ if (!lkb) {
4040+ goto out;
4041+ }
4042+
4043+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
4044+ ret = -EBUSY;
4045+ goto out;
4046+ }
4047+
4048+ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4049+ goto out;
4050+ }
4051+
4052+ if ((flags & DLM_LKF_QUECVT) &&
4053+ !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
4054+ goto out;
4055+ }
4056+
4057+ if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
4058+ goto out;
4059+ }
4060+
4061+ if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) {
4062+ goto out;
4063+ }
4064+
4065+ /* Set up the ranges as appropriate */
4066+ if (range) {
4067+ if (range->ra_start > range->ra_end)
4068+ goto out;
4069+
4070+ if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
4071+ ret = -ENOMEM;
4072+ goto out;
4073+ }
4074+ }
4075+
4076+ rsb = lkb->lkb_resource;
4077+ down_read(&rsb->res_ls->ls_in_recovery);
4078+
4079+ lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
4080+ lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
4081+
4082+ if (flags & DLM_LKF_NODLCKWT)
4083+ lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
4084+ if (ast)
4085+ lkb->lkb_astaddr = ast;
4086+ if (astarg)
4087+ lkb->lkb_astparam = (long) astarg;
4088+ if (bast)
4089+ lkb->lkb_bastaddr = bast;
4090+ lkb->lkb_rqmode = mode;
4091+ lkb->lkb_lockqueue_flags = flags;
4092+ lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
4093+ lkb->lkb_lvbptr = lksb->sb_lvbptr;
4094+
4095+ if (rsb->res_nodeid) {
4096+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4097+ ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
4098+ } else {
4099+ ret = dlm_convert_stage2(lkb, FALSE);
4100+ }
4101+
4102+ up_read(&rsb->res_ls->ls_in_recovery);
4103+
4104+ wake_astd();
4105+
4106+ out:
4107+ return ret;
4108+}
4109+
4110+/*
4111+ * For local conversion requests on locally mastered locks this is called
4112+ * directly from dlm_lock/convert_lock. This function is also called for
4113+ * remote conversion requests of MSTCPY locks (from process_cluster_request).
4114+ */
4115+
4116+int dlm_convert_stage2(gd_lkb_t *lkb, int do_ast)
4117+{
4118+ gd_res_t *rsb = lkb->lkb_resource;
4119+ int ret = 0;
4120+
4121+ down_write(&rsb->res_lock);
4122+
4123+ if (can_be_granted(rsb, lkb)) {
4124+ grant_lock(lkb, 0);
4125+ grant_pending_locks(rsb);
4126+ goto out;
4127+ }
4128+
4129+ /*
4130+ * Remove lkb from granted queue.
4131+ */
4132+
4133+ lkb_dequeue(lkb);
4134+
4135+ /*
4136+ * The user won't wait so stick it back on the grant queue
4137+ */
4138+
4139+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4140+ lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4141+ ret = lkb->lkb_retstatus = -EAGAIN;
4142+ if (do_ast)
5cdbd17b 4143+ queue_ast(lkb, AST_COMP, 0);
4bf12011 4144+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4145+ send_blocking_asts_all(rsb, lkb);
4146+ goto out;
4147+ }
4148+
4149+ /*
4150+ * The lkb's status tells which queue it's on. Put back on convert
4151+ * queue. (QUECVT requests added at end of the queue, all others in
4152+ * order.)
4153+ */
4154+
4155+ lkb->lkb_retstatus = 0;
4156+ lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4157+
4158+ /*
4159+ * If the request can't be granted
4160+ */
4161+
4162+ send_blocking_asts(rsb, lkb);
4163+
4164+ if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
4165+ add_to_deadlockqueue(lkb);
4166+
4167+ out:
4168+ up_write(&rsb->res_lock);
4169+ return ret;
4170+}
4171+
4172+/*
4173+ * Remove lkb from any queue it's on, add it to the granted queue, and queue a
4174+ * completion ast. rsb res_lock must be held in write when this is called.
4175+ */
4176+
4177+static void grant_lock(gd_lkb_t *lkb, int send_remote)
4178+{
4179+ gd_res_t *rsb = lkb->lkb_resource;
4180+
4181+ if (lkb->lkb_duetime)
4182+ remove_from_deadlockqueue(lkb);
4183+
4184+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
4185+ int b;
4186+ GDLM_ASSERT(lkb->lkb_lvbptr,);
4187+
4188+ if (!rsb->res_lvbptr)
4189+ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
4190+
4191+ b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
4192+ if (b)
4193+ memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
4194+ else
4195+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4196+ }
4197+
4198+ if (lkb->lkb_range) {
4199+ lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
4200+ lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
4201+ }
4202+
4203+ lkb->lkb_grmode = lkb->lkb_rqmode;
4204+ lkb->lkb_rqmode = DLM_LOCK_IV;
4205+ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4206+
4207+ lkb->lkb_highbast = 0;
4208+ lkb->lkb_retstatus = 0;
5cdbd17b 4209+ queue_ast(lkb, AST_COMP, 0);
4bf12011 4210+
4211+ /*
4212+ * A remote conversion request has been granted, either immediately
4213+ * upon being requested or after waiting a bit. In the former case,
4214+ * reply_and_grant() is called. In the later case send_remote is 1 and
4215+ * remote_grant() is called.
4216+ *
4217+ * The "send_remote" flag is set only for locks which are granted "out
4218+ * of band" - ie by another lock being converted or unlocked.
4219+ *
4220+ * The second case occurs when this lkb is granted right away as part
4221+ * of processing the initial request. In that case, we send a single
4222+ * message in reply_and_grant which combines the request reply with the
4223+ * grant message.
4224+ */
4225+
4226+ if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
4227+ if (send_remote)
4228+ remote_grant(lkb);
4229+ else if (lkb->lkb_request)
4230+ reply_and_grant(lkb);
4231+ }
4232+
4233+}
4234+
4235+static void send_bast_queue(struct list_head *head, gd_lkb_t *lkb)
4236+{
4237+ gd_lkb_t *gr;
4238+
4239+ list_for_each_entry(gr, head, lkb_statequeue) {
4240+ if (gr->lkb_bastaddr &&
4241+ gr->lkb_highbast < lkb->lkb_rqmode &&
4242+ ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
5cdbd17b 4243+ queue_ast(gr, AST_BAST, lkb->lkb_rqmode);
4bf12011 4244+ gr->lkb_highbast = lkb->lkb_rqmode;
4245+ }
4246+ }
4247+}
4248+
4249+/*
4250+ * Notify granted locks if they are blocking a newly forced-to-wait lock.
4251+ */
4252+
4253+static void send_blocking_asts(gd_res_t *rsb, gd_lkb_t *lkb)
4254+{
4255+ send_bast_queue(&rsb->res_grantqueue, lkb);
4256+ /* check if the following improves performance */
4257+ /* send_bast_queue(&rsb->res_convertqueue, lkb); */
4258+}
4259+
4260+static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb)
4261+{
4262+ send_bast_queue(&rsb->res_grantqueue, lkb);
4263+ send_bast_queue(&rsb->res_convertqueue, lkb);
4264+}
4265+
4266+/*
4267+ * Called when a lock has been dequeued. Look for any locks to grant that are
4268+ * waiting for conversion or waiting to be granted.
4269+ * The rsb res_lock must be held in write when this function is called.
4270+ */
4271+
4272+int grant_pending_locks(gd_res_t *rsb)
4273+{
4274+ gd_lkb_t *lkb;
4275+ struct list_head *list;
4276+ struct list_head *temp;
4277+ int8_t high = DLM_LOCK_IV;
4278+
4279+ list_for_each_safe(list, temp, &rsb->res_convertqueue) {
4280+ lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4281+
4282+ if (can_be_granted(rsb, lkb))
4283+ grant_lock(lkb, 1);
4284+ else
4285+ high = MAX(lkb->lkb_rqmode, high);
4286+ }
4287+
4288+ list_for_each_safe(list, temp, &rsb->res_waitqueue) {
4289+ lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4290+
4291+ if (can_be_granted(rsb, lkb))
4292+ grant_lock(lkb, 1);
4293+ else
4294+ high = MAX(lkb->lkb_rqmode, high);
4295+ }
4296+
4297+ /*
4298+ * If there are locks left on the wait/convert queue then send blocking
4299+ * ASTs to granted locks that are blocking
4300+ *
4301+ * FIXME: This might generate some spurious blocking ASTs for range
4302+ * locks.
4303+ */
4304+
4305+ if (high > DLM_LOCK_IV) {
4306+ list_for_each_safe(list, temp, &rsb->res_grantqueue) {
4307+ lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
4308+
4309+ if (lkb->lkb_bastaddr &&
4310+ (lkb->lkb_highbast < high) &&
4311+ !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
4312+
5cdbd17b 4313+ queue_ast(lkb, AST_BAST, high);
4bf12011 4314+ lkb->lkb_highbast = high;
4315+ }
4316+ }
4317+ }
4318+
4319+ return 0;
4320+}
4321+
4322+/*
4323+ * Called to cancel a locking operation that failed due to some internal
4324+ * reason.
4325+ *
4326+ * Waiting locks will be removed, converting locks will be reverted to their
4327+ * granted status, unlocks will be left where they are.
4328+ *
4329+ * A completion AST will be delivered to the caller.
4330+ */
4331+
4332+int cancel_lockop(gd_lkb_t *lkb, int status)
4333+{
4334+ int state = lkb->lkb_lockqueue_state;
5cdbd17b 4335+ uint16_t astflags = AST_COMP;
4bf12011 4336+
4337+ lkb->lkb_lockqueue_state = 0;
4338+
4339+ switch (state) {
4340+ case GDLM_LQSTATE_WAIT_RSB:
5cdbd17b 4341+ astflags |= AST_DEL;
4bf12011 4342+ break;
4343+
4344+ case GDLM_LQSTATE_WAIT_CONDGRANT:
4345+ res_lkb_dequeue(lkb);
5cdbd17b 4346+ astflags |= AST_DEL;
4bf12011 4347+ break;
4348+
4349+ case GDLM_LQSTATE_WAIT_CONVERT:
4350+ res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
4351+
4352+ /* Remove from deadlock detection */
4353+ if (lkb->lkb_duetime) {
4354+ remove_from_deadlockqueue(lkb);
4355+ }
4356+ break;
4357+
4358+ case GDLM_LQSTATE_WAIT_UNLOCK:
4359+ /* We can leave this. I think.... */
4360+ break;
4361+ }
4362+
4363+ lkb->lkb_retstatus = status;
5cdbd17b 4364+ queue_ast(lkb, astflags, 0);
4bf12011 4365+
4366+ return 0;
4367+}
4368+
4369+/*
4370+ * Check for conversion deadlock. If a deadlock was found
4371+ * return lkb to kill, else return NULL
4372+ */
4373+
4374+gd_lkb_t *conversion_deadlock_check(gd_lkb_t *lkb)
4375+{
4376+ gd_res_t *rsb = lkb->lkb_resource;
4377+ struct list_head *entry;
4378+
4379+ GDLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
4380+
4381+ /* Work our way up to the head of the queue looking for locks that
4382+ * conflict with us */
4383+
4384+ down_read(&rsb->res_lock);
4385+
4386+ entry = lkb->lkb_statequeue.prev;
4387+ while (entry != &rsb->res_convertqueue) {
4388+ gd_lkb_t *lkb2 = list_entry(entry, gd_lkb_t, lkb_statequeue);
4389+
4390+ if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
4391+ up_read(&rsb->res_lock);
4392+ return lkb;
4393+ }
4394+ entry = entry->prev;
4395+ }
4396+ up_read(&rsb->res_lock);
4397+
4398+ return 0;
4399+}
4400+
4401+/*
4402+ * Conversion operation was cancelled by us (not the user).
4403+ * ret contains the return code to pass onto the user
4404+ */
4405+
4406+void cancel_conversion(gd_lkb_t *lkb, int ret)
4407+{
4408+ gd_res_t *rsb = lkb->lkb_resource;
4409+
4410+ /* Stick it back on the granted queue */
4411+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4412+ lkb->lkb_rqmode = lkb->lkb_grmode;
4413+
4414+ remove_from_deadlockqueue(lkb);
4415+
4416+ lkb->lkb_retstatus = ret;
5cdbd17b 4417+ queue_ast(lkb, AST_COMP, 0);
4bf12011 4418+ wake_astd();
4419+}
4420+
4421+/*
4422+ * As new master of the rsb for this lkb, we need to handle these requests
4423+ * removed from the lockqueue and originating from local processes:
4424+ * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
4425+ * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
4426+ */
4427+
4428+void process_remastered_lkb(gd_lkb_t *lkb, int state)
4429+{
4430+ switch (state) {
4431+ case GDLM_LQSTATE_WAIT_RSB:
4432+ dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
4433+ lkb->lkb_lockqueue_flags,
4434+ lkb->lkb_resource->res_name,
4435+ lkb->lkb_resource->res_length);
4436+ break;
4437+
4438+ case GDLM_LQSTATE_WAIT_CONDGRANT:
4439+ res_lkb_dequeue(lkb);
4440+ dlm_lock_stage3(lkb);
4441+ break;
4442+
4443+ case GDLM_LQSTATE_WAIT_UNLOCK:
4444+ dlm_unlock_stage2(lkb, lkb->lkb_lockqueue_flags);
4445+ break;
4446+
4447+ case GDLM_LQSTATE_WAIT_CONVERT:
4448+ dlm_convert_stage2(lkb, TRUE);
4449+ break;
4450+
4451+ default:
4452+ GDLM_ASSERT(0,);
4453+ }
4454+}
4455diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
4456--- linux-orig/cluster/dlm/locking.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 4457+++ linux-patched/cluster/dlm/locking.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 4458@@ -0,0 +1,33 @@
4459+/******************************************************************************
4460+*******************************************************************************
4461+**
4462+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4463+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4464+**
4465+** This copyrighted material is made available to anyone wishing to use,
4466+** modify, copy, or redistribute it subject to the terms and conditions
4467+** of the GNU General Public License v.2.
4468+**
4469+*******************************************************************************
4470+******************************************************************************/
4471+
4472+#ifndef __LOCKING_DOT_H__
4473+#define __LOCKING_DOT_H__
4474+
4475+void process_remastered_lkb(gd_lkb_t * lkb, int state);
4476+void dlm_lock_stage3(gd_lkb_t * lkb);
4477+int dlm_convert_stage2(gd_lkb_t * lkb, int do_ast);
4478+int dlm_unlock_stage2(gd_lkb_t * lkb, uint32_t flags);
4479+int dlm_lock_stage2(gd_ls_t * lspace, gd_lkb_t * lkb, gd_res_t * rsb,
4480+ int flags);
4481+gd_res_t *create_rsb(gd_ls_t * lspace, gd_lkb_t * lkb, char *name, int namelen);
4482+int free_rsb_if_unused(gd_res_t * rsb);
4483+gd_lkb_t *remote_stage2(int remote_csid, gd_ls_t * lspace,
4484+ struct gd_remlockrequest *freq);
4485+int cancel_lockop(gd_lkb_t * lkb, int status);
4486+int dlm_remove_lock(gd_lkb_t * lkb, uint32_t flags);
4487+int grant_pending_locks(gd_res_t * rsb);
4488+void cancel_conversion(gd_lkb_t * lkb, int ret);
4489+gd_lkb_t *conversion_deadlock_check(gd_lkb_t * lkb);
4490+
4491+#endif /* __LOCKING_DOT_H__ */
4492diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
4493--- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
4494+++ linux-patched/cluster/dlm/lockqueue.c 2004-06-29 20:01:20.000000000 +0800
4495@@ -0,0 +1,957 @@
4bf12011 4496+/******************************************************************************
4497+*******************************************************************************
4498+**
4499+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4500+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4501+**
4502+** This copyrighted material is made available to anyone wishing to use,
4503+** modify, copy, or redistribute it subject to the terms and conditions
4504+** of the GNU General Public License v.2.
4505+**
4506+*******************************************************************************
4507+******************************************************************************/
4508+
4509+/*
4510+ * lockqueue.c
4511+ *
4512+ * This controls the lock queue, which is where locks
4513+ * come when they need to wait for a remote operation
4514+ * to complete.
4515+ *
4516+ * This could also be thought of as the "high-level" comms
4517+ * layer.
4518+ *
4519+ */
4520+
4521+#include "dlm_internal.h"
4522+#include "lockqueue.h"
4523+#include "dir.h"
4524+#include "locking.h"
4525+#include "lkb.h"
4526+#include "lowcomms.h"
4527+#include "midcomms.h"
4528+#include "reccomms.h"
4529+#include "nodes.h"
4530+#include "lockspace.h"
4531+#include "ast.h"
4532+#include "memory.h"
4533+#include "rsb.h"
4534+#include "queries.h"
4535+
4536+static void add_reply_lvb(gd_lkb_t * lkb, struct gd_remlockreply *reply);
4537+static void add_request_lvb(gd_lkb_t * lkb, struct gd_remlockrequest *req);
4538+
4539+/*
4540+ * format of an entry on the request queue
4541+ */
4542+struct rq_entry {
4543+ struct list_head rqe_list;
4544+ uint32_t rqe_nodeid;
4545+ char rqe_request[1];
4546+};
4547+
4548+/*
4549+ * Add a new request (if appropriate) to the request queue and send the remote
4550+ * request out. - runs in the context of the locking caller
4551+ *
4552+ * Recovery of a remote_stage request if the remote end fails while the lkb
4553+ * is still on the lockqueue:
4554+ *
4555+ * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
4556+ * lockqueue_lkb_mark() at the start of recovery.
4557+ *
4558+ * o Some lkb's will be rebuilt on new master rsb's during recovery.
4559+ * (depends on the type of request, see below).
4560+ *
4561+ * o At the end of recovery, resend_cluster_requests() looks at these
4562+ * LQRESEND lkb's and either:
4563+ *
4564+ * i) resends the request to the new master for the rsb where the
4565+ * request is processed as usual. The lkb remains on the lockqueue until
4566+ * the new master replies and we run process_lockqueue_reply().
4567+ *
4568+ * ii) if we've become the rsb master, remove the lkb from the lockqueue
4569+ * and processes the request locally via process_remastered_lkb().
4570+ *
4571+ * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
4572+ * and the request should be resent if dest node is failed.
4573+ *
4574+ * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
4575+ * wait queue. Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
4576+ * makes send_lkb_queue() skip it). Resend this request to the new master.
4577+ *
4578+ * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue. It will
4579+ * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
4580+ * Resend this request to the new master.
4581+ *
4582+ * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
4583+ * It will be rebuilt on the new master rsb's granted queue. Resend this
4584+ * request to the new master.
4585+ */
4586+
4587+int remote_stage(gd_lkb_t *lkb, int state)
4588+{
4589+ int error;
4590+
4591+ lkb->lkb_lockqueue_state = state;
4592+ add_to_lockqueue(lkb);
4593+
4594+ error = send_cluster_request(lkb, state);
4595+ if (error < 0) {
4596+ log_print("remote_stage error sending request %d", error);
4597+
4598+ /* Leave on lockqueue, it will be resent to correct node during
4599+ * recovery. */
4600+
4601+ /*
4602+ lkb->lkb_lockqueue_state = 0;
4603+ remove_from_lockqueue(lkb);
4604+ return -ENOTCONN;
4605+ */
4606+ }
4607+ return 0;
4608+}
4609+
4610+/*
4611+ * Requests received while the lockspace is in recovery get added to the
4612+ * request queue and processed when recovery is complete.
4613+ */
4614+
4615+void add_to_requestqueue(gd_ls_t *ls, int nodeid, char *request, int length)
4616+{
4617+ struct rq_entry *entry;
4618+
4619+ if (in_nodes_gone(ls, nodeid))
4620+ return;
4621+
4622+ entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
4623+ if (!entry) {
4624+ // TODO something better
4625+ printk("dlm: add_to_requestqueue: out of memory\n");
4626+ return;
4627+ }
4628+
4629+ log_debug(ls, "add_to_requestqueue %d", nodeid);
4630+ entry->rqe_nodeid = nodeid;
4631+ memcpy(entry->rqe_request, request, length);
4632+ list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
4633+}
4634+
4635+int process_requestqueue(gd_ls_t *ls)
4636+{
4637+ int error = 0, count = 0;
4638+ struct rq_entry *entry, *safe;
4639+ struct gd_req_header *req;
4640+
4641+ log_all(ls, "process held requests");
4642+
4643+ list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4644+ req = (struct gd_req_header *) entry->rqe_request;
4645+ log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid);
4646+
4647+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
4648+ log_debug(ls, "process_requestqueue aborted");
4649+ error = -EINTR;
4650+ break;
4651+ }
4652+
4653+ error = process_cluster_request(entry->rqe_nodeid, req, TRUE);
4654+ if (error == -EINTR) {
4655+ log_debug(ls, "process_requestqueue interrupted");
4656+ break;
4657+ }
4658+
4659+ list_del(&entry->rqe_list);
4660+ kfree(entry);
4661+ count++;
4662+ error = 0;
4663+ }
4664+
4665+ log_all(ls, "processed %d requests", count);
4666+ return error;
4667+}
4668+
4669+void wait_requestqueue(gd_ls_t *ls)
4670+{
4671+ while (!list_empty(&ls->ls_requestqueue) &&
4672+ test_bit(LSFL_LS_RUN, &ls->ls_flags))
4673+ schedule();
4674+}
4675+
4676+/*
4677+ * Resdir requests (lookup or remove) and replies from before recovery are
4678+ * invalid since the resdir was rebuilt. Clear them. Requests from nodes now
4679+ * gone are also invalid.
4680+ */
4681+
4682+void purge_requestqueue(gd_ls_t *ls)
4683+{
4684+ int count = 0;
4685+ struct rq_entry *entry, *safe;
4686+ struct gd_req_header *req;
4687+ struct gd_remlockrequest *freq;
4688+ gd_lkb_t *lkb;
4689+
4690+ log_all(ls, "purge requests");
4691+
4692+ list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4693+ req = (struct gd_req_header *) entry->rqe_request;
4694+ freq = (struct gd_remlockrequest *) req;
4695+
4696+ if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
4697+ req->rh_cmd == GDLM_REMCMD_LOOKUP ||
4698+ in_nodes_gone(ls, entry->rqe_nodeid)) {
4699+
4700+ list_del(&entry->rqe_list);
4701+ kfree(entry);
4702+ count++;
4703+
4704+ } else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
4705+
4706+ /*
4707+ * Replies to resdir lookups are invalid and must be
4708+ * purged. The lookup requests are marked in
4709+ * lockqueue_lkb_mark and will be resent in
4710+ * resend_cluster_requests. The only way to check if
4711+ * this is a lookup reply is to look at the
4712+ * lockqueue_state of the lkb.
4713+ */
4714+
4715+ lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid);
4716+ GDLM_ASSERT(lkb,);
4717+ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
4718+ list_del(&entry->rqe_list);
4719+ kfree(entry);
4720+ count++;
4721+ }
4722+ }
4723+ }
4724+
4725+ log_all(ls, "purged %d requests", count);
4726+}
4727+
4728+/*
4729+ * Check if there's a reply for the given lkid in the requestqueue.
4730+ */
4731+
4732+int reply_in_requestqueue(gd_ls_t *ls, int lkid)
4733+{
4734+ int rv = FALSE;
4735+ struct rq_entry *entry, *safe;
4736+ struct gd_req_header *req;
4737+ struct gd_remlockrequest *freq;
4738+
4739+ list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4740+ req = (struct gd_req_header *) entry->rqe_request;
4741+ freq = (struct gd_remlockrequest *) req;
4742+
4743+ if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY &&
4744+ freq->rr_header.rh_lkid == lkid) {
4745+ rv = TRUE;
4746+ break;
4747+ }
4748+ }
4749+
4750+ return rv;
4751+}
4752+
4753+void allocate_and_copy_lvb(gd_ls_t *ls, char **lvbptr, char *src)
4754+{
4755+ if (!*lvbptr)
4756+ *lvbptr = allocate_lvb(ls);
4757+ if (*lvbptr)
4758+ memcpy(*lvbptr, src, DLM_LVB_LEN);
4759+}
4760+
4761+/*
4762+ * Process a lockqueue LKB after it has had it's remote processing complete and
4763+ * been pulled from the lockqueue. Runs in the context of the DLM recvd thread on
4764+ * the machine that requested the lock.
4765+ */
4766+
4767+static void process_lockqueue_reply(gd_lkb_t *lkb,
4768+ struct gd_remlockreply *reply)
4769+{
4770+ int state = lkb->lkb_lockqueue_state;
4771+ int oldstate;
4772+ gd_res_t *rsb = lkb->lkb_resource;
4773+ gd_ls_t *ls = rsb->res_ls;
4774+
4775+ lkb->lkb_lockqueue_state = 0;
4776+ if (state)
4777+ remove_from_lockqueue(lkb);
4778+
4779+ switch (state) {
4780+ case GDLM_LQSTATE_WAIT_RSB:
4781+
4782+ GDLM_ASSERT(reply->rl_status == 0,);
4783+
4784+ if (reply->rl_nodeid == our_nodeid())
4785+ rsb->res_nodeid = 0;
4786+ else
4787+ rsb->res_nodeid = reply->rl_nodeid;
4788+
4789+ rsb->res_resdir_seq = reply->rl_resdir_seq;
4790+ lkb->lkb_nodeid = rsb->res_nodeid;
4791+
4792+ dlm_lock_stage2(rsb->res_ls, lkb, rsb,
4793+ lkb->lkb_lockqueue_flags);
4794+ break;
4795+
4796+ case GDLM_LQSTATE_WAIT_CONVERT:
4797+ case GDLM_LQSTATE_WAIT_CONDGRANT:
4798+
4799+ /*
4800+ * After a remote lock/conversion/grant request we put the lock
4801+ * on the right queue and send an AST if appropriate. Any lock
4802+ * shuffling (eg newly granted locks because this one was
4803+ * converted downwards) will be dealt with in seperate messages
4804+ * (which may be in the same network message)
4805+ */
4806+
4807+ if (!lkb->lkb_remid)
4808+ lkb->lkb_remid = reply->rl_lkid;
4809+
4810+ /*
4811+ * The remote request failed (we assume because of NOQUEUE).
4812+ * If this is a new request (non-conv) the lkb was created just
4813+ * for it so the lkb should be freed. If this was a
4814+ * conversion, the lkb already existed so we should put it back
4815+ * on the grant queue.
4816+ */
4817+
4818+ if (reply->rl_status != 0) {
4819+ GDLM_ASSERT(reply->rl_status == -EAGAIN,);
4820+
4821+ if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
4822+ res_lkb_dequeue(lkb);
5cdbd17b
AM
4823+ lkb->lkb_retstatus = reply->rl_status;
4824+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
4825+ } else {
4bf12011 4826+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5cdbd17b
AM
4827+ lkb->lkb_retstatus = reply->rl_status;
4828+ queue_ast(lkb, AST_COMP, 0);
4829+ }
4bf12011 4830+ break;
4831+ }
4832+
4833+ /*
4834+ * The remote request was successful in granting the request or
4835+ * queuing it to be granted later. Add the lkb to the
4836+ * appropriate rsb queue.
4837+ */
4838+
4839+ switch (reply->rl_lockstate) {
4840+ case GDLM_LKSTS_GRANTED:
4841+
4842+ /* Compact version of grant_lock(). */
4843+
4844+ down_write(&rsb->res_lock);
4845+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
4846+ memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
4847+ DLM_LVB_LEN);
4848+
4849+ lkb->lkb_grmode = lkb->lkb_rqmode;
4850+ lkb->lkb_rqmode = DLM_LOCK_IV;
4851+ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4852+
4853+ if (lkb->lkb_range) {
4854+ lkb->lkb_range[GR_RANGE_START] =
4855+ lkb->lkb_range[RQ_RANGE_START];
4856+ lkb->lkb_range[GR_RANGE_END] =
4857+ lkb->lkb_range[RQ_RANGE_END];
4858+ }
4859+ up_write(&rsb->res_lock);
4860+
4861+ lkb->lkb_retstatus = 0;
5cdbd17b 4862+ queue_ast(lkb, AST_COMP, 0);
4bf12011 4863+ break;
4864+
4865+ case GDLM_LKSTS_WAITING:
4866+
4867+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4868+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4869+ else
4870+ log_error(ls, "wait reply for granted %x %u",
4871+ lkb->lkb_id, lkb->lkb_nodeid);
4872+ break;
4873+
4874+ case GDLM_LKSTS_CONVERT:
4875+
4876+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4877+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4878+ else
4879+ log_error(ls, "convert reply for granted %x %u",
4880+ lkb->lkb_id, lkb->lkb_nodeid);
4881+ break;
4882+
4883+ default:
4884+ log_error(ls, "process_lockqueue_reply state %d",
4885+ reply->rl_lockstate);
4886+ }
4887+
4888+ break;
4889+
4890+ case GDLM_LQSTATE_WAIT_UNLOCK:
4891+
4892+ /*
4893+ * Unlocks should never fail. Update local lock info. This
4894+ * always sends completion AST with status in lksb
4895+ */
4896+
4897+ GDLM_ASSERT(reply->rl_status == 0,);
4898+ oldstate = res_lkb_dequeue(lkb);
4899+
4900+ /* Differentiate between unlocks and conversion cancellations */
4901+ if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL &&
4902+ oldstate == GDLM_LKSTS_CONVERT) {
4903+ res_lkb_enqueue(lkb->lkb_resource, lkb,
4904+ GDLM_LKSTS_GRANTED);
4905+ lkb->lkb_retstatus = -DLM_ECANCEL;
5cdbd17b 4906+ queue_ast(lkb, AST_COMP, 0);
4bf12011 4907+ } else {
4bf12011 4908+ lkb->lkb_retstatus = -DLM_EUNLOCK;
5cdbd17b 4909+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
4bf12011 4910+ }
4bf12011 4911+ break;
4912+
4913+ default:
4914+ log_error(ls, "process_lockqueue_reply id %x state %d",
4915+ lkb->lkb_id, state);
4916+ }
4917+}
4918+
4919+/*
4920+ * Tell a remote node to grant a lock. This happens when we are the master
4921+ * copy for a lock that is actually held on a remote node. The remote end is
4922+ * also responsible for sending the completion AST.
4923+ */
4924+
4925+void remote_grant(gd_lkb_t *lkb)
4926+{
4927+ struct writequeue_entry *e;
4928+ struct gd_remlockrequest *req;
4929+
4930+ // TODO Error handling
4931+ e = lowcomms_get_buffer(lkb->lkb_nodeid,
4932+ sizeof(struct gd_remlockrequest),
4933+ lkb->lkb_resource->res_ls->ls_allocation,
4934+ (char **) &req);
4935+ if (!e)
4936+ return;
4937+
4938+ req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
4939+ req->rr_header.rh_length = sizeof(struct gd_remlockrequest);
4940+ req->rr_header.rh_flags = 0;
4941+ req->rr_header.rh_lkid = lkb->lkb_id;
4942+ req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
4943+ req->rr_remlkid = lkb->lkb_remid;
4944+ req->rr_flags = 0;
4945+
4946+ if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
4947+ /* This is a confusing non-standard use of rr_flags which is
4948+ * usually used to pass lockqueue_flags. */
4949+ req->rr_flags |= GDLM_LKFLG_DEMOTED;
4950+ }
4951+
4952+ add_request_lvb(lkb, req);
4953+ midcomms_send_buffer(&req->rr_header, e);
4954+}
4955+
4956+void reply_and_grant(gd_lkb_t *lkb)
4957+{
4958+ struct gd_remlockrequest *req = lkb->lkb_request;
4959+ struct gd_remlockreply *reply;
4960+ struct writequeue_entry *e;
4961+
4962+ // TODO Error handling
4963+ e = lowcomms_get_buffer(lkb->lkb_nodeid,
4964+ sizeof(struct gd_remlockreply),
4965+ lkb->lkb_resource->res_ls->ls_allocation,
4966+ (char **) &reply);
4967+ if (!e)
4968+ return;
4969+
4970+ reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
4971+ reply->rl_header.rh_flags = 0;
4972+ reply->rl_header.rh_length = sizeof(struct gd_remlockreply);
4973+ reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
4974+ reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
4975+
4976+ reply->rl_status = lkb->lkb_retstatus;
4977+ reply->rl_lockstate = lkb->lkb_status;
4978+ reply->rl_lkid = lkb->lkb_id;
4979+
4980+ GDLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
4981+
4982+ lkb->lkb_request = NULL;
4983+
4984+ add_reply_lvb(lkb, reply);
4985+ midcomms_send_buffer(&reply->rl_header, e);
4986+}
4987+
4988+/*
4989+ * Request removal of a dead entry in the resource directory
4990+ */
4991+
4992+void remote_remove_resdata(gd_ls_t *ls, int nodeid, char *name, int namelen,
4993+ uint8_t sequence)
4994+{
4995+ struct writequeue_entry *e;
4996+ struct gd_remlockrequest *req;
4997+
4998+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
4999+ gd_rcom_t *rc = allocate_rcom_buffer(ls);
5000+
5001+ memcpy(rc->rc_buf, name, namelen);
5002+ rc->rc_datalen = namelen;
5003+
5004+ rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
5005+
5006+ free_rcom_buffer(rc);
5007+ return;
5008+ }
5009+ // TODO Error handling
5010+ e = lowcomms_get_buffer(nodeid,
5011+ sizeof(struct gd_remlockrequest) + namelen - 1,
5012+ ls->ls_allocation, (char **) &req);
5013+ if (!e)
5014+ return;
5015+
5016+ memset(req, 0, sizeof(struct gd_remlockrequest) + namelen - 1);
5017+ req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
5018+ req->rr_header.rh_length =
5019+ sizeof(struct gd_remlockrequest) + namelen - 1;
5020+ req->rr_header.rh_flags = 0;
5021+ req->rr_header.rh_lkid = 0;
5022+ req->rr_header.rh_lockspace = ls->ls_global_id;
5023+ req->rr_remlkid = 0;
5024+ req->rr_resdir_seq = sequence;
5025+ memcpy(req->rr_name, name, namelen);
5026+
5027+ midcomms_send_buffer(&req->rr_header, e);
5028+}
5029+
5030+/*
5031+ * Send remote cluster request to directory or master node before the request
5032+ * is put on the lock queue. Runs in the context of the locking caller.
5033+ */
5034+
5035+int send_cluster_request(gd_lkb_t *lkb, int state)
5036+{
5037+ uint32_t target_nodeid;
5038+ gd_res_t *rsb = lkb->lkb_resource;
5039+ gd_ls_t *ls = rsb->res_ls;
5040+ struct gd_remlockrequest *req;
5041+ struct writequeue_entry *e;
5042+
5043+ /* Need to know the target nodeid before we allocate a send buffer */
5044+ target_nodeid = lkb->lkb_nodeid;
5045+ GDLM_ASSERT(target_nodeid != 0,);
5046+
5047+ if (state == GDLM_LQSTATE_WAIT_RSB)
5048+ target_nodeid = get_directory_nodeid(rsb);
5049+
5050+ GDLM_ASSERT(target_nodeid,);
5051+
5052+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5053+ /* this may happen when called by resend_cluster_request */
5054+ log_error(ls, "send_cluster_request to %u state %d recovery",
5055+ target_nodeid, state);
5056+ }
5057+
5058+ e = lowcomms_get_buffer(target_nodeid,
5059+ sizeof(struct gd_remlockrequest) +
5060+ rsb->res_length - 1, ls->ls_allocation,
5061+ (char **) &req);
5062+ if (!e)
5063+ return -ENOBUFS;
5064+ memset(req, 0, sizeof(struct gd_remlockrequest) + rsb->res_length - 1);
5065+
5066+ /* Common stuff, some are just defaults */
5067+
5068+ if (lkb->lkb_bastaddr)
5cdbd17b 5069+ req->rr_asts = AST_BAST;
4bf12011 5070+ if (lkb->lkb_astaddr)
5cdbd17b 5071+ req->rr_asts |= AST_COMP;
4bf12011 5072+ if (lkb->lkb_parent)
5073+ req->rr_remparid = lkb->lkb_parent->lkb_remid;
5074+
5075+ req->rr_flags = lkb->lkb_lockqueue_flags;
5076+ req->rr_rqmode = lkb->lkb_rqmode;
5077+ req->rr_remlkid = lkb->lkb_remid;
5078+ req->rr_header.rh_length =
5079+ sizeof(struct gd_remlockrequest) + rsb->res_length - 1;
5080+ req->rr_header.rh_flags = 0;
5081+ req->rr_header.rh_lkid = lkb->lkb_id;
5082+ req->rr_header.rh_lockspace = ls->ls_global_id;
5083+
5084+ switch (state) {
5085+
5086+ case GDLM_LQSTATE_WAIT_RSB:
5087+
5088+ /* The lock must be a root lock */
5089+ GDLM_ASSERT(!lkb->lkb_parent,);
5090+
5091+ req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
5092+ memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5093+ break;
5094+
5095+ case GDLM_LQSTATE_WAIT_CONVERT:
5096+
5097+ req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
5098+ if (lkb->lkb_range) {
5099+ req->rr_flags |= GDLM_LKFLG_RANGE;
5100+ req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5101+ req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5102+ }
5103+ break;
5104+
5105+ case GDLM_LQSTATE_WAIT_CONDGRANT:
5106+
5107+ req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
5108+ req->rr_resdir_seq = rsb->res_resdir_seq;
5109+ memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5110+ if (lkb->lkb_range) {
5111+ req->rr_flags |= GDLM_LKFLG_RANGE;
5112+ req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5113+ req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5114+ }
5115+ break;
5116+
5117+ case GDLM_LQSTATE_WAIT_UNLOCK:
5118+
5119+ req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
5120+ break;
5121+
5122+ default:
5123+ GDLM_ASSERT(!"Unknown cluster request",);
5124+ }
5125+
5126+ add_request_lvb(lkb, req);
5127+ midcomms_send_buffer(&req->rr_header, e);
5128+
5129+ return 0;
5130+}
5131+
5132+/*
5133+ * We got a request from another cluster node, process it and return an info
5134+ * structure with the lock state/LVB etc as required. Executes in the DLM's
5135+ * recvd thread.
5136+ */
5137+
5138+int process_cluster_request(int nodeid, struct gd_req_header *req, int recovery)
5139+{
5140+ gd_ls_t *lspace;
5141+ gd_lkb_t *lkb = NULL;
5142+ gd_res_t *rsb;
5143+ int send_reply = 0, status = 0, namelen;
5144+ struct gd_remlockrequest *freq = (struct gd_remlockrequest *) req;
5145+ struct gd_remlockreply reply;
5146+
5147+ lspace = find_lockspace_by_global_id(req->rh_lockspace);
5148+
5149+ if (!lspace) {
5150+ log_print("process_cluster_request invalid lockspace %x "
5151+ "from %d req %u", req->rh_lockspace, nodeid,
5152+ req->rh_cmd);
5153+ status = -EINVAL;
5154+ goto out;
5155+ }
5156+
5157+ /* wait for recoverd to drain requestqueue */
5158+ if (!recovery)
5159+ wait_requestqueue(lspace);
5160+
5161+ /*
5162+ * If we're in recovery then queue the request for later. Otherwise,
5163+ * we still need to get the "in_recovery" lock to make sure the
5164+ * recovery itself doesn't start until we are done.
5165+ */
5166+ retry:
5167+ if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
5168+ if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags))
5169+ log_error(lspace, "process_cluster_request warning %u",
5170+ nodeid);
5171+ add_to_requestqueue(lspace, nodeid, (char *) req,
5172+ req->rh_length);
5173+ log_debug(lspace, "process_cluster_request abort");
5174+ status = -EINTR;
5175+ goto out;
5176+ }
5177+ if (!down_read_trylock(&lspace->ls_in_recovery)) {
5178+ schedule();
5179+ goto retry;
5180+ }
5181+
5182+
5183+ /*
5184+ * Process the request.
5185+ */
5186+
5187+ switch (req->rh_cmd) {
5188+
5189+ case GDLM_REMCMD_LOOKUP:
5190+ {
5191+ gd_resdata_t *rd;
5192+ int status;
5193+ uint32_t dir_nodeid;
5194+
5195+ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5196+
5197+ dir_nodeid = name_to_directory_nodeid(lspace,
5198+ freq->rr_name,
5199+ namelen);
5200+ if (dir_nodeid != our_nodeid())
5201+ log_debug(lspace, "ignoring directory lookup");
5202+
5203+ status = get_resdata(lspace, nodeid, freq->rr_name,
5204+ namelen, &rd, 0);
5205+ if (status)
5206+ status = -ENOMEM;
5207+
5208+ reply.rl_status = status;
5209+ reply.rl_lockstate = 0;
5210+ reply.rl_nodeid = rd->rd_master_nodeid;
5211+ reply.rl_resdir_seq = rd->rd_sequence;
5212+ }
5213+ send_reply = 1;
5214+ break;
5215+
5216+ case GDLM_REMCMD_REM_RESDATA:
5217+
5218+ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5219+ remove_resdata(lspace, nodeid, freq->rr_name, namelen,
5220+ freq->rr_resdir_seq);
5221+ break;
5222+
5223+ case GDLM_REMCMD_LOCKREQUEST:
5224+
5225+ lkb = remote_stage2(nodeid, lspace, freq);
5226+ if (lkb) {
5227+ lkb->lkb_request = freq;
5228+ dlm_lock_stage3(lkb);
5229+
5230+ /*
5231+ * If the request was granted in lock_stage3, then a
5232+ * reply message was already sent in combination with
5233+ * the grant message and lkb_request is NULL.
5234+ */
5235+
5236+ if (lkb->lkb_request) {
5237+ lkb->lkb_request = NULL;
5238+ send_reply = 1;
5239+ reply.rl_status = lkb->lkb_retstatus;
5240+ reply.rl_lockstate = lkb->lkb_status;
5241+ reply.rl_lkid = lkb->lkb_id;
5242+
5243+ /*
5244+ * If the request could not be granted and the
5245+ * user won't wait, then free up the LKB
5246+ */
5247+
5cdbd17b
AM
5248+ if (lkb->lkb_retstatus == -EAGAIN) {
5249+ GDLM_ASSERT(lkb->lkb_lockqueue_flags &
5250+ DLM_LKF_NOQUEUE,);
4bf12011 5251+ rsb = lkb->lkb_resource;
5252+ release_lkb(lspace, lkb);
5253+ release_rsb(rsb);
5254+ lkb = NULL;
5255+ }
5256+ }
5257+ } else {
5258+ reply.rl_status = -ENOMEM;
5259+ send_reply = 1;
5260+ }
5261+ break;
5262+
5263+ case GDLM_REMCMD_CONVREQUEST:
5264+
5265+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5266+
5267+ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5268+ freq->rr_remlkid,
5269+ freq->rr_header.rh_lkid, nodeid););
5270+
5271+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5272+ log_error(lspace, "convrequest: invalid status %d",
5273+ lkb->lkb_status);
5274+
5275+ lkb->lkb_rqmode = freq->rr_rqmode;
5276+ lkb->lkb_lockqueue_flags = freq->rr_flags;
5277+ lkb->lkb_request = freq;
5278+ lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
5279+
5280+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK
5281+ || freq->rr_flags & DLM_LKF_VALBLK) {
5282+ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
5283+ allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
5284+ freq->rr_lvb);
5285+ }
5286+
5287+ if (freq->rr_flags & GDLM_LKFLG_RANGE) {
5288+ if (lkb_set_range(lspace, lkb, freq->rr_range_start,
5289+ freq->rr_range_end)) {
5290+ reply.rl_status = -ENOMEM;
5291+ send_reply = 1;
5292+ goto out;
5293+ }
5294+ }
5295+
5296+ dlm_convert_stage2(lkb, FALSE);
5297+
5298+ /*
5299+ * If the conv request was granted in stage2, then a reply
5300+ * message was already sent in combination with the grant
5301+ * message.
5302+ */
5303+
5304+ if (lkb->lkb_request) {
5305+ lkb->lkb_request = NULL;
5306+ send_reply = 1;
5307+ reply.rl_status = lkb->lkb_retstatus;
5308+ reply.rl_lockstate = lkb->lkb_status;
5309+ reply.rl_lkid = lkb->lkb_id;
5310+ }
5311+ break;
5312+
5313+ case GDLM_REMCMD_LOCKREPLY:
5314+
5315+ lkb = find_lock_by_id(lspace, freq->rr_header.rh_lkid);
5316+
5317+ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5318+ freq->rr_remlkid,
5319+ freq->rr_header.rh_lkid, nodeid););
5320+
5321+ process_lockqueue_reply(lkb, (struct gd_remlockreply *) req);
5322+ break;
5323+
5324+ case GDLM_REMCMD_LOCKGRANT:
5325+
5326+ /*
5327+ * Remote lock has been granted asynchronously. Do a compact
5328+ * version of what grant_lock() does.
5329+ */
5330+
5331+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5332+
5333+ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5334+ freq->rr_remlkid,
5335+ freq->rr_header.rh_lkid, nodeid););
5336+
5337+ rsb = lkb->lkb_resource;
5338+
5339+ if (lkb->lkb_lockqueue_state)
5340+ log_error(rsb->res_ls, "granting lock on lockqueue "
5341+ "id=%x from=%u lqstate=%d flags=%x",
5342+ lkb->lkb_id, nodeid, lkb->lkb_lockqueue_state,
5343+ lkb->lkb_flags);
5344+
5345+ down_write(&rsb->res_lock);
5346+
5347+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5348+ memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN);
5349+
5350+ lkb->lkb_grmode = lkb->lkb_rqmode;
5351+ lkb->lkb_rqmode = DLM_LOCK_IV;
5352+
5353+ if (lkb->lkb_range) {
5354+ lkb->lkb_range[GR_RANGE_START] =
5355+ lkb->lkb_range[RQ_RANGE_START];
5356+ lkb->lkb_range[GR_RANGE_END] =
5357+ lkb->lkb_range[RQ_RANGE_END];
5358+ }
5359+
5360+ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5361+ up_write(&rsb->res_lock);
5362+
5363+ if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
5364+ lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
5365+
5366+ lkb->lkb_retstatus = 0;
5cdbd17b 5367+ queue_ast(lkb, AST_COMP, 0);
4bf12011 5368+ break;
5369+
5370+ case GDLM_REMCMD_SENDBAST:
5371+
5372+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5373+
5374+ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5375+ freq->rr_remlkid,
5376+ freq->rr_header.rh_lkid, nodeid););
5377+
5378+ if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
5cdbd17b 5379+ queue_ast(lkb, AST_BAST, freq->rr_rqmode);
4bf12011 5380+ break;
5381+
5382+ case GDLM_REMCMD_SENDCAST:
5383+
5384+ /* This is only used for some error completion ASTs */
5385+
5386+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5387+
5388+ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5389+ freq->rr_remlkid,
5390+ freq->rr_header.rh_lkid, nodeid););
5391+
5392+ /* Return the lock to granted status */
5393+ res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
5394+
5395+ lkb->lkb_retstatus = freq->rr_status;
5cdbd17b 5396+ queue_ast(lkb, AST_COMP, 0);
4bf12011 5397+ break;
5398+
5399+ case GDLM_REMCMD_UNLOCKREQUEST:
5400+
5401+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5402+
5403+ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
5404+ freq->rr_remlkid,
5405+ freq->rr_header.rh_lkid, nodeid););
5406+
5407+ reply.rl_status = dlm_unlock_stage2(lkb, freq->rr_flags);
5408+ send_reply = 1;
5409+ break;
5410+
5411+ case GDLM_REMCMD_QUERY:
5412+ remote_query(nodeid, lspace, req);
5413+ break;
5414+
5415+ case GDLM_REMCMD_QUERYREPLY:
5416+ remote_query_reply(nodeid, lspace, req);
5417+ break;
5418+
5419+ default:
5420+ log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
5421+ }
5422+
5423+ up_read(&lspace->ls_in_recovery);
5424+
5425+ out:
5426+ if (send_reply) {
5427+ reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5428+ reply.rl_header.rh_flags = 0;
5429+ reply.rl_header.rh_length = sizeof(reply);
5430+ reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
5431+ reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
5432+
5433+ status = midcomms_send_message(nodeid, &reply.rl_header,
5434+ GFP_KERNEL);
5435+ }
5436+
5437+ wake_astd();
5438+
5439+ return status;
5440+}
5441+
5442+static void add_reply_lvb(gd_lkb_t *lkb, struct gd_remlockreply *reply)
5443+{
5444+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5445+ memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5446+}
5447+
5448+static void add_request_lvb(gd_lkb_t *lkb, struct gd_remlockrequest *req)
5449+{
5450+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5451+ memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5452+}
5453diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
5454--- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 5455+++ linux-patched/cluster/dlm/lockqueue.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 5456@@ -0,0 +1,29 @@
5457+/******************************************************************************
5458+*******************************************************************************
5459+**
5460+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5461+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5462+**
5463+** This copyrighted material is made available to anyone wishing to use,
5464+** modify, copy, or redistribute it subject to the terms and conditions
5465+** of the GNU General Public License v.2.
5466+**
5467+*******************************************************************************
5468+******************************************************************************/
5469+
5470+#ifndef __LOCKQUEUE_DOT_H__
5471+#define __LOCKQUEUE_DOT_H__
5472+
5473+void remote_grant(gd_lkb_t * lkb);
5474+void reply_and_grant(gd_lkb_t * lkb);
5475+int remote_stage(gd_lkb_t * lkb, int state);
5476+int process_cluster_request(int csid, struct gd_req_header *req, int recovery);
5477+int send_cluster_request(gd_lkb_t * lkb, int state);
5478+void purge_requestqueue(gd_ls_t * ls);
5479+int process_requestqueue(gd_ls_t * ls);
5480+int reply_in_requestqueue(gd_ls_t * ls, int lkid);
5481+void remote_remove_resdata(gd_ls_t * ls, int nodeid, char *name, int namelen,
5482+ uint8_t sequence);
5483+void allocate_and_copy_lvb(gd_ls_t * ls, char **lvbptr, char *src);
5484+
5485+#endif /* __LOCKQUEUE_DOT_H__ */
5486diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
5487--- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 5488+++ linux-patched/cluster/dlm/lockspace.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 5489@@ -0,0 +1,706 @@
5490+/******************************************************************************
5491+*******************************************************************************
5492+**
5493+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5494+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5495+**
5496+** This copyrighted material is made available to anyone wishing to use,
5497+** modify, copy, or redistribute it subject to the terms and conditions
5498+** of the GNU General Public License v.2.
5499+**
5500+*******************************************************************************
5501+******************************************************************************/
5502+
5503+#include <linux/module.h>
5504+
5505+#include "dlm_internal.h"
5506+#include "recoverd.h"
5507+#include "ast.h"
5508+#include "lkb.h"
5509+#include "nodes.h"
5510+#include "dir.h"
5511+#include "lowcomms.h"
5512+#include "config.h"
5513+#include "memory.h"
5514+#include "lockspace.h"
5515+#include "device.h"
5516+
5517+#define GDST_NONE (0)
5518+#define GDST_RUNNING (1)
5519+
5520+static int gdlmstate;
5521+static int gdlmcount;
5522+static struct semaphore gdlmstate_lock;
5523+struct list_head lslist;
5524+spinlock_t lslist_lock;
5525+struct kcl_service_ops ls_ops;
5526+
5527+static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
5528+
5529+
5530+void dlm_lockspace_init(void)
5531+{
5532+ gdlmstate = GDST_NONE;
5533+ gdlmcount = 0;
5534+ init_MUTEX(&gdlmstate_lock);
5535+ INIT_LIST_HEAD(&lslist);
5536+ spin_lock_init(&lslist_lock);
5537+}
5538+
5539+gd_ls_t *find_lockspace_by_global_id(uint32_t id)
5540+{
5541+ gd_ls_t *ls;
5542+
5543+ spin_lock(&lslist_lock);
5544+
5545+ list_for_each_entry(ls, &lslist, ls_list) {
5546+ if (ls->ls_global_id == id)
5547+ goto out;
5548+ }
5549+ ls = NULL;
5550+ out:
5551+ spin_unlock(&lslist_lock);
5552+ return ls;
5553+}
5554+
5555+/* TODO: make this more efficient */
5556+gd_ls_t *find_lockspace_by_local_id(void *id)
5557+{
5558+ gd_ls_t *ls;
5559+
5560+ spin_lock(&lslist_lock);
5561+
5562+ list_for_each_entry(ls, &lslist, ls_list) {
5563+ if (ls->ls_local_id == (uint32_t)(long)id)
5564+ goto out;
5565+ }
5566+ ls = NULL;
5567+ out:
5568+ spin_unlock(&lslist_lock);
5569+ return ls;
5570+}
5571+
5572+gd_ls_t *find_lockspace_by_name(char *name, int namelen)
5573+{
5574+ gd_ls_t *ls;
5575+
5576+ spin_lock(&lslist_lock);
5577+
5578+ list_for_each_entry(ls, &lslist, ls_list) {
5579+ if (ls->ls_namelen == namelen &&
5580+ memcmp(ls->ls_name, name, namelen) == 0)
5581+ goto out;
5582+ }
5583+ ls = NULL;
5584+ out:
5585+ spin_unlock(&lslist_lock);
5586+ return ls;
5587+}
5588+
5589+/*
5590+ * Called from dlm_init. These are the general threads which are not
5591+ * lockspace-specific and work for all gdlm lockspaces.
5592+ */
5593+
5594+static int threads_start(void)
5595+{
5596+ int error;
5597+
5598+ /* Thread which interacts with cman for all ls's */
5599+ error = recoverd_start();
5600+ if (error) {
5601+ log_print("cannot start recovery thread %d", error);
5602+ goto fail;
5603+ }
5604+
5605+ /* Thread which process lock requests for all ls's */
5606+ error = astd_start();
5607+ if (error) {
5608+ log_print("cannot start ast thread %d", error);
5609+ goto recoverd_fail;
5610+ }
5611+
5612+ /* Thread for sending/receiving messages for all ls's */
5613+ error = lowcomms_start();
5614+ if (error) {
5615+ log_print("cannot start lowcomms %d", error);
5616+ goto astd_fail;
5617+ }
5618+
5619+ return 0;
5620+
5621+ astd_fail:
5622+ astd_stop();
5623+
5624+ recoverd_fail:
5625+ recoverd_stop();
5626+
5627+ fail:
5628+ return error;
5629+}
5630+
5631+static void threads_stop(void)
5632+{
5633+ lowcomms_stop();
5634+ astd_stop();
5635+ recoverd_stop();
5636+}
5637+
5638+static int init_internal(void)
5639+{
5640+ int error = 0;
5641+
5642+ if (gdlmstate == GDST_RUNNING)
5643+ gdlmcount++;
5644+ else {
5645+ error = threads_start();
5646+ if (error)
5647+ goto out;
5648+
5649+ gdlmstate = GDST_RUNNING;
5650+ gdlmcount = 1;
5651+ }
5652+
5653+ out:
5654+ return error;
5655+}
5656+
5657+
5658+/*
5659+ * Called after gdlm module is loaded and before any lockspaces are created.
5660+ * Starts and initializes global threads and structures. These global entities
5661+ * are shared by and independent of all lockspaces.
5662+ *
5663+ * There should be a gdlm-specific user command which a person can run which
5664+ * calls this function. If a user hasn't run that command and something
5665+ * creates a new lockspace, this is called first.
5666+ *
5667+ * This also starts the default lockspace.
5668+ */
5669+
5670+int dlm_init(void)
5671+{
5672+ int error;
5673+
5674+ down(&gdlmstate_lock);
5675+ error = init_internal();
5676+ up(&gdlmstate_lock);
5677+
5678+ return error;
5679+}
5680+
5681+int dlm_release(void)
5682+{
5683+ int error = 0;
5684+
5685+ down(&gdlmstate_lock);
5686+
5687+ if (gdlmstate == GDST_NONE)
5688+ goto out;
5689+
5690+ if (gdlmcount)
5691+ gdlmcount--;
5692+
5693+ if (gdlmcount)
5694+ goto out;
5695+
5696+ spin_lock(&lslist_lock);
5697+ if (!list_empty(&lslist)) {
5698+ spin_unlock(&lslist_lock);
5699+ log_print("cannot stop threads, lockspaces still exist");
5700+ goto out;
5701+ }
5702+ spin_unlock(&lslist_lock);
5703+
5704+ threads_stop();
5705+ gdlmstate = GDST_NONE;
5706+
5707+ out:
5708+ up(&gdlmstate_lock);
5709+
5710+ return error;
5711+}
5712+
5713+gd_ls_t *allocate_ls(int namelen)
5714+{
5715+ gd_ls_t *ls;
5716+
5717+ /* FIXME: use appropriate malloc type */
5718+
5719+ ls = kmalloc(sizeof(gd_ls_t) + namelen, GFP_KERNEL);
5720+ if (ls)
5721+ memset(ls, 0, sizeof(gd_ls_t) + namelen);
5722+
5723+ return ls;
5724+}
5725+
5726+void free_ls(gd_ls_t *ls)
5727+{
5728+ kfree(ls);
5729+}
5730+
5731+static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
5732+{
5733+ gd_ls_t *ls;
5734+ int i, error = -ENOMEM;
5735+ uint32_t local_id = 0;
5736+
5737+ if (!try_module_get(THIS_MODULE))
5738+ return -EINVAL;
5739+
5740+ if (namelen > MAX_SERVICE_NAME_LEN)
5741+ return -EINVAL;
5742+
5743+ if ((ls = find_lockspace_by_name(name, namelen))) {
5744+ *lockspace = (void *)ls->ls_local_id;
5745+ return -EEXIST;
5746+ }
5747+
5748+ /*
5749+ * Initialize ls fields
5750+ */
5751+
5752+ ls = allocate_ls(namelen);
5753+ if (!ls)
5754+ goto out;
5755+
5756+ memcpy(ls->ls_name, name, namelen);
5757+ ls->ls_namelen = namelen;
5758+
5759+ ls->ls_allocation = GFP_KERNEL;
5760+ memset(&ls->ls_flags, 0, sizeof(unsigned long));
5761+ INIT_LIST_HEAD(&ls->ls_rootres);
5762+ ls->ls_hashsize = dlm_config.reshashtbl;
5763+ ls->ls_hashmask = ls->ls_hashsize - 1;
5764+
5765+ ls->ls_reshashtbl =
5766+ kmalloc(sizeof(struct list_head) * ls->ls_hashsize, GFP_KERNEL);
5767+ if (!ls->ls_reshashtbl)
5768+ goto out_lsfree;
5769+
5770+ for (i = 0; i < ls->ls_hashsize; i++)
5771+ INIT_LIST_HEAD(&ls->ls_reshashtbl[i]);
5772+
5773+ rwlock_init(&ls->ls_reshash_lock);
5774+
5775+ if (init_lockidtbl(ls, dlm_config.lockidtbl) == -1)
5776+ goto out_htfree;
5777+
5778+ INIT_LIST_HEAD(&ls->ls_nodes);
5779+ ls->ls_num_nodes = 0;
5780+ INIT_LIST_HEAD(&ls->ls_nodes_gone);
5781+ INIT_LIST_HEAD(&ls->ls_recover);
5782+ spin_lock_init(&ls->ls_recover_lock);
5783+ INIT_LIST_HEAD(&ls->ls_recover_list);
5784+ ls->ls_recover_list_count = 0;
5785+ spin_lock_init(&ls->ls_recover_list_lock);
5786+ init_waitqueue_head(&ls->ls_wait_general);
5787+ INIT_LIST_HEAD(&ls->ls_requestqueue);
5788+ INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
5789+ ls->ls_last_stop = 0;
5790+ ls->ls_last_start = 0;
5791+ ls->ls_last_finish = 0;
5792+ ls->ls_rcom_msgid = 0;
5793+ init_MUTEX(&ls->ls_rcom_lock);
5794+ init_rwsem(&ls->ls_in_recovery);
5795+ init_rwsem(&ls->ls_unlock_sem);
5796+ init_rwsem(&ls->ls_rec_rsblist);
5797+ init_rwsem(&ls->ls_gap_rsblist);
5798+ down_write(&ls->ls_in_recovery);
5799+
5800+ for (i = 0; i < RESDIRHASH_SIZE; i++) {
5801+ INIT_LIST_HEAD(&ls->ls_resdir_hash[i].rb_reslist);
5802+ rwlock_init(&ls->ls_resdir_hash[i].rb_lock);
5803+ }
5804+
5805+ if (flags & DLM_LSF_NOTIMERS)
5806+ set_bit(LSFL_NOTIMERS, &ls->ls_flags);
5807+
5808+ /*
5809+ * Connect this lockspace with the cluster manager
5810+ */
5811+
5812+ error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
5813+ &ls_ops, TRUE, (void *) ls, &local_id);
5814+ if (error)
5815+ goto out_idtblfree;
5816+
5817+ ls->ls_state = LSST_INIT;
5818+ ls->ls_local_id = local_id;
5819+
5820+ spin_lock(&lslist_lock);
5821+ list_add(&ls->ls_list, &lslist);
5822+ spin_unlock(&lslist_lock);
5823+
5824+ error = kcl_join_service(local_id);
5825+ if (error) {
5826+ log_error(ls, "service manager join error %d", error);
5827+ goto out_reg;
5828+ }
5829+
5830+ /* The ls isn't actually running until it receives a start() from CMAN.
5831+ * Neither does it have a global ls id until started. */
5832+
5833+
5834+ /* Return the local ID as the lockspace handle. I've left this
5835+ cast to a void* as it allows us to replace it with pretty much
5836+ anything at a future date without breaking clients. But returning
5837+ the address of the lockspace is a bad idea as it could get
5838+ forcibly removed, leaving client with a dangling pointer */
5839+ *lockspace = (void *)local_id;
5840+
5841+ return 0;
5842+
5843+ out_reg:
5844+ kcl_unregister_service(ls->ls_local_id);
5845+
5846+ out_idtblfree:
5847+ free_lockidtbl(ls);
5848+
5849+ out_htfree:
5850+ kfree(ls->ls_reshashtbl);
5851+
5852+ out_lsfree:
5853+ free_ls(ls);
5854+
5855+ out:
5856+ return error;
5857+}
5858+
5859+/*
5860+ * Called by a system like GFS which wants independent lock spaces.
5861+ */
5862+
5863+int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
5864+{
5865+ int error = -ENOSYS;
5866+
5867+ down(&gdlmstate_lock);
5868+
5869+ error = init_internal();
5870+ if (error)
5871+ goto out;
5872+
5873+ error = new_lockspace(name, namelen, lockspace, flags);
5874+
5875+ out:
5876+ up(&gdlmstate_lock);
5877+
5878+ return error;
5879+}
5880+
5881+/* Return 1 if the lockspace still has active remote locks,
5882+ * 2 if the lockspace still has active local locks.
5883+ */
5884+static int lockspace_busy(gd_ls_t *ls)
5885+{
5886+ int i;
5887+ int lkb_found = 0;
5888+ gd_lkb_t *lkb;
5889+
5890+ /* NOTE: We check the lockidtbl here rather than the resource table.
5891+ * This is because there may be LKBs queued as ASTs that have been unlinked
5892+ * from their RSBs and are pending deletion once the AST has been delivered
5893+ */
5894+ read_lock(&ls->ls_lockidtbl_lock);
5895+ for (i = 0; i < ls->ls_lockidtbl_size; i++) {
5896+ if (!list_empty(&ls->ls_lockidtbl[i].list)) {
5897+ lkb_found = 1;
5898+ list_for_each_entry(lkb, &ls->ls_lockidtbl[i].list, lkb_idtbl_list) {
5899+ if (!lkb->lkb_nodeid) {
5900+ read_unlock(&ls->ls_lockidtbl_lock);
5901+ return 2;
5902+ }
5903+ }
5904+ }
5905+ }
5906+ read_unlock(&ls->ls_lockidtbl_lock);
5907+ return lkb_found;
5908+}
5909+
5910+/* Actually release the lockspace */
5911+static int release_lockspace(gd_ls_t *ls, int force)
5912+{
5913+ gd_lkb_t *lkb;
5914+ gd_res_t *rsb;
5915+ gd_recover_t *gr;
5916+ gd_csb_t *csb;
5917+ struct list_head *head;
5918+ int i;
5919+ int busy = lockspace_busy(ls);
5920+
5921+ /* Don't destroy a busy lockspace */
5922+ if (busy > force)
5923+ return -EBUSY;
5924+
5925+ if (force < 3) {
5926+ kcl_leave_service(ls->ls_local_id);
5927+ kcl_unregister_service(ls->ls_local_id);
5928+ }
5929+
5930+ spin_lock(&lslist_lock);
5931+ list_del(&ls->ls_list);
5932+ spin_unlock(&lslist_lock);
5933+
5934+ /*
5935+ * Free resdata structs.
5936+ */
5937+
5938+ resdir_clear(ls);
5939+
5940+ /*
5941+ * Free all lkb's on lockidtbl[] lists.
5942+ */
5943+
5944+ for (i = 0; i < ls->ls_lockidtbl_size; i++) {
5945+ head = &ls->ls_lockidtbl[i].list;
5946+ while (!list_empty(head)) {
5947+ lkb = list_entry(head->next, gd_lkb_t, lkb_idtbl_list);
5948+ list_del(&lkb->lkb_idtbl_list);
5949+
5950+ if (lkb->lkb_lockqueue_state)
5951+ remove_from_lockqueue(lkb);
5952+
5cdbd17b 5953+ if (lkb->lkb_astflags & (AST_COMP | AST_BAST))
4bf12011 5954+ list_del(&lkb->lkb_astqueue);
5955+
5956+ if (lkb->lkb_lvbptr
5957+ && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
5958+ free_lvb(lkb->lkb_lvbptr);
5959+
5960+ free_lkb(lkb);
5961+ }
5962+ }
5963+
5964+ /*
5965+ * Free lkidtbl[] itself
5966+ */
5967+
5968+ kfree(ls->ls_lockidtbl);
5969+
5970+ /*
5971+ * Free all rsb's on reshashtbl[] lists
5972+ */
5973+
5974+ for (i = 0; i < ls->ls_hashsize; i++) {
5975+ head = &ls->ls_reshashtbl[i];
5976+ while (!list_empty(head)) {
5977+ rsb = list_entry(head->next, gd_res_t, res_hashchain);
5978+ list_del(&rsb->res_hashchain);
5979+
5980+ if (rsb->res_lvbptr)
5981+ free_lvb(rsb->res_lvbptr);
5982+
5983+ free_rsb(rsb);
5984+ }
5985+ }
5986+
5987+ /*
5988+ * Free reshashtbl[] itself
5989+ */
5990+
5991+ kfree(ls->ls_reshashtbl);
5992+
5993+ /*
5994+ * Free structures on any other lists
5995+ */
5996+
5997+ head = &ls->ls_recover;
5998+ while (!list_empty(head)) {
5999+ gr = list_entry(head->next, gd_recover_t, gr_list);
6000+ list_del(&gr->gr_list);
6001+ free_dlm_recover(gr);
6002+ }
6003+
6004+ head = &ls->ls_nodes;
6005+ while (!list_empty(head)) {
6006+ csb = list_entry(head->next, gd_csb_t, csb_list);
6007+ list_del(&csb->csb_list);
6008+ release_csb(csb);
6009+ }
6010+
6011+ head = &ls->ls_nodes_gone;
6012+ while (!list_empty(head)) {
6013+ csb = list_entry(head->next, gd_csb_t, csb_list);
6014+ list_del(&csb->csb_list);
6015+ release_csb(csb);
6016+ }
6017+
6018+ free_ls(ls);
6019+
6020+ dlm_release();
6021+
6022+ module_put(THIS_MODULE);
6023+ return 0;
6024+}
6025+
6026+
6027+/*
6028+ * Called when a system has released all its locks and is not going to use the
6029+ * lockspace any longer. We blindly free everything we're managing for this
6030+ * lockspace. Remaining nodes will go through the recovery process as if we'd
6031+ * died. The lockspace must continue to function as usual, participating in
6032+ * recoveries, until kcl_leave_service returns.
6033+ *
6034+ * Force has 4 possible values:
6035+ * 0 - don't destroy locksapce if it has any LKBs
6036+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
6037+ * 2 - destroy lockspace regardless of LKBs
6038+ * 3 - destroy lockspace as part of a forced shutdown
6039+ */
6040+
6041+int dlm_release_lockspace(void *lockspace, int force)
6042+{
6043+ gd_ls_t *ls;
6044+
6045+ ls = find_lockspace_by_local_id(lockspace);
6046+ if (!ls)
6047+ return -EINVAL;
6048+
6049+ return release_lockspace(ls, force);
6050+}
6051+
6052+
6053+/* Called when the cluster is being shut down dirtily */
6054+void dlm_emergency_shutdown()
6055+{
6056+ gd_ls_t *ls;
6057+ gd_ls_t *tmp;
6058+
6059+ /* Shut lowcomms down to prevent any socket activity */
6060+ lowcomms_stop_accept();
6061+
6062+ /* Delete the devices that belong the the userland
6063+ lockspaces to be deleted. */
6064+ dlm_device_free_devices();
6065+
6066+ /* Now try to clean the lockspaces */
6067+ spin_lock(&lslist_lock);
6068+
6069+ list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
6070+ spin_unlock(&lslist_lock);
6071+ release_lockspace(ls, 3);
6072+ spin_lock(&lslist_lock);
6073+ }
6074+
6075+ spin_unlock(&lslist_lock);
6076+}
6077+
6078+gd_recover_t *allocate_dlm_recover(void)
6079+{
6080+ gd_recover_t *gr;
6081+
6082+ gr = (gd_recover_t *) kmalloc(sizeof(gd_recover_t), GFP_KERNEL);
6083+ if (gr)
6084+ memset(gr, 0, sizeof(gd_recover_t));
6085+
6086+ return gr;
6087+}
6088+
6089+void free_dlm_recover(gd_recover_t * gr)
6090+{
6091+ kfree(gr);
6092+}
6093+
6094+/*
6095+ * Called by CMAN on a specific ls. "stop" means set flag which while set
6096+ * causes all new requests to ls to be queued and not submitted until flag is
6097+ * cleared. stop on a ls also needs to cancel any prior starts on the ls.
6098+ * The recoverd thread carries out any work called for by this event.
6099+ */
6100+
6101+static int dlm_ls_stop(void *servicedata)
6102+{
6103+ gd_ls_t *ls = (gd_ls_t *) servicedata;
6104+ int new;
6105+
6106+ spin_lock(&ls->ls_recover_lock);
6107+ ls->ls_last_stop = ls->ls_last_start;
6108+ set_bit(LSFL_LS_STOP, &ls->ls_flags);
6109+ new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
6110+ spin_unlock(&ls->ls_recover_lock);
6111+
6112+ /*
6113+ * This in_recovery lock does two things:
6114+ *
6115+ * 1) Keeps this function from returning until all threads are out
6116+ * of locking routines and locking is truely stopped.
6117+ * 2) Keeps any new requests from being processed until it's unlocked
6118+ * when recovery is complete.
6119+ */
6120+
6121+ if (new)
6122+ down_write(&ls->ls_in_recovery);
6123+
6124+ clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
6125+ clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
6126+ clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
6127+ clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
6128+
6129+ recoverd_kick(ls);
6130+
6131+ return 0;
6132+}
6133+
6134+/*
6135+ * Called by CMAN on a specific ls. "start" means enable the lockspace to do
6136+ * request processing which first requires that the recovery procedure be
6137+ * stepped through with all nodes sharing the lockspace (nodeids). The first
6138+ * start on the ls after it's created is a special case and requires some extra
6139+ * work like figuring out our own local nodeid. We can't do all this in the
6140+ * calling CMAN context, so we must pass this work off to the recoverd thread
6141+ * which was created in gdlm_init(). The recoverd thread carries out any work
6142+ * called for by this event.
6143+ */
6144+
6145+static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
6146+ int event_id, int type)
6147+{
6148+ gd_ls_t *ls = (gd_ls_t *) servicedata;
6149+ gd_recover_t *gr;
6150+ int error = -ENOMEM;
6151+
6152+ gr = allocate_dlm_recover();
6153+ if (!gr)
6154+ goto out;
6155+
6156+ gr->gr_nodeids = nodeids;
6157+ gr->gr_node_count = count;
6158+ gr->gr_event_id = event_id;
6159+
6160+ spin_lock(&ls->ls_recover_lock);
6161+ ls->ls_last_start = event_id;
6162+ list_add_tail(&gr->gr_list, &ls->ls_recover);
6163+ set_bit(LSFL_LS_START, &ls->ls_flags);
6164+ spin_unlock(&ls->ls_recover_lock);
6165+
6166+ recoverd_kick(ls);
6167+ error = 0;
6168+
6169+ out:
6170+ return error;
6171+}
6172+
6173+/*
6174+ * Called by CMAN on a specific ls. "finish" means that all nodes which
6175+ * received a "start" have completed the start and called kcl_start_done.
6176+ * The recoverd thread carries out any work called for by this event.
6177+ */
6178+
6179+static void dlm_ls_finish(void *servicedata, int event_id)
6180+{
6181+ gd_ls_t *ls = (gd_ls_t *) servicedata;
6182+
6183+ spin_lock(&ls->ls_recover_lock);
6184+ ls->ls_last_finish = event_id;
6185+ set_bit(LSFL_LS_FINISH, &ls->ls_flags);
6186+ spin_unlock(&ls->ls_recover_lock);
6187+
6188+ recoverd_kick(ls);
6189+}
6190+
6191+struct kcl_service_ops ls_ops = {
6192+ .stop = dlm_ls_stop,
6193+ .start = dlm_ls_start,
6194+ .finish = dlm_ls_finish
6195+};
6196diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
6197--- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 6198+++ linux-patched/cluster/dlm/lockspace.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 6199@@ -0,0 +1,29 @@
6200+/******************************************************************************
6201+*******************************************************************************
6202+**
6203+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6204+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6205+**
6206+** This copyrighted material is made available to anyone wishing to use,
6207+** modify, copy, or redistribute it subject to the terms and conditions
6208+** of the GNU General Public License v.2.
6209+**
6210+*******************************************************************************
6211+******************************************************************************/
6212+
6213+#ifndef __LOCKSPACE_DOT_H__
6214+#define __LOCKSPACE_DOT_H__
6215+
6216+void dlm_lockspace_init(void);
6217+int dlm_init(void);
6218+int dlm_release(void);
6219+int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
6220+int dlm_release_lockspace(void *ls, int force);
6221+gd_ls_t *find_lockspace_by_global_id(uint32_t id);
6222+gd_ls_t *find_lockspace_by_local_id(void *id);
6223+gd_ls_t *find_lockspace_by_name(char *name, int namelen);
6224+void free_dlm_recover(gd_recover_t *gr);
6225+int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out);
6226+void dlm_emergency_shutdown(void);
6227+
6228+#endif /* __LOCKSPACE_DOT_H__ */
6229diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
6230--- linux-orig/cluster/dlm/lowcomms.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 6231+++ linux-patched/cluster/dlm/lowcomms.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 6232@@ -0,0 +1,1354 @@
6233+/******************************************************************************
6234+*******************************************************************************
6235+**
6236+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6237+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6238+**
6239+** This copyrighted material is made available to anyone wishing to use,
6240+** modify, copy, or redistribute it subject to the terms and conditions
6241+** of the GNU General Public License v.2.
6242+**
6243+*******************************************************************************
6244+******************************************************************************/
6245+
6246+/*
6247+ * lowcomms.c
6248+ *
6249+ * This is the "low-level" comms layer.
6250+ *
6251+ * It is responsible for sending/receiving messages
6252+ * from other nodes in the cluster.
6253+ *
6254+ * Cluster nodes are referred to by their nodeids. nodeids are
6255+ * simply 32 bit numbers to the locking module - if they need to
6256+ * be expanded for the cluster infrastructure then that is it's
6257+ * responsibility. It is this layer's
6258+ * responsibility to resolve these into IP address or
6259+ * whatever it needs for inter-node communication.
6260+ *
6261+ * The comms level is two kernel threads that deal mainly with
6262+ * the receiving of messages from other nodes and passing them
6263+ * up to the mid-level comms layer (which understands the
6264+ * message format) for execution by the locking core, and
6265+ * a send thread which does all the setting up of connections
6266+ * to remote nodes and the sending of data. Threads are not allowed
6267+ * to send their own data because it may cause them to wait in times
6268+ * of high load. Also, this way, the sending thread can collect together
6269+ * messages bound for one node and send them in one block.
6270+ *
6271+ * I don't see any problem with the recv thread executing the locking
6272+ * code on behalf of remote processes as the locking code is
6273+ * short, efficient and never waits.
6274+ *
6275+ */
6276+
6277+
6278+#include <asm/ioctls.h>
6279+#include <net/sock.h>
6280+#include <net/tcp.h>
6281+#include <linux/pagemap.h>
6282+#include <cluster/cnxman.h>
6283+
6284+#include "dlm_internal.h"
6285+#include "lowcomms.h"
6286+#include "midcomms.h"
6287+#include "config.h"
6288+
6289+struct cbuf {
6290+ unsigned base;
6291+ unsigned len;
6292+ unsigned mask;
6293+};
6294+
6295+#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
6296+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
6297+#define CBUF_EMPTY(cb) ((cb)->len == 0)
6298+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
6299+#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \
6300+ (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
6301+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
6302+
6303+struct connection {
6304+ struct socket *sock; /* NULL if not connected */
6305+ uint32_t nodeid; /* So we know who we are in the list */
6306+ struct rw_semaphore sock_sem; /* Stop connect races */
6307+ struct list_head read_list; /* On this list when ready for reading */
6308+ struct list_head write_list; /* On this list when ready for writing */
6309+ struct list_head state_list; /* On this list when ready to connect */
6310+ unsigned long flags; /* bit 1,2 = We are on the read/write lists */
6311+#define CF_READ_PENDING 1
6312+#define CF_WRITE_PENDING 2
6313+#define CF_CONNECT_PENDING 3
6314+#define CF_IS_OTHERSOCK 4
6315+ struct list_head writequeue; /* List of outgoing writequeue_entries */
6316+ struct list_head listenlist; /* List of allocated listening sockets */
6317+ spinlock_t writequeue_lock;
6318+ int (*rx_action) (struct connection *); /* What to do when active */
6319+ struct page *rx_page;
6320+ struct cbuf cb;
6321+ int retries;
6322+#define MAX_CONNECT_RETRIES 3
6323+ struct connection *othersock;
6324+};
6325+#define sock2con(x) ((struct connection *)(x)->sk_user_data)
6326+#define nodeid2con(x) (&connections[(x)])
6327+
6328+/* An entry waiting to be sent */
6329+struct writequeue_entry {
6330+ struct list_head list;
6331+ struct page *page;
6332+ int offset;
6333+ int len;
6334+ int end;
6335+ int users;
6336+ struct connection *con;
6337+};
6338+
6339+/* "Template" structure for IPv4 and IPv6 used to fill
6340+ * in the missing bits when converting between cman (which knows
6341+ * nothing about sockaddr structs) and real life where we actually
6342+ * have to connect to these addresses. Also one of these structs
6343+ * will hold the cached "us" address.
6344+ *
6345+ * It's an in6 sockaddr just so there's enough space for anything
6346+ * we're likely to see here.
6347+ */
6348+static struct sockaddr_in6 local_addr;
6349+
6350+/* Manage daemons */
6351+static struct semaphore thread_lock;
6352+static struct completion thread_completion;
6353+static atomic_t send_run;
6354+static atomic_t recv_run;
6355+
6356+/* An array of connections, indexed by NODEID */
6357+static struct connection *connections;
6358+static int conn_array_size;
6359+static atomic_t writequeue_length;
6360+static atomic_t accepting;
6361+
6362+static wait_queue_t lowcomms_send_waitq_head;
6363+static wait_queue_head_t lowcomms_send_waitq;
6364+
6365+static wait_queue_t lowcomms_recv_waitq_head;
6366+static wait_queue_head_t lowcomms_recv_waitq;
6367+
6368+/* List of sockets that have reads pending */
6369+static struct list_head read_sockets;
6370+static spinlock_t read_sockets_lock;
6371+
6372+/* List of sockets which have writes pending */
6373+static struct list_head write_sockets;
6374+static spinlock_t write_sockets_lock;
6375+
6376+/* List of sockets which have connects pending */
6377+static struct list_head state_sockets;
6378+static spinlock_t state_sockets_lock;
6379+
6380+/* List of allocated listen sockets */
6381+static struct list_head listen_sockets;
6382+
6383+static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
6384+static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
6385+
6386+
6387+/* Data available on socket or listen socket received a connect */
6388+static void lowcomms_data_ready(struct sock *sk, int count_unused)
6389+{
6390+ struct connection *con = sock2con(sk);
6391+
6392+ if (test_and_set_bit(CF_READ_PENDING, &con->flags))
6393+ return;
6394+
6395+ spin_lock_bh(&read_sockets_lock);
6396+ list_add_tail(&con->read_list, &read_sockets);
6397+ spin_unlock_bh(&read_sockets_lock);
6398+
6399+ wake_up_interruptible(&lowcomms_recv_waitq);
6400+}
6401+
6402+static void lowcomms_write_space(struct sock *sk)
6403+{
6404+ struct connection *con = sock2con(sk);
6405+
6406+ if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
6407+ return;
6408+
6409+ spin_lock_bh(&write_sockets_lock);
6410+ list_add_tail(&con->write_list, &write_sockets);
6411+ spin_unlock_bh(&write_sockets_lock);
6412+
6413+ wake_up_interruptible(&lowcomms_send_waitq);
6414+}
6415+
6416+static inline void lowcomms_connect_sock(struct connection *con)
6417+{
6418+ if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
6419+ return;
6420+ if (!atomic_read(&accepting))
6421+ return;
6422+
6423+ spin_lock_bh(&state_sockets_lock);
6424+ list_add_tail(&con->state_list, &state_sockets);
6425+ spin_unlock_bh(&state_sockets_lock);
6426+
6427+ wake_up_interruptible(&lowcomms_send_waitq);
6428+}
6429+
6430+static void lowcomms_state_change(struct sock *sk)
6431+{
6432+/* struct connection *con = sock2con(sk); */
6433+
6434+ switch (sk->sk_state) {
6435+ case TCP_ESTABLISHED:
6436+ lowcomms_write_space(sk);
6437+ break;
6438+
6439+ case TCP_FIN_WAIT1:
6440+ case TCP_FIN_WAIT2:
6441+ case TCP_TIME_WAIT:
6442+ case TCP_CLOSE:
6443+ case TCP_CLOSE_WAIT:
6444+ case TCP_LAST_ACK:
6445+ case TCP_CLOSING:
6446+ /* FIXME: I think this causes more trouble than it solves.
6447+ lowcomms wil reconnect anyway when there is something to
6448+ send. This just attempts reconnection if a node goes down!
6449+ */
6450+ /* lowcomms_connect_sock(con); */
6451+ break;
6452+
6453+ default:
6454+ printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
6455+ break;
6456+ }
6457+}
6458+
6459+/* Make a socket active */
6460+static int add_sock(struct socket *sock, struct connection *con)
6461+{
6462+ con->sock = sock;
6463+
6464+ /* Install a data_ready callback */
6465+ con->sock->sk->sk_data_ready = lowcomms_data_ready;
6466+ con->sock->sk->sk_write_space = lowcomms_write_space;
6467+ con->sock->sk->sk_state_change = lowcomms_state_change;
6468+
6469+ return 0;
6470+}
6471+
6472+/* Add the port number to an IP6 or 4 sockaddr and return the address
6473+ length */
6474+static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
6475+ int *addr_len)
6476+{
6477+ saddr->sin6_family = local_addr.sin6_family;
6478+ if (local_addr.sin6_family == AF_INET) {
6479+ struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
6480+ in4_addr->sin_port = cpu_to_be16(port);
6481+ *addr_len = sizeof(struct sockaddr_in);
6482+ }
6483+ else {
6484+ saddr->sin6_port = cpu_to_be16(port);
6485+ *addr_len = sizeof(struct sockaddr_in6);
6486+ }
6487+}
6488+
6489+/* Close a remote connection and tidy up */
6490+static void close_connection(struct connection *con)
6491+{
6492+ if (test_bit(CF_IS_OTHERSOCK, &con->flags))
6493+ return;
6494+
6495+ down_write(&con->sock_sem);
6496+
6497+ if (con->sock) {
6498+ sock_release(con->sock);
6499+ con->sock = NULL;
6500+ if (con->othersock) {
6501+ down_write(&con->othersock->sock_sem);
6502+ sock_release(con->othersock->sock);
6503+ con->othersock->sock = NULL;
6504+ up_write(&con->othersock->sock_sem);
6505+ kfree(con->othersock);
6506+ con->othersock = NULL;
6507+ }
6508+ }
6509+ if (con->rx_page) {
6510+ __free_page(con->rx_page);
6511+ con->rx_page = NULL;
6512+ }
6513+ up_write(&con->sock_sem);
6514+}
6515+
6516+/* Data received from remote end */
6517+static int receive_from_sock(struct connection *con)
6518+{
6519+ int ret = 0;
6520+ struct msghdr msg;
6521+ struct iovec iov[2];
6522+ mm_segment_t fs;
6523+ unsigned len;
6524+ int r;
6525+ int call_again_soon = 0;
6526+
6527+ down_read(&con->sock_sem);
6528+
6529+ if (con->sock == NULL)
6530+ goto out;
6531+ if (con->rx_page == NULL) {
6532+ /*
6533+ * This doesn't need to be atomic, but I think it should
6534+ * improve performance if it is.
6535+ */
6536+ con->rx_page = alloc_page(GFP_ATOMIC);
6537+ if (con->rx_page == NULL)
6538+ goto out_resched;
6539+ CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
6540+ }
6541+ /*
6542+ * To avoid doing too many short reads, we will reschedule for another
6543+ * another time if there are less than 32 bytes left in the buffer.
6544+ */
6545+ if (!CBUF_MAY_ADD(&con->cb, 32))
6546+ goto out_resched;
6547+
6548+ msg.msg_control = NULL;
6549+ msg.msg_controllen = 0;
6550+ msg.msg_iovlen = 1;
6551+ msg.msg_iov = iov;
6552+ msg.msg_name = NULL;
6553+ msg.msg_namelen = 0;
6554+ msg.msg_flags = 0;
6555+
6556+ /*
6557+ * iov[0] is the bit of the circular buffer between the current end
6558+ * point (cb.base + cb.len) and the end of the buffer.
6559+ */
6560+ iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
6561+ iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
6562+ iov[1].iov_len = 0;
6563+
6564+ /*
6565+ * iov[1] is the bit of the circular buffer between the start of the
6566+ * buffer and the start of the currently used section (cb.base)
6567+ */
6568+ if (CBUF_DATA(&con->cb) >= con->cb.base) {
6569+ iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
6570+ iov[1].iov_len = con->cb.base;
6571+ iov[1].iov_base = page_address(con->rx_page);
6572+ msg.msg_iovlen = 2;
6573+ }
6574+ len = iov[0].iov_len + iov[1].iov_len;
6575+
6576+ fs = get_fs();
6577+ set_fs(get_ds());
6578+ r = ret = sock_recvmsg(con->sock, &msg, len,
6579+ MSG_DONTWAIT | MSG_NOSIGNAL);
6580+ set_fs(fs);
6581+
6582+ if (ret <= 0)
6583+ goto out_close;
6584+ if (ret == len)
6585+ call_again_soon = 1;
6586+ CBUF_ADD(&con->cb, ret);
6587+ ret = midcomms_process_incoming_buffer(con->nodeid,
6588+ page_address(con->rx_page),
6589+ con->cb.base, con->cb.len,
6590+ PAGE_CACHE_SIZE);
6591+ if (ret == -EBADMSG) {
6592+ printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
6593+ "iov_len=%u, iov_base[0]=%p, read=%d\n",
6594+ page_address(con->rx_page), con->cb.base, con->cb.len,
6595+ len, iov[0].iov_base, r);
6596+ }
6597+ if (ret < 0)
6598+ goto out_close;
6599+ CBUF_EAT(&con->cb, ret);
6600+
6601+ if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
6602+ __free_page(con->rx_page);
6603+ con->rx_page = NULL;
6604+ }
6605+ out:
6606+ if (call_again_soon)
6607+ goto out_resched;
6608+ up_read(&con->sock_sem);
6609+ ret = 0;
6610+ goto out_ret;
6611+
6612+ out_resched:
6613+ lowcomms_data_ready(con->sock->sk, 0);
6614+ up_read(&con->sock_sem);
6615+ ret = 0;
6616+ goto out_ret;
6617+
6618+ out_close:
6619+ up_read(&con->sock_sem);
6620+ if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) {
6621+ close_connection(con);
6622+ lowcomms_connect_sock(con);
6623+ }
6624+
6625+ out_ret:
6626+ return ret;
6627+}
6628+
6629+/* Listening socket is busy, accept a connection */
6630+static int accept_from_sock(struct connection *con)
6631+{
6632+ int result;
6633+ struct sockaddr_in6 peeraddr;
6634+ struct socket *newsock;
6635+ int len;
6636+ int nodeid;
6637+ struct connection *newcon;
6638+
6639+ memset(&peeraddr, 0, sizeof(peeraddr));
6640+ newsock = sock_alloc();
6641+ if (!newsock)
6642+ return -ENOMEM;
6643+
6644+ down_read(&con->sock_sem);
6645+
6646+ result = -ENOTCONN;
6647+ if (con->sock == NULL)
6648+ goto accept_err;
6649+
6650+ newsock->type = con->sock->type;
6651+ newsock->ops = con->sock->ops;
6652+
6653+ result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
6654+ if (result < 0)
6655+ goto accept_err;
6656+
6657+ /* Get the connected socket's peer */
6658+ if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
6659+ &len, 2)) {
6660+ result = -ECONNABORTED;
6661+ goto accept_err;
6662+ }
6663+
6664+ /* Get the new node's NODEID */
6665+ nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
6666+ if (nodeid == 0) {
6667+ printk("dlm: connect from non cluster node\n");
6668+ sock_release(newsock);
6669+ up_read(&con->sock_sem);
6670+ return -1;
6671+ }
6672+
6673+ log_print("got connection from %d", nodeid);
6674+
6675+ /* Check to see if we already have a connection to this node. This
6676+ * could happen if the two nodes initiate a connection at roughly
6677+ * the same time and the connections cross on the wire.
6678+ * TEMPORARY FIX:
6679+ * In this case we store the incoming one in "othersock"
6680+ */
6681+ newcon = nodeid2con(nodeid);
6682+ down_write(&newcon->sock_sem);
6683+ if (newcon->sock) {
6684+ struct connection *othercon;
6685+
6686+ othercon = kmalloc(sizeof(struct connection), GFP_KERNEL);
6687+ if (!othercon) {
6688+ printk("dlm: failed to allocate incoming socket\n");
6689+ sock_release(newsock);
6690+ up_write(&newcon->sock_sem);
6691+ up_read(&con->sock_sem);
6692+ goto accept_out;
6693+ }
6694+ memset(othercon, 0, sizeof(*othercon));
6695+ newcon->othersock = othercon;
6696+ othercon->nodeid = nodeid;
6697+ othercon->sock = newsock;
6698+ othercon->rx_action = receive_from_sock;
6699+ add_sock(newsock, othercon);
6700+ init_rwsem(&othercon->sock_sem);
6701+ set_bit(CF_IS_OTHERSOCK, &othercon->flags);
6702+ newsock->sk->sk_user_data = othercon;
6703+
6704+ up_write(&newcon->sock_sem);
6705+ lowcomms_data_ready(newsock->sk, 0);
6706+ up_read(&con->sock_sem);
6707+ goto accept_out;
6708+ }
6709+
6710+ newsock->sk->sk_user_data = newcon;
6711+ newcon->rx_action = receive_from_sock;
6712+ add_sock(newsock, newcon);
6713+ up_write(&newcon->sock_sem);
6714+
6715+ /*
6716+ * Add it to the active queue in case we got data
6717+ * beween processing the accept adding the socket
6718+ * to the read_sockets list
6719+ */
6720+ lowcomms_data_ready(newsock->sk, 0);
6721+
6722+ up_read(&con->sock_sem);
6723+
6724+ accept_out:
6725+ return 0;
6726+
6727+ accept_err:
6728+ up_read(&con->sock_sem);
6729+ sock_release(newsock);
6730+
6731+ printk("dlm: error accepting connection from node: %d\n", result);
6732+ return result;
6733+}
6734+
6735+/* Connect a new socket to its peer */
6736+static int connect_to_sock(struct connection *con)
6737+{
6738+ int result = -EHOSTUNREACH;
6739+ struct sockaddr_in6 saddr;
6740+ int addr_len;
6741+ struct socket *sock;
6742+
6743+ if (con->nodeid == 0) {
6744+ log_print("attempt to connect sock 0 foiled");
6745+ return 0;
6746+ }
6747+
6748+ down_write(&con->sock_sem);
6749+ if (con->retries++ > MAX_CONNECT_RETRIES)
6750+ goto out;
6751+
6752+ // FIXME not sure this should happen, let alone like this.
6753+ if (con->sock) {
6754+ sock_release(con->sock);
6755+ con->sock = NULL;
6756+ }
6757+
6758+ /* Create a socket to communicate with */
6759+ result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6760+ if (result < 0)
6761+ goto out_err;
6762+
6763+ if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
6764+ goto out_err;
6765+
6766+ sock->sk->sk_user_data = con;
6767+ con->rx_action = receive_from_sock;
6768+
6769+ make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
6770+
6771+ add_sock(sock, con);
6772+ result =
6773+ sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
6774+ O_NONBLOCK);
6775+ if (result == -EINPROGRESS)
6776+ result = 0;
6777+ if (result != 0)
6778+ goto out_err;
6779+
6780+ out:
6781+ up_write(&con->sock_sem);
6782+ /*
6783+ * Returning an error here means we've given up trying to connect to
6784+ * a remote node, otherwise we return 0 and reschedule the connetion
6785+ * attempt
6786+ */
6787+ return result;
6788+
6789+ out_err:
6790+ if (con->sock) {
6791+ sock_release(con->sock);
6792+ con->sock = NULL;
6793+ }
6794+ /*
6795+ * Some errors are fatal and this list might need adjusting. For other
6796+ * errors we try again until the max number of retries is reached.
6797+ */
6798+ if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
6799+ result != -ENETDOWN && result != EINVAL
6800+ && result != -EPROTONOSUPPORT) {
6801+ lowcomms_connect_sock(con);
6802+ result = 0;
6803+ }
6804+ goto out;
6805+}
6806+
6807+static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
6808+{
6809+ struct socket *sock = NULL;
6810+ mm_segment_t fs;
6811+ int result = 0;
6812+ int one = 1;
6813+ struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
6814+
6815+ /* Create a socket to communicate with */
6816+ result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6817+ if (result < 0) {
6818+ printk("dlm: Can't create listening comms socket\n");
6819+ goto create_out;
6820+ }
6821+
6822+ fs = get_fs();
6823+ set_fs(get_ds());
6824+ result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
6825+ set_fs(fs);
6826+ if (result < 0) {
6827+ printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
6828+ }
6829+ sock->sk->sk_user_data = con;
6830+ con->rx_action = accept_from_sock;
6831+ con->sock = sock;
6832+
6833+ /* Bind to our port */
6834+ make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
6835+ result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
6836+ if (result < 0) {
6837+ printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
6838+ sock_release(sock);
6839+ sock = NULL;
6840+ goto create_out;
6841+ }
6842+
6843+ fs = get_fs();
6844+ set_fs(get_ds());
6845+
6846+ result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
6847+ set_fs(fs);
6848+ if (result < 0) {
6849+ printk("dlm: Set keepalive failed: %d\n", result);
6850+ }
6851+
6852+ result = sock->ops->listen(sock, 5);
6853+ if (result < 0) {
6854+ printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
6855+ sock_release(sock);
6856+ sock = NULL;
6857+ goto create_out;
6858+ }
6859+
6860+ create_out:
6861+ return sock;
6862+}
6863+
6864+
6865+/* Listen on all interfaces */
6866+static int listen_for_all(void)
6867+{
6868+ int result = 0;
6869+ int nodeid;
6870+ struct socket *sock = NULL;
6871+ struct list_head *addr_list;
6872+ struct connection *con = nodeid2con(0);
6873+ struct cluster_node_addr *node_addr;
6874+ char local_addr[sizeof(struct sockaddr_in6)];
6875+
6876+ /* This will also fill in local_addr */
6877+ nodeid = lowcomms_our_nodeid();
6878+
6879+ addr_list = kcl_get_node_addresses(nodeid);
6880+ if (!addr_list) {
6881+ printk("dlm: cannot initialise comms layer\n");
6882+ result = -ENOTCONN;
6883+ goto create_out;
6884+ }
6885+
6886+ list_for_each_entry(node_addr, addr_list, list) {
6887+
6888+ if (!con) {
6889+ con = kmalloc(sizeof(struct connection), GFP_KERNEL);
6890+ if (!con) {
6891+ printk("dlm: failed to allocate listen socket\n");
6892+ goto create_out;
6893+ }
6894+ memset(con, 0, sizeof(*con));
6895+ init_rwsem(&con->sock_sem);
6896+ spin_lock_init(&con->writequeue_lock);
6897+ INIT_LIST_HEAD(&con->writequeue);
6898+ set_bit(CF_IS_OTHERSOCK, &con->flags);
6899+ }
6900+
6901+ memcpy(local_addr, node_addr->addr, node_addr->addr_len);
6902+ sock = create_listen_sock(con, local_addr,
6903+ node_addr->addr_len);
6904+ if (sock) {
6905+ add_sock(sock, con);
6906+ }
6907+ else {
6908+ kfree(con);
6909+ }
6910+
6911+ /* Keep a list of dynamically allocated listening sockets
6912+ so we can free them at shutdown */
6913+ if (test_bit(CF_IS_OTHERSOCK, &con->flags)) {
6914+ list_add_tail(&con->listenlist, &listen_sockets);
6915+ }
6916+ con = NULL;
6917+ }
6918+
6919+ create_out:
6920+ return result;
6921+}
6922+
6923+
6924+
6925+static struct writequeue_entry *new_writequeue_entry(struct connection *con,
6926+ int allocation)
6927+{
6928+ struct writequeue_entry *entry;
6929+
6930+ entry = kmalloc(sizeof(struct writequeue_entry), allocation);
6931+ if (!entry)
6932+ return NULL;
6933+
6934+ entry->page = alloc_page(allocation);
6935+ if (!entry->page) {
6936+ kfree(entry);
6937+ return NULL;
6938+ }
6939+
6940+ entry->offset = 0;
6941+ entry->len = 0;
6942+ entry->end = 0;
6943+ entry->users = 0;
6944+ entry->con = con;
6945+
6946+ return entry;
6947+}
6948+
6949+struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
6950+ int allocation, char **ppc)
6951+{
6952+ struct connection *con = nodeid2con(nodeid);
6953+ struct writequeue_entry *e;
6954+ int offset = 0;
6955+ int users = 0;
6956+
6957+ if (!atomic_read(&accepting))
6958+ return NULL;
6959+
6960+ spin_lock(&con->writequeue_lock);
6961+ e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
6962+ if (((struct list_head *) e == &con->writequeue) ||
6963+ (PAGE_CACHE_SIZE - e->end < len)) {
6964+ e = NULL;
6965+ } else {
6966+ offset = e->end;
6967+ e->end += len;
6968+ users = e->users++;
6969+ }
6970+ spin_unlock(&con->writequeue_lock);
6971+
6972+ if (e) {
6973+ got_one:
6974+ if (users == 0)
6975+ kmap(e->page);
6976+ *ppc = page_address(e->page) + offset;
6977+ return e;
6978+ }
6979+
6980+ e = new_writequeue_entry(con, allocation);
6981+ if (e) {
6982+ spin_lock(&con->writequeue_lock);
6983+ offset = e->end;
6984+ e->end += len;
6985+ users = e->users++;
6986+ list_add_tail(&e->list, &con->writequeue);
6987+ spin_unlock(&con->writequeue_lock);
6988+ atomic_inc(&writequeue_length);
6989+ goto got_one;
6990+ }
6991+ return NULL;
6992+}
6993+
6994+void lowcomms_commit_buffer(struct writequeue_entry *e)
6995+{
6996+ struct connection *con = e->con;
6997+ int users;
6998+
6999+ if (!atomic_read(&accepting))
7000+ return;
7001+
7002+ spin_lock(&con->writequeue_lock);
7003+ users = --e->users;
7004+ if (users)
7005+ goto out;
7006+ e->len = e->end - e->offset;
7007+ kunmap(e->page);
7008+ spin_unlock(&con->writequeue_lock);
7009+
7010+ if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
7011+ spin_lock_bh(&write_sockets_lock);
7012+ list_add_tail(&con->write_list, &write_sockets);
7013+ spin_unlock_bh(&write_sockets_lock);
7014+
7015+ wake_up_interruptible(&lowcomms_send_waitq);
7016+ }
7017+ return;
7018+
7019+ out:
7020+ spin_unlock(&con->writequeue_lock);
7021+ return;
7022+}
7023+
7024+static void free_entry(struct writequeue_entry *e)
7025+{
7026+ __free_page(e->page);
7027+ kfree(e);
7028+ atomic_dec(&writequeue_length);
7029+}
7030+
7031+/* Send a message */
7032+static int send_to_sock(struct connection *con)
7033+{
7034+ int ret = 0;
7035+ ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
7036+ const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
7037+ struct writequeue_entry *e;
7038+ int len, offset;
7039+
7040+ down_read(&con->sock_sem);
7041+ if (con->sock == NULL)
7042+ goto out_connect;
7043+
7044+ sendpage = con->sock->ops->sendpage;
7045+
7046+ spin_lock(&con->writequeue_lock);
7047+ for (;;) {
7048+ e = list_entry(con->writequeue.next, struct writequeue_entry,
7049+ list);
7050+ if ((struct list_head *) e == &con->writequeue)
7051+ break;
7052+
7053+ len = e->len;
7054+ offset = e->offset;
7055+ BUG_ON(len == 0 && e->users == 0);
7056+ spin_unlock(&con->writequeue_lock);
7057+
7058+ ret = 0;
7059+ if (len) {
7060+ ret = sendpage(con->sock, e->page, offset, len,
7061+ msg_flags);
7062+ if (ret == -EAGAIN || ret == 0)
7063+ goto out;
7064+ if (ret <= 0)
7065+ goto send_error;
7066+ }
7067+
7068+ spin_lock(&con->writequeue_lock);
7069+ e->offset += ret;
7070+ e->len -= ret;
7071+
7072+ if (e->len == 0 && e->users == 0) {
7073+ list_del(&e->list);
7074+ free_entry(e);
7075+ continue;
7076+ }
7077+ }
7078+ spin_unlock(&con->writequeue_lock);
7079+ out:
7080+ up_read(&con->sock_sem);
7081+ return ret;
7082+
7083+ send_error:
7084+ up_read(&con->sock_sem);
7085+ close_connection(con);
7086+ lowcomms_connect_sock(con);
7087+ return ret;
7088+
7089+ out_connect:
7090+ up_read(&con->sock_sem);
7091+ lowcomms_connect_sock(con);
7092+ return 0;
7093+}
7094+
7095+/* Called from recoverd when it knows that a node has
7096+ left the cluster */
7097+int lowcomms_close(int nodeid)
7098+{
7099+ struct connection *con;
7100+
7101+ if (!connections)
7102+ goto out;
7103+
7104+ con = nodeid2con(nodeid);
7105+ if (con->sock) {
7106+ close_connection(con);
7107+ return 0;
7108+ }
7109+
7110+ out:
7111+ return -1;
7112+}
7113+
7114+/* API send message call, may queue the request */
7115+/* N.B. This is the old interface - use the new one for new calls */
7116+int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
7117+{
7118+ struct writequeue_entry *e;
7119+ char *b;
7120+
7121+ GDLM_ASSERT(nodeid < dlm_config.max_connections,
7122+ printk("nodeid=%u\n", nodeid););
7123+
7124+ e = lowcomms_get_buffer(nodeid, len, allocation, &b);
7125+ if (e) {
7126+ memcpy(b, buf, len);
7127+ lowcomms_commit_buffer(e);
7128+ return 0;
7129+ }
7130+ return -ENOBUFS;
7131+}
7132+
7133+/* Look for activity on active sockets */
7134+static void process_sockets(void)
7135+{
7136+ struct list_head *list;
7137+ struct list_head *temp;
7138+
7139+ spin_lock_bh(&read_sockets_lock);
7140+ list_for_each_safe(list, temp, &read_sockets) {
7141+ struct connection *con =
7142+ list_entry(list, struct connection, read_list);
7143+ list_del(&con->read_list);
7144+ clear_bit(CF_READ_PENDING, &con->flags);
7145+
7146+ spin_unlock_bh(&read_sockets_lock);
7147+
7148+ con->rx_action(con);
7149+
7150+ /* Don't starve out everyone else */
7151+ schedule();
7152+ spin_lock_bh(&read_sockets_lock);
7153+ }
7154+ spin_unlock_bh(&read_sockets_lock);
7155+}
7156+
7157+/* Try to send any messages that are pending
7158+ */
7159+static void process_output_queue(void)
7160+{
7161+ struct list_head *list;
7162+ struct list_head *temp;
7163+ int ret;
7164+
7165+ spin_lock_bh(&write_sockets_lock);
7166+ list_for_each_safe(list, temp, &write_sockets) {
7167+ struct connection *con =
7168+ list_entry(list, struct connection, write_list);
7169+ list_del(&con->write_list);
7170+ clear_bit(CF_WRITE_PENDING, &con->flags);
7171+
7172+ spin_unlock_bh(&write_sockets_lock);
7173+
7174+ ret = send_to_sock(con);
7175+ if (ret < 0) {
7176+ }
7177+ spin_lock_bh(&write_sockets_lock);
7178+ }
7179+ spin_unlock_bh(&write_sockets_lock);
7180+}
7181+
7182+static void process_state_queue(void)
7183+{
7184+ struct list_head *list;
7185+ struct list_head *temp;
7186+ int ret;
7187+
7188+ spin_lock_bh(&state_sockets_lock);
7189+ list_for_each_safe(list, temp, &state_sockets) {
7190+ struct connection *con =
7191+ list_entry(list, struct connection, state_list);
7192+ list_del(&con->state_list);
7193+ clear_bit(CF_CONNECT_PENDING, &con->flags);
7194+ spin_unlock_bh(&state_sockets_lock);
7195+
7196+ ret = connect_to_sock(con);
7197+ if (ret < 0) {
7198+ }
7199+ spin_lock_bh(&state_sockets_lock);
7200+ }
7201+ spin_unlock_bh(&state_sockets_lock);
7202+}
7203+
7204+/* Discard all entries on the write queues */
7205+static void clean_writequeues(void)
7206+{
7207+ struct list_head *list;
7208+ struct list_head *temp;
7209+ int nodeid;
7210+
7211+ for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) {
7212+ struct connection *con = nodeid2con(nodeid);
7213+
7214+ spin_lock(&con->writequeue_lock);
7215+ list_for_each_safe(list, temp, &con->writequeue) {
7216+ struct writequeue_entry *e =
7217+ list_entry(list, struct writequeue_entry, list);
7218+ list_del(&e->list);
7219+ free_entry(e);
7220+ }
7221+ spin_unlock(&con->writequeue_lock);
7222+ }
7223+}
7224+
7225+static int read_list_empty(void)
7226+{
7227+ int status;
7228+
7229+ spin_lock_bh(&read_sockets_lock);
7230+ status = list_empty(&read_sockets);
7231+ spin_unlock_bh(&read_sockets_lock);
7232+
7233+ return status;
7234+}
7235+
7236+/* DLM Transport comms receive daemon */
7237+static int dlm_recvd(void *data)
7238+{
7239+ daemonize("dlm_recvd");
7240+ atomic_set(&recv_run, 1);
7241+
7242+ init_waitqueue_head(&lowcomms_recv_waitq);
7243+ init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
7244+ add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
7245+
7246+ complete(&thread_completion);
7247+
7248+ while (atomic_read(&recv_run)) {
7249+
7250+ set_task_state(current, TASK_INTERRUPTIBLE);
7251+
7252+ if (read_list_empty())
7253+ schedule();
7254+
7255+ set_task_state(current, TASK_RUNNING);
7256+
7257+ process_sockets();
7258+ }
7259+
7260+ down(&thread_lock);
7261+ up(&thread_lock);
7262+
7263+ complete(&thread_completion);
7264+
7265+ return 0;
7266+}
7267+
7268+static int write_and_state_lists_empty(void)
7269+{
7270+ int status;
7271+
7272+ spin_lock_bh(&write_sockets_lock);
7273+ status = list_empty(&write_sockets);
7274+ spin_unlock_bh(&write_sockets_lock);
7275+
7276+ spin_lock_bh(&state_sockets_lock);
7277+ if (list_empty(&state_sockets) == 0)
7278+ status = 0;
7279+ spin_unlock_bh(&state_sockets_lock);
7280+
7281+ return status;
7282+}
7283+
7284+/* DLM Transport send daemon */
7285+static int dlm_sendd(void *data)
7286+{
7287+ daemonize("dlm_sendd");
7288+ atomic_set(&send_run, 1);
7289+
7290+ init_waitqueue_head(&lowcomms_send_waitq);
7291+ init_waitqueue_entry(&lowcomms_send_waitq_head, current);
7292+ add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
7293+
7294+ complete(&thread_completion);
7295+
7296+ while (atomic_read(&send_run)) {
7297+
7298+ set_task_state(current, TASK_INTERRUPTIBLE);
7299+
7300+ if (write_and_state_lists_empty())
7301+ schedule();
7302+
7303+ set_task_state(current, TASK_RUNNING);
7304+
7305+ process_state_queue();
7306+ process_output_queue();
7307+ }
7308+
7309+ down(&thread_lock);
7310+ up(&thread_lock);
7311+
7312+ complete(&thread_completion);
7313+
7314+ return 0;
7315+}
7316+
7317+static void daemons_stop(void)
7318+{
7319+ if (atomic_read(&recv_run)) {
7320+ down(&thread_lock);
7321+ atomic_set(&recv_run, 0);
7322+ wake_up_interruptible(&lowcomms_recv_waitq);
7323+ up(&thread_lock);
7324+ wait_for_completion(&thread_completion);
7325+ }
7326+
7327+ if (atomic_read(&send_run)) {
7328+ down(&thread_lock);
7329+ atomic_set(&send_run, 0);
7330+ wake_up_interruptible(&lowcomms_send_waitq);
7331+ up(&thread_lock);
7332+ wait_for_completion(&thread_completion);
7333+ }
7334+}
7335+
7336+static int daemons_start(void)
7337+{
7338+ int error;
7339+
7340+ error = kernel_thread(dlm_recvd, NULL, 0);
7341+ if (error < 0) {
7342+ log_print("can't start recvd thread: %d", error);
7343+ goto out;
7344+ }
7345+ wait_for_completion(&thread_completion);
7346+
7347+ error = kernel_thread(dlm_sendd, NULL, 0);
7348+ if (error < 0) {
7349+ log_print("can't start sendd thread: %d", error);
7350+ daemons_stop();
7351+ goto out;
7352+ }
7353+ wait_for_completion(&thread_completion);
7354+
7355+ error = 0;
7356+ out:
7357+ return error;
7358+}
7359+
7360+/*
7361+ * Return the largest buffer size we can cope with.
7362+ */
7363+int lowcomms_max_buffer_size(void)
7364+{
7365+ return PAGE_CACHE_SIZE;
7366+}
7367+
7368+void lowcomms_stop(void)
7369+{
7370+ int i;
7371+ struct connection *temp;
7372+ struct connection *lcon;
7373+
7374+ atomic_set(&accepting, 0);
7375+
7376+ /* Set all the activity flags to prevent any
7377+ socket activity.
7378+ */
7379+ for (i = 0; i < conn_array_size; i++) {
7380+ connections[i].flags = 0x7;
7381+ }
7382+ daemons_stop();
7383+ clean_writequeues();
7384+
7385+ for (i = 0; i < conn_array_size; i++) {
7386+ close_connection(nodeid2con(i));
7387+ }
7388+
7389+ kfree(connections);
7390+ connections = NULL;
7391+
7392+ /* Free up any dynamically allocated listening sockets */
7393+ list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
7394+ sock_release(lcon->sock);
7395+ kfree(lcon);
7396+ }
7397+
7398+ kcl_releaseref_cluster();
7399+}
7400+
7401+/* This is quite likely to sleep... */
7402+int lowcomms_start(void)
7403+{
7404+ int error = 0;
7405+ int i;
7406+
7407+ INIT_LIST_HEAD(&read_sockets);
7408+ INIT_LIST_HEAD(&write_sockets);
7409+ INIT_LIST_HEAD(&state_sockets);
7410+ INIT_LIST_HEAD(&listen_sockets);
7411+
7412+ spin_lock_init(&read_sockets_lock);
7413+ spin_lock_init(&write_sockets_lock);
7414+ spin_lock_init(&state_sockets_lock);
7415+
7416+ init_completion(&thread_completion);
7417+ init_MUTEX(&thread_lock);
7418+ atomic_set(&send_run, 0);
7419+ atomic_set(&recv_run, 0);
7420+
7421+ error = -ENOTCONN;
7422+ if (kcl_addref_cluster())
7423+ goto out;
7424+
7425+ /*
7426+ * Temporarily initialise the waitq head so that lowcomms_send_message
7427+ * doesn't crash if it gets called before the thread is fully
7428+ * initialised
7429+ */
7430+ init_waitqueue_head(&lowcomms_send_waitq);
7431+
7432+ error = -ENOMEM;
7433+
7434+ connections = kmalloc(sizeof(struct connection) *
7435+ dlm_config.max_connections, GFP_KERNEL);
7436+ if (!connections)
7437+ goto out;
7438+
7439+ memset(connections, 0,
7440+ sizeof(struct connection) * dlm_config.max_connections);
7441+ for (i = 0; i < dlm_config.max_connections; i++) {
7442+ connections[i].nodeid = i;
7443+ init_rwsem(&connections[i].sock_sem);
7444+ INIT_LIST_HEAD(&connections[i].writequeue);
7445+ spin_lock_init(&connections[i].writequeue_lock);
7446+ }
7447+ conn_array_size = dlm_config.max_connections;
7448+
7449+ /* Start listening */
7450+ error = listen_for_all();
7451+ if (error)
7452+ goto fail_free_conn;
7453+
7454+ error = daemons_start();
7455+ if (error)
7456+ goto fail_free_conn;
7457+
7458+ atomic_set(&accepting, 1);
7459+
7460+ return 0;
7461+
7462+ fail_free_conn:
7463+ kfree(connections);
7464+
7465+ out:
7466+ return error;
7467+}
7468+
7469+/* Don't accept any more outgoing work */
7470+void lowcomms_stop_accept()
7471+{
7472+ atomic_set(&accepting, 0);
7473+}
7474+
7475+/* Cluster Manager interface functions for looking up
7476+ nodeids and IP addresses by each other
7477+*/
7478+
7479+/* Return the IP address of a node given its NODEID */
7480+static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
7481+{
7482+ struct list_head *addrs;
7483+ struct cluster_node_addr *node_addr;
7484+ struct cluster_node_addr *current_addr = NULL;
7485+ struct sockaddr_in6 *saddr;
7486+ int interface;
7487+ int i;
7488+
7489+ addrs = kcl_get_node_addresses(nodeid);
7490+ if (!addrs)
7491+ return -1;
7492+
7493+ interface = kcl_get_current_interface();
7494+
7495+ /* Look for address number <interface> */
7496+ i=0; /* i/f numbers start at 1 */
7497+ list_for_each_entry(node_addr, addrs, list) {
7498+ if (interface == ++i) {
7499+ current_addr = node_addr;
7500+ break;
7501+ }
7502+ }
7503+
7504+ /* If that failed then just use the first one */
7505+ if (!current_addr)
7506+ current_addr = (struct cluster_node_addr *)addrs->next;
7507+
7508+ saddr = (struct sockaddr_in6 *)current_addr->addr;
7509+
7510+ /* Extract the IP address */
7511+ if (saddr->sin6_family == AF_INET) {
7512+ struct sockaddr_in *in4 = (struct sockaddr_in *)saddr;
7513+ struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
7514+ ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
7515+ }
7516+ else {
7517+ struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
7518+ memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
7519+ }
7520+
7521+ return 0;
7522+}
7523+
7524+/* Return the NODEID for a node given its sockaddr */
7525+static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
7526+{
7527+ struct kcl_cluster_node node;
7528+ struct sockaddr_in6 ipv6_addr;
7529+ struct sockaddr_in ipv4_addr;
7530+
7531+ if (addr->sa_family == AF_INET) {
7532+ struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
7533+ memcpy(&ipv4_addr, &local_addr, addr_len);
7534+ memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
7535+
7536+ addr = (struct sockaddr *)&ipv4_addr;
7537+ }
7538+ else {
7539+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
7540+ memcpy(&ipv6_addr, &local_addr, addr_len);
7541+ memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
7542+
7543+ addr = (struct sockaddr *)&ipv6_addr;
7544+ }
7545+
7546+ if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
7547+ return node.node_id;
7548+ else
7549+ return 0;
7550+}
7551+
7552+int lowcomms_our_nodeid(void)
7553+{
7554+ struct kcl_cluster_node node;
7555+ struct list_head *addrs;
7556+ struct cluster_node_addr *first_addr;
7557+ static int our_nodeid = 0;
7558+
7559+ if (our_nodeid)
7560+ return our_nodeid;
7561+
7562+ if (kcl_get_node_by_nodeid(0, &node) == -1)
7563+ return 0;
7564+
7565+ our_nodeid = node.node_id;
7566+
7567+ /* Fill in the "template" structure */
7568+ addrs = kcl_get_node_addresses(our_nodeid);
7569+ if (!addrs)
7570+ return 0;
7571+
7572+ first_addr = (struct cluster_node_addr *) addrs->next;
7573+ memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
7574+
7575+ return node.node_id;
7576+}
7577+/*
7578+ * Overrides for Emacs so that we follow Linus's tabbing style.
7579+ * Emacs will notice this stuff at the end of the file and automatically
7580+ * adjust the settings for this buffer only. This must remain at the end
7581+ * of the file.
7582+ * ---------------------------------------------------------------------------
7583+ * Local variables:
7584+ * c-file-style: "linux"
7585+ * End:
7586+ */
7587diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
7588--- linux-orig/cluster/dlm/lowcomms.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 7589+++ linux-patched/cluster/dlm/lowcomms.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 7590@@ -0,0 +1,34 @@
7591+/******************************************************************************
7592+*******************************************************************************
7593+**
7594+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7595+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7596+**
7597+** This copyrighted material is made available to anyone wishing to use,
7598+** modify, copy, or redistribute it subject to the terms and conditions
7599+** of the GNU General Public License v.2.
7600+**
7601+*******************************************************************************
7602+******************************************************************************/
7603+
7604+#ifndef __LOWCOMMS_DOT_H__
7605+#define __LOWCOMMS_DOT_H__
7606+
7607+/* The old interface */
7608+int lowcomms_send_message(int csid, char *buf, int len, int allocation);
7609+
7610+/* The new interface */
7611+struct writequeue_entry;
7612+extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7613+ int allocation, char **ppc);
7614+extern void lowcomms_commit_buffer(struct writequeue_entry *e);
7615+
7616+int lowcomms_start(void);
7617+void lowcomms_stop(void);
7618+void lowcomms_stop_accept(void);
7619+int lowcomms_close(int nodeid);
7620+int lowcomms_max_buffer_size(void);
7621+
7622+int lowcomms_our_nodeid(void);
7623+
7624+#endif /* __LOWCOMMS_DOT_H__ */
7625diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
7626--- linux-orig/cluster/dlm/main.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 7627+++ linux-patched/cluster/dlm/main.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 7628@@ -0,0 +1,98 @@
7629+/******************************************************************************
7630+*******************************************************************************
7631+**
7632+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7633+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7634+**
7635+** This copyrighted material is made available to anyone wishing to use,
7636+** modify, copy, or redistribute it subject to the terms and conditions
7637+** of the GNU General Public License v.2.
7638+**
7639+*******************************************************************************
7640+******************************************************************************/
7641+
7642+#define EXPORT_SYMTAB
7643+
7644+#include <linux/init.h>
7645+#include <linux/proc_fs.h>
7646+#include <linux/ctype.h>
7647+#include <linux/seq_file.h>
7648+#include <linux/module.h>
7649+#include <net/sock.h>
7650+
7651+#include <cluster/cnxman.h>
7652+
7653+#include "dlm_internal.h"
7654+#include "lockspace.h"
7655+#include "recoverd.h"
7656+#include "ast.h"
7657+#include "lkb.h"
7658+#include "nodes.h"
7659+#include "locking.h"
7660+#include "config.h"
7661+#include "memory.h"
7662+#include "recover.h"
7663+#include "lowcomms.h"
7664+
7665+int dlm_device_init(void);
7666+void dlm_device_exit(void);
7667+void dlm_proc_init(void);
7668+void dlm_proc_exit(void);
7669+
7670+
7671+/* Cluster manager callbacks, we want to know if a node dies
7672+ N.B. this is independent of lockspace-specific event callbacks from SM */
7673+
7674+static void cman_callback(kcl_callback_reason reason, long arg)
7675+{
7676+ if (reason == DIED) {
7677+ lowcomms_close((int) arg);
7678+ }
7679+
7680+ /* This is unconditional. so do what we can to tidy up */
7681+ if (reason == LEAVING) {
7682+ dlm_emergency_shutdown();
7683+ }
7684+}
7685+
7686+int __init init_dlm(void)
7687+{
7688+ dlm_proc_init();
7689+ dlm_lockspace_init();
7690+ dlm_recoverd_init();
7691+ dlm_nodes_init();
7692+ dlm_device_init();
7693+ dlm_memory_init();
7694+ dlm_config_init();
7695+
7696+ kcl_add_callback(cman_callback);
7697+
7698+ printk("DLM %s (built %s %s) installed\n",
7699+ DLM_RELEASE_NAME, __DATE__, __TIME__);
7700+
7701+ return 0;
7702+}
7703+
7704+void __exit exit_dlm(void)
7705+{
7706+ kcl_remove_callback(cman_callback);
7707+
7708+ dlm_device_exit();
7709+ dlm_memory_exit();
7710+ dlm_config_exit();
7711+ dlm_proc_exit();
7712+}
7713+
7714+MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
7715+MODULE_AUTHOR("Red Hat, Inc.");
7716+MODULE_LICENSE("GPL");
7717+
7718+module_init(init_dlm);
7719+module_exit(exit_dlm);
7720+
7721+EXPORT_SYMBOL(dlm_init);
7722+EXPORT_SYMBOL(dlm_release);
7723+EXPORT_SYMBOL(dlm_new_lockspace);
7724+EXPORT_SYMBOL(dlm_release_lockspace);
7725+EXPORT_SYMBOL(dlm_lock);
7726+EXPORT_SYMBOL(dlm_unlock);
7727diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
7728--- linux-orig/cluster/dlm/memory.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 7729+++ linux-patched/cluster/dlm/memory.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 7730@@ -0,0 +1,238 @@
7731+/******************************************************************************
7732+*******************************************************************************
7733+**
7734+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7735+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7736+**
7737+** This copyrighted material is made available to anyone wishing to use,
7738+** modify, copy, or redistribute it subject to the terms and conditions
7739+** of the GNU General Public License v.2.
7740+**
7741+*******************************************************************************
7742+******************************************************************************/
7743+
7744+/* memory.c
7745+ *
7746+ * memory allocation routines
7747+ *
7748+ */
7749+
7750+#include "dlm_internal.h"
7751+#include "memory.h"
7752+#include "config.h"
7753+
7754+/* as the man says...Shouldn't this be in a header file somewhere? */
7755+#define BYTES_PER_WORD sizeof(void *)
7756+
7757+static kmem_cache_t *rsb_cache_small;
7758+static kmem_cache_t *rsb_cache_large;
7759+static kmem_cache_t *lkb_cache;
7760+static kmem_cache_t *lvb_cache;
7761+static kmem_cache_t *resdir_cache_large;
7762+static kmem_cache_t *resdir_cache_small;
7763+
7764+/* The thresholds above which we allocate large RSBs/resdatas rather than small
7765+ * ones. This must make the resultant structure end on a word boundary */
7766+#define LARGE_RSB_NAME 28
7767+#define LARGE_RES_NAME 28
7768+
7769+int dlm_memory_init()
7770+{
7771+ int ret = -ENOMEM;
7772+
7773+
7774+ rsb_cache_small =
7775+ kmem_cache_create("dlm_rsb(small)",
7776+ (sizeof(gd_res_t) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7777+ __alignof__(gd_res_t), 0, NULL, NULL);
7778+ if (!rsb_cache_small)
7779+ goto out;
7780+
7781+ rsb_cache_large =
7782+ kmem_cache_create("dlm_rsb(large)",
7783+ sizeof(gd_res_t) + DLM_RESNAME_MAXLEN,
7784+ __alignof__(gd_res_t), 0, NULL, NULL);
7785+ if (!rsb_cache_large)
7786+ goto out_free_rsbs;
7787+
7788+ lkb_cache = kmem_cache_create("dlm_lkb", sizeof(gd_lkb_t),
7789+ __alignof__(gd_lkb_t), 0, NULL, NULL);
7790+ if (!lkb_cache)
7791+ goto out_free_rsbl;
7792+
7793+ resdir_cache_large =
7794+ kmem_cache_create("dlm_resdir(l)",
7795+ sizeof(gd_resdata_t) + DLM_RESNAME_MAXLEN,
7796+ __alignof__(gd_resdata_t), 0, NULL, NULL);
7797+ if (!resdir_cache_large)
7798+ goto out_free_lkb;
7799+
7800+ resdir_cache_small =
7801+ kmem_cache_create("dlm_resdir(s)",
7802+ (sizeof(gd_resdata_t) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7803+ __alignof__(gd_resdata_t), 0, NULL, NULL);
7804+ if (!resdir_cache_small)
7805+ goto out_free_resl;
7806+
7807+ /* LVB cache also holds ranges, so should be 64bit aligned */
7808+ lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
7809+ __alignof__(uint64_t), 0, NULL, NULL);
7810+ if (!lkb_cache)
7811+ goto out_free_ress;
7812+
7813+ ret = 0;
7814+ goto out;
7815+
7816+ out_free_ress:
7817+ kmem_cache_destroy(resdir_cache_small);
7818+
7819+ out_free_resl:
7820+ kmem_cache_destroy(resdir_cache_large);
7821+
7822+ out_free_lkb:
7823+ kmem_cache_destroy(lkb_cache);
7824+
7825+ out_free_rsbl:
7826+ kmem_cache_destroy(rsb_cache_large);
7827+
7828+ out_free_rsbs:
7829+ kmem_cache_destroy(rsb_cache_small);
7830+
7831+ out:
7832+ return ret;
7833+}
7834+
7835+void dlm_memory_exit()
7836+{
7837+ kmem_cache_destroy(rsb_cache_large);
7838+ kmem_cache_destroy(rsb_cache_small);
7839+ kmem_cache_destroy(lkb_cache);
7840+ kmem_cache_destroy(resdir_cache_small);
7841+ kmem_cache_destroy(resdir_cache_large);
7842+ kmem_cache_destroy(lvb_cache);
7843+}
7844+
7845+gd_res_t *allocate_rsb(gd_ls_t *ls, int namelen)
7846+{
7847+ gd_res_t *r;
7848+
7849+ GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
7850+
7851+ if (namelen >= LARGE_RSB_NAME)
7852+ r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
7853+ else
7854+ r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
7855+
7856+ if (r)
7857+ memset(r, 0, sizeof(gd_res_t) + namelen);
7858+
7859+ return r;
7860+}
7861+
7862+void free_rsb(gd_res_t *r)
7863+{
7864+ int length = r->res_length;
7865+
7866+#ifdef POISON
7867+ memset(r, 0x55, sizeof(gd_res_t) + r->res_length);
7868+#endif
7869+
7870+ if (length >= LARGE_RSB_NAME)
7871+ kmem_cache_free(rsb_cache_large, r);
7872+ else
7873+ kmem_cache_free(rsb_cache_small, r);
7874+}
7875+
7876+gd_lkb_t *allocate_lkb(gd_ls_t *ls)
7877+{
7878+ gd_lkb_t *l;
7879+
7880+ l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
7881+ if (l)
7882+ memset(l, 0, sizeof(gd_lkb_t));
7883+
7884+ return l;
7885+}
7886+
7887+void free_lkb(gd_lkb_t *l)
7888+{
7889+#ifdef POISON
7890+ memset(l, 0xAA, sizeof(gd_lkb_t));
7891+#endif
7892+ kmem_cache_free(lkb_cache, l);
7893+}
7894+
7895+gd_resdata_t *allocate_resdata(gd_ls_t *ls, int namelen)
7896+{
7897+ gd_resdata_t *rd;
7898+
7899+ GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
7900+
7901+ if (namelen >= LARGE_RES_NAME)
7902+ rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
7903+ else
7904+ rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
7905+
7906+ if (rd)
7907+ memset(rd, 0, sizeof(gd_resdata_t));
7908+
7909+ return rd;
7910+}
7911+
7912+void free_resdata(gd_resdata_t *rd)
7913+{
7914+ if (rd->rd_length >= LARGE_RES_NAME)
7915+ kmem_cache_free(resdir_cache_large, rd);
7916+ else
7917+ kmem_cache_free(resdir_cache_small, rd);
7918+}
7919+
7920+char *allocate_lvb(gd_ls_t *ls)
7921+{
7922+ char *l;
7923+
7924+ l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
7925+ if (l)
7926+ memset(l, 0, DLM_LVB_LEN);
7927+
7928+ return l;
7929+}
7930+
7931+void free_lvb(char *l)
7932+{
7933+ kmem_cache_free(lvb_cache, l);
7934+}
7935+
7936+/* Ranges are allocated from the LVB cache as they are the same size (4x64
7937+ * bits) */
7938+uint64_t *allocate_range(gd_ls_t * ls)
7939+{
7940+ uint64_t *l;
7941+
7942+ l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
7943+ if (l)
7944+ memset(l, 0, DLM_LVB_LEN);
7945+
7946+ return l;
7947+}
7948+
7949+void free_range(uint64_t *l)
7950+{
7951+ kmem_cache_free(lvb_cache, l);
7952+}
7953+
7954+gd_rcom_t *allocate_rcom_buffer(gd_ls_t *ls)
7955+{
7956+ gd_rcom_t *rc;
7957+
7958+ rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
7959+ if (rc)
7960+ memset(rc, 0, dlm_config.buffer_size);
7961+
7962+ return rc;
7963+}
7964+
7965+void free_rcom_buffer(gd_rcom_t *rc)
7966+{
7967+ kfree(rc);
7968+}
7969diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
7970--- linux-orig/cluster/dlm/memory.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 7971+++ linux-patched/cluster/dlm/memory.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 7972@@ -0,0 +1,32 @@
7973+/******************************************************************************
7974+*******************************************************************************
7975+**
7976+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
7977+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
7978+**
7979+** This copyrighted material is made available to anyone wishing to use,
7980+** modify, copy, or redistribute it subject to the terms and conditions
7981+** of the GNU General Public License v.2.
7982+**
7983+*******************************************************************************
7984+******************************************************************************/
7985+
7986+#ifndef __MEMORY_DOT_H__
7987+#define __MEMORY_DOT_H__
7988+
7989+int dlm_memory_init(void);
7990+void dlm_memory_exit(void);
7991+gd_res_t *allocate_rsb(gd_ls_t * ls, int namelen);
7992+void free_rsb(gd_res_t * r);
7993+gd_lkb_t *allocate_lkb(gd_ls_t * ls);
7994+void free_lkb(gd_lkb_t * l);
7995+gd_resdata_t *allocate_resdata(gd_ls_t * ls, int namelen);
7996+void free_resdata(gd_resdata_t * rd);
7997+char *allocate_lvb(gd_ls_t * ls);
7998+void free_lvb(char *l);
7999+gd_rcom_t *allocate_rcom_buffer(gd_ls_t * ls);
8000+void free_rcom_buffer(gd_rcom_t * rc);
8001+uint64_t *allocate_range(gd_ls_t * ls);
8002+void free_range(uint64_t * l);
8003+
8004+#endif /* __MEMORY_DOT_H__ */
8005diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
8006--- linux-orig/cluster/dlm/midcomms.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8007+++ linux-patched/cluster/dlm/midcomms.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 8008@@ -0,0 +1,351 @@
8009+/******************************************************************************
8010+*******************************************************************************
8011+**
8012+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8013+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8014+**
8015+** This copyrighted material is made available to anyone wishing to use,
8016+** modify, copy, or redistribute it subject to the terms and conditions
8017+** of the GNU General Public License v.2.
8018+**
8019+*******************************************************************************
8020+******************************************************************************/
8021+
8022+/*
8023+ * midcomms.c
8024+ *
8025+ * This is the appallingly named "mid-level" comms layer.
8026+ *
8027+ * Its purpose is to take packets from the "real" comms layer,
8028+ * split them up into packets and pass them to the interested
8029+ * part of the locking mechanism.
8030+ *
8031+ * It also takes messages from the locking layer, formats them
8032+ * into packets and sends them to the comms layer.
8033+ *
8034+ * It knows the format of the mid-level messages used and nodeidss
8035+ * but it does not know how to resolve a nodeid into an IP address
8036+ * or any of the comms channel details
8037+ *
8038+ */
8039+
8040+#include "dlm_internal.h"
8041+#include "lowcomms.h"
8042+#include "midcomms.h"
8043+#include "lockqueue.h"
8044+#include "nodes.h"
8045+#include "reccomms.h"
8046+#include "config.h"
8047+
8048+/* Byteorder routines */
8049+
8050+static void host_to_network(void *msg)
8051+{
8052+ struct gd_req_header *head = msg;
8053+ struct gd_remlockrequest *req = msg;
8054+ struct gd_remlockreply *reply = msg;
8055+ struct gd_remquery *query = msg;
8056+ struct gd_remqueryreply *queryrep = msg;
8057+ gd_rcom_t *rc = msg;
8058+
8059+ /* Force into network byte order */
8060+
8061+ /*
8062+ * Do the common header first
8063+ */
8064+
8065+ head->rh_length = cpu_to_le16(head->rh_length);
8066+ head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
8067+ /* Leave the lkid alone as it is transparent at the remote end */
8068+
8069+ /*
8070+ * Do the fields in the remlockrequest or remlockreply structs
8071+ */
8072+
8073+ switch (req->rr_header.rh_cmd) {
8074+
8075+ case GDLM_REMCMD_LOCKREQUEST:
8076+ case GDLM_REMCMD_CONVREQUEST:
8077+ req->rr_range_start = cpu_to_le64(req->rr_range_start);
8078+ req->rr_range_end = cpu_to_le64(req->rr_range_end);
8079+ /* Deliberate fall through */
8080+ case GDLM_REMCMD_UNLOCKREQUEST:
8081+ case GDLM_REMCMD_LOOKUP:
8082+ case GDLM_REMCMD_LOCKGRANT:
8083+ case GDLM_REMCMD_SENDBAST:
8084+ case GDLM_REMCMD_SENDCAST:
8085+ case GDLM_REMCMD_REM_RESDATA:
8086+ req->rr_flags = cpu_to_le32(req->rr_flags);
8087+ req->rr_status = cpu_to_le32(req->rr_status);
8088+ break;
8089+
8090+ case GDLM_REMCMD_LOCKREPLY:
8091+ reply->rl_lockstate = cpu_to_le32(reply->rl_lockstate);
8092+ reply->rl_nodeid = cpu_to_le32(reply->rl_nodeid);
8093+ reply->rl_status = cpu_to_le32(reply->rl_status);
8094+ break;
8095+
8096+ case GDLM_REMCMD_RECOVERMESSAGE:
8097+ case GDLM_REMCMD_RECOVERREPLY:
8098+ rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
8099+ rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
8100+ break;
8101+
8102+ case GDLM_REMCMD_QUERY:
8103+ query->rq_mstlkid = cpu_to_le32(query->rq_mstlkid);
8104+ query->rq_query = cpu_to_le32(query->rq_query);
8105+ query->rq_maxlocks = cpu_to_le32(query->rq_maxlocks);
8106+ break;
8107+
8108+ case GDLM_REMCMD_QUERYREPLY:
8109+ queryrep->rq_numlocks = cpu_to_le32(queryrep->rq_numlocks);
8110+ queryrep->rq_status = cpu_to_le32(queryrep->rq_status);
8111+ queryrep->rq_grantcount = cpu_to_le32(queryrep->rq_grantcount);
8112+ queryrep->rq_waitcount = cpu_to_le32(queryrep->rq_waitcount);
8113+ queryrep->rq_convcount = cpu_to_le32(queryrep->rq_convcount);
8114+ break;
8115+
8116+ default:
8117+ printk("dlm: warning, unknown REMCMD type %u\n",
8118+ req->rr_header.rh_cmd);
8119+ }
8120+}
8121+
8122+static void network_to_host(void *msg)
8123+{
8124+ struct gd_req_header *head = msg;
8125+ struct gd_remlockrequest *req = msg;
8126+ struct gd_remlockreply *reply = msg;
8127+ struct gd_remquery *query = msg;
8128+ struct gd_remqueryreply *queryrep = msg;
8129+ gd_rcom_t *rc = msg;
8130+
8131+ /* Force into host byte order */
8132+
8133+ /*
8134+ * Do the common header first
8135+ */
8136+
8137+ head->rh_length = le16_to_cpu(head->rh_length);
8138+ head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
8139+ /* Leave the lkid alone as it is transparent at the remote end */
8140+
8141+ /*
8142+ * Do the fields in the remlockrequest or remlockreply structs
8143+ */
8144+
8145+ switch (req->rr_header.rh_cmd) {
8146+
8147+ case GDLM_REMCMD_LOCKREQUEST:
8148+ case GDLM_REMCMD_CONVREQUEST:
8149+ req->rr_range_start = le64_to_cpu(req->rr_range_start);
8150+ req->rr_range_end = le64_to_cpu(req->rr_range_end);
8151+ case GDLM_REMCMD_LOOKUP:
8152+ case GDLM_REMCMD_UNLOCKREQUEST:
8153+ case GDLM_REMCMD_LOCKGRANT:
8154+ case GDLM_REMCMD_SENDBAST:
8155+ case GDLM_REMCMD_SENDCAST:
8156+ case GDLM_REMCMD_REM_RESDATA:
8157+ /* Actually, not much to do here as the remote lock IDs are
8158+ * transparent too */
8159+ req->rr_flags = le32_to_cpu(req->rr_flags);
8160+ req->rr_status = le32_to_cpu(req->rr_status);
8161+ break;
8162+
8163+ case GDLM_REMCMD_LOCKREPLY:
8164+ reply->rl_lockstate = le32_to_cpu(reply->rl_lockstate);
8165+ reply->rl_nodeid = le32_to_cpu(reply->rl_nodeid);
8166+ reply->rl_status = le32_to_cpu(reply->rl_status);
8167+ break;
8168+
8169+ case GDLM_REMCMD_RECOVERMESSAGE:
8170+ case GDLM_REMCMD_RECOVERREPLY:
8171+ rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
8172+ rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
8173+ break;
8174+
8175+
8176+ case GDLM_REMCMD_QUERY:
8177+ query->rq_mstlkid = le32_to_cpu(query->rq_mstlkid);
8178+ query->rq_query = le32_to_cpu(query->rq_query);
8179+ query->rq_maxlocks = le32_to_cpu(query->rq_maxlocks);
8180+ break;
8181+
8182+ case GDLM_REMCMD_QUERYREPLY:
8183+ queryrep->rq_numlocks = le32_to_cpu(queryrep->rq_numlocks);
8184+ queryrep->rq_status = le32_to_cpu(queryrep->rq_status);
8185+ queryrep->rq_grantcount = le32_to_cpu(queryrep->rq_grantcount);
8186+ queryrep->rq_waitcount = le32_to_cpu(queryrep->rq_waitcount);
8187+ queryrep->rq_convcount = le32_to_cpu(queryrep->rq_convcount);
8188+ break;
8189+
8190+ default:
8191+ printk("dlm: warning, unknown REMCMD type %u\n",
8192+ req->rr_header.rh_cmd);
8193+ }
8194+}
8195+
8196+static void copy_from_cb(void *dst, const void *base, unsigned offset,
8197+ unsigned len, unsigned limit)
8198+{
8199+ unsigned copy = len;
8200+
8201+ if ((copy + offset) > limit)
8202+ copy = limit - offset;
8203+ memcpy(dst, base + offset, copy);
8204+ len -= copy;
8205+ if (len)
8206+ memcpy(dst + copy, base, len);
8207+}
8208+
8209+static void khexdump(const unsigned char *c, int len)
8210+{
8211+ while (len > 16) {
8212+ printk(KERN_INFO
8213+ "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
8214+ c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
8215+ c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
8216+ len -= 16;
8217+ }
8218+ while (len > 4) {
8219+ printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
8220+ c[3]);
8221+ len -= 4;
8222+ }
8223+ while (len > 0) {
8224+ printk(KERN_INFO "%02x\n", c[0]);
8225+ len--;
8226+ }
8227+}
8228+
8229+/*
8230+ * Called from the low-level comms layer to process a buffer of
8231+ * commands.
8232+ *
8233+ * Only complete messages are processed here, any "spare" bytes from
8234+ * the end of a buffer are saved and tacked onto the front of the next
8235+ * message that comes in. I doubt this will happen very often but we
8236+ * need to be able to cope with it and I don't want the task to be waiting
8237+ * for packets to come in when there is useful work to be done.
8238+ *
8239+ */
8240+int midcomms_process_incoming_buffer(int nodeid, const void *base,
8241+ unsigned offset, unsigned len,
8242+ unsigned limit)
8243+{
8244+ unsigned char __tmp[sizeof(struct gd_req_header) + 64];
8245+ struct gd_req_header *msg = (struct gd_req_header *) __tmp;
8246+ int ret = 0;
8247+ int err = 0;
8248+ unsigned msglen;
8249+ __u32 id, space;
8250+
8251+ while (len > sizeof(struct gd_req_header)) {
8252+ /* Get message header and check it over */
8253+ copy_from_cb(msg, base, offset, sizeof(struct gd_req_header),
8254+ limit);
8255+ msglen = le16_to_cpu(msg->rh_length);
8256+ id = msg->rh_lkid;
8257+ space = msg->rh_lockspace;
8258+
8259+ /* Check message size */
8260+ err = -EINVAL;
8261+ if (msglen < sizeof(struct gd_req_header))
8262+ break;
8263+ err = -E2BIG;
8264+ if (msglen > dlm_config.buffer_size) {
8265+ printk("dlm: message size too big %d\n", msglen);
8266+ break;
8267+ }
8268+ err = 0;
8269+
8270+ /* Not enough in buffer yet? wait for some more */
8271+ if (msglen > len)
8272+ break;
8273+
8274+ /* Make sure our temp buffer is large enough */
8275+ if (msglen > sizeof(__tmp) &&
8276+ msg == (struct gd_req_header *) __tmp) {
8277+ msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
8278+ if (msg == NULL)
8279+ return ret;
8280+ }
8281+
8282+ copy_from_cb(msg, base, offset, msglen, limit);
8283+ BUG_ON(id != msg->rh_lkid);
8284+ BUG_ON(space != msg->rh_lockspace);
8285+ ret += msglen;
8286+ offset += msglen;
8287+ offset &= (limit - 1);
8288+ len -= msglen;
8289+ network_to_host(msg);
8290+
8291+ if ((msg->rh_cmd > 32) ||
8292+ (msg->rh_cmd == 0) ||
8293+ (msg->rh_length < sizeof(struct gd_req_header)) ||
8294+ (msg->rh_length > dlm_config.buffer_size)) {
8295+
8296+ printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
8297+ "lkid=%u, lockspace=%u\n",
8298+ msg->rh_cmd, msg->rh_flags, msg->rh_length,
8299+ msg->rh_lkid, msg->rh_lockspace);
8300+
8301+ printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
8302+ "ret=%u, limit=%08x newbuf=%d\n",
8303+ base, offset, len, ret, limit,
8304+ ((struct gd_req_header *) __tmp == msg));
8305+
8306+ khexdump((const unsigned char *) msg, msg->rh_length);
8307+
8308+ return -EBADMSG;
8309+ }
8310+
8311+ switch (msg->rh_cmd) {
8312+ case GDLM_REMCMD_RECOVERMESSAGE:
8313+ case GDLM_REMCMD_RECOVERREPLY:
8314+ process_recovery_comm(nodeid, msg);
8315+ break;
8316+ default:
8317+ process_cluster_request(nodeid, msg, FALSE);
8318+ }
8319+ }
8320+
8321+ if (msg != (struct gd_req_header *) __tmp)
8322+ kfree(msg);
8323+
8324+ return err ? err : ret;
8325+}
8326+
8327+/*
8328+ * Send a lowcomms buffer
8329+ */
8330+
8331+void midcomms_send_buffer(struct gd_req_header *msg, struct writequeue_entry *e)
8332+{
8333+ host_to_network(msg);
8334+ lowcomms_commit_buffer(e);
8335+}
8336+
8337+/*
8338+ * Make the message into network byte order and send it
8339+ */
8340+
8341+int midcomms_send_message(uint32_t nodeid, struct gd_req_header *msg,
8342+ int allocation)
8343+{
8344+ int len = msg->rh_length;
8345+
8346+ host_to_network(msg);
8347+
8348+ /*
8349+ * Loopback. In fact, the locking code pretty much prevents this from
8350+ * being needed but it can happen when the directory node is also the
8351+ * local node.
8352+ */
8353+
8354+ if (nodeid == our_nodeid())
8355+ return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
8356+ len, len);
8357+
8358+ return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
8359+}
8360diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
8361--- linux-orig/cluster/dlm/midcomms.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8362+++ linux-patched/cluster/dlm/midcomms.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 8363@@ -0,0 +1,24 @@
8364+/******************************************************************************
8365+*******************************************************************************
8366+**
8367+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8368+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8369+**
8370+** This copyrighted material is made available to anyone wishing to use,
8371+** modify, copy, or redistribute it subject to the terms and conditions
8372+** of the GNU General Public License v.2.
8373+**
8374+*******************************************************************************
8375+******************************************************************************/
8376+
8377+#ifndef __MIDCOMMS_DOT_H__
8378+#define __MIDCOMMS_DOT_H__
8379+
8380+int midcomms_send_message(uint32_t csid, struct gd_req_header *msg,
8381+ int allocation);
8382+int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
8383+ unsigned len, unsigned limit);
8384+void midcomms_send_buffer(struct gd_req_header *msg,
8385+ struct writequeue_entry *e);
8386+
8387+#endif /* __MIDCOMMS_DOT_H__ */
8388diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
8389--- linux-orig/cluster/dlm/nodes.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8390+++ linux-patched/cluster/dlm/nodes.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 8391@@ -0,0 +1,325 @@
8392+/******************************************************************************
8393+*******************************************************************************
8394+**
8395+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8396+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8397+**
8398+** This copyrighted material is made available to anyone wishing to use,
8399+** modify, copy, or redistribute it subject to the terms and conditions
8400+** of the GNU General Public License v.2.
8401+**
8402+*******************************************************************************
8403+******************************************************************************/
8404+
8405+#include <net/sock.h>
8406+#include <cluster/cnxman.h>
8407+
8408+#include "dlm_internal.h"
8409+#include "lowcomms.h"
8410+#include "nodes.h"
8411+#include "recover.h"
8412+#include "reccomms.h"
8413+#include "util.h"
8414+
8415+static struct list_head cluster_nodes;
8416+static spinlock_t node_lock;
8417+static uint32_t local_nodeid;
8418+static struct semaphore local_init_lock;
8419+
8420+
8421+void dlm_nodes_init(void)
8422+{
8423+ INIT_LIST_HEAD(&cluster_nodes);
8424+ spin_lock_init(&node_lock);
8425+ local_nodeid = 0;
8426+ init_MUTEX(&local_init_lock);
8427+}
8428+
8429+static gd_node_t *search_node(uint32_t nodeid)
8430+{
8431+ gd_node_t *node;
8432+
8433+ list_for_each_entry(node, &cluster_nodes, gn_list) {
8434+ if (node->gn_nodeid == nodeid)
8435+ goto out;
8436+ }
8437+ node = NULL;
8438+ out:
8439+ return node;
8440+}
8441+
8442+static void put_node(gd_node_t *node)
8443+{
8444+ spin_lock(&node_lock);
8445+ node->gn_refcount--;
8446+ if (node->gn_refcount == 0) {
8447+ list_del(&node->gn_list);
8448+ spin_unlock(&node_lock);
8449+ kfree(node);
8450+ return;
8451+ }
8452+ spin_unlock(&node_lock);
8453+}
8454+
8455+static int get_node(uint32_t nodeid, gd_node_t **ndp)
8456+{
8457+ gd_node_t *node, *node2;
8458+ int error = -ENOMEM;
8459+
8460+ spin_lock(&node_lock);
8461+ node = search_node(nodeid);
8462+ if (node)
8463+ node->gn_refcount++;
8464+ spin_unlock(&node_lock);
8465+
8466+ if (node)
8467+ goto out;
8468+
8469+ node = (gd_node_t *) kmalloc(sizeof(gd_node_t), GFP_KERNEL);
8470+ if (!node)
8471+ goto fail;
8472+
8473+ memset(node, 0, sizeof(gd_node_t));
8474+ node->gn_nodeid = nodeid;
8475+
8476+ spin_lock(&node_lock);
8477+ node2 = search_node(nodeid);
8478+ if (node2) {
8479+ node2->gn_refcount++;
8480+ spin_unlock(&node_lock);
8481+ kfree(node);
8482+ node = node2;
8483+ goto out;
8484+ }
8485+
8486+ node->gn_refcount = 1;
8487+ list_add_tail(&node->gn_list, &cluster_nodes);
8488+ spin_unlock(&node_lock);
8489+
8490+ out:
8491+ *ndp = node;
8492+ return 0;
8493+
8494+ fail:
8495+ return error;
8496+}
8497+
8498+int init_new_csb(uint32_t nodeid, gd_csb_t **ret_csb)
8499+{
8500+ gd_csb_t *csb;
8501+ gd_node_t *node;
8502+ int error = -ENOMEM;
8503+
8504+ csb = (gd_csb_t *) kmalloc(sizeof(gd_csb_t), GFP_KERNEL);
8505+ if (!csb)
8506+ goto fail;
8507+
8508+ memset(csb, 0, sizeof(gd_csb_t));
8509+
8510+ error = get_node(nodeid, &node);
8511+ if (error)
8512+ goto fail_free;
8513+
8514+ csb->csb_node = node;
8515+
8516+ down(&local_init_lock);
8517+
8518+ if (!local_nodeid) {
8519+ if (nodeid == our_nodeid()) {
8520+ local_nodeid = node->gn_nodeid;
8521+ }
8522+ }
8523+ up(&local_init_lock);
8524+
8525+ *ret_csb = csb;
8526+ return 0;
8527+
8528+ fail_free:
8529+ kfree(csb);
8530+ fail:
8531+ return error;
8532+}
8533+
8534+void release_csb(gd_csb_t *csb)
8535+{
8536+ put_node(csb->csb_node);
8537+ kfree(csb);
8538+}
8539+
8540+uint32_t our_nodeid(void)
8541+{
8542+ return lowcomms_our_nodeid();
8543+}
8544+
8545+int nodes_reconfig_wait(gd_ls_t *ls)
8546+{
8547+ int error;
8548+
8549+ if (ls->ls_low_nodeid == our_nodeid()) {
8550+ error = gdlm_wait_status_all(ls, NODES_VALID);
8551+ if (!error)
8552+ set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
8553+
8554+ /* Experimental: this delay should allow any final messages
8555+ * from the previous node to be received before beginning
8556+ * recovery. */
8557+
8558+ if (ls->ls_num_nodes == 1) {
8559+ current->state = TASK_UNINTERRUPTIBLE;
8560+ schedule_timeout((2) * HZ);
8561+ }
8562+
8563+ } else
8564+ error = gdlm_wait_status_low(ls, NODES_ALL_VALID);
8565+
8566+ return error;
8567+}
8568+
8569+static void add_ordered_node(gd_ls_t *ls, gd_csb_t *new)
8570+{
8571+ gd_csb_t *csb = NULL;
8572+ struct list_head *tmp;
8573+ struct list_head *newlist = &new->csb_list;
8574+ struct list_head *head = &ls->ls_nodes;
8575+
8576+ list_for_each(tmp, head) {
8577+ csb = list_entry(tmp, gd_csb_t, csb_list);
8578+
8579+ if (new->csb_node->gn_nodeid < csb->csb_node->gn_nodeid)
8580+ break;
8581+ }
8582+
8583+ if (!csb)
8584+ list_add_tail(newlist, head);
8585+ else {
8586+ /* FIXME: can use list macro here */
8587+ newlist->prev = tmp->prev;
8588+ newlist->next = tmp;
8589+ tmp->prev->next = newlist;
8590+ tmp->prev = newlist;
8591+ }
8592+}
8593+
8594+int ls_nodes_reconfig(gd_ls_t *ls, gd_recover_t *gr, int *neg_out)
8595+{
8596+ gd_csb_t *csb, *safe;
8597+ int error, i, found, pos = 0, neg = 0;
8598+ uint32_t low = (uint32_t) (-1);
8599+
8600+ /*
8601+ * Remove (and save) departed nodes from lockspace's nodes list
8602+ */
8603+
8604+ list_for_each_entry_safe(csb, safe, &ls->ls_nodes, csb_list) {
8605+ found = FALSE;
8606+ for (i = 0; i < gr->gr_node_count; i++) {
8607+ if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
8608+ found = TRUE;
8609+ break;
8610+ }
8611+ }
8612+
8613+ if (!found) {
8614+ neg++;
8615+ csb->csb_gone_event = gr->gr_event_id;
8616+ list_del(&csb->csb_list);
8617+ list_add_tail(&csb->csb_list, &ls->ls_nodes_gone);
8618+ ls->ls_num_nodes--;
8619+ log_all(ls, "remove node %u", csb->csb_node->gn_nodeid);
8620+ }
8621+ }
8622+
8623+ /*
8624+ * Add new nodes to lockspace's nodes list
8625+ */
8626+
8627+ for (i = 0; i < gr->gr_node_count; i++) {
8628+ found = FALSE;
8629+ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
8630+ if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
8631+ found = TRUE;
8632+ break;
8633+ }
8634+ }
8635+
8636+ if (!found) {
8637+ pos++;
8638+
8639+ error = init_new_csb(gr->gr_nodeids[i], &csb);
8640+ GDLM_ASSERT(!error,);
8641+
8642+ add_ordered_node(ls, csb);
8643+ ls->ls_num_nodes++;
8644+ log_all(ls, "add node %u", csb->csb_node->gn_nodeid);
8645+ }
8646+ }
8647+
8648+ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
8649+ if (csb->csb_node->gn_nodeid < low)
8650+ low = csb->csb_node->gn_nodeid;
8651+ }
8652+
8653+ rcom_log_clear(ls);
8654+ ls->ls_low_nodeid = low;
8655+ ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
8656+ set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8657+ *neg_out = neg;
8658+
8659+ error = nodes_reconfig_wait(ls);
8660+
8661+ log_all(ls, "total nodes %d", ls->ls_num_nodes);
8662+
8663+ return error;
8664+}
8665+
8666+int ls_nodes_init(gd_ls_t *ls, gd_recover_t *gr)
8667+{
8668+ gd_csb_t *csb;
8669+ int i, error;
8670+ uint32_t low = (uint32_t) (-1);
8671+
8672+ log_all(ls, "add nodes");
8673+
8674+ for (i = 0; i < gr->gr_node_count; i++) {
8675+ error = init_new_csb(gr->gr_nodeids[i], &csb);
8676+ if (error)
8677+ goto fail;
8678+
8679+ add_ordered_node(ls, csb);
8680+ ls->ls_num_nodes++;
8681+
8682+ if (csb->csb_node->gn_nodeid < low)
8683+ low = csb->csb_node->gn_nodeid;
8684+ }
8685+
8686+ ls->ls_low_nodeid = low;
8687+ ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
8688+ set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8689+
8690+ error = nodes_reconfig_wait(ls);
8691+
8692+ log_all(ls, "total nodes %d", ls->ls_num_nodes);
8693+
8694+ return error;
8695+
8696+ fail:
8697+ while (!list_empty(&ls->ls_nodes)) {
8698+ csb = list_entry(ls->ls_nodes.next, gd_csb_t, csb_list);
8699+ list_del(&csb->csb_list);
8700+ release_csb(csb);
8701+ }
8702+ ls->ls_num_nodes = 0;
8703+
8704+ return error;
8705+}
8706+
8707+int in_nodes_gone(gd_ls_t *ls, uint32_t nodeid)
8708+{
8709+ gd_csb_t *csb;
8710+
8711+ list_for_each_entry(csb, &ls->ls_nodes_gone, csb_list) {
8712+ if (csb->csb_node->gn_nodeid == nodeid)
8713+ return TRUE;
8714+ }
8715+ return FALSE;
8716+}
8717diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
8718--- linux-orig/cluster/dlm/nodes.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8719+++ linux-patched/cluster/dlm/nodes.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 8720@@ -0,0 +1,25 @@
8721+/******************************************************************************
8722+*******************************************************************************
8723+**
8724+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8725+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8726+**
8727+** This copyrighted material is made available to anyone wishing to use,
8728+** modify, copy, or redistribute it subject to the terms and conditions
8729+** of the GNU General Public License v.2.
8730+**
8731+*******************************************************************************
8732+******************************************************************************/
8733+
8734+#ifndef __NODES_DOT_H__
8735+#define __NODES_DOT_H__
8736+
8737+void dlm_nodes_init(void);
8738+int init_new_csb(uint32_t nodeid, gd_csb_t ** ret_csb);
8739+void release_csb(gd_csb_t * csb);
8740+uint32_t our_nodeid(void);
8741+int ls_nodes_reconfig(gd_ls_t * ls, gd_recover_t * gr, int *neg);
8742+int ls_nodes_init(gd_ls_t * ls, gd_recover_t * gr);
8743+int in_nodes_gone(gd_ls_t * ls, uint32_t nodeid);
8744+
8745+#endif /* __NODES_DOT_H__ */
8746diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
8747--- linux-orig/cluster/dlm/proc.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8748+++ linux-patched/cluster/dlm/proc.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 8749@@ -0,0 +1,469 @@
8750+/******************************************************************************
8751+*******************************************************************************
8752+**
8753+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8754+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8755+**
8756+** This copyrighted material is made available to anyone wishing to use,
8757+** modify, copy, or redistribute it subject to the terms and conditions
8758+** of the GNU General Public License v.2.
8759+**
8760+*******************************************************************************
8761+******************************************************************************/
8762+
8763+#include <linux/init.h>
8764+#include <linux/proc_fs.h>
8765+#include <linux/ctype.h>
8766+#include <linux/seq_file.h>
8767+#include <linux/module.h>
8768+
8769+#include "dlm_internal.h"
8770+#include "lockspace.h"
8771+
8772+#if defined(DLM_DEBUG)
8773+#define DLM_DEBUG_SIZE (1024)
8774+#define MAX_DEBUG_MSG_LEN (64)
8775+#else
8776+#define DLM_DEBUG_SIZE (0)
8777+#define MAX_DEBUG_MSG_LEN (0)
8778+#endif
8779+
8780+static char * debug_buf;
8781+static unsigned int debug_size;
8782+static unsigned int debug_point;
8783+static int debug_wrap;
8784+static spinlock_t debug_lock;
8785+static struct proc_dir_entry * debug_proc_entry = NULL;
8786+static struct proc_dir_entry * rcom_proc_entry = NULL;
8787+static char proc_ls_name[255] = "";
8788+
8789+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
8790+static struct proc_dir_entry * locks_proc_entry = NULL;
8791+static struct seq_operations locks_info_op;
8792+
8793+
8794+static int locks_open(struct inode *inode, struct file *file)
8795+{
8796+ return seq_open(file, &locks_info_op);
8797+}
8798+
8799+/* Write simply sets the lockspace to use */
8800+static ssize_t locks_write(struct file *file, const char *buf,
8801+ size_t count, loff_t * ppos)
8802+{
8803+ if (count < sizeof(proc_ls_name)) {
8804+ copy_from_user(proc_ls_name, buf, count);
8805+ proc_ls_name[count] = '\0';
8806+
8807+ /* Remove any trailing LF so that lazy users
8808+ can just echo "lsname" > /proc/cluster/dlm_locks */
8809+ if (proc_ls_name[count - 1] == '\n')
8810+ proc_ls_name[count - 1] = '\0';
8811+
8812+ return count;
8813+ }
8814+ return 0;
8815+}
8816+
8817+static struct file_operations locks_fops = {
8818+ open:locks_open,
8819+ write:locks_write,
8820+ read:seq_read,
8821+ llseek:seq_lseek,
8822+ release:seq_release,
8823+};
8824+
8825+struct ls_dumpinfo {
8826+ int entry;
8827+ struct list_head *next;
8828+ gd_ls_t *ls;
8829+ gd_res_t *rsb;
8830+};
8831+
8832+static int print_resource(gd_res_t * res, struct seq_file *s);
8833+
8834+static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
8835+{
8836+ read_lock(&di->ls->ls_reshash_lock);
8837+ if (!di->next) {
8838+ /* Find the next non-empty hash bucket */
8839+ while (list_empty(&di->ls->ls_reshashtbl[di->entry]) &&
8840+ di->entry < di->ls->ls_hashsize) {
8841+ di->entry++;
8842+ }
8843+ if (di->entry >= di->ls->ls_hashsize) {
8844+ read_unlock(&di->ls->ls_reshash_lock);
8845+ return NULL; /* End of hash list */
8846+ }
8847+
8848+ di->next = di->ls->ls_reshashtbl[di->entry].next;
8849+ } else { /* Find the next entry in the list */
8850+
8851+ di->next = di->next->next;
8852+ if (di->next->next == di->ls->ls_reshashtbl[di->entry].next) {
8853+ /* End of list - move to next bucket */
8854+ di->next = NULL;
8855+ di->entry++;
8856+ read_unlock(&di->ls->ls_reshash_lock);
8857+
8858+ return next_rsb(di); /* do the top half of this conditional */
8859+ }
8860+ }
8861+ di->rsb = list_entry(di->next, gd_res_t, res_hashchain);
8862+ read_unlock(&di->ls->ls_reshash_lock);
8863+
8864+ return di;
8865+}
8866+
8867+static void *s_start(struct seq_file *m, loff_t * pos)
8868+{
8869+ struct ls_dumpinfo *di;
8870+ gd_ls_t *ls;
8871+ int i;
8872+
8873+ ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
8874+ if (!ls)
8875+ return NULL;
8876+
8877+ di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
8878+ if (!di)
8879+ return NULL;
8880+
8881+ if (*pos == 0)
8882+ seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
8883+
8884+ di->entry = 0;
8885+ di->next = NULL;
8886+ di->ls = ls;
8887+
8888+ for (i = 0; i < *pos; i++)
8889+ if (next_rsb(di) == NULL)
8890+ return NULL;
8891+
8892+ return next_rsb(di);
8893+}
8894+
8895+static void *s_next(struct seq_file *m, void *p, loff_t * pos)
8896+{
8897+ struct ls_dumpinfo *di = p;
8898+
8899+ *pos += 1;
8900+
8901+ return next_rsb(di);
8902+}
8903+
8904+static int s_show(struct seq_file *m, void *p)
8905+{
8906+ struct ls_dumpinfo *di = p;
8907+ return print_resource(di->rsb, m);
8908+}
8909+
8910+static void s_stop(struct seq_file *m, void *p)
8911+{
8912+ kfree(p);
8913+}
8914+
8915+static struct seq_operations locks_info_op = {
8916+ start:s_start,
8917+ next:s_next,
8918+ stop:s_stop,
8919+ show:s_show
8920+};
8921+
8922+static char *print_lockmode(int mode)
8923+{
8924+ switch (mode) {
8925+ case DLM_LOCK_IV:
8926+ return "--";
8927+ case DLM_LOCK_NL:
8928+ return "NL";
8929+ case DLM_LOCK_CR:
8930+ return "CR";
8931+ case DLM_LOCK_CW:
8932+ return "CW";
8933+ case DLM_LOCK_PR:
8934+ return "PR";
8935+ case DLM_LOCK_PW:
8936+ return "PW";
8937+ case DLM_LOCK_EX:
8938+ return "EX";
8939+ default:
8940+ return "??";
8941+ }
8942+}
8943+
8944+static void print_lock(struct seq_file *s, gd_lkb_t * lkb, gd_res_t * res)
8945+{
8946+
8947+ seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
8948+
8949+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8950+ || lkb->lkb_status == GDLM_LKSTS_WAITING)
8951+ seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
8952+
8953+ if (lkb->lkb_range) {
8954+ /* This warns on Alpha. Tough. Only I see it */
8955+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8956+ || lkb->lkb_status == GDLM_LKSTS_GRANTED)
8957+ seq_printf(s, " %" PRIx64 "-%" PRIx64,
8958+ lkb->lkb_range[GR_RANGE_START],
8959+ lkb->lkb_range[GR_RANGE_END]);
8960+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
8961+ || lkb->lkb_status == GDLM_LKSTS_WAITING)
8962+ seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
8963+ lkb->lkb_range[RQ_RANGE_START],
8964+ lkb->lkb_range[RQ_RANGE_END]);
8965+ }
8966+
8967+ if (lkb->lkb_nodeid) {
8968+ if (lkb->lkb_nodeid != res->res_nodeid)
8969+ seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
8970+ lkb->lkb_remid);
8971+ else
8972+ seq_printf(s, " Master: %08x", lkb->lkb_remid);
8973+ }
8974+
8975+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
8976+ seq_printf(s, " LQ: %d", lkb->lkb_lockqueue_state);
8977+
8978+ seq_printf(s, "\n");
8979+}
8980+
8981+static int print_resource(gd_res_t *res, struct seq_file *s)
8982+{
8983+ int i;
8984+ struct list_head *locklist;
8985+
8986+ seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
8987+ res->res_parent, res->res_length);
8988+ for (i = 0; i < res->res_length; i++) {
8989+ if (isprint(res->res_name[i]))
8990+ seq_printf(s, "%c", res->res_name[i]);
8991+ else
8992+ seq_printf(s, "%c", '.');
8993+ }
8994+ if (res->res_nodeid)
8995+ seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
8996+ res->res_nodeid);
8997+ else
8998+ seq_printf(s, "\" \nMaster Copy\n");
8999+
9000+ /* Print the LVB: */
9001+ if (res->res_lvbptr) {
9002+ seq_printf(s, "LVB: ");
9003+ for (i = 0; i < DLM_LVB_LEN; i++) {
9004+ if (i == DLM_LVB_LEN / 2)
9005+ seq_printf(s, "\n ");
9006+ seq_printf(s, "%02x ",
9007+ (unsigned char) res->res_lvbptr[i]);
9008+ }
9009+ seq_printf(s, "\n");
9010+ }
9011+
9012+ /* Print the locks attached to this resource */
9013+ seq_printf(s, "Granted Queue\n");
9014+ list_for_each(locklist, &res->res_grantqueue) {
9015+ gd_lkb_t *this_lkb =
9016+ list_entry(locklist, gd_lkb_t, lkb_statequeue);
9017+ print_lock(s, this_lkb, res);
9018+ }
9019+
9020+ seq_printf(s, "Conversion Queue\n");
9021+ list_for_each(locklist, &res->res_convertqueue) {
9022+ gd_lkb_t *this_lkb =
9023+ list_entry(locklist, gd_lkb_t, lkb_statequeue);
9024+ print_lock(s, this_lkb, res);
9025+ }
9026+
9027+ seq_printf(s, "Waiting Queue\n");
9028+ list_for_each(locklist, &res->res_waitqueue) {
9029+ gd_lkb_t *this_lkb =
9030+ list_entry(locklist, gd_lkb_t, lkb_statequeue);
9031+ print_lock(s, this_lkb, res);
9032+ }
9033+ return 0;
9034+}
9035+#endif /* CONFIG_CLUSTER_DLM_PROCLOCKS */
9036+
9037+void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...)
9038+{
9039+ va_list va;
9040+ int i, n, size, len;
9041+ char buf[MAX_DEBUG_MSG_LEN+1];
9042+
9043+ spin_lock(&debug_lock);
9044+
9045+ if (!debug_buf)
9046+ goto out;
9047+
9048+ size = MAX_DEBUG_MSG_LEN;
9049+ memset(buf, 0, size+1);
9050+
9051+ n = snprintf(buf, size, "%s ", ls->ls_name);
9052+ size -= n;
9053+
9054+ va_start(va, fmt);
9055+ vsnprintf(buf+n, size, fmt, va);
9056+ va_end(va);
9057+
9058+ len = strlen(buf);
9059+ if (len > MAX_DEBUG_MSG_LEN-1)
9060+ len = MAX_DEBUG_MSG_LEN-1;
9061+ buf[len] = '\n';
9062+ buf[len+1] = '\0';
9063+
9064+ for (i = 0; i < strlen(buf); i++) {
9065+ debug_buf[debug_point++] = buf[i];
9066+
9067+ if (debug_point == debug_size) {
9068+ debug_point = 0;
9069+ debug_wrap = 1;
9070+ }
9071+ }
9072+ out:
9073+ spin_unlock(&debug_lock);
9074+}
9075+
9076+void dlm_debug_dump(void)
9077+{
9078+ int i;
9079+
9080+ spin_lock(&debug_lock);
9081+ if (debug_wrap) {
9082+ for (i = debug_point; i < debug_size; i++)
9083+ printk("%c", debug_buf[i]);
9084+ }
9085+ for (i = 0; i < debug_point; i++)
9086+ printk("%c", debug_buf[i]);
9087+ spin_unlock(&debug_lock);
9088+}
9089+
9090+void dlm_debug_setup(int size)
9091+{
9092+ char *b = NULL;
9093+
9094+ if (size > PAGE_SIZE)
9095+ size = PAGE_SIZE;
9096+ if (size)
9097+ b = kmalloc(size, GFP_KERNEL);
9098+
9099+ spin_lock(&debug_lock);
9100+ if (debug_buf)
9101+ kfree(debug_buf);
9102+ if (!size || !b)
9103+ goto out;
9104+ debug_size = size;
9105+ debug_point = 0;
9106+ debug_wrap = 0;
9107+ debug_buf = b;
9108+ memset(debug_buf, 0, debug_size);
9109+ out:
9110+ spin_unlock(&debug_lock);
9111+}
9112+
9113+static void dlm_debug_init(void)
9114+{
9115+ debug_buf = NULL;
9116+ debug_size = 0;
9117+ debug_point = 0;
9118+ debug_wrap = 0;
9119+ spin_lock_init(&debug_lock);
9120+
9121+ dlm_debug_setup(DLM_DEBUG_SIZE);
9122+}
9123+
9124+#ifdef CONFIG_PROC_FS
9125+int dlm_debug_info(char *b, char **start, off_t offset, int length)
9126+{
9127+ int i, n = 0;
9128+
9129+ spin_lock(&debug_lock);
9130+
9131+ if (debug_wrap) {
9132+ for (i = debug_point; i < debug_size; i++)
9133+ n += sprintf(b + n, "%c", debug_buf[i]);
9134+ }
9135+ for (i = 0; i < debug_point; i++)
9136+ n += sprintf(b + n, "%c", debug_buf[i]);
9137+
9138+ spin_unlock(&debug_lock);
9139+
9140+ return n;
9141+}
9142+
9143+int dlm_rcom_info(char *b, char **start, off_t offset, int length)
9144+{
9145+ gd_ls_t *ls;
9146+ gd_csb_t *csb;
9147+ int n = 0;
9148+
9149+ ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9150+ if (!ls)
9151+ return 0;
9152+
9153+ n += sprintf(b + n, "nodeid names_send_count names_send_msgid "
9154+ "names_recv_count names_recv_msgid "
9155+ "locks_send_count locks_send_msgid "
9156+ "locks_recv_count locks_recv_msgid\n");
9157+
9158+ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
9159+ n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n",
9160+ csb->csb_node->gn_nodeid,
9161+ csb->csb_names_send_count,
9162+ csb->csb_names_send_msgid,
9163+ csb->csb_names_recv_count,
9164+ csb->csb_names_recv_msgid,
9165+ csb->csb_locks_send_count,
9166+ csb->csb_locks_send_msgid,
9167+ csb->csb_locks_recv_count,
9168+ csb->csb_locks_recv_msgid);
9169+ }
9170+ return n;
9171+}
9172+#endif
9173+
9174+void dlm_proc_init(void)
9175+{
9176+#ifdef CONFIG_PROC_FS
9177+ debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
9178+ NULL);
9179+ if (!debug_proc_entry)
9180+ return;
9181+
9182+ debug_proc_entry->get_info = &dlm_debug_info;
9183+
9184+ rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL);
9185+ if (!rcom_proc_entry)
9186+ return;
9187+
9188+ rcom_proc_entry->get_info = &dlm_rcom_info;
9189+#endif
9190+ dlm_debug_init();
9191+
9192+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9193+ locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
9194+ S_IFREG | 0400,
9195+ NULL, NULL, NULL);
9196+ if (!locks_proc_entry)
9197+ return;
9198+ locks_proc_entry->proc_fops = &locks_fops;
9199+#endif
9200+}
9201+
9202+void dlm_proc_exit(void)
9203+{
9204+#ifdef CONFIG_PROC_FS
9205+ if (debug_proc_entry) {
9206+ remove_proc_entry("cluster/dlm_debug", NULL);
9207+ dlm_debug_setup(0);
9208+ }
9209+
9210+ if (rcom_proc_entry)
9211+ remove_proc_entry("cluster/dlm_rcom", NULL);
9212+#endif
9213+
9214+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9215+ if (locks_proc_entry)
9216+ remove_proc_entry("cluster/dlm_locks", NULL);
9217+#endif
9218+}
9219diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
9220--- linux-orig/cluster/dlm/queries.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
9221+++ linux-patched/cluster/dlm/queries.c 2004-06-29 20:01:20.000000000 +0800
9222@@ -0,0 +1,696 @@
4bf12011 9223+/******************************************************************************
9224+*******************************************************************************
9225+**
9226+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9227+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9228+**
9229+** This copyrighted material is made available to anyone wishing to use,
9230+** modify, copy, or redistribute it subject to the terms and conditions
9231+** of the GNU General Public License v.2.
9232+**
9233+*******************************************************************************
9234+******************************************************************************/
9235+
9236+/*
9237+ * queries.c
9238+ *
9239+ * This file provides the kernel query interface to the DLM.
9240+ *
9241+ */
9242+
9243+#define EXPORT_SYMTAB
9244+#include <linux/module.h>
9245+
9246+#include "dlm_internal.h"
5cdbd17b 9247+#include "lockspace.h"
4bf12011 9248+#include "lockqueue.h"
9249+#include "locking.h"
9250+#include "lkb.h"
9251+#include "nodes.h"
9252+#include "dir.h"
9253+#include "ast.h"
9254+#include "memory.h"
9255+#include "lowcomms.h"
9256+#include "midcomms.h"
9257+#include "rsb.h"
9258+
9259+static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo);
9260+static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo);
9261+
9262+/*
9263+ * API entry point.
9264+ */
9265+int dlm_query(void *lockspace,
9266+ struct dlm_lksb *lksb,
9267+ int query,
9268+ struct dlm_queryinfo *qinfo,
9269+ void (ast_routine(void *)),
9270+ void *astarg)
9271+{
9272+ int status = -EINVAL;
9273+ gd_lkb_t *target_lkb;
9274+ gd_lkb_t *query_lkb = NULL; /* Our temporary LKB */
9275+ gd_ls_t *ls = (gd_ls_t *) find_lockspace_by_local_id(lockspace);
9276+
9277+
9278+ if (!qinfo)
9279+ goto out;
9280+ if (!ls)
9281+ goto out;
9282+ if (!ast_routine)
9283+ goto out;
9284+ if (!lksb)
9285+ goto out;
9286+
9287+ if (!qinfo->gqi_lockinfo)
9288+ qinfo->gqi_locksize = 0;
9289+
9290+ /* Find the lkid */
9291+ target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
9292+ if (!target_lkb)
9293+ goto out;
9294+
9295+ /* If the user wants a list of locks that are blocking or
9296+ not blocking this lock, then it must be waiting
9297+ for something
9298+ */
9299+ if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
9300+ (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
9301+ target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
9302+ return -EINVAL;
9303+
9304+ /* We now allocate an LKB for our own use (so we can hang
9305+ * things like the AST routine and the lksb from it) */
9306+ lksb->sb_status = -EBUSY;
9307+ query_lkb = create_lkb(ls);
9308+ if (!query_lkb) {
9309+ status = -ENOMEM;
9310+ goto out;
9311+ }
9312+ query_lkb->lkb_astaddr = ast_routine;
9313+ query_lkb->lkb_astparam = (long)astarg;
9314+ query_lkb->lkb_resource = target_lkb->lkb_resource;
9315+ query_lkb->lkb_lksb = lksb;
9316+
9317+ /* Don't free the resource while we are querying it. This ref
9318+ * will be dropped when the LKB is freed */
9319+ hold_rsb(query_lkb->lkb_resource);
9320+
9321+ /* Fill in the stuff that's always local */
9322+ if (qinfo->gqi_resinfo) {
9323+ if (target_lkb->lkb_resource->res_nodeid)
9324+ qinfo->gqi_resinfo->rsi_masternode =
9325+ target_lkb->lkb_resource->res_nodeid;
9326+ else
9327+ qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
9328+ qinfo->gqi_resinfo->rsi_length =
9329+ target_lkb->lkb_resource->res_length;
9330+ memcpy(qinfo->gqi_resinfo->rsi_name,
9331+ target_lkb->lkb_resource->res_name,
9332+ qinfo->gqi_resinfo->rsi_length);
9333+ }
9334+
9335+ /* If the master is local (or the user doesn't want the overhead of a
9336+ * remote call) - fill in the details here */
9337+ if (target_lkb->lkb_resource->res_nodeid == 0 ||
9338+ (query & DLM_QUERY_LOCAL)) {
9339+
9340+ status = 0;
9341+ /* Resource info */
9342+ if (qinfo->gqi_resinfo) {
9343+ query_resource(target_lkb->lkb_resource,
9344+ qinfo->gqi_resinfo);
9345+ }
9346+
9347+ /* Lock lists */
9348+ if (qinfo->gqi_lockinfo) {
9349+ status = query_locks(query, target_lkb, qinfo);
9350+ }
9351+
9352+ query_lkb->lkb_retstatus = status;
5cdbd17b 9353+ queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
4bf12011 9354+ wake_astd();
9355+
9356+ /* An AST will be delivered so we must return success here */
9357+ status = 0;
9358+ goto out;
9359+ }
9360+
9361+ /* Remote master */
9362+ if (target_lkb->lkb_resource->res_nodeid != 0)
9363+ {
9364+ struct gd_remquery *remquery;
9365+ struct writequeue_entry *e;
9366+
9367+ /* Clear this cos the receiving end adds to it with
9368+ each incoming packet */
9369+ qinfo->gqi_lockcount = 0;
9370+
9371+ /* Squirrel a pointer to the query info struct
9372+ somewhere illegal */
9373+ query_lkb->lkb_request = (struct gd_remlockrequest *) qinfo;
9374+
9375+ e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
9376+ sizeof(struct gd_remquery),
9377+ ls->ls_allocation,
9378+ (char **) &remquery);
9379+ if (!e) {
9380+ status = -ENOBUFS;
9381+ goto out;
9382+ }
9383+
9384+ /* Build remote packet */
9385+ memset(remquery, 0, sizeof(struct gd_remquery));
9386+
9387+ remquery->rq_maxlocks = qinfo->gqi_locksize;
9388+ remquery->rq_query = query;
9389+ remquery->rq_mstlkid = target_lkb->lkb_remid;
9390+ if (qinfo->gqi_lockinfo)
9391+ remquery->rq_maxlocks = qinfo->gqi_locksize;
9392+
9393+ remquery->rq_header.rh_cmd = GDLM_REMCMD_QUERY;
9394+ remquery->rq_header.rh_flags = 0;
9395+ remquery->rq_header.rh_length = sizeof(struct gd_remquery);
9396+ remquery->rq_header.rh_lkid = query_lkb->lkb_id;
9397+ remquery->rq_header.rh_lockspace = ls->ls_global_id;
9398+
9399+ midcomms_send_buffer(&remquery->rq_header, e);
9400+ status = 0;
9401+ }
9402+
9403+ out:
9404+
9405+ return status;
9406+}
9407+
9408+static inline int valid_range(struct dlm_range *r)
9409+{
9410+ if (r->ra_start != 0ULL ||
9411+ r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
9412+ return 1;
9413+ else
9414+ return 0;
9415+}
9416+
9417+static void put_int(int x, char *buf, int *offp)
9418+{
9419+ x = cpu_to_le32(x);
9420+ memcpy(buf + *offp, &x, sizeof(int));
9421+ *offp += sizeof(int);
9422+}
9423+
9424+static void put_int64(uint64_t x, char *buf, int *offp)
9425+{
9426+ x = cpu_to_le64(x);
9427+ memcpy(buf + *offp, &x, sizeof(uint64_t));
9428+ *offp += sizeof(uint64_t);
9429+}
9430+
9431+static int get_int(char *buf, int *offp)
9432+{
9433+ int value;
9434+ memcpy(&value, buf + *offp, sizeof(int));
9435+ *offp += sizeof(int);
9436+ return le32_to_cpu(value);
9437+}
9438+
9439+static uint64_t get_int64(char *buf, int *offp)
9440+{
9441+ uint64_t value;
9442+
9443+ memcpy(&value, buf + *offp, sizeof(uint64_t));
9444+ *offp += sizeof(uint64_t);
9445+ return le64_to_cpu(value);
9446+}
9447+
9448+#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
9449+
9450+/* Called from recvd to get lock info for a remote node */
9451+int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
9452+{
9453+ struct gd_remquery *query = (struct gd_remquery *) msg;
9454+ struct gd_remqueryreply *reply;
9455+ struct dlm_resinfo resinfo;
9456+ struct dlm_queryinfo qinfo;
9457+ struct writequeue_entry *e;
9458+ char *buf;
9459+ gd_lkb_t *lkb;
9460+ int status = 0;
9461+ int bufidx;
9462+ int finished = 0;
9463+ int cur_lock = 0;
9464+ int start_lock = 0;
9465+
9466+ lkb = find_lock_by_id(ls, query->rq_mstlkid);
9467+ if (!lkb) {
9468+ status = -EINVAL;
9469+ goto send_error;
9470+ }
9471+
9472+ qinfo.gqi_resinfo = &resinfo;
9473+ qinfo.gqi_locksize = query->rq_maxlocks;
9474+
9475+ /* Get the resource bits */
9476+ query_resource(lkb->lkb_resource, &resinfo);
9477+
9478+ /* Now get the locks if wanted */
9479+ if (query->rq_maxlocks) {
9480+ qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
9481+ GFP_KERNEL);
9482+ if (!qinfo.gqi_lockinfo) {
9483+ status = -ENOMEM;
9484+ goto send_error;
9485+ }
9486+
9487+ status = query_locks(query->rq_query, lkb, &qinfo);
9488+ if (status && status != -E2BIG) {
9489+ kfree(qinfo.gqi_lockinfo);
9490+ goto send_error;
9491+ }
9492+ }
9493+ else {
9494+ qinfo.gqi_lockinfo = NULL;
9495+ qinfo.gqi_lockcount = 0;
9496+ }
9497+
9498+ /* Send as many blocks as needed for all the locks */
9499+ do {
9500+ int i;
9501+ int msg_len = sizeof(struct gd_remqueryreply);
9502+ int last_msg_len = msg_len; /* keeps compiler quiet */
9503+ int last_lock;
9504+
9505+ /* First work out how many locks we can fit into a block */
9506+ for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
9507+
9508+ last_msg_len = msg_len;
9509+
9510+ msg_len += LOCK_LEN;
9511+ if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
9512+ valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
9513+
9514+ msg_len += sizeof(uint64_t) * 4;
9515+ }
9516+ }
9517+
9518+ /* There must be a neater way of doing this... */
9519+ if (msg_len > PAGE_SIZE) {
9520+ last_lock = i-1;
9521+ msg_len = last_msg_len;
9522+ }
9523+ else {
9524+ last_lock = i;
9525+ }
9526+
9527+ e = lowcomms_get_buffer(nodeid,
9528+ msg_len,
9529+ ls->ls_allocation,
9530+ (char **) &reply);
9531+ if (!e) {
9532+ kfree(qinfo.gqi_lockinfo);
9533+ status = -ENOBUFS;
9534+ goto out;
9535+ }
9536+
9537+ reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
9538+ reply->rq_header.rh_length = msg_len;
9539+ reply->rq_header.rh_lkid = msg->rh_lkid;
9540+ reply->rq_header.rh_lockspace = msg->rh_lockspace;
9541+
9542+ reply->rq_status = status;
9543+ reply->rq_startlock = cur_lock;
9544+ reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
9545+ reply->rq_convcount = qinfo.gqi_resinfo->rsi_convcount;
9546+ reply->rq_waitcount = qinfo.gqi_resinfo->rsi_waitcount;
9547+ memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
9548+
9549+ buf = (char *)reply;
9550+ bufidx = sizeof(struct gd_remqueryreply);
9551+
9552+ for (; cur_lock < last_lock; cur_lock++) {
9553+
9554+ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
9555+ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
9556+ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
9557+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
9558+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
9559+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
9560+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
9561+
9562+ if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
9563+ valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
9564+
9565+ buf[bufidx++] = 1;
9566+ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
9567+ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
9568+ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
9569+ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
9570+ }
9571+ else {
9572+ buf[bufidx++] = 0;
9573+ }
9574+ }
9575+
9576+ if (cur_lock == qinfo.gqi_lockcount) {
9577+ reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
9578+ finished = 1;
9579+ }
9580+ else {
9581+ reply->rq_header.rh_flags = 0;
9582+ }
9583+
9584+ reply->rq_numlocks = cur_lock - start_lock;
9585+ start_lock = cur_lock;
9586+
9587+ midcomms_send_buffer(&reply->rq_header, e);
9588+ } while (!finished);
9589+
9590+ kfree(qinfo.gqi_lockinfo);
9591+ out:
9592+ return status;
9593+
9594+ send_error:
9595+ e = lowcomms_get_buffer(nodeid,
9596+ sizeof(struct gd_remqueryreply),
9597+ ls->ls_allocation,
9598+ (char **) &reply);
9599+ if (!e) {
9600+ status = -ENOBUFS;
9601+ goto out;
9602+ }
9603+ reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
9604+ reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */
9605+ reply->rq_header.rh_length = sizeof(struct gd_remqueryreply);
9606+ reply->rq_header.rh_lkid = msg->rh_lkid;
9607+ reply->rq_header.rh_lockspace = msg->rh_lockspace;
9608+ reply->rq_status = status;
9609+ reply->rq_numlocks = 0;
9610+ reply->rq_startlock = 0;
9611+ reply->rq_grantcount = 0;
9612+ reply->rq_convcount = 0;
9613+ reply->rq_waitcount = 0;
9614+
9615+ midcomms_send_buffer(&reply->rq_header, e);
9616+
9617+ return status;
9618+}
9619+
9620+/* Reply to a remote query */
9621+int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
9622+{
9623+ gd_lkb_t *query_lkb;
9624+ struct dlm_queryinfo *qinfo;
9625+ struct gd_remqueryreply *reply;
9626+ char *buf;
9627+ int i;
9628+ int bufidx;
9629+
9630+ query_lkb = find_lock_by_id(ls, msg->rh_lkid);
9631+ if (!query_lkb)
9632+ return -EINVAL;
9633+
9634+ qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
9635+ reply = (struct gd_remqueryreply *) msg;
9636+
9637+ /* Copy the easy bits first */
9638+ qinfo->gqi_lockcount += reply->rq_numlocks;
9639+ if (qinfo->gqi_resinfo) {
9640+ qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
9641+ qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
9642+ qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
9643+ memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
9644+ DLM_LVB_LEN);
9645+ }
9646+
9647+ /* Now unpack the locks */
9648+ bufidx = sizeof(struct gd_remqueryreply);
9649+ buf = (char *) msg;
9650+
9651+ GDLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
9652+ printk("start = %d, num + %d. Max= %d\n",
9653+ reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
9654+
9655+ for (i = reply->rq_startlock;
9656+ i < reply->rq_startlock + reply->rq_numlocks; i++) {
9657+ qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
9658+ qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
9659+ qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
9660+ qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
9661+ qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
9662+ qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
9663+ qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
9664+ if (buf[bufidx++]) {
9665+ qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
9666+ qinfo->gqi_lockinfo[i].lki_grrange.ra_end = get_int64(buf, &bufidx);
9667+ qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
9668+ qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = get_int64(buf, &bufidx);
9669+ }
9670+ else {
9671+ qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
9672+ qinfo->gqi_lockinfo[i].lki_grrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
9673+ qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
9674+ qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
9675+ }
9676+ }
9677+
9678+ /* If this was the last block then now tell the user */
9679+ if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
9680+ query_lkb->lkb_retstatus = reply->rq_status;
5cdbd17b 9681+ queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
4bf12011 9682+ wake_astd();
9683+ }
9684+
9685+ return 0;
9686+}
9687+
9688+/* Aggregate resource information */
9689+static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo)
9690+{
9691+ struct list_head *tmp;
9692+
9693+
9694+ if (rsb->res_lvbptr)
9695+ memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
9696+
9697+ resinfo->rsi_grantcount = 0;
9698+ list_for_each(tmp, &rsb->res_grantqueue) {
9699+ resinfo->rsi_grantcount++;
9700+ }
9701+
9702+ resinfo->rsi_waitcount = 0;
9703+ list_for_each(tmp, &rsb->res_waitqueue) {
9704+ resinfo->rsi_waitcount++;
9705+ }
9706+
9707+ resinfo->rsi_convcount = 0;
9708+ list_for_each(tmp, &rsb->res_convertqueue) {
9709+ resinfo->rsi_convcount++;
9710+ }
9711+
9712+ return 0;
9713+}
9714+
9715+static int add_lock(gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
9716+{
9717+ int entry;
9718+
9719+ /* Don't fill it in if the buffer is full */
9720+ if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
9721+ return -E2BIG;
9722+
9723+ /* gqi_lockcount contains the number of locks we have returned */
9724+ entry = qinfo->gqi_lockcount++;
9725+
9726+ /* Fun with master copies */
9727+ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
9728+ qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
9729+ qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
9730+ }
9731+ else {
9732+ qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
9733+ qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
9734+ }
9735+
9736+ /* Also make sure we always have a valid nodeid in there, the
9737+ calling end may not know which node "0" is */
9738+ if (lkb->lkb_nodeid)
9739+ qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
9740+ else
9741+ qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
9742+
9743+ if (lkb->lkb_parent)
9744+ qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
9745+ else
9746+ qinfo->gqi_lockinfo[entry].lki_parent = 0;
9747+
9748+ qinfo->gqi_lockinfo[entry].lki_state = lkb->lkb_status;
9749+ qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
9750+ qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
9751+
9752+ if (lkb->lkb_range) {
9753+ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
9754+ lkb->lkb_range[GR_RANGE_START];
9755+ qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
9756+ lkb->lkb_range[GR_RANGE_END];
9757+ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
9758+ lkb->lkb_range[RQ_RANGE_START];
9759+ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
9760+ lkb->lkb_range[RQ_RANGE_END];
9761+ } else {
9762+ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
9763+ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
9764+ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
9765+ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
9766+ }
9767+ return 0;
9768+}
9769+
9770+static int query_lkb_queue(struct list_head *queue, int query,
9771+ struct dlm_queryinfo *qinfo)
9772+{
9773+ struct list_head *tmp;
9774+ int status = 0;
9775+ int mode = query & DLM_QUERY_MODE_MASK;
9776+
9777+ list_for_each(tmp, queue) {
9778+ gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9779+ int lkmode;
9780+
9781+ if (query & DLM_QUERY_RQMODE)
9782+ lkmode = lkb->lkb_rqmode;
9783+ else
9784+ lkmode = lkb->lkb_grmode;
9785+
9786+ /* Add the LKB info to the list if it matches the criteria in
9787+ * the query bitmap */
9788+ switch (query & DLM_QUERY_MASK) {
9789+ case DLM_QUERY_LOCKS_ALL:
9790+ status = add_lock(lkb, qinfo);
9791+ break;
9792+
9793+ case DLM_QUERY_LOCKS_HIGHER:
9794+ if (lkmode > mode)
9795+ status = add_lock(lkb, qinfo);
9796+ break;
9797+
9798+ case DLM_QUERY_LOCKS_EQUAL:
9799+ if (lkmode == mode)
9800+ status = add_lock(lkb, qinfo);
9801+ break;
9802+
9803+ case DLM_QUERY_LOCKS_LOWER:
9804+ if (lkmode < mode)
9805+ status = add_lock(lkb, qinfo);
9806+ break;
9807+ }
9808+ }
9809+ return status;
9810+}
9811+
9812+/*
9813+ * Return 1 if the locks' ranges overlap
9814+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
9815+ */
9816+static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
9817+{
9818+ if (!lkb1->lkb_range || !lkb2->lkb_range)
9819+ return 1;
9820+
9821+ if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
9822+ lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
9823+ return 0;
9824+
9825+ return 1;
9826+}
9827+extern const int __dlm_compat_matrix[8][8];
9828+
9829+
9830+static int get_blocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
9831+{
9832+ struct list_head *tmp;
9833+ int status = 0;
9834+
9835+ list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9836+ gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9837+
9838+ if (ranges_overlap(lkb, qlkb) &&
9839+ !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
9840+ status = add_lock(lkb, qinfo);
9841+ }
9842+
9843+ return status;
9844+}
9845+
9846+static int get_nonblocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
9847+{
9848+ struct list_head *tmp;
9849+ int status = 0;
9850+
9851+ list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9852+ gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
9853+
9854+ if (!(ranges_overlap(lkb, qlkb) &&
9855+ !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
9856+ status = add_lock(lkb, qinfo);
9857+ }
9858+
9859+ return status;
9860+}
9861+
9862+/* Gather a list of appropriate locks */
9863+static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
9864+{
9865+ int status = 0;
9866+
9867+
9868+ /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
9869+ * was requested as the mode
9870+ */
9871+ if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
9872+ query &= ~DLM_QUERY_MODE_MASK;
9873+ if (query & DLM_QUERY_RQMODE)
9874+ query |= lkb->lkb_rqmode;
9875+ else
9876+ query |= lkb->lkb_grmode;
9877+ }
9878+
9879+ qinfo->gqi_lockcount = 0;
9880+
9881+ /* BLOCKING/NOTBLOCK only look at the granted queue */
9882+ if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
9883+ return get_blocking_locks(lkb, qinfo);
9884+
9885+ if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
9886+ return get_nonblocking_locks(lkb, qinfo);
9887+
9888+ /* Do the lock queues that were requested */
9889+ if (query & DLM_QUERY_QUEUE_GRANT) {
9890+ status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue,
9891+ query, qinfo);
9892+ }
9893+
9894+ if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
9895+ status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue,
9896+ query, qinfo);
9897+ }
9898+
9899+ if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
9900+ status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue,
9901+ query, qinfo);
9902+ }
9903+
9904+
9905+ return status;
9906+}
9907+
9908+EXPORT_SYMBOL(dlm_query);
9909+/*
9910+ * Overrides for Emacs so that we follow Linus's tabbing style.
9911+ * Emacs will notice this stuff at the end of the file and automatically
9912+ * adjust the settings for this buffer only. This must remain at the end
9913+ * of the file.
9914+ * ---------------------------------------------------------------------------
9915+ * Local variables:
9916+ * c-file-style: "linux"
9917+ * End:
9918+ */
9919diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
9920--- linux-orig/cluster/dlm/queries.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9921+++ linux-patched/cluster/dlm/queries.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 9922@@ -0,0 +1,20 @@
9923+/******************************************************************************
9924+*******************************************************************************
9925+**
9926+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9927+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9928+**
9929+** This copyrighted material is made available to anyone wishing to use,
9930+** modify, copy, or redistribute it subject to the terms and conditions
9931+** of the GNU General Public License v.2.
9932+**
9933+*******************************************************************************
9934+******************************************************************************/
9935+
9936+#ifndef __QUERIES_DOT_H__
9937+#define __QUERIES_DOT_H__
9938+
9939+extern int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
9940+extern int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
9941+
9942+#endif /* __QUERIES_DOT_H__ */
9943diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
9944--- linux-orig/cluster/dlm/rebuild.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
9945+++ linux-patched/cluster/dlm/rebuild.c 2004-06-29 20:01:20.000000000 +0800
9946@@ -0,0 +1,1245 @@
4bf12011 9947+/******************************************************************************
9948+*******************************************************************************
9949+**
9950+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9951+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9952+**
9953+** This copyrighted material is made available to anyone wishing to use,
9954+** modify, copy, or redistribute it subject to the terms and conditions
9955+** of the GNU General Public License v.2.
9956+**
9957+*******************************************************************************
9958+******************************************************************************/
9959+
9960+/*
9961+ * Rebuild RSB's on new masters. Functions for transferring locks and
9962+ * subresources to new RSB masters during recovery.
9963+ */
9964+
9965+#include "dlm_internal.h"
9966+#include "reccomms.h"
9967+#include "lkb.h"
9968+#include "rsb.h"
9969+#include "nodes.h"
9970+#include "config.h"
9971+#include "memory.h"
9972+#include "recover.h"
9973+
9974+
9975+/* Types of entity serialised in remastering messages */
9976+#define REMASTER_ROOTRSB 1
9977+#define REMASTER_RSB 2
9978+#define REMASTER_LKB 3
9979+
9980+struct rcom_fill {
9981+ char * outbuf; /* Beginning of data */
9982+ int offset; /* Current offset into outbuf */
9983+ int maxlen; /* Max value of offset */
9984+ int remasterid;
9985+ int count;
9986+ gd_res_t * rsb;
9987+ gd_res_t * subrsb;
9988+ gd_lkb_t * lkb;
9989+ struct list_head * lkbqueue;
9990+ char more;
9991+};
9992+typedef struct rcom_fill rcom_fill_t;
9993+
9994+
9995+struct rebuild_node {
9996+ struct list_head list;
9997+ int nodeid;
9998+ gd_res_t * rootrsb;
9999+};
10000+typedef struct rebuild_node rebuild_node_t;
10001+
10002+
10003+/*
10004+ * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
10005+ * master. The rsb will be "done" with recovery when the new master has
10006+ * replied with all the new remote lockid's for this rsb's lkb's.
10007+ */
10008+
10009+void expect_new_lkids(gd_res_t *rsb)
10010+{
10011+ rsb->res_newlkid_expect = 0;
10012+ recover_list_add(rsb);
10013+}
10014+
10015+/*
10016+ * This function is called on root rsb or subrsb when another lkb is being sent
10017+ * to the new master for which we expect to receive a corresponding remote lkid
10018+ */
10019+
10020+void need_new_lkid(gd_res_t *rsb)
10021+{
10022+ gd_res_t *root = rsb;
10023+
10024+ if (rsb->res_parent)
10025+ root = rsb->res_root;
10026+
10027+ if (!root->res_newlkid_expect)
10028+ recover_list_add(root);
10029+ else
10030+ GDLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
10031+
10032+ root->res_newlkid_expect++;
10033+}
10034+
10035+/*
10036+ * This function is called for each lkb for which a new remote lkid is
10037+ * received. Decrement the expected number of remote lkids expected for the
10038+ * root rsb.
10039+ */
10040+
10041+void have_new_lkid(gd_lkb_t *lkb)
10042+{
10043+ gd_res_t *root = lkb->lkb_resource;
10044+
10045+ if (root->res_parent)
10046+ root = root->res_root;
10047+
10048+ down_write(&root->res_lock);
10049+
10050+ GDLM_ASSERT(root->res_newlkid_expect,
10051+ printk("newlkid_expect=%d\n", root->res_newlkid_expect););
10052+
10053+ root->res_newlkid_expect--;
10054+
10055+ if (!root->res_newlkid_expect) {
10056+ clear_bit(RESFL_NEW_MASTER, &root->res_flags);
10057+ recover_list_del(root);
10058+ }
10059+ up_write(&root->res_lock);
10060+}
10061+
10062+/*
10063+ * Return the rebuild struct for a node - will create an entry on the rootrsb
10064+ * list if necessary.
10065+ *
10066+ * Currently no locking is needed here as it all happens in the gdlm_recvd
10067+ * thread
10068+ */
10069+
10070+static rebuild_node_t *find_rebuild_root(gd_ls_t *ls, int nodeid)
10071+{
10072+ rebuild_node_t *node = NULL;
10073+
10074+ list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
10075+ if (node->nodeid == nodeid)
10076+ return node;
10077+ }
10078+
10079+ /* Not found, add one */
10080+ node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
10081+ if (!node)
10082+ return NULL;
10083+
10084+ node->nodeid = nodeid;
10085+ node->rootrsb = NULL;
10086+ list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
10087+
10088+ return node;
10089+}
10090+
10091+/*
10092+ * Tidy up after a rebuild run. Called when all recovery has finished
10093+ */
10094+
10095+void rebuild_freemem(gd_ls_t *ls)
10096+{
10097+ rebuild_node_t *node = NULL, *s;
10098+
10099+ list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
10100+ list_del(&node->list);
10101+ kfree(node);
10102+ }
10103+}
10104+
10105+static void put_int(int x, char *buf, int *offp)
10106+{
10107+ x = cpu_to_le32(x);
10108+ memcpy(buf + *offp, &x, sizeof(int));
10109+ *offp += sizeof(int);
10110+}
10111+
10112+static void put_int64(uint64_t x, char *buf, int *offp)
10113+{
10114+ x = cpu_to_le64(x);
10115+ memcpy(buf + *offp, &x, sizeof(uint64_t));
10116+ *offp += sizeof(uint64_t);
10117+}
10118+
10119+static void put_bytes(char *x, int len, char *buf, int *offp)
10120+{
10121+ put_int(len, buf, offp);
10122+ memcpy(buf + *offp, x, len);
10123+ *offp += len;
10124+}
10125+
10126+static void put_char(char x, char *buf, int *offp)
10127+{
10128+ buf[*offp] = x;
10129+ *offp += 1;
10130+}
10131+
10132+static int get_int(char *buf, int *offp)
10133+{
10134+ int value;
10135+ memcpy(&value, buf + *offp, sizeof(int));
10136+ *offp += sizeof(int);
10137+ return le32_to_cpu(value);
10138+}
10139+
10140+static uint64_t get_int64(char *buf, int *offp)
10141+{
10142+ uint64_t value;
10143+
10144+ memcpy(&value, buf + *offp, sizeof(uint64_t));
10145+ *offp += sizeof(uint64_t);
10146+ return le64_to_cpu(value);
10147+}
10148+
10149+static char get_char(char *buf, int *offp)
10150+{
10151+ char x = buf[*offp];
10152+
10153+ *offp += 1;
10154+ return x;
10155+}
10156+
10157+static void get_bytes(char *bytes, int *len, char *buf, int *offp)
10158+{
10159+ *len = get_int(buf, offp);
10160+ memcpy(bytes, buf + *offp, *len);
10161+ *offp += *len;
10162+}
10163+
10164+static int lkb_length(gd_lkb_t *lkb)
10165+{
10166+ int len = 0;
10167+
10168+ len += sizeof(int); /* lkb_id */
10169+ len += sizeof(int); /* lkb_resource->res_reamasterid */
10170+ len += sizeof(int); /* lkb_flags */
10171+ len += sizeof(int); /* lkb_status */
10172+ len += sizeof(char); /* lkb_rqmode */
10173+ len += sizeof(char); /* lkb_grmode */
10174+ len += sizeof(int); /* lkb_childcnt */
10175+ len += sizeof(int); /* lkb_parent->lkb_id */
10176+ len += sizeof(int); /* lkb_bastaddr */
10177+
10178+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10179+ len += sizeof(int); /* number of lvb bytes */
10180+ len += DLM_LVB_LEN;
10181+ }
10182+
10183+ if (lkb->lkb_range) {
10184+ len += sizeof(uint64_t);
10185+ len += sizeof(uint64_t);
10186+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
10187+ len += sizeof(uint64_t);
10188+ len += sizeof(uint64_t);
10189+ }
10190+ }
10191+
10192+ return len;
10193+}
10194+
10195+/*
10196+ * It's up to the caller to be sure there's enough space in the buffer.
10197+ */
10198+
10199+static void serialise_lkb(gd_lkb_t *lkb, char *buf, int *offp)
10200+{
10201+ int flags;
10202+
10203+ /* Need to tell the remote end if we have a range */
10204+ flags = lkb->lkb_flags;
10205+ if (lkb->lkb_range)
10206+ flags |= GDLM_LKFLG_RANGE;
10207+
10208+ /*
10209+ * See lkb_length()
10210+ * Total: 30 (no lvb) or 66 (with lvb) bytes
10211+ */
10212+
10213+ put_int(lkb->lkb_id, buf, offp);
10214+ put_int(lkb->lkb_resource->res_remasterid, buf, offp);
10215+ put_int(flags, buf, offp);
10216+ put_int(lkb->lkb_status, buf, offp);
10217+ put_char(lkb->lkb_rqmode, buf, offp);
10218+ put_char(lkb->lkb_grmode, buf, offp);
10219+ put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
10220+
10221+ if (lkb->lkb_parent)
10222+ put_int(lkb->lkb_parent->lkb_id, buf, offp);
10223+ else
10224+ put_int(0, buf, offp);
10225+
10226+ if (lkb->lkb_bastaddr)
10227+ put_int(1, buf, offp);
10228+ else
10229+ put_int(0, buf, offp);
10230+
10231+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10232+ GDLM_ASSERT(lkb->lkb_lvbptr,);
10233+ put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
10234+ }
10235+
10236+ /* Only send the range we actually need */
10237+ if (lkb->lkb_range) {
10238+ switch (lkb->lkb_status) {
10239+ case GDLM_LKSTS_CONVERT:
10240+ put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10241+ put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10242+ put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10243+ put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10244+ break;
10245+ case GDLM_LKSTS_WAITING:
10246+ put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10247+ put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10248+ break;
10249+ case GDLM_LKSTS_GRANTED:
10250+ put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10251+ put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10252+ break;
10253+ default:
10254+ GDLM_ASSERT(0,);
10255+ }
10256+ }
10257+}
10258+
10259+static int rsb_length(gd_res_t *rsb)
10260+{
10261+ int len = 0;
10262+
10263+ len += sizeof(int); /* number of res_name bytes */
10264+ len += rsb->res_length; /* res_name */
10265+ len += sizeof(int); /* res_remasterid */
10266+ len += sizeof(int); /* res_parent->res_remasterid */
10267+
10268+ return len;
10269+}
10270+
10271+static inline gd_res_t *next_subrsb(gd_res_t *subrsb)
10272+{
10273+ struct list_head *tmp;
10274+ gd_res_t *r;
10275+
10276+ tmp = subrsb->res_subreslist.next;
10277+ r = list_entry(tmp, gd_res_t, res_subreslist);
10278+
10279+ return r;
10280+}
10281+
10282+static inline int last_in_list(gd_res_t *r, struct list_head *head)
10283+{
10284+ gd_res_t *last = list_entry(head->prev, gd_res_t, res_subreslist);
10285+
10286+ if (last == r)
10287+ return 1;
10288+ return 0;
10289+}
10290+
10291+/*
10292+ * Used to decide if an rsb should be rebuilt on a new master. An rsb only
10293+ * needs to be rebuild if we have lkb's queued on it. NOREBUILD lkb's on the
10294+ * wait queue are not rebuilt.
10295+ */
10296+
10297+static int lkbs_to_remaster(gd_res_t *r)
10298+{
10299+ gd_lkb_t *lkb;
10300+ gd_res_t *sub;
10301+
10302+ if (!list_empty(&r->res_grantqueue) ||
10303+ !list_empty(&r->res_convertqueue))
10304+ return TRUE;
10305+
10306+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
10307+ if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10308+ continue;
10309+ return TRUE;
10310+ }
10311+
10312+ list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
10313+ if (!list_empty(&sub->res_grantqueue) ||
10314+ !list_empty(&sub->res_convertqueue))
10315+ return TRUE;
10316+
10317+ list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) {
10318+ if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10319+ continue;
10320+ return TRUE;
10321+ }
10322+ }
10323+
10324+ return FALSE;
10325+}
10326+
10327+static void serialise_rsb(gd_res_t *rsb, char *buf, int *offp)
10328+{
10329+ /*
10330+ * See rsb_length()
10331+ * Total: 36 bytes (4 + 24 + 4 + 4)
10332+ */
10333+
10334+ put_bytes(rsb->res_name, rsb->res_length, buf, offp);
10335+ put_int(rsb->res_remasterid, buf, offp);
10336+
10337+ if (rsb->res_parent)
10338+ put_int(rsb->res_parent->res_remasterid, buf, offp);
10339+ else
10340+ put_int(0, buf, offp);
10341+
10342+ GDLM_ASSERT(!rsb->res_lvbptr,);
10343+}
10344+
10345+/*
10346+ * Flatten an LKB into a buffer for sending to the new RSB master. As a
10347+ * side-effect the nodeid of the lock is set to the nodeid of the new RSB
10348+ * master.
10349+ */
10350+
10351+static int pack_one_lkb(gd_res_t *r, gd_lkb_t *lkb, rcom_fill_t *fill)
10352+{
10353+ if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
10354+ goto nospace;
10355+
10356+ lkb->lkb_nodeid = r->res_nodeid;
10357+
10358+ put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
10359+ serialise_lkb(lkb, fill->outbuf, &fill->offset);
10360+
10361+ fill->count++;
10362+ need_new_lkid(r);
10363+ return 0;
10364+
10365+ nospace:
10366+ return -ENOSPC;
10367+}
10368+
10369+/*
10370+ * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
10371+ */
10372+
10373+static int pack_lkb_queue(gd_res_t *r, struct list_head *queue,
10374+ rcom_fill_t *fill)
10375+{
10376+ gd_lkb_t *lkb;
10377+ int error;
10378+
10379+ list_for_each_entry(lkb, queue, lkb_statequeue) {
10380+ if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10381+ continue;
10382+
10383+ error = pack_one_lkb(r, lkb, fill);
10384+ if (error)
10385+ goto nospace;
10386+ }
10387+
10388+ return 0;
10389+
10390+ nospace:
10391+ fill->lkb = lkb;
10392+ fill->lkbqueue = queue;
10393+
10394+ return error;
10395+}
10396+
10397+static int pack_lkb_queues(gd_res_t *r, rcom_fill_t *fill)
10398+{
10399+ int error;
10400+
10401+ error = pack_lkb_queue(r, &r->res_grantqueue, fill);
10402+ if (error)
10403+ goto nospace;
10404+
10405+ error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10406+ if (error)
10407+ goto nospace;
10408+
10409+ error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10410+
10411+ nospace:
10412+ return error;
10413+}
10414+
10415+/*
10416+ * Pack remaining lkb's for rsb or subrsb. This may include a partial lkb
10417+ * queue and full lkb queues.
10418+ */
10419+
10420+static int pack_lkb_remaining(gd_res_t *r, rcom_fill_t *fill)
10421+{
10422+ struct list_head *tmp, *start, *end;
10423+ gd_lkb_t *lkb;
10424+ int error;
10425+
10426+ /*
10427+ * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
10428+ */
10429+
10430+ error = pack_one_lkb(r, fill->lkb, fill);
10431+ if (error)
10432+ goto out;
10433+
10434+ start = fill->lkb->lkb_statequeue.next;
10435+ end = fill->lkbqueue;
10436+
10437+ for (tmp = start; tmp != end; tmp = tmp->next) {
10438+ lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
10439+
10440+ error = pack_one_lkb(r, lkb, fill);
10441+ if (error) {
10442+ fill->lkb = lkb;
10443+ goto out;
10444+ }
10445+ }
10446+
10447+ /*
10448+ * Pack all lkb's on r's queues following fill->lkbqueue.
10449+ */
10450+
10451+ if (fill->lkbqueue == &r->res_waitqueue)
10452+ goto out;
10453+ if (fill->lkbqueue == &r->res_convertqueue)
10454+ goto skip;
10455+
10456+ GDLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
10457+
10458+ error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10459+ if (error)
10460+ goto out;
10461+ skip:
10462+ error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10463+
10464+ out:
10465+ return error;
10466+}
10467+
10468+static int pack_one_subrsb(gd_res_t *rsb, gd_res_t *subrsb, rcom_fill_t *fill)
10469+{
10470+ int error;
10471+
10472+ down_write(&subrsb->res_lock);
10473+
10474+ if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
10475+ goto nospace;
10476+
10477+ subrsb->res_nodeid = rsb->res_nodeid;
10478+ subrsb->res_remasterid = ++fill->remasterid;
10479+
10480+ put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
10481+ serialise_rsb(subrsb, fill->outbuf, &fill->offset);
10482+
10483+ error = pack_lkb_queues(subrsb, fill);
10484+ if (error)
10485+ goto nospace;
10486+
10487+ up_write(&subrsb->res_lock);
10488+
10489+ return 0;
10490+
10491+ nospace:
10492+ up_write(&subrsb->res_lock);
10493+ fill->subrsb = subrsb;
10494+
10495+ return -ENOSPC;
10496+}
10497+
10498+static int pack_subrsbs(gd_res_t *rsb, gd_res_t *in_subrsb, rcom_fill_t *fill)
10499+{
10500+ gd_res_t *subrsb;
10501+ int error = 0;
10502+
10503+ /*
10504+ * When an initial subrsb is given, we know it needs to be packed.
10505+ * When no initial subrsb is given, begin with the first (if any exist).
10506+ */
10507+
10508+ if (!in_subrsb) {
10509+ if (list_empty(&rsb->res_subreslist))
10510+ goto out;
10511+
10512+ subrsb = list_entry(rsb->res_subreslist.next, gd_res_t,
10513+ res_subreslist);
10514+ } else
10515+ subrsb = in_subrsb;
10516+
10517+ for (;;) {
10518+ error = pack_one_subrsb(rsb, subrsb, fill);
10519+ if (error)
10520+ goto out;
10521+
10522+ if (last_in_list(subrsb, &rsb->res_subreslist))
10523+ break;
10524+
10525+ subrsb = next_subrsb(subrsb);
10526+ }
10527+
10528+ out:
10529+ return error;
10530+}
10531+
10532+/*
10533+ * Finish packing whatever is left in an rsb tree. If space runs out while
10534+ * finishing, save subrsb/lkb and this will be called again for the same rsb.
10535+ *
10536+ * !subrsb && lkb, we left off part way through root rsb's lkbs.
10537+ * subrsb && !lkb, we left off just before starting a new subrsb.
10538+ * subrsb && lkb, we left off part way through a subrsb's lkbs.
10539+ * !subrsb && !lkb, we shouldn't be in this function, but starting
10540+ * a new rsb in pack_rsb_tree().
10541+ */
10542+
10543+static int pack_rsb_tree_remaining(gd_ls_t *ls, gd_res_t *rsb,
10544+ rcom_fill_t *fill)
10545+{
10546+ gd_res_t *subrsb = NULL;
10547+ int error = 0;
10548+
10549+ if (!fill->subrsb && fill->lkb) {
10550+ error = pack_lkb_remaining(rsb, fill);
10551+ if (error)
10552+ goto out;
10553+
10554+ error = pack_subrsbs(rsb, NULL, fill);
10555+ if (error)
10556+ goto out;
10557+ }
10558+
10559+ else if (fill->subrsb && !fill->lkb) {
10560+ error = pack_subrsbs(rsb, fill->subrsb, fill);
10561+ if (error)
10562+ goto out;
10563+ }
10564+
10565+ else if (fill->subrsb && fill->lkb) {
10566+ error = pack_lkb_remaining(fill->subrsb, fill);
10567+ if (error)
10568+ goto out;
10569+
10570+ if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
10571+ goto out;
10572+
10573+ subrsb = next_subrsb(fill->subrsb);
10574+
10575+ error = pack_subrsbs(rsb, subrsb, fill);
10576+ if (error)
10577+ goto out;
10578+ }
10579+
10580+ fill->subrsb = NULL;
10581+ fill->lkb = NULL;
10582+
10583+ out:
10584+ return error;
10585+}
10586+
10587+/*
10588+ * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
10589+ * buffer. When the buffer runs out of space, save the place to restart (the
10590+ * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
10591+ */
10592+
10593+static int pack_rsb_tree(gd_ls_t *ls, gd_res_t *rsb, rcom_fill_t *fill)
10594+{
10595+ int error = -ENOSPC;
10596+
10597+ fill->remasterid = 0;
10598+
10599+ /*
10600+ * Pack the root rsb itself. A 1 byte type precedes the serialised
10601+ * rsb. Then pack the lkb's for the root rsb.
10602+ */
10603+
10604+ down_write(&rsb->res_lock);
10605+
10606+ if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
10607+ goto out;
10608+
10609+ rsb->res_remasterid = ++fill->remasterid;
10610+ put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
10611+ serialise_rsb(rsb, fill->outbuf, &fill->offset);
10612+
10613+ error = pack_lkb_queues(rsb, fill);
10614+ if (error)
10615+ goto out;
10616+
10617+ up_write(&rsb->res_lock);
10618+
10619+ /*
10620+ * Pack subrsb/lkb's under the root rsb.
10621+ */
10622+
10623+ error = pack_subrsbs(rsb, NULL, fill);
10624+
10625+ return error;
10626+
10627+ out:
10628+ up_write(&rsb->res_lock);
10629+ return error;
10630+}
10631+
10632+/*
10633+ * Given an RSB, return the next RSB that should be sent to a new master.
10634+ */
10635+
10636+static gd_res_t *next_remastered_rsb(gd_ls_t *ls, gd_res_t *rsb)
10637+{
10638+ struct list_head *tmp, *start, *end;
10639+ gd_res_t *r;
10640+
10641+ if (!rsb)
10642+ start = ls->ls_rootres.next;
10643+ else
10644+ start = rsb->res_rootlist.next;
10645+
10646+ end = &ls->ls_rootres;
10647+
10648+ for (tmp = start; tmp != end; tmp = tmp->next) {
10649+ r = list_entry(tmp, gd_res_t, res_rootlist);
10650+
10651+ if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
10652+ if (r->res_nodeid && lkbs_to_remaster(r)) {
10653+ expect_new_lkids(r);
10654+ return r;
10655+ } else
10656+ clear_bit(RESFL_NEW_MASTER, &r->res_flags);
10657+ }
10658+ }
10659+
10660+ return NULL;
10661+}
10662+
10663+/*
10664+ * Given an rcom buffer, fill it with RSB's that need to be sent to a single
10665+ * new master node. In the case where all the data to send to one node
10666+ * requires multiple messages, this function needs to resume filling each
10667+ * successive buffer from the point where it left off when the previous buffer
10668+ * filled up.
10669+ */
10670+
10671+static void fill_rcom_buffer(gd_ls_t *ls, rcom_fill_t *fill, uint32_t *nodeid)
10672+{
10673+ gd_res_t *rsb, *prev_rsb = fill->rsb;
10674+ int error;
10675+
10676+ fill->offset = 0;
10677+
10678+ if (!prev_rsb) {
10679+
10680+ /*
10681+ * The first time this function is called.
10682+ */
10683+
10684+ rsb = next_remastered_rsb(ls, NULL);
10685+ if (!rsb)
10686+ goto no_more;
10687+
10688+ } else if (fill->subrsb || fill->lkb) {
10689+
10690+ /*
10691+ * Continue packing an rsb tree that was partially packed last
10692+ * time (fill->subrsb/lkb indicates where packing of last block
10693+ * left off)
10694+ */
10695+
10696+ rsb = prev_rsb;
10697+ *nodeid = rsb->res_nodeid;
10698+
10699+ error = pack_rsb_tree_remaining(ls, rsb, fill);
10700+ if (error == -ENOSPC)
10701+ goto more;
10702+
10703+ rsb = next_remastered_rsb(ls, prev_rsb);
10704+ if (!rsb)
10705+ goto no_more;
10706+
10707+ if (rsb->res_nodeid != prev_rsb->res_nodeid)
10708+ goto more;
10709+ } else {
10710+ rsb = prev_rsb;
10711+ }
10712+
10713+ /*
10714+ * Pack rsb trees into the buffer until we run out of space, run out of
10715+ * new rsb's or hit a new nodeid.
10716+ */
10717+
10718+ *nodeid = rsb->res_nodeid;
10719+
10720+ for (;;) {
10721+ error = pack_rsb_tree(ls, rsb, fill);
10722+ if (error == -ENOSPC)
10723+ goto more;
10724+
10725+ prev_rsb = rsb;
10726+
10727+ rsb = next_remastered_rsb(ls, prev_rsb);
10728+ if (!rsb)
10729+ goto no_more;
10730+
10731+ if (rsb->res_nodeid != prev_rsb->res_nodeid)
10732+ goto more;
10733+ }
10734+
10735+ more:
10736+ fill->more = 1;
10737+ fill->rsb = rsb;
10738+ return;
10739+
10740+ no_more:
10741+ fill->more = 0;
10742+}
10743+
10744+/*
10745+ * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
10746+ */
10747+
10748+int rebuild_rsbs_send(gd_ls_t *ls)
10749+{
10750+ gd_rcom_t *rc;
10751+ rcom_fill_t fill;
10752+ uint32_t nodeid;
10753+ int error;
10754+
10755+ GDLM_ASSERT(recover_list_empty(ls),);
10756+
10757+ log_all(ls, "rebuild locks");
10758+
10759+ error = -ENOMEM;
10760+ rc = allocate_rcom_buffer(ls);
10761+ if (!rc)
10762+ goto ret;
10763+
10764+ error = 0;
10765+ memset(&fill, 0, sizeof(rcom_fill_t));
10766+ fill.outbuf = rc->rc_buf;
10767+ fill.maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
10768+
10769+ do {
10770+ fill_rcom_buffer(ls, &fill, &nodeid);
10771+ if (!fill.offset)
10772+ break;
10773+
10774+ rc->rc_datalen = fill.offset;
10775+ error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
10776+ if (error)
10777+ goto out;
10778+
10779+ schedule();
10780+ error = gdlm_recovery_stopped(ls);
10781+ if (error)
10782+ goto out;
10783+ }
10784+ while (fill.more);
10785+
10786+ error = gdlm_wait_function(ls, &recover_list_empty);
10787+
10788+ log_all(ls, "rebuilt %d locks", fill.count);
10789+
10790+ out:
10791+ rebuild_freemem(ls);
10792+ free_rcom_buffer(rc);
10793+
10794+ ret:
10795+ return error;
10796+}
10797+
10798+static gd_res_t *find_by_remasterid(gd_ls_t *ls, int remasterid,
10799+ gd_res_t *rootrsb)
10800+{
10801+ gd_res_t *rsb;
10802+
10803+ GDLM_ASSERT(rootrsb,);
10804+
10805+ if (rootrsb->res_remasterid == remasterid) {
10806+ rsb = rootrsb;
10807+ goto out;
10808+ }
10809+
10810+ list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
10811+ if (rsb->res_remasterid == remasterid)
10812+ goto out;
10813+ }
10814+ rsb = NULL;
10815+
10816+ out:
10817+ return rsb;
10818+}
10819+
10820+/*
10821+ * Search a queue for the given remote lock id (remlkid).
10822+ */
10823+
10824+static gd_lkb_t *search_remlkid(struct list_head *statequeue, int nodeid,
10825+ int remid)
10826+{
10827+ gd_lkb_t *lkb;
10828+
10829+ list_for_each_entry(lkb, statequeue, lkb_statequeue) {
10830+ if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
10831+ return lkb;
10832+ }
10833+ }
10834+
10835+ return NULL;
10836+}
10837+
10838+/*
10839+ * Given a remote lock ID (and a parent resource), return the local LKB for it
10840+ * Hopefully we dont need to do this too often on deep lock trees. This is
10841+ * VERY suboptimal for anything but the smallest lock trees. It searches the
10842+ * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
10843+ * returns the LKB address. OPTIMISATION: we should keep a list of these while
10844+ * we are building up the remastered LKBs
10845+ */
10846+
10847+static gd_lkb_t *find_by_remlkid(gd_res_t *rootrsb, int nodeid, int remid)
10848+{
10849+ gd_lkb_t *lkb;
10850+ gd_res_t *rsb;
10851+
10852+ lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
10853+ if (lkb)
10854+ goto out;
10855+
10856+ lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
10857+ if (lkb)
10858+ goto out;
10859+
10860+ lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
10861+ if (lkb)
10862+ goto out;
10863+
10864+ list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
10865+ lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
10866+ if (lkb)
10867+ goto out;
10868+
10869+ lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
10870+ if (lkb)
10871+ goto out;
10872+
10873+ lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
10874+ if (lkb)
10875+ goto out;
10876+ }
10877+ lkb = NULL;
10878+
10879+ out:
10880+ return lkb;
10881+}
10882+
10883+/*
10884+ * Unpack an LKB from a remaster operation
10885+ */
10886+
10887+static int deserialise_lkb(gd_ls_t *ls, int rem_nodeid, gd_res_t *rootrsb,
10888+ char *buf, int *ptr, char *outbuf, int *outoffp)
10889+{
10890+ gd_lkb_t *lkb;
10891+ gd_res_t *rsb;
10892+ int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
10893+
10894+ remote_lkid = get_int(buf, ptr);
10895+
10896+ rsb_rmid = get_int(buf, ptr);
10897+ rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
10898+ GDLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
10899+
10900+ /*
10901+ * We could have received this lkb already from a previous recovery
10902+ * that was interrupted. If so, just return the lkid to the remote
10903+ * node.
10904+ */
10905+ lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
10906+ if (lkb)
10907+ goto put_lkid;
10908+
10909+ lkb = create_lkb(rsb->res_ls);
10910+ if (!lkb)
10911+ goto out;
10912+
10913+ lkb->lkb_remid = remote_lkid;
10914+ lkb->lkb_flags = get_int(buf, ptr);
10915+ status = get_int(buf, ptr);
10916+ lkb->lkb_rqmode = get_char(buf, ptr);
10917+ lkb->lkb_grmode = get_char(buf, ptr);
10918+ atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
10919+
10920+ parentid = get_int(buf, ptr);
10921+ lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
10922+
10923+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10924+ lkb->lkb_lvbptr = allocate_lvb(ls);
10925+ if (!lkb->lkb_lvbptr)
10926+ goto out;
10927+ get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
10928+ }
10929+
10930+ if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
10931+ uint64_t start, end;
10932+
10933+ /* Don't need to keep the range flag, for comms use only */
10934+ lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
10935+ start = get_int64(buf, ptr);
10936+ end = get_int64(buf, ptr);
10937+
10938+ lkb->lkb_range = allocate_range(rsb->res_ls);
10939+ if (!lkb->lkb_range)
10940+ goto out;
10941+
10942+ switch (status) {
10943+ case GDLM_LKSTS_CONVERT:
10944+ lkb->lkb_range[RQ_RANGE_START] = start;
10945+ lkb->lkb_range[RQ_RANGE_END] = end;
10946+ start = get_int64(buf, ptr);
10947+ end = get_int64(buf, ptr);
10948+ lkb->lkb_range[GR_RANGE_START] = start;
10949+ lkb->lkb_range[GR_RANGE_END] = end;
10950+
10951+ case GDLM_LKSTS_WAITING:
10952+ lkb->lkb_range[RQ_RANGE_START] = start;
10953+ lkb->lkb_range[RQ_RANGE_END] = end;
10954+ break;
10955+
10956+ case GDLM_LKSTS_GRANTED:
10957+ lkb->lkb_range[GR_RANGE_START] = start;
10958+ lkb->lkb_range[GR_RANGE_END] = end;
10959+ break;
10960+ default:
10961+ GDLM_ASSERT(0,);
10962+ }
10963+ }
10964+
10965+ /* Resolve local lock LKB address from parent ID */
10966+ if (parentid)
10967+ lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
10968+ parentid);
10969+
10970+ atomic_inc(&rsb->res_ref);
10971+ lkb->lkb_resource = rsb;
10972+
10973+ lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
10974+ lkb->lkb_nodeid = rem_nodeid;
10975+
10976+ /*
10977+ * Put the lkb on an RSB queue. An lkb that's in the midst of a
10978+ * conversion request (on the requesting node's lockqueue and has
10979+ * LQCONVERT set) should be put on the granted queue. The convert
10980+ * request will be resent by the requesting node.
10981+ */
10982+
10983+ if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
10984+ lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
10985+ GDLM_ASSERT(status == GDLM_LKSTS_CONVERT,
10986+ printk("status=%d\n", status););
10987+ lkb->lkb_rqmode = DLM_LOCK_IV;
10988+ status = GDLM_LKSTS_GRANTED;
10989+ }
10990+
10991+ lkb_enqueue(rsb, lkb, status);
10992+
10993+ /*
10994+ * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
10995+ */
10996+
10997+ if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
10998+ && lkb->lkb_grmode > DLM_LOCK_NL) {
10999+ if (!rsb->res_lvbptr)
11000+ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
11001+ if (!rsb->res_lvbptr)
11002+ goto out;
11003+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
11004+ }
11005+
11006+ /*
11007+ * Clear flags that may have been sent over that are only relevant in
11008+ * the context of the sender.
11009+ */
11010+
5cdbd17b
AM
11011+ lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND |
11012+ GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED);
4bf12011 11013+
11014+ put_lkid:
11015+ /* Return the new LKID to the caller's buffer */
11016+ put_int(lkb->lkb_id, outbuf, outoffp);
11017+ put_int(lkb->lkb_remid, outbuf, outoffp);
11018+ error = 0;
11019+
11020+ out:
11021+ return error;
11022+}
11023+
11024+static gd_res_t *deserialise_rsb(gd_ls_t *ls, int nodeid, gd_res_t *rootrsb,
11025+ char *buf, int *ptr)
11026+{
11027+ int length;
11028+ int remasterid;
11029+ int parent_remasterid;
11030+ char name[DLM_RESNAME_MAXLEN];
11031+ int error;
11032+ gd_res_t *parent = NULL;
11033+ gd_res_t *rsb;
11034+
11035+ get_bytes(name, &length, buf, ptr);
11036+ remasterid = get_int(buf, ptr);
11037+ parent_remasterid = get_int(buf, ptr);
11038+
11039+ if (parent_remasterid)
11040+ parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
11041+
11042+ /*
11043+ * The rsb reference from this find_or_create_rsb() will keep the rsb
11044+ * around while we add new lkb's to it from deserialise_lkb. Each of
11045+ * the lkb's will add an rsb reference. The reference added here is
11046+ * removed by release_rsb() after all lkb's are added.
11047+ */
11048+
11049+ error = find_or_create_rsb(ls, parent, name, length, 1, &rsb);
11050+ GDLM_ASSERT(!error,);
11051+
11052+ /* There is a case where the above needs to create the RSB. */
11053+ if (rsb->res_nodeid == -1)
11054+ rsb->res_nodeid = our_nodeid();
11055+
11056+ rsb->res_remasterid = remasterid;
11057+
11058+ return rsb;
11059+}
11060+
11061+/*
11062+ * Processing at the receiving end of a NEWLOCKS message from a node in
11063+ * rebuild_rsbs_send(). Rebuild a remastered lock tree. Nodeid is the remote
11064+ * node whose locks we are now mastering. For a reply we need to send back the
11065+ * new lockids of the remastered locks so that remote ops can find them.
11066+ */
11067+
11068+int rebuild_rsbs_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
11069+{
11070+ gd_rcom_t *rc;
11071+ gd_res_t *rsb = NULL;
11072+ rebuild_node_t *rnode;
11073+ char *outbuf;
11074+ int outptr, ptr = 0, error = -ENOMEM;
11075+
11076+ rnode = find_rebuild_root(ls, nodeid);
11077+ if (!rnode)
11078+ goto out;
11079+
11080+ /*
11081+ * Allocate a buffer for the reply message which is a list of remote
11082+ * lock IDs and their (new) local lock ids. It will always be big
11083+ * enough to fit <n> ID pairs if it already fit <n> LKBs.
11084+ */
11085+
11086+ rc = allocate_rcom_buffer(ls);
11087+ if (!rc)
11088+ goto out;
11089+ outbuf = rc->rc_buf;
11090+ outptr = 0;
11091+
11092+ /*
11093+ * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
11094+ * created. Each deserialise_rsb adds an rsb reference that must be
11095+ * removed with release_rsb once all new lkb's for an rsb have been
11096+ * added.
11097+ */
11098+
11099+ while (ptr < len) {
11100+ int type;
11101+
11102+ type = get_char(buf, &ptr);
11103+
11104+ switch (type) {
11105+ case REMASTER_ROOTRSB:
11106+ if (rsb)
11107+ release_rsb(rsb);
11108+ rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11109+ &ptr);
11110+ rnode->rootrsb = rsb;
11111+ break;
11112+
11113+ case REMASTER_RSB:
11114+ if (rsb)
11115+ release_rsb(rsb);
11116+ rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11117+ &ptr);
11118+ break;
11119+
11120+ case REMASTER_LKB:
11121+ deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
11122+ outbuf, &outptr);
11123+ break;
11124+
11125+ default:
11126+ GDLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
11127+ "len=%d\n", type, nodeid, ptr,
11128+ len););
11129+ }
11130+ }
11131+
11132+ if (rsb)
11133+ release_rsb(rsb);
11134+
11135+ /*
11136+ * Reply with the new lock IDs.
11137+ */
11138+
11139+ rc->rc_datalen = outptr;
11140+ error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
11141+
11142+ free_rcom_buffer(rc);
11143+
11144+ out:
11145+ return error;
11146+}
11147+
11148+/*
11149+ * Processing for a NEWLOCKIDS message. Called when we get the reply from the
11150+ * new master telling us what the new remote lock IDs are for the remastered
11151+ * locks
11152+ */
11153+
11154+int rebuild_rsbs_lkids_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
11155+{
11156+ int offset = 0;
11157+
11158+ if (len == 1)
11159+ len = 0;
11160+
11161+ while (offset < len) {
11162+ int remote_id;
11163+ int local_id;
11164+ gd_lkb_t *lkb;
11165+
11166+ if (offset + 8 > len) {
11167+ log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
11168+ "length nodeid=%d offset=%d len=%d",
11169+ nodeid, offset, len);
11170+ break;
11171+ }
11172+
11173+ remote_id = get_int(buf, &offset);
11174+ local_id = get_int(buf, &offset);
11175+
11176+ lkb = find_lock_by_id(ls, local_id);
11177+ if (lkb) {
11178+ lkb->lkb_remid = remote_id;
11179+ have_new_lkid(lkb);
11180+ } else {
11181+ log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
11182+ "nodeid=%d id=%x remid=%x offset=%d len=%d",
11183+ nodeid, local_id, remote_id, offset, len);
11184+ }
11185+ }
11186+
11187+ if (recover_list_empty(ls))
11188+ wake_up(&ls->ls_wait_general);
11189+
11190+ return 0;
11191+}
11192diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
11193--- linux-orig/cluster/dlm/rebuild.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 11194+++ linux-patched/cluster/dlm/rebuild.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 11195@@ -0,0 +1,22 @@
11196+/******************************************************************************
11197+*******************************************************************************
11198+**
11199+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11200+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11201+**
11202+** This copyrighted material is made available to anyone wishing to use,
11203+** modify, copy, or redistribute it subject to the terms and conditions
11204+** of the GNU General Public License v.2.
11205+**
11206+*******************************************************************************
11207+******************************************************************************/
11208+
11209+#ifndef __REBUILD_DOT_H__
11210+#define __REBUILD_DOT_H__
11211+
11212+int rebuild_rsbs_send(gd_ls_t * ls);
11213+int rebuild_rsbs_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
11214+int rebuild_rsbs_lkids_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
11215+int rebuild_freemem(gd_ls_t * ls);
11216+
11217+#endif /* __REBUILD_DOT_H__ */
11218diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
11219--- linux-orig/cluster/dlm/reccomms.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 11220+++ linux-patched/cluster/dlm/reccomms.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 11221@@ -0,0 +1,502 @@
11222+/******************************************************************************
11223+*******************************************************************************
11224+**
11225+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11226+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11227+**
11228+** This copyrighted material is made available to anyone wishing to use,
11229+** modify, copy, or redistribute it subject to the terms and conditions
11230+** of the GNU General Public License v.2.
11231+**
11232+*******************************************************************************
11233+******************************************************************************/
11234+
11235+#include "dlm_internal.h"
11236+#include "lowcomms.h"
11237+#include "midcomms.h"
11238+#include "reccomms.h"
11239+#include "nodes.h"
11240+#include "lockspace.h"
11241+#include "recover.h"
11242+#include "dir.h"
11243+#include "config.h"
11244+#include "rebuild.h"
11245+#include "memory.h"
11246+
11247+/* Running on the basis that only a single recovery communication will be done
11248+ * at a time per lockspace */
11249+
11250+static void rcom_process_message(gd_ls_t * ls, uint32_t nodeid, gd_rcom_t * rc);
11251+
11252+/*
11253+ * Track per-node progress/stats during recovery to help debugging.
11254+ */
11255+
11256+void rcom_log(gd_ls_t *ls, int nodeid, gd_rcom_t *rc, int send)
11257+{
11258+ gd_csb_t *csb;
11259+ int found = 0;
11260+
11261+ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11262+ if (csb->csb_node->gn_nodeid == nodeid) {
11263+ found = TRUE;
11264+ break;
11265+ }
11266+ }
11267+
11268+ if (!found)
11269+ return;
11270+
11271+ if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) {
11272+ if (send) {
11273+ csb->csb_names_send_count++;
11274+ csb->csb_names_send_msgid = rc->rc_msgid;
11275+ } else {
11276+ csb->csb_names_recv_count++;
11277+ csb->csb_names_recv_msgid = rc->rc_msgid;
11278+ }
11279+ } else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) {
11280+ if (send) {
11281+ csb->csb_locks_send_count++;
11282+ csb->csb_locks_send_msgid = rc->rc_msgid;
11283+ } else {
11284+ csb->csb_locks_recv_count++;
11285+ csb->csb_locks_recv_msgid = rc->rc_msgid;
11286+ }
11287+ }
11288+}
11289+
11290+void rcom_log_clear(gd_ls_t *ls)
11291+{
11292+ gd_csb_t *csb;
11293+
11294+ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11295+ csb->csb_names_send_count = 0;
11296+ csb->csb_names_send_msgid = 0;
11297+ csb->csb_names_recv_count = 0;
11298+ csb->csb_names_recv_msgid = 0;
11299+ csb->csb_locks_send_count = 0;
11300+ csb->csb_locks_send_msgid = 0;
11301+ csb->csb_locks_recv_count = 0;
11302+ csb->csb_locks_recv_msgid = 0;
11303+ }
11304+}
11305+
11306+static int rcom_response(gd_ls_t *ls)
11307+{
11308+ return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11309+}
11310+
11311+/**
11312+ * rcom_send_message - send or request recovery data
11313+ * @ls: the lockspace
11314+ * @nodeid: node to which the message is sent
11315+ * @type: type of recovery message
11316+ * @rc: the rc buffer to send
11317+ * @need_reply: wait for reply if this is set
11318+ *
11319+ * Using this interface
11320+ * i) Allocate an rc buffer:
11321+ * rc = allocate_rcom_buffer(ls);
11322+ * ii) Copy data to send beginning at rc->rc_buf:
11323+ * memcpy(rc->rc_buf, mybuf, mylen);
11324+ * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
11325+ * rc->rc_datalen = mylen
11326+ * iv) Submit the rc to this function:
11327+ * rcom_send_message(rc);
11328+ *
11329+ * The max value of "mylen" is dlm_config.buffer_size - sizeof(gd_rcom_t). If
11330+ * more data must be passed in one send, use rcom_expand_buffer() which
11331+ * incrementally increases the size of the rc buffer by dlm_config.buffer_size
11332+ * bytes.
11333+ *
11334+ * Any data returned for the message (when need_reply is set) will saved in
11335+ * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
11336+ * number of bytes copied into rc->rc_buf.
11337+ *
11338+ * Returns: 0 on success, -EXXX on failure
11339+ */
11340+
11341+int rcom_send_message(gd_ls_t *ls, uint32_t nodeid, int type, gd_rcom_t *rc,
11342+ int need_reply)
11343+{
11344+ int error = 0;
11345+
11346+ if (!rc->rc_datalen)
11347+ rc->rc_datalen = 1;
11348+
11349+ /*
11350+ * Fill in the header.
11351+ */
11352+
11353+ rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
11354+ rc->rc_header.rh_lockspace = ls->ls_global_id;
11355+ rc->rc_header.rh_length = sizeof(gd_rcom_t) + rc->rc_datalen - 1;
11356+ rc->rc_subcmd = type;
11357+ rc->rc_msgid = ++ls->ls_rcom_msgid;
11358+
11359+ rcom_log(ls, nodeid, rc, 1);
11360+
11361+ /*
11362+ * When a reply is received, the reply data goes back into this buffer.
11363+ * Synchronous rcom requests (need_reply=1) are serialised because of
11364+ * the single ls_rcom.
11365+ */
11366+
11367+ if (need_reply) {
11368+ down(&ls->ls_rcom_lock);
11369+ ls->ls_rcom = rc;
11370+ }
11371+
11372+ /*
11373+ * After sending the message we'll wait at the end of this function to
11374+ * get a reply. The READY flag will be set when the reply has been
11375+ * received and requested data has been copied into
11376+ * ls->ls_rcom->rc_buf;
11377+ */
11378+
11379+ GDLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
11380+
11381+ /*
11382+ * The WAIT bit indicates that we're waiting for and willing to accept a
11383+ * reply. Any replies are ignored unless this bit is set.
11384+ */
11385+
11386+ set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11387+
11388+ /*
11389+ * Process the message locally.
11390+ */
11391+
11392+ if (nodeid == our_nodeid()) {
11393+ rcom_process_message(ls, nodeid, rc);
11394+ goto out;
11395+ }
11396+
11397+ /*
11398+ * Send the message.
11399+ */
11400+
11401+ log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
11402+
11403+ error = midcomms_send_message(nodeid, (struct gd_req_header *) rc,
11404+ GFP_KERNEL);
11405+ GDLM_ASSERT(error >= 0, printk("error = %d\n", error););
11406+ error = 0;
11407+
11408+ /*
11409+ * Wait for a reply. Once a reply is processed from midcomms, the
11410+ * READY bit will be set and we'll be awoken (gdlm_wait_function will
11411+ * return 0).
11412+ */
11413+
11414+ if (need_reply) {
11415+ error = gdlm_wait_function(ls, &rcom_response);
11416+ if (error)
11417+ log_debug(ls, "rcom wait error %d", error);
11418+ }
11419+
11420+ out:
11421+ clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11422+ clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11423+
11424+ if (need_reply)
11425+ up(&ls->ls_rcom_lock);
11426+
11427+ return error;
11428+}
11429+
11430+/*
11431+ * Runs in same context as midcomms.
11432+ */
11433+
11434+static void rcom_process_message(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *rc)
11435+{
11436+ gd_rcom_t rc_stack;
11437+ gd_rcom_t *reply = NULL;
11438+ gd_resdata_t *rd;
11439+ int status, datalen, maxlen;
11440+ uint32_t be_nodeid;
11441+
11442+ if (!ls)
11443+ return;
11444+
11445+ rcom_log(ls, nodeid, rc, 0);
11446+
11447+ if (gdlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
11448+ log_error(ls, "ignoring recovery message %x from %u",
11449+ rc->rc_subcmd, nodeid);
11450+ return;
11451+ }
11452+
11453+ switch (rc->rc_subcmd) {
11454+
11455+ case RECCOMM_STATUS:
11456+
11457+ memset(&rc_stack, 0, sizeof(gd_rcom_t));
11458+ reply = &rc_stack;
11459+
11460+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11461+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11462+ reply->rc_subcmd = rc->rc_subcmd;
11463+ reply->rc_msgid = rc->rc_msgid;
11464+ reply->rc_buf[0] = 0;
11465+
11466+ if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
11467+ reply->rc_buf[0] |= RESDIR_VALID;
11468+
11469+ if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
11470+ reply->rc_buf[0] |= RESDIR_ALL_VALID;
11471+
11472+ if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
11473+ reply->rc_buf[0] |= NODES_VALID;
11474+
11475+ if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
11476+ reply->rc_buf[0] |= NODES_ALL_VALID;
11477+
11478+ reply->rc_datalen = 1;
11479+ reply->rc_header.rh_length =
11480+ sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11481+
11482+ log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
11483+ break;
11484+
11485+ case RECCOMM_RECOVERNAMES:
11486+
11487+ reply = allocate_rcom_buffer(ls);
11488+ GDLM_ASSERT(reply,);
11489+ maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
11490+
11491+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11492+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11493+ reply->rc_subcmd = rc->rc_subcmd;
11494+ reply->rc_msgid = rc->rc_msgid;
11495+
11496+ /*
11497+ * The other node wants a bunch of resource names. The name of
11498+ * the resource to begin with is in rc->rc_buf.
11499+ */
11500+
11501+ datalen = resdir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
11502+ reply->rc_buf, maxlen, nodeid);
11503+
11504+ reply->rc_datalen = datalen;
11505+ reply->rc_header.rh_length =
11506+ sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11507+
11508+ log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
11509+ reply->rc_msgid);
11510+ break;
11511+
11512+ case RECCOMM_GETMASTER:
11513+
11514+ reply = allocate_rcom_buffer(ls);
11515+ GDLM_ASSERT(reply,);
11516+
11517+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11518+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11519+ reply->rc_subcmd = rc->rc_subcmd;
11520+ reply->rc_msgid = rc->rc_msgid;
11521+
11522+ /*
11523+ * The other node wants to know the master of a named resource.
11524+ */
11525+
11526+ status = get_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen,
11527+ &rd, 1);
11528+ if (status != 0) {
11529+ free_rcom_buffer(reply);
11530+ reply = NULL;
11531+ return;
11532+ }
11533+ be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
11534+ memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
11535+ reply->rc_datalen = sizeof(uint32_t);
11536+ reply->rc_header.rh_length =
11537+ sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11538+ break;
11539+
11540+ case RECCOMM_BULKLOOKUP:
11541+
11542+ reply = allocate_rcom_buffer(ls);
11543+ GDLM_ASSERT(reply,);
11544+
11545+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11546+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11547+ reply->rc_subcmd = rc->rc_subcmd;
11548+ reply->rc_msgid = rc->rc_msgid;
11549+
11550+ /*
11551+ * This is a bulk version of the above and just returns a
11552+ * buffer full of node ids to match the resources
11553+ */
11554+
11555+ datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
11556+ rc->rc_datalen, reply->rc_buf);
11557+ if (datalen < 0) {
11558+ free_rcom_buffer(reply);
11559+ reply = NULL;
11560+ return;
11561+ }
11562+
11563+ reply->rc_datalen = datalen;
11564+ reply->rc_header.rh_length =
11565+ sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11566+ break;
11567+
11568+ /*
11569+ * These RECCOMM messages don't need replies.
11570+ */
11571+
11572+ case RECCOMM_NEWLOCKS:
11573+ rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11574+ break;
11575+
11576+ case RECCOMM_NEWLOCKIDS:
11577+ rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11578+ break;
11579+
11580+ case RECCOMM_REMRESDATA:
11581+ remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen, 1);
11582+ break;
11583+
11584+ default:
11585+ GDLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
11586+ }
11587+
11588+ if (reply) {
11589+ if (nodeid == our_nodeid()) {
11590+ GDLM_ASSERT(rc == ls->ls_rcom,);
11591+ memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11592+ rc->rc_datalen = reply->rc_datalen;
11593+ } else {
11594+ midcomms_send_message(nodeid,
11595+ (struct gd_req_header *) reply,
11596+ GFP_KERNEL);
11597+ }
11598+
11599+ if (reply != &rc_stack)
11600+ free_rcom_buffer(reply);
11601+ }
11602+}
11603+
11604+static void process_reply_sync(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11605+{
11606+ gd_rcom_t *rc = ls->ls_rcom;
11607+
11608+ if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
11609+ log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
11610+ return;
11611+ }
11612+
11613+ if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
11614+ log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
11615+ reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
11616+ return;
11617+ }
11618+
11619+ memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11620+ rc->rc_datalen = reply->rc_datalen;
11621+
11622+ /*
11623+ * Tell the thread waiting in rcom_send_message() that it can go ahead.
11624+ */
11625+
11626+ set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11627+ wake_up(&ls->ls_wait_general);
11628+}
11629+
11630+static void process_reply_async(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11631+{
11632+ restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
11633+ reply->rc_msgid);
11634+}
11635+
11636+/*
11637+ * Runs in same context as midcomms.
11638+ */
11639+
11640+static void rcom_process_reply(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
11641+{
11642+ if (gdlm_recovery_stopped(ls)) {
11643+ log_error(ls, "ignoring recovery reply %x from %u",
11644+ reply->rc_subcmd, nodeid);
11645+ return;
11646+ }
11647+
11648+ switch (reply->rc_subcmd) {
11649+ case RECCOMM_GETMASTER:
11650+ process_reply_async(ls, nodeid, reply);
11651+ break;
11652+ case RECCOMM_STATUS:
11653+ case RECCOMM_NEWLOCKS:
11654+ case RECCOMM_NEWLOCKIDS:
11655+ case RECCOMM_RECOVERNAMES:
11656+ process_reply_sync(ls, nodeid, reply);
11657+ break;
11658+ default:
11659+ log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
11660+ reply->rc_subcmd, nodeid);
11661+ }
11662+}
11663+
11664+
11665+static int send_ls_not_ready(uint32_t nodeid, struct gd_req_header *header)
11666+{
11667+ struct writequeue_entry *wq;
11668+ gd_rcom_t *rc = (gd_rcom_t *) header;
11669+ gd_rcom_t *reply;
11670+
11671+ wq = lowcomms_get_buffer(nodeid, sizeof(gd_rcom_t), GFP_KERNEL,
11672+ (char **)&reply);
11673+ if (!wq)
11674+ return -ENOMEM;
11675+
11676+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11677+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11678+ reply->rc_subcmd = rc->rc_subcmd;
11679+ reply->rc_msgid = rc->rc_msgid;
11680+ reply->rc_buf[0] = 0;
11681+
11682+ reply->rc_datalen = 1;
11683+ reply->rc_header.rh_length = sizeof(gd_rcom_t) + reply->rc_datalen - 1;
11684+
11685+ midcomms_send_buffer((struct gd_req_header *)reply, wq);
11686+ return 0;
11687+}
11688+
11689+
11690+/*
11691+ * Runs in same context as midcomms. Both recovery requests and recovery
11692+ * replies come through this function.
11693+ */
11694+
11695+void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header)
11696+{
11697+ gd_ls_t *ls = find_lockspace_by_global_id(header->rh_lockspace);
11698+ gd_rcom_t *rc = (gd_rcom_t *) header;
11699+
11700+ /* If the lockspace doesn't exist then still send a status message
11701+ back, it's possible that it just doesn't have it's global_id
11702+ yet. */
11703+ if (!ls) {
11704+ send_ls_not_ready(nodeid, header);
11705+ return;
11706+ }
11707+
11708+ switch (header->rh_cmd) {
11709+ case GDLM_REMCMD_RECOVERMESSAGE:
11710+ down_read(&ls->ls_rec_rsblist);
11711+ rcom_process_message(ls, nodeid, rc);
11712+ up_read(&ls->ls_rec_rsblist);
11713+ break;
11714+
11715+ case GDLM_REMCMD_RECOVERREPLY:
11716+ rcom_process_reply(ls, nodeid, rc);
11717+ break;
11718+
11719+ default:
11720+ GDLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
11721+ }
11722+}
11723+
11724diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
11725--- linux-orig/cluster/dlm/reccomms.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 11726+++ linux-patched/cluster/dlm/reccomms.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 11727@@ -0,0 +1,37 @@
11728+/******************************************************************************
11729+*******************************************************************************
11730+**
11731+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11732+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11733+**
11734+** This copyrighted material is made available to anyone wishing to use,
11735+** modify, copy, or redistribute it subject to the terms and conditions
11736+** of the GNU General Public License v.2.
11737+**
11738+*******************************************************************************
11739+******************************************************************************/
11740+
11741+#ifndef __RECCOMMS_DOT_H__
11742+#define __RECCOMMS_DOT_H__
11743+
11744+/* Bit flags */
11745+
11746+#define RESDIR_VALID (1)
11747+#define RESDIR_ALL_VALID (2)
11748+#define NODES_VALID (4)
11749+#define NODES_ALL_VALID (8)
11750+
11751+#define RECCOMM_STATUS (1)
11752+#define RECCOMM_RECOVERNAMES (2)
11753+#define RECCOMM_GETMASTER (3)
11754+#define RECCOMM_BULKLOOKUP (4)
11755+#define RECCOMM_NEWLOCKS (5)
11756+#define RECCOMM_NEWLOCKIDS (6)
11757+#define RECCOMM_REMRESDATA (7)
11758+
11759+int rcom_send_message(gd_ls_t * ls, uint32_t nodeid, int type, gd_rcom_t * rc,
11760+ int need_reply);
11761+void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header);
11762+void rcom_log_clear(gd_ls_t *ls);
11763+
11764+#endif
11765diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
11766--- linux-orig/cluster/dlm/recover.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 11767+++ linux-patched/cluster/dlm/recover.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 11768@@ -0,0 +1,632 @@
11769+/******************************************************************************
11770+*******************************************************************************
11771+**
11772+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11773+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11774+**
11775+** This copyrighted material is made available to anyone wishing to use,
11776+** modify, copy, or redistribute it subject to the terms and conditions
11777+** of the GNU General Public License v.2.
11778+**
11779+*******************************************************************************
11780+******************************************************************************/
11781+
11782+#include "dlm_internal.h"
11783+#include "reccomms.h"
11784+#include "dir.h"
11785+#include "locking.h"
11786+#include "rsb.h"
11787+#include "lockspace.h"
11788+#include "lkb.h"
11789+#include "nodes.h"
11790+#include "config.h"
11791+#include "ast.h"
11792+#include "memory.h"
11793+
11794+/*
11795+ * Called in recovery routines to check whether the recovery process has been
11796+ * interrupted/stopped by another transition. A recovery in-process will abort
11797+ * if the lockspace is "stopped" so that a new recovery process can start from
11798+ * the beginning when the lockspace is "started" again.
11799+ */
11800+
11801+int gdlm_recovery_stopped(gd_ls_t *ls)
11802+{
11803+ return test_bit(LSFL_LS_STOP, &ls->ls_flags);
11804+}
11805+
11806+static void gdlm_wait_timer_fn(unsigned long data)
11807+{
11808+ gd_ls_t *ls = (gd_ls_t *) data;
11809+
11810+ wake_up(&ls->ls_wait_general);
11811+}
11812+
11813+/*
11814+ * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
11815+ * set due to failure of a node in ls_nodes). When another function thinks it
11816+ * could have completed the waited-on task, they should wake up ls_wait_general
11817+ * to get an immediate response rather than waiting for the timer to detect the
11818+ * result. A timer wakes us up periodically while waiting to see if we should
11819+ * abort due to a node failure.
11820+ */
11821+
11822+int gdlm_wait_function(gd_ls_t *ls, int (*testfn) (gd_ls_t * ls))
11823+{
11824+ struct timer_list timer;
11825+ int error = 0;
11826+
11827+ init_timer(&timer);
11828+ timer.function = gdlm_wait_timer_fn;
11829+ timer.data = (long) ls;
11830+
11831+ for (;;) {
11832+ mod_timer(&timer, jiffies + (5 * HZ));
11833+
11834+ wchan_cond_sleep_intr(ls->ls_wait_general,
11835+ !testfn(ls) &&
11836+ !test_bit(LSFL_LS_STOP, &ls->ls_flags));
11837+
11838+ if (timer_pending(&timer))
11839+ del_timer(&timer);
11840+
11841+ if (testfn(ls))
11842+ break;
11843+
11844+ if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
11845+ error = -1;
11846+ break;
11847+ }
11848+ }
11849+
11850+ return error;
11851+}
11852+
11853+int gdlm_wait_status_all(gd_ls_t *ls, unsigned int wait_status)
11854+{
11855+ gd_rcom_t rc_stack, *rc;
11856+ gd_csb_t *csb;
11857+ int status;
11858+ int error = 0;
11859+
11860+ memset(&rc_stack, 0, sizeof(gd_rcom_t));
11861+ rc = &rc_stack;
11862+ rc->rc_datalen = 0;
11863+
11864+ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
11865+ for (;;) {
11866+ error = gdlm_recovery_stopped(ls);
11867+ if (error)
11868+ goto out;
11869+
11870+ error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
11871+ RECCOMM_STATUS, rc, 1);
11872+ if (error)
11873+ goto out;
11874+
11875+ status = rc->rc_buf[0];
11876+ if (status & wait_status)
11877+ break;
11878+ else {
11879+ set_current_state(TASK_INTERRUPTIBLE);
11880+ schedule_timeout(HZ >> 1);
11881+ }
11882+ }
11883+ }
11884+
11885+ out:
11886+ return error;
11887+}
11888+
11889+int gdlm_wait_status_low(gd_ls_t *ls, unsigned int wait_status)
11890+{
11891+ gd_rcom_t rc_stack, *rc;
11892+ uint32_t nodeid = ls->ls_low_nodeid;
11893+ int status;
11894+ int error = 0;
11895+
11896+ memset(&rc_stack, 0, sizeof(gd_rcom_t));
11897+ rc = &rc_stack;
11898+ rc->rc_datalen = 0;
11899+
11900+ for (;;) {
11901+ error = gdlm_recovery_stopped(ls);
11902+ if (error)
11903+ goto out;
11904+
11905+ error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
11906+ if (error)
11907+ break;
11908+
11909+ status = rc->rc_buf[0];
11910+ if (status & wait_status)
11911+ break;
11912+ else {
11913+ set_current_state(TASK_INTERRUPTIBLE);
11914+ schedule_timeout(HZ >> 1);
11915+ }
11916+ }
11917+
11918+ out:
11919+ return error;
11920+}
11921+
11922+static int purge_queue(gd_ls_t *ls, struct list_head *queue)
11923+{
11924+ gd_lkb_t *lkb, *safe;
11925+ gd_res_t *rsb;
11926+ int count = 0;
11927+
11928+ list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
11929+ if (!lkb->lkb_nodeid)
11930+ continue;
11931+
11932+ GDLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
11933+
11934+ if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
11935+ list_del(&lkb->lkb_statequeue);
11936+
11937+ rsb = lkb->lkb_resource;
11938+ lkb->lkb_status = 0;
11939+
11940+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
11941+ && &lkb->lkb_duetime)
11942+ remove_from_deadlockqueue(lkb);
11943+
11944+ release_lkb(ls, lkb);
11945+ release_rsb(rsb);
11946+ count++;
11947+ }
11948+ }
11949+
11950+ return count;
11951+}
11952+
11953+/*
11954+ * Go through local restbl and for each rsb we're master of, clear out any
11955+ * lkb's held by departed nodes.
11956+ */
11957+
11958+int restbl_lkb_purge(gd_ls_t *ls)
11959+{
11960+ struct list_head *tmp2, *safe2;
11961+ int count = 0;
11962+ gd_res_t *rootrsb, *safe, *rsb;
11963+
11964+ log_all(ls, "purge locks of departed nodes");
11965+
11966+ list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
11967+
11968+ rootrsb->res_resdir_seq = 1;
11969+
11970+ if (rootrsb->res_nodeid)
11971+ continue;
11972+
11973+ hold_rsb(rootrsb);
11974+ down_write(&rootrsb->res_lock);
11975+
11976+ /* This traverses the subreslist in reverse order so we purge
11977+ * the children before their parents. */
11978+
11979+ for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
11980+ tmp2 != &rootrsb->res_subreslist;
11981+ tmp2 = safe2, safe2 = safe2->prev) {
11982+ rsb = list_entry(tmp2, gd_res_t, res_subreslist);
11983+
11984+ hold_rsb(rsb);
11985+ purge_queue(ls, &rsb->res_grantqueue);
11986+ purge_queue(ls, &rsb->res_convertqueue);
11987+ purge_queue(ls, &rsb->res_waitqueue);
11988+ release_rsb(rsb);
11989+ }
11990+ count += purge_queue(ls, &rootrsb->res_grantqueue);
11991+ count += purge_queue(ls, &rootrsb->res_convertqueue);
11992+ count += purge_queue(ls, &rootrsb->res_waitqueue);
11993+
11994+ up_write(&rootrsb->res_lock);
11995+ release_rsb(rootrsb);
11996+ }
11997+
11998+ log_all(ls, "purged %d locks", count);
11999+
12000+ return 0;
12001+}
12002+
12003+/*
12004+ * Grant any locks that have become grantable after a purge
12005+ */
12006+
12007+int restbl_grant_after_purge(gd_ls_t *ls)
12008+{
12009+ gd_res_t *root, *rsb, *safe;
12010+ int error = 0;
12011+
12012+ down_write(&ls->ls_gap_rsblist);
12013+
12014+ list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
12015+ /* only the rsb master grants locks */
12016+ if (root->res_nodeid)
12017+ continue;
12018+
12019+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
12020+ log_debug(ls, "restbl_grant_after_purge aborted");
12021+ error = -EINTR;
12022+ up_write(&ls->ls_gap_rsblist);
12023+ goto out;
12024+ }
12025+
12026+ down_write(&root->res_lock);
12027+ grant_pending_locks(root);
12028+ up_write(&root->res_lock);
12029+
12030+ list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
12031+ down_write(&rsb->res_lock);
12032+ grant_pending_locks(rsb);
12033+ up_write(&rsb->res_lock);
12034+ }
12035+ }
12036+ up_write(&ls->ls_gap_rsblist);
12037+ wake_astd();
12038+ out:
12039+ return error;
12040+}
12041+
12042+/*
12043+ * Set the lock master for all LKBs in a lock queue
12044+ */
12045+
12046+static void set_lock_master(struct list_head *queue, int nodeid)
12047+{
12048+ gd_lkb_t *lkb;
12049+
12050+ list_for_each_entry(lkb, queue, lkb_statequeue) {
12051+ /* Don't muck around with pre-exising sublocks */
12052+ if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
12053+ lkb->lkb_nodeid = nodeid;
12054+ }
12055+}
12056+
12057+static void set_master_lkbs(gd_res_t *rsb)
12058+{
12059+ set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
12060+ set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
12061+ set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
12062+}
12063+
12064+/*
12065+ * This rsb struct is now the master so it is responsible for keeping the
12066+ * latest rsb. Find if any current lkb's have an up to date copy of the lvb to
12067+ * be used as the rsb copy. An equivalent step occurs as new lkb's arrive for
12068+ * this rsb in deserialise_lkb.
12069+ */
12070+
12071+static void set_rsb_lvb(gd_res_t *rsb)
12072+{
12073+ gd_lkb_t *lkb;
12074+
12075+ list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
12076+
12077+ if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12078+ (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12079+ (lkb->lkb_grmode > DLM_LOCK_NL))
12080+ {
12081+ if (!rsb->res_lvbptr)
12082+ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12083+
12084+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12085+ return;
12086+ }
12087+ }
12088+
12089+ list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
12090+
12091+ if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12092+ (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12093+ (lkb->lkb_grmode > DLM_LOCK_NL))
12094+ {
12095+ if (!rsb->res_lvbptr)
12096+ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12097+
12098+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12099+ return;
12100+ }
12101+ }
12102+}
12103+
12104+/*
12105+ * Propogate the new master nodeid to locks, subrsbs, sublocks.
12106+ * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
12107+ */
12108+
12109+static void set_new_master(gd_res_t *rsb)
12110+{
12111+ gd_res_t *subrsb;
12112+
12113+ down_write(&rsb->res_lock);
12114+
12115+ if (rsb->res_nodeid == our_nodeid()) {
12116+ rsb->res_nodeid = 0;
12117+ set_rsb_lvb(rsb);
12118+ }
12119+
12120+ set_master_lkbs(rsb);
12121+
12122+ list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
12123+ subrsb->res_nodeid = rsb->res_nodeid;
12124+ set_master_lkbs(subrsb);
12125+ }
12126+
12127+ up_write(&rsb->res_lock);
12128+
12129+ set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
12130+}
12131+
12132+/*
12133+ * The recover_list contains all the rsb's for which we've requested the new
12134+ * master nodeid. As replies are returned from the resource directories the
12135+ * rsb's are removed from the list. When the list is empty we're done.
12136+ *
12137+ * The recover_list is later similarly used for all rsb's for which we've sent
12138+ * new lkb's and need to receive new corresponding lkid's.
12139+ */
12140+
12141+int recover_list_empty(gd_ls_t *ls)
12142+{
12143+ int empty;
12144+
12145+ spin_lock(&ls->ls_recover_list_lock);
12146+ empty = list_empty(&ls->ls_recover_list);
12147+ spin_unlock(&ls->ls_recover_list_lock);
12148+
12149+ return empty;
12150+}
12151+
12152+int recover_list_count(gd_ls_t *ls)
12153+{
12154+ int count;
12155+
12156+ spin_lock(&ls->ls_recover_list_lock);
12157+ count = ls->ls_recover_list_count;
12158+ spin_unlock(&ls->ls_recover_list_lock);
12159+
12160+ return count;
12161+}
12162+
12163+void recover_list_add(gd_res_t *rsb)
12164+{
12165+ gd_ls_t *ls = rsb->res_ls;
12166+
12167+ spin_lock(&ls->ls_recover_list_lock);
12168+ if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
12169+ list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
12170+ ls->ls_recover_list_count++;
12171+ hold_rsb(rsb);
12172+ }
12173+ spin_unlock(&ls->ls_recover_list_lock);
12174+}
12175+
12176+void recover_list_del(gd_res_t *rsb)
12177+{
12178+ gd_ls_t *ls = rsb->res_ls;
12179+
12180+ spin_lock(&ls->ls_recover_list_lock);
12181+ clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
12182+ list_del(&rsb->res_recover_list);
12183+ ls->ls_recover_list_count--;
12184+ spin_unlock(&ls->ls_recover_list_lock);
12185+
12186+ release_rsb(rsb);
12187+}
12188+
12189+static gd_res_t *recover_list_find(gd_ls_t *ls, int msgid)
12190+{
12191+ gd_res_t *rsb = NULL;
12192+
12193+ spin_lock(&ls->ls_recover_list_lock);
12194+
12195+ list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
12196+ if (rsb->res_recover_msgid == msgid)
12197+ goto rec_found;
12198+ }
12199+ rsb = NULL;
12200+
12201+ rec_found:
12202+ spin_unlock(&ls->ls_recover_list_lock);
12203+ return rsb;
12204+}
12205+
12206+#if 0
12207+static void recover_list_clear(gd_ls_t *ls)
12208+{
12209+ gd_res_t *rsb;
12210+
12211+
12212+ spin_lock(&ls->ls_recover_list_lock);
12213+
12214+ while (!list_empty(&ls->ls_recover_list)) {
12215+ rsb = list_entry(ls->ls_recover_list.next, gd_res_t,
12216+ res_recover_list);
12217+ list_del(&rsb->res_recover_list);
12218+ ls->ls_recover_list_count--;
12219+ }
12220+ spin_unlock(&ls->ls_recover_list_lock);
12221+
12222+}
12223+#endif
12224+
12225+#if 0
12226+void recover_list_dump(gd_ls_t *ls)
12227+{
12228+ struct list_head *tmp;
12229+ gd_res_t *rsb;
12230+
12231+ spin_lock(&ls->ls_recover_list_lock);
12232+
12233+ printk("recover_list_count=%d\n", ls->ls_recover_list_count);
12234+
12235+ list_for_each(tmp, &ls->ls_recover_list) {
12236+ rsb = list_entry(tmp, gd_res_t, res_recover_list);
12237+ gdlm_res_dbprint(rsb);
12238+ }
12239+ spin_unlock(&ls->ls_recover_list_lock);
12240+}
12241+#endif
12242+
12243+static int rsb_master_lookup(gd_res_t *rsb, gd_rcom_t *rc)
12244+{
12245+ gd_ls_t *ls = rsb->res_ls;
12246+ gd_resdata_t *rd;
12247+ uint32_t dir_nodeid;
12248+ int error;
12249+
12250+ dir_nodeid = get_directory_nodeid(rsb);
12251+
12252+ if (dir_nodeid == our_nodeid()) {
12253+ error = get_resdata(ls, dir_nodeid, rsb->res_name,
12254+ rsb->res_length, &rd, 1);
12255+ if (error)
12256+ goto fail;
12257+
12258+ rsb->res_nodeid = rd->rd_master_nodeid;
12259+ set_new_master(rsb);
12260+ } else {
12261+ /* As we are the only thread doing recovery this
12262+ should be safe. if not then we need to use a different
12263+ ID somehow. We must set it in the RSB before rcom_send_msg
12264+ completes cos we may get a reply quite quickly.
12265+ */
12266+ rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
12267+
12268+ recover_list_add(rsb);
12269+
12270+ memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
12271+ rc->rc_datalen = rsb->res_length;
12272+
12273+ error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
12274+ rc, 0);
12275+ if (error)
12276+ goto fail;
12277+ }
12278+
12279+ fail:
12280+ return error;
12281+}
12282+
12283+/*
12284+ * Go through local root resources and for each rsb which has a master which
12285+ * has departed, get the new master nodeid from the resdir. The resdir will
12286+ * assign mastery to the first node to look up the new master. That means
12287+ * we'll discover in this lookup if we're the new master of any rsb's.
12288+ *
12289+ * We fire off all the resdir requests individually and asynchronously to the
12290+ * correct resdir node. The replies are processed in rsb_master_recv().
12291+ */
12292+
12293+int restbl_rsb_update(gd_ls_t *ls)
12294+{
12295+ gd_res_t *rsb, *safe;
12296+ gd_rcom_t *rc;
12297+ int error = -ENOMEM;
12298+ int count = 0;
12299+
12300+ log_all(ls, "update remastered resources");
12301+
12302+ rc = allocate_rcom_buffer(ls);
12303+ if (!rc)
12304+ goto out;
12305+
12306+ list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
12307+ if (!rsb->res_nodeid)
12308+ continue;
12309+
12310+ error = gdlm_recovery_stopped(ls);
12311+ if (error)
12312+ goto out_free;
12313+
12314+ if (in_nodes_gone(ls, rsb->res_nodeid)) {
12315+ error = rsb_master_lookup(rsb, rc);
12316+ if (error)
12317+ goto out_free;
12318+ count++;
12319+ }
12320+ }
12321+
12322+ error = gdlm_wait_function(ls, &recover_list_empty);
12323+
12324+ log_all(ls, "updated %d resources", count);
12325+
12326+ out_free:
12327+ free_rcom_buffer(rc);
12328+
12329+ out:
12330+ return error;
12331+}
12332+
12333+int restbl_rsb_update_recv(gd_ls_t *ls, uint32_t nodeid, char *buf, int length,
12334+ int msgid)
12335+{
12336+ gd_res_t *rsb;
12337+ uint32_t be_nodeid;
12338+
12339+ rsb = recover_list_find(ls, msgid);
12340+ if (!rsb) {
12341+ log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
12342+ goto out;
12343+ }
12344+
12345+ memcpy(&be_nodeid, buf, sizeof(uint32_t));
12346+ rsb->res_nodeid = be32_to_cpu(be_nodeid);
12347+ set_new_master(rsb);
12348+ recover_list_del(rsb);
12349+
12350+ if (recover_list_empty(ls))
12351+ wake_up(&ls->ls_wait_general);
12352+
12353+ out:
12354+ return 0;
12355+}
12356+
12357+/*
12358+ * This function not used any longer.
12359+ */
12360+
12361+int bulk_master_lookup(gd_ls_t *ls, int nodeid, char *inbuf, int inlen,
12362+ char *outbuf)
12363+{
12364+ char *inbufptr, *outbufptr;
12365+
12366+ /*
12367+ * The other node wants nodeids matching the resource names in inbuf.
12368+ * The resource names are packed into inbuf as
12369+ * [len1][name1][len2][name2]... where lenX is 1 byte and nameX is
12370+ * lenX bytes. Matching nodeids are packed into outbuf in order
12371+ * [nodeid1][nodeid2]...
12372+ */
12373+
12374+ inbufptr = inbuf;
12375+ outbufptr = outbuf;
12376+
12377+ while (inbufptr < inbuf + inlen) {
12378+ gd_resdata_t *rd;
12379+ uint32_t be_nodeid;
12380+ int status;
12381+
12382+ status = get_resdata(ls, nodeid, inbufptr + 1, *inbufptr,
12383+ &rd, 1);
12384+ if (status != 0)
12385+ goto fail;
12386+
12387+ inbufptr += *inbufptr + 1;
12388+
12389+ be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
12390+ memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
12391+ outbufptr += sizeof(uint32_t);
12392+
12393+ /* add assertion that outbufptr - outbuf is not > than ... */
12394+ }
12395+
12396+ return (outbufptr - outbuf);
12397+
12398+ fail:
12399+ return -1;
12400+}
12401diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
12402--- linux-orig/cluster/dlm/recover.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12403+++ linux-patched/cluster/dlm/recover.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 12404@@ -0,0 +1,34 @@
12405+/******************************************************************************
12406+*******************************************************************************
12407+**
12408+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12409+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12410+**
12411+** This copyrighted material is made available to anyone wishing to use,
12412+** modify, copy, or redistribute it subject to the terms and conditions
12413+** of the GNU General Public License v.2.
12414+**
12415+*******************************************************************************
12416+******************************************************************************/
12417+
12418+#ifndef __RECOVER_DOT_H__
12419+#define __RECOVER_DOT_H__
12420+
12421+int gdlm_wait_function(gd_ls_t * ls, int (*testfn) (gd_ls_t * ls));
12422+int gdlm_wait_status_all(gd_ls_t * ls, unsigned int wait_status);
12423+int gdlm_wait_status_low(gd_ls_t * ls, unsigned int wait_status);
12424+int gdlm_recovery_stopped(gd_ls_t * ls);
12425+int recover_list_empty(gd_ls_t * ls);
12426+int recover_list_count(gd_ls_t * ls);
12427+void recover_list_add(gd_res_t * rsb);
12428+void recover_list_del(gd_res_t * rsb);
12429+void recover_list_dump(gd_ls_t * ls);
12430+int restbl_lkb_purge(gd_ls_t * ls);
12431+void restbl_grant_after_purge(gd_ls_t * ls);
12432+int restbl_rsb_update(gd_ls_t * ls);
12433+int restbl_rsb_update_recv(gd_ls_t * ls, int nodeid, char *buf, int len,
12434+ int msgid);
12435+int bulk_master_lookup(gd_ls_t * ls, int nodeid, char *inbuf, int inlen,
12436+ char *outbuf);
12437+
12438+#endif /* __RECOVER_DOT_H__ */
12439diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
12440--- linux-orig/cluster/dlm/recoverd.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12441+++ linux-patched/cluster/dlm/recoverd.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 12442@@ -0,0 +1,692 @@
12443+/******************************************************************************
12444+*******************************************************************************
12445+**
12446+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12447+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12448+**
12449+** This copyrighted material is made available to anyone wishing to use,
12450+** modify, copy, or redistribute it subject to the terms and conditions
12451+** of the GNU General Public License v.2.
12452+**
12453+*******************************************************************************
12454+******************************************************************************/
12455+
12456+#include "dlm_internal.h"
12457+#include "nodes.h"
12458+#include "dir.h"
12459+#include "ast.h"
12460+#include "recover.h"
12461+#include "lockspace.h"
12462+#include "lowcomms.h"
12463+#include "lockqueue.h"
12464+#include "lkb.h"
12465+#include "rebuild.h"
12466+
12467+/*
12468+ * next_move actions
12469+ */
12470+
12471+#define DO_STOP (1)
12472+#define DO_START (2)
12473+#define DO_FINISH (3)
12474+#define DO_FINISH_STOP (4)
12475+#define DO_FINISH_START (5)
12476+
12477+/*
12478+ * recoverd_flags for thread
12479+ */
12480+
12481+#define THREAD_STOP (0)
12482+
12483+/*
12484+ * local thread variables
12485+ */
12486+
12487+static unsigned long recoverd_flags;
12488+static struct completion recoverd_run;
12489+static wait_queue_head_t recoverd_wait;
12490+static struct task_struct *recoverd_task;
12491+
12492+/*
12493+ * Queue of lockspaces (gr_recover_t structs) which need to be
12494+ * started/recovered
12495+ */
12496+
12497+static struct list_head recoverd_start_queue;
12498+static atomic_t recoverd_start_count;
12499+
12500+extern struct list_head lslist;
12501+extern spinlock_t lslist_lock;
12502+
12503+void dlm_recoverd_init(void)
12504+{
12505+ INIT_LIST_HEAD(&recoverd_start_queue);
12506+ atomic_set(&recoverd_start_count, 0);
12507+
12508+ init_completion(&recoverd_run);
12509+ init_waitqueue_head(&recoverd_wait);
12510+ memset(&recoverd_flags, 0, sizeof(unsigned long));
12511+}
12512+
12513+static int enable_locking(gd_ls_t *ls, int event_id)
12514+{
12515+ int error = 0;
12516+
12517+ spin_lock(&ls->ls_recover_lock);
12518+ if (ls->ls_last_stop < event_id) {
12519+ set_bit(LSFL_LS_RUN, &ls->ls_flags);
12520+ up_write(&ls->ls_in_recovery);
12521+ } else {
12522+ error = -EINTR;
12523+ log_debug(ls, "enable_locking: abort %d", event_id);
12524+ }
12525+ spin_unlock(&ls->ls_recover_lock);
12526+ return error;
12527+}
12528+
12529+static int ls_first_start(gd_ls_t *ls, gd_recover_t *gr)
12530+{
12531+ int error;
12532+
12533+ log_all(ls, "recover event %u (first)", gr->gr_event_id);
12534+
12535+ kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
12536+
12537+ error = ls_nodes_init(ls, gr);
12538+ if (error) {
12539+ log_error(ls, "nodes_init failed %d", error);
12540+ goto out;
12541+ }
12542+
12543+ error = resdir_rebuild_local(ls);
12544+ if (error) {
12545+ log_error(ls, "resdir_rebuild_local failed %d", error);
12546+ goto out;
12547+ }
12548+
12549+ error = resdir_rebuild_wait(ls);
12550+ if (error) {
12551+ log_error(ls, "resdir_rebuild_wait failed %d", error);
12552+ goto out;
12553+ }
12554+
12555+ log_all(ls, "recover event %u done", gr->gr_event_id);
12556+ kcl_start_done(ls->ls_local_id, gr->gr_event_id);
12557+
12558+ out:
12559+ return error;
12560+}
12561+
12562+/*
12563+ * We are given here a new group of nodes which are in the lockspace. We first
12564+ * figure out the differences in ls membership from when we were last running.
12565+ * If nodes from before are gone, then there will be some lock recovery to do.
12566+ * If there are only nodes which have joined, then there's no lock recovery.
12567+ *
12568+ * note: cman requires an rc to finish starting on an revent (where nodes die)
12569+ * before it allows an sevent (where nodes join) to be processed. This means
12570+ * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
12571+ * joined.
12572+ */
12573+
12574+static int ls_reconfig(gd_ls_t *ls, gd_recover_t *gr)
12575+{
12576+ int error, neg = 0;
12577+
12578+ log_all(ls, "recover event %u", gr->gr_event_id);
12579+
12580+ /*
12581+ * Add or remove nodes from the lockspace's ls_nodes list.
12582+ */
12583+
12584+ error = ls_nodes_reconfig(ls, gr, &neg);
12585+ if (error) {
12586+ log_error(ls, "nodes_reconfig failed %d", error);
12587+ goto fail;
12588+ }
12589+
12590+ /*
12591+ * Rebuild our own share of the resdir by collecting from all other
12592+ * nodes rsb name/master pairs for which the name hashes to us.
12593+ */
12594+
12595+ error = resdir_rebuild_local(ls);
12596+ if (error) {
12597+ log_error(ls, "resdir_rebuild_local failed %d", error);
12598+ goto fail;
12599+ }
12600+
12601+ /*
12602+ * Purge resdir-related requests that are being held in requestqueue.
12603+ * All resdir requests from before recovery started are invalid now due
12604+ * to the resdir rebuild and will be resent by the requesting nodes.
12605+ */
12606+
12607+ purge_requestqueue(ls);
12608+ set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12609+
12610+ /*
12611+ * Wait for all nodes to complete resdir rebuild.
12612+ */
12613+
12614+ error = resdir_rebuild_wait(ls);
12615+ if (error) {
12616+ log_error(ls, "resdir_rebuild_wait failed %d", error);
12617+ goto fail;
12618+ }
12619+
12620+ /*
12621+ * Mark our own lkb's waiting in the lockqueue for remote replies from
12622+ * nodes that are now departed. These will be resent to the new
12623+ * masters in resend_cluster_requests. Also mark resdir lookup
12624+ * requests for resending.
12625+ */
12626+
12627+ lockqueue_lkb_mark(ls);
12628+
12629+ error = gdlm_recovery_stopped(ls);
12630+ if (error)
12631+ goto fail;
12632+
12633+ if (neg) {
12634+ /*
12635+ * Clear lkb's for departed nodes. This can't fail since it
12636+ * doesn't involve communicating with other nodes.
12637+ */
12638+
12639+ down_write(&ls->ls_rec_rsblist);
12640+ restbl_lkb_purge(ls);
12641+ up_write(&ls->ls_rec_rsblist);
12642+
12643+ down_read(&ls->ls_rec_rsblist);
12644+
12645+ /*
12646+ * Get new master id's for rsb's of departed nodes. This fails
12647+ * if we can't communicate with other nodes.
12648+ */
12649+
12650+ error = restbl_rsb_update(ls);
12651+ if (error) {
12652+ log_error(ls, "restbl_rsb_update failed %d", error);
12653+ goto fail_up;
12654+ }
12655+
12656+ /*
12657+ * Send our lkb info to new masters. This fails if we can't
12658+ * communicate with a node.
12659+ */
12660+
12661+ error = rebuild_rsbs_send(ls);
12662+ if (error) {
12663+ log_error(ls, "rebuild_rsbs_send failed %d", error);
12664+ goto fail_up;
12665+ }
12666+ up_read(&ls->ls_rec_rsblist);
12667+ }
12668+
12669+ clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12670+
12671+ log_all(ls, "recover event %u done", gr->gr_event_id);
12672+ kcl_start_done(ls->ls_local_id, gr->gr_event_id);
12673+ return 0;
12674+
12675+ fail_up:
12676+ up_read(&ls->ls_rec_rsblist);
12677+ fail:
12678+ log_all(ls, "recover event %d error %d", gr->gr_event_id, error);
12679+ return error;
12680+}
12681+
12682+static void clear_finished_nodes(gd_ls_t *ls, int finish_event)
12683+{
12684+ gd_csb_t *csb, *safe;
12685+
12686+ list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, csb_list) {
12687+ if (csb->csb_gone_event <= finish_event) {
12688+ list_del(&csb->csb_list);
12689+ release_csb(csb);
12690+ }
12691+ }
12692+}
12693+
12694+/*
12695+ * Between calls to this routine for a ls, there can be multiple stop/start
12696+ * events from cman where every start but the latest is cancelled by stops.
12697+ * There can only be a single finish from cman because every finish requires us
12698+ * to call start_done. A single finish event could be followed by multiple
12699+ * stop/start events. This routine takes any combination of events from cman
12700+ * and boils them down to one course of action.
12701+ */
12702+
12703+int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out)
12704+{
12705+ LIST_HEAD(events);
12706+ unsigned int cmd = 0, stop, start, finish;
12707+ unsigned int last_stop, last_start, last_finish;
12708+ gd_recover_t *gr = NULL, *start_gr = NULL;
12709+
12710+ /*
12711+ * Grab the current state of cman/sm events.
12712+ */
12713+
12714+ spin_lock(&ls->ls_recover_lock);
12715+
12716+ stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
12717+ start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
12718+ finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
12719+
12720+ last_stop = ls->ls_last_stop;
12721+ last_start = ls->ls_last_start;
12722+ last_finish = ls->ls_last_finish;
12723+
12724+ while (!list_empty(&ls->ls_recover)) {
12725+ gr = list_entry(ls->ls_recover.next, gd_recover_t, gr_list);
12726+ list_del(&gr->gr_list);
12727+ list_add_tail(&gr->gr_list, &events);
12728+ }
12729+ spin_unlock(&ls->ls_recover_lock);
12730+
12731+ log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
12732+ last_stop, last_start, last_finish);
12733+
12734+ /*
12735+ * Toss start events which have since been cancelled.
12736+ */
12737+
12738+ while (!list_empty(&events)) {
12739+ GDLM_ASSERT(start,);
12740+ gr = list_entry(events.next, gd_recover_t, gr_list);
12741+ list_del(&gr->gr_list);
12742+
12743+ if (gr->gr_event_id <= last_stop) {
12744+ log_debug(ls, "move skip event %u", gr->gr_event_id);
12745+ kfree(gr->gr_nodeids);
12746+ free_dlm_recover(gr);
12747+ gr = NULL;
12748+ } else {
12749+ log_debug(ls, "move use event %u", gr->gr_event_id);
12750+ GDLM_ASSERT(!start_gr,);
12751+ start_gr = gr;
12752+ }
12753+ }
12754+
12755+ /*
12756+ * Eight possible combinations of events.
12757+ */
12758+
12759+ /* 0 */
12760+ if (!stop && !start && !finish) {
12761+ GDLM_ASSERT(!start_gr,);
12762+ cmd = 0;
12763+ goto out;
12764+ }
12765+
12766+ /* 1 */
12767+ if (!stop && !start && finish) {
12768+ GDLM_ASSERT(!start_gr,);
12769+ GDLM_ASSERT(last_start > last_stop,);
12770+ GDLM_ASSERT(last_finish == last_start,);
12771+ cmd = DO_FINISH;
12772+ *finish_out = last_finish;
12773+ goto out;
12774+ }
12775+
12776+ /* 2 */
12777+ if (!stop && start && !finish) {
12778+ GDLM_ASSERT(start_gr,);
12779+ GDLM_ASSERT(last_start > last_stop,);
12780+ cmd = DO_START;
12781+ *gr_out = start_gr;
12782+ goto out;
12783+ }
12784+
12785+ /* 3 */
12786+ if (!stop && start && finish) {
12787+ GDLM_ASSERT(0, printk("finish and start with no stop\n"););
12788+ }
12789+
12790+ /* 4 */
12791+ if (stop && !start && !finish) {
12792+ GDLM_ASSERT(!start_gr,);
12793+ GDLM_ASSERT(last_start == last_stop,);
12794+ cmd = DO_STOP;
12795+ goto out;
12796+ }
12797+
12798+ /* 5 */
12799+ if (stop && !start && finish) {
12800+ GDLM_ASSERT(!start_gr,);
12801+ GDLM_ASSERT(last_finish == last_start,);
12802+ GDLM_ASSERT(last_stop == last_start,);
12803+ cmd = DO_FINISH_STOP;
12804+ *finish_out = last_finish;
12805+ goto out;
12806+ }
12807+
12808+ /* 6 */
12809+ if (stop && start && !finish) {
12810+ if (start_gr) {
12811+ GDLM_ASSERT(last_start > last_stop,);
12812+ cmd = DO_START;
12813+ *gr_out = start_gr;
12814+ } else {
12815+ GDLM_ASSERT(last_stop == last_start,);
12816+ cmd = DO_STOP;
12817+ }
12818+ goto out;
12819+ }
12820+
12821+ /* 7 */
12822+ if (stop && start && finish) {
12823+ if (start_gr) {
12824+ GDLM_ASSERT(last_start > last_stop,);
12825+ GDLM_ASSERT(last_start > last_finish,);
12826+ cmd = DO_FINISH_START;
12827+ *finish_out = last_finish;
12828+ *gr_out = start_gr;
12829+ } else {
12830+ GDLM_ASSERT(last_start == last_stop,);
12831+ GDLM_ASSERT(last_start > last_finish,);
12832+ cmd = DO_FINISH_STOP;
12833+ *finish_out = last_finish;
12834+ }
12835+ goto out;
12836+ }
12837+
12838+ out:
12839+ return cmd;
12840+}
12841+
12842+/*
12843+ * This function decides what to do given every combination of current
12844+ * lockspace state and next lockspace state.
12845+ */
12846+
12847+static void do_ls_recovery(gd_ls_t *ls)
12848+{
12849+ gd_recover_t *gr = NULL;
12850+ int error, cur_state, next_state = 0, do_now, finish_event = 0;
12851+
12852+ do_now = next_move(ls, &gr, &finish_event);
12853+ if (!do_now)
12854+ goto out;
12855+
12856+ cur_state = ls->ls_state;
12857+ next_state = 0;
12858+
12859+ GDLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
12860+ log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
12861+
12862+ /*
12863+ * LSST_CLEAR - we're not in any recovery state. We can get a stop or
12864+ * a stop and start which equates with a START.
12865+ */
12866+
12867+ if (cur_state == LSST_CLEAR) {
12868+ switch (do_now) {
12869+ case DO_STOP:
12870+ next_state = LSST_WAIT_START;
12871+ break;
12872+
12873+ case DO_START:
12874+ error = ls_reconfig(ls, gr);
12875+ if (error)
12876+ next_state = LSST_WAIT_START;
12877+ else
12878+ next_state = LSST_RECONFIG_DONE;
12879+ break;
12880+
12881+ case DO_FINISH: /* invalid */
12882+ case DO_FINISH_STOP: /* invalid */
12883+ case DO_FINISH_START: /* invalid */
12884+ default:
12885+ GDLM_ASSERT(0,);
12886+ }
12887+ goto out;
12888+ }
12889+
12890+ /*
12891+ * LSST_WAIT_START - we're not running because of getting a stop or
12892+ * failing a start. We wait in this state for another stop/start or
12893+ * just the next start to begin another reconfig attempt.
12894+ */
12895+
12896+ if (cur_state == LSST_WAIT_START) {
12897+ switch (do_now) {
12898+ case DO_STOP:
12899+ break;
12900+
12901+ case DO_START:
12902+ error = ls_reconfig(ls, gr);
12903+ if (error)
12904+ next_state = LSST_WAIT_START;
12905+ else
12906+ next_state = LSST_RECONFIG_DONE;
12907+ break;
12908+
12909+ case DO_FINISH: /* invalid */
12910+ case DO_FINISH_STOP: /* invalid */
12911+ case DO_FINISH_START: /* invalid */
12912+ default:
12913+ GDLM_ASSERT(0,);
12914+ }
12915+ goto out;
12916+ }
12917+
12918+ /*
12919+ * LSST_RECONFIG_DONE - we entered this state after successfully
12920+ * completing ls_reconfig and calling kcl_start_done. We expect to get
12921+ * a finish if everything goes ok. A finish could be followed by stop
12922+ * or stop/start before we get here to check it. Or a finish may never
12923+ * happen, only stop or stop/start.
12924+ */
12925+
12926+ if (cur_state == LSST_RECONFIG_DONE) {
12927+ switch (do_now) {
12928+ case DO_FINISH:
12929+ clear_finished_nodes(ls, finish_event);
12930+ next_state = LSST_CLEAR;
12931+
12932+ error = enable_locking(ls, finish_event);
12933+ if (error)
12934+ break;
12935+
12936+ error = process_requestqueue(ls);
12937+ if (error)
12938+ break;
12939+
12940+ error = resend_cluster_requests(ls);
12941+ if (error)
12942+ break;
12943+
12944+ restbl_grant_after_purge(ls);
12945+
12946+ log_all(ls, "recover event %u finished", finish_event);
12947+ break;
12948+
12949+ case DO_STOP:
12950+ next_state = LSST_WAIT_START;
12951+ break;
12952+
12953+ case DO_FINISH_STOP:
12954+ clear_finished_nodes(ls, finish_event);
12955+ next_state = LSST_WAIT_START;
12956+ break;
12957+
12958+ case DO_FINISH_START:
12959+ clear_finished_nodes(ls, finish_event);
12960+ /* fall into DO_START */
12961+
12962+ case DO_START:
12963+ error = ls_reconfig(ls, gr);
12964+ if (error)
12965+ next_state = LSST_WAIT_START;
12966+ else
12967+ next_state = LSST_RECONFIG_DONE;
12968+ break;
12969+
12970+ default:
12971+ GDLM_ASSERT(0,);
12972+ }
12973+ goto out;
12974+ }
12975+
12976+ /*
12977+ * LSST_INIT - state after ls is created and before it has been
12978+ * started. A start operation will cause the ls to be started for the
12979+ * first time. A failed start will cause to just wait in INIT for
12980+ * another stop/start.
12981+ */
12982+
12983+ if (cur_state == LSST_INIT) {
12984+ switch (do_now) {
12985+ case DO_START:
12986+ error = ls_first_start(ls, gr);
12987+ if (!error)
12988+ next_state = LSST_INIT_DONE;
12989+ break;
12990+
12991+ case DO_STOP:
12992+ break;
12993+
12994+ case DO_FINISH: /* invalid */
12995+ case DO_FINISH_STOP: /* invalid */
12996+ case DO_FINISH_START: /* invalid */
12997+ default:
12998+ GDLM_ASSERT(0,);
12999+ }
13000+ goto out;
13001+ }
13002+
13003+ /*
13004+ * LSST_INIT_DONE - after the first start operation is completed
13005+ * successfully and kcl_start_done() called. If there are no errors, a
13006+ * finish will arrive next and we'll move to LSST_CLEAR.
13007+ */
13008+
13009+ if (cur_state == LSST_INIT_DONE) {
13010+ switch (do_now) {
13011+ case DO_STOP:
13012+ case DO_FINISH_STOP:
13013+ next_state = LSST_WAIT_START;
13014+ break;
13015+
13016+ case DO_START:
13017+ case DO_FINISH_START:
13018+ error = ls_reconfig(ls, gr);
13019+ if (error)
13020+ next_state = LSST_WAIT_START;
13021+ else
13022+ next_state = LSST_RECONFIG_DONE;
13023+ break;
13024+
13025+ case DO_FINISH:
13026+ next_state = LSST_CLEAR;
13027+ enable_locking(ls, finish_event);
13028+ log_all(ls, "recover event %u finished", finish_event);
13029+ break;
13030+
13031+ default:
13032+ GDLM_ASSERT(0,);
13033+ }
13034+ goto out;
13035+ }
13036+
13037+ out:
13038+ if (next_state)
13039+ ls->ls_state = next_state;
13040+
13041+ if (gr) {
13042+ kfree(gr->gr_nodeids);
13043+ free_dlm_recover(gr);
13044+ }
13045+}
13046+
13047+static __inline__ gd_ls_t *get_work(int clear)
13048+{
13049+ gd_ls_t *ls;
13050+
13051+ spin_lock(&lslist_lock);
13052+
13053+ list_for_each_entry(ls, &lslist, ls_list) {
13054+ if (clear) {
13055+ if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
13056+ goto got_work;
13057+
13058+ } else {
13059+ if (test_bit(LSFL_WORK, &ls->ls_flags))
13060+ goto got_work;
13061+ }
13062+ }
13063+ ls = NULL;
13064+
13065+ got_work:
13066+ spin_unlock(&lslist_lock);
13067+
13068+ return ls;
13069+}
13070+
13071+/*
13072+ * Thread which does recovery for all lockspaces.
13073+ */
13074+
13075+static int dlm_recoverd(void *arg)
13076+{
13077+ gd_ls_t *ls;
13078+
13079+ daemonize("dlm_recoverd");
13080+ recoverd_task = current;
13081+ complete(&recoverd_run);
13082+
13083+ while (!test_bit(THREAD_STOP, &recoverd_flags)) {
13084+ wchan_cond_sleep_intr(recoverd_wait, !get_work(0));
13085+ if ((ls = get_work(1)))
13086+ do_ls_recovery(ls);
13087+ }
13088+
13089+ complete(&recoverd_run);
13090+ return 0;
13091+}
13092+
13093+/*
13094+ * Mark a specific lockspace as needing work and wake up the thread to do it.
13095+ */
13096+
13097+void recoverd_kick(gd_ls_t *ls)
13098+{
13099+ set_bit(LSFL_WORK, &ls->ls_flags);
13100+ wake_up(&recoverd_wait);
13101+}
13102+
13103+/*
13104+ * Start the recoverd thread when gdlm is started (before any lockspaces).
13105+ */
13106+
13107+int recoverd_start(void)
13108+{
13109+ int error;
13110+
13111+ clear_bit(THREAD_STOP, &recoverd_flags);
13112+ error = kernel_thread(dlm_recoverd, NULL, 0);
13113+ if (error < 0)
13114+ goto out;
13115+
13116+ error = 0;
13117+ wait_for_completion(&recoverd_run);
13118+
13119+ out:
13120+ return error;
13121+}
13122+
13123+/*
13124+ * Stop the recoverd thread when gdlm is shut down (all lockspaces are gone).
13125+ */
13126+
13127+int recoverd_stop(void)
13128+{
13129+ set_bit(THREAD_STOP, &recoverd_flags);
13130+ wake_up(&recoverd_wait);
13131+ wait_for_completion(&recoverd_run);
13132+
13133+ return 0;
13134+}
13135diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
13136--- linux-orig/cluster/dlm/recoverd.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13137+++ linux-patched/cluster/dlm/recoverd.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 13138@@ -0,0 +1,22 @@
13139+/******************************************************************************
13140+*******************************************************************************
13141+**
13142+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13143+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13144+**
13145+** This copyrighted material is made available to anyone wishing to use,
13146+** modify, copy, or redistribute it subject to the terms and conditions
13147+** of the GNU General Public License v.2.
13148+**
13149+*******************************************************************************
13150+******************************************************************************/
13151+
13152+#ifndef __RECOVERD_DOT_H__
13153+#define __RECOVERD_DOT_H__
13154+
13155+void dlm_recoverd_init(void);
13156+void recoverd_kick(gd_ls_t * ls);
13157+int recoverd_start(void);
13158+int recoverd_stop(void);
13159+
13160+#endif /* __RECOVERD_DOT_H__ */
13161diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
13162--- linux-orig/cluster/dlm/rsb.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13163+++ linux-patched/cluster/dlm/rsb.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 13164@@ -0,0 +1,307 @@
13165+/******************************************************************************
13166+*******************************************************************************
13167+**
13168+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13169+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13170+**
13171+** This copyrighted material is made available to anyone wishing to use,
13172+** modify, copy, or redistribute it subject to the terms and conditions
13173+** of the GNU General Public License v.2.
13174+**
13175+*******************************************************************************
13176+******************************************************************************/
13177+
13178+#include "dlm_internal.h"
13179+#include "locking.h"
13180+#include "memory.h"
13181+#include "lockqueue.h"
13182+#include "nodes.h"
13183+#include "dir.h"
13184+#include "util.h"
13185+
13186+static gd_res_t *search_hashchain(struct list_head *head, gd_res_t *parent,
13187+ char *name, int namelen)
13188+{
13189+ gd_res_t *r;
13190+
13191+ list_for_each_entry(r, head, res_hashchain) {
13192+ if ((parent == r->res_parent) && (namelen == r->res_length) &&
13193+ (memcmp(name, r->res_name, namelen) == 0)) {
13194+ atomic_inc(&r->res_ref);
13195+ return r;
13196+ }
13197+ }
13198+
13199+ return NULL;
13200+}
13201+
13202+/*
13203+ * A way to arbitrarily hold onto an rsb which we already have a reference to
13204+ * to make sure it doesn't go away. Opposite of release_rsb().
13205+ */
13206+
13207+void hold_rsb(gd_res_t *r)
13208+{
13209+ atomic_inc(&r->res_ref);
13210+}
13211+
13212+/*
13213+ * release_rsb() - Decrement reference count on rsb struct. Free the rsb
13214+ * struct when there are zero references. Every lkb for the rsb adds a
13215+ * reference. When ref is zero there can be no more lkb's for the rsb, on the
13216+ * queue's or anywhere else.
13217+ */
13218+
13219+void release_rsb(gd_res_t *r)
13220+{
13221+ gd_ls_t *ls = r->res_ls;
13222+ int removed = FALSE;
13223+
13224+ write_lock(&ls->ls_reshash_lock);
13225+ atomic_dec(&r->res_ref);
13226+
13227+ if (!atomic_read(&r->res_ref)) {
13228+ GDLM_ASSERT(list_empty(&r->res_grantqueue),);
13229+ GDLM_ASSERT(list_empty(&r->res_waitqueue),);
13230+ GDLM_ASSERT(list_empty(&r->res_convertqueue),);
13231+ removed = TRUE;
13232+ list_del(&r->res_hashchain);
13233+ }
13234+ write_unlock(&ls->ls_reshash_lock);
13235+
13236+ if (removed) {
13237+ down_read(&ls->ls_gap_rsblist);
13238+ if (r->res_parent)
13239+ list_del(&r->res_subreslist);
13240+ else
13241+ list_del(&r->res_rootlist);
13242+ up_read(&ls->ls_gap_rsblist);
13243+
13244+ /*
13245+ * Remove resdir entry if this was a locally mastered root rsb.
13246+ */
13247+ if (!r->res_parent && !r->res_nodeid) {
13248+ if (get_directory_nodeid(r) != our_nodeid())
13249+ remote_remove_resdata(r->res_ls,
13250+ get_directory_nodeid(r),
13251+ r->res_name,
13252+ r->res_length,
13253+ r->res_resdir_seq);
13254+ else
13255+ remove_resdata(r->res_ls, our_nodeid(),
13256+ r->res_name, r->res_length,
13257+ r->res_resdir_seq);
13258+ }
13259+
13260+ if (r->res_lvbptr)
13261+ free_lvb(r->res_lvbptr);
13262+
13263+ free_rsb(r);
13264+ }
13265+}
13266+
13267+/*
13268+ * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
13269+ * If the rsb exists, its ref count is incremented by this function. If it
13270+ * doesn't exist, it's created with a ref count of one.
13271+ */
13272+
13273+int find_or_create_rsb(gd_ls_t *ls, gd_res_t *parent, char *name, int namelen,
13274+ int create, gd_res_t **rp)
13275+{
13276+ uint32_t hash;
13277+ gd_res_t *r, *tmp;
13278+ int error = -ENOMEM;
13279+
13280+ GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
13281+
13282+ hash = gdlm_hash(name, namelen);
13283+ hash &= ls->ls_hashmask;
13284+
13285+ read_lock(&ls->ls_reshash_lock);
13286+ r = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
13287+ read_unlock(&ls->ls_reshash_lock);
13288+
13289+ if (r)
13290+ goto out_set;
13291+ if (!create) {
13292+ *rp = NULL;
13293+ goto out;
13294+ }
13295+
13296+ r = allocate_rsb(ls, namelen);
13297+ if (!r)
13298+ goto fail;
13299+
13300+ INIT_LIST_HEAD(&r->res_subreslist);
13301+ INIT_LIST_HEAD(&r->res_grantqueue);
13302+ INIT_LIST_HEAD(&r->res_convertqueue);
13303+ INIT_LIST_HEAD(&r->res_waitqueue);
13304+
13305+ memcpy(r->res_name, name, namelen);
13306+ r->res_length = namelen;
13307+ r->res_ls = ls;
13308+ init_rwsem(&r->res_lock);
13309+ atomic_set(&r->res_ref, 1);
13310+
13311+ if (parent) {
13312+ r->res_parent = parent;
13313+ r->res_depth = parent->res_depth + 1;
13314+ r->res_root = parent->res_root;
13315+ r->res_nodeid = parent->res_nodeid;
13316+ } else {
13317+ r->res_parent = NULL;
13318+ r->res_depth = 1;
13319+ r->res_root = r;
13320+ r->res_nodeid = -1;
13321+ }
13322+
13323+ write_lock(&ls->ls_reshash_lock);
13324+ tmp = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
13325+ if (tmp) {
13326+ write_unlock(&ls->ls_reshash_lock);
13327+ free_rsb(r);
13328+ r = tmp;
13329+ } else {
13330+ list_add(&r->res_hashchain, &ls->ls_reshashtbl[hash]);
13331+ write_unlock(&ls->ls_reshash_lock);
13332+
13333+ down_read(&ls->ls_gap_rsblist);
13334+ if (parent)
13335+ list_add_tail(&r->res_subreslist,
13336+ &r->res_root->res_subreslist);
13337+ else
13338+ list_add(&r->res_rootlist, &ls->ls_rootres);
13339+ up_read(&ls->ls_gap_rsblist);
13340+ }
13341+
13342+ out_set:
13343+ *rp = r;
13344+
13345+ out:
13346+ error = 0;
13347+
13348+ fail:
13349+ return error;
13350+}
13351+
13352+/*
13353+ * Add a LKB to a resource's grant/convert/wait queue. in order
13354+ */
13355+
13356+void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
13357+{
13358+ gd_lkb_t *lkb = NULL;
13359+
13360+ list_for_each_entry(lkb, head, lkb_statequeue) {
13361+ if (lkb->lkb_rqmode < mode)
13362+ break;
13363+ }
13364+
13365+ if (!lkb) {
13366+ /* No entries in the queue, we are alone */
13367+ list_add_tail(new, head);
13368+ } else {
13369+ __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
13370+ }
13371+}
13372+
13373+/*
13374+ * The rsb res_lock must be held in write when this function is called.
13375+ */
13376+
13377+void lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13378+{
13379+
13380+ GDLM_ASSERT(!lkb->lkb_status, printk("status=%u\n", lkb->lkb_status););
13381+
13382+ lkb->lkb_status = type;
13383+
13384+ switch (type) {
13385+ case GDLM_LKSTS_WAITING:
13386+ list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
13387+ break;
13388+
13389+ case GDLM_LKSTS_GRANTED:
13390+ lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
13391+ lkb->lkb_grmode);
13392+ break;
13393+
13394+ case GDLM_LKSTS_CONVERT:
13395+ if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
13396+ list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
13397+
13398+ else
13399+ if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)
13400+ list_add_tail(&lkb->lkb_statequeue,
13401+ &r->res_convertqueue);
13402+ else
13403+ lkb_add_ordered(&lkb->lkb_statequeue,
13404+ &r->res_convertqueue, lkb->lkb_rqmode);
13405+ break;
13406+
13407+ default:
13408+ GDLM_ASSERT(0,);
13409+ }
13410+}
13411+
13412+void res_lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13413+{
13414+ down_write(&r->res_lock);
13415+ lkb_enqueue(r, lkb, type);
13416+ up_write(&r->res_lock);
13417+}
13418+
13419+/*
13420+ * The rsb res_lock must be held in write when this function is called.
13421+ */
13422+
13423+int lkb_dequeue(gd_lkb_t *lkb)
13424+{
13425+ int status = lkb->lkb_status;
13426+
13427+ if (!status)
13428+ goto out;
13429+
13430+ lkb->lkb_status = 0;
13431+ list_del(&lkb->lkb_statequeue);
13432+
13433+ out:
13434+ return status;
13435+}
13436+
13437+int res_lkb_dequeue(gd_lkb_t *lkb)
13438+{
13439+ int status;
13440+
13441+ down_write(&lkb->lkb_resource->res_lock);
13442+ status = lkb_dequeue(lkb);
13443+ up_write(&lkb->lkb_resource->res_lock);
13444+
13445+ return status;
13446+}
13447+
13448+/*
13449+ * The rsb res_lock must be held in write when this function is called.
13450+ */
13451+
13452+int lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13453+{
13454+ int status;
13455+
13456+ status = lkb_dequeue(lkb);
13457+ lkb_enqueue(r, lkb, type);
13458+
13459+ return status;
13460+}
13461+
13462+int res_lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
13463+{
13464+ int status;
13465+
13466+ down_write(&r->res_lock);
13467+ status = lkb_swqueue(r, lkb, type);
13468+ up_write(&r->res_lock);
13469+
13470+ return status;
13471+}
13472diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
13473--- linux-orig/cluster/dlm/rsb.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13474+++ linux-patched/cluster/dlm/rsb.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 13475@@ -0,0 +1,30 @@
13476+/******************************************************************************
13477+*******************************************************************************
13478+**
13479+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13480+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13481+**
13482+** This copyrighted material is made available to anyone wishing to use,
13483+** modify, copy, or redistribute it subject to the terms and conditions
13484+** of the GNU General Public License v.2.
13485+**
13486+*******************************************************************************
13487+******************************************************************************/
13488+
13489+#ifndef __RSB_DOT_H__
13490+#define __RSB_DOT_H__
13491+
13492+void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
13493+void _release_rsb(gd_res_t * r);
13494+void release_rsb(gd_res_t * r);
13495+void hold_rsb(gd_res_t * r);
13496+int find_or_create_rsb(gd_ls_t * ls, gd_res_t * parent, char *name, int namelen,
13497+ int create, gd_res_t ** rp);
13498+void lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13499+void res_lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13500+int lkb_dequeue(gd_lkb_t * lkb);
13501+int res_lkb_dequeue(gd_lkb_t * lkb);
13502+int lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13503+int res_lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
13504+
13505+#endif /* __RSB_DOT_H__ */
13506diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
13507--- linux-orig/cluster/dlm/util.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13508+++ linux-patched/cluster/dlm/util.c 2004-06-29 20:01:20.000000000 +0800
4bf12011 13509@@ -0,0 +1,130 @@
13510+/******************************************************************************
13511+*******************************************************************************
13512+**
13513+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13514+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13515+**
13516+** This copyrighted material is made available to anyone wishing to use,
13517+** modify, copy, or redistribute it subject to the terms and conditions
13518+** of the GNU General Public License v.2.
13519+**
13520+*******************************************************************************
13521+******************************************************************************/
13522+
13523+#include "dlm_internal.h"
13524+
13525+static const uint32_t crc_32_tab[] = {
13526+ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
13527+ 0xe963a535, 0x9e6495a3,
13528+ 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
13529+ 0xe7b82d07, 0x90bf1d91,
13530+ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
13531+ 0xf4d4b551, 0x83d385c7,
13532+ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
13533+ 0xfa0f3d63, 0x8d080df5,
13534+ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
13535+ 0xd20d85fd, 0xa50ab56b,
13536+ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
13537+ 0xdcd60dcf, 0xabd13d59,
13538+ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
13539+ 0xcfba9599, 0xb8bda50f,
13540+ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
13541+ 0xc1611dab, 0xb6662d3d,
13542+ 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
13543+ 0x9fbfe4a5, 0xe8b8d433,
13544+ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
13545+ 0x91646c97, 0xe6635c01,
13546+ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
13547+ 0x8208f4c1, 0xf50fc457,
13548+ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
13549+ 0x8cd37cf3, 0xfbd44c65,
13550+ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
13551+ 0xa4d1c46d, 0xd3d6f4fb,
13552+ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
13553+ 0xaa0a4c5f, 0xdd0d7cc9,
13554+ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
13555+ 0xb966d409, 0xce61e49f,
13556+ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
13557+ 0xb7bd5c3b, 0xc0ba6cad,
13558+ 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
13559+ 0x04db2615, 0x73dc1683,
13560+ 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
13561+ 0x0a00ae27, 0x7d079eb1,
13562+ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
13563+ 0x196c3671, 0x6e6b06e7,
13564+ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
13565+ 0x17b7be43, 0x60b08ed5,
13566+ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
13567+ 0x3fb506dd, 0x48b2364b,
13568+ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
13569+ 0x316e8eef, 0x4669be79,
13570+ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
13571+ 0x220216b9, 0x5505262f,
13572+ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
13573+ 0x2cd99e8b, 0x5bdeae1d,
13574+ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
13575+ 0x72076785, 0x05005713,
13576+ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
13577+ 0x7cdcefb7, 0x0bdbdf21,
13578+ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
13579+ 0x6fb077e1, 0x18b74777,
13580+ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
13581+ 0x616bffd3, 0x166ccf45,
13582+ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
13583+ 0x4969474d, 0x3e6e77db,
13584+ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
13585+ 0x47b2cf7f, 0x30b5ffe9,
13586+ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
13587+ 0x54de5729, 0x23d967bf,
13588+ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
13589+ 0x5a05df1b, 0x2d02ef8d
13590+};
13591+
13592+/**
13593+ * gdlm_hash - hash an array of data
13594+ * @data: the data to be hashed
13595+ * @len: the length of data to be hashed
13596+ *
13597+ * Copied from GFS.
13598+ *
13599+ * Take some data and convert it to a 32-bit hash.
13600+ *
13601+ * The hash function is a 32-bit CRC of the data. The algorithm uses
13602+ * the crc_32_tab table above.
13603+ *
13604+ * This may not be the fastest hash function, but it does a fair bit better
13605+ * at providing uniform results than the others I've looked at. That's
13606+ * really important for efficient directories.
13607+ *
13608+ * Returns: the hash
13609+ */
13610+
13611+uint32_t gdlm_hash(const char *data, int len)
13612+{
13613+ uint32_t hash = 0xFFFFFFFF;
13614+
13615+ for (; len--; data++)
13616+ hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
13617+
13618+ hash = ~hash;
13619+
13620+ return hash;
13621+}
13622+
13623+uint32_t gdlm_next_power2(uint32_t val)
13624+{
13625+ uint32_t x;
13626+
13627+ for (x = 1; x < val; x <<= 1) ;
13628+
13629+ return x;
13630+}
13631+
13632+void print_lkb(gd_lkb_t *lkb)
13633+{
13634+ printk("dlm: lkb id=%x remid=%x flags=%x status=%x rq=%d gr=%d "
13635+ "nodeid=%u lqstate=%x lqflags=%x\n",
13636+ lkb->lkb_id, lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_status,
13637+ lkb->lkb_rqmode, lkb->lkb_grmode, lkb->lkb_nodeid,
13638+ lkb->lkb_lockqueue_state, lkb->lkb_lockqueue_flags);
13639+}
13640diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
13641--- linux-orig/cluster/dlm/util.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13642+++ linux-patched/cluster/dlm/util.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 13643@@ -0,0 +1,22 @@
13644+/******************************************************************************
13645+*******************************************************************************
13646+**
13647+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13648+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13649+**
13650+** This copyrighted material is made available to anyone wishing to use,
13651+** modify, copy, or redistribute it subject to the terms and conditions
13652+** of the GNU General Public License v.2.
13653+**
13654+*******************************************************************************
13655+******************************************************************************/
13656+
13657+#ifndef __UTIL_DOT_H__
13658+#define __UTIL_DOT_H__
13659+
13660+uint32_t gdlm_hash(const char *data, int len);
13661+uint32_t gdlm_next_power2(uint32_t val);
13662+
13663+void print_lkb(gd_lkb_t *lkb);
13664+
13665+#endif
13666diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
13667--- linux-orig/include/cluster/dlm.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13668+++ linux-patched/include/cluster/dlm.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 13669@@ -0,0 +1,404 @@
13670+/******************************************************************************
13671+*******************************************************************************
13672+**
13673+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13674+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13675+**
13676+** This copyrighted material is made available to anyone wishing to use,
13677+** modify, copy, or redistribute it subject to the terms and conditions
13678+** of the GNU General Public License v.2.
13679+**
13680+*******************************************************************************
13681+******************************************************************************/
13682+
13683+#ifndef __DLM_DOT_H__
13684+#define __DLM_DOT_H__
13685+
13686+/*
13687+ * Interface to DLM - routines and structures to use DLM lockspaces.
13688+ */
13689+
13690+/*
13691+ * Lock Modes
13692+ */
13693+
13694+#define DLM_LOCK_IV (-1) /* invalid */
13695+#define DLM_LOCK_NL (0) /* null */
13696+#define DLM_LOCK_CR (1) /* concurrent read */
13697+#define DLM_LOCK_CW (2) /* concurrent write */
13698+#define DLM_LOCK_PR (3) /* protected read */
13699+#define DLM_LOCK_PW (4) /* protected write */
13700+#define DLM_LOCK_EX (5) /* exclusive */
13701+
13702+/*
13703+ * Maximum size in bytes of a dlm_lock name
13704+ */
13705+
13706+#define DLM_RESNAME_MAXLEN (64)
13707+
13708+/*
13709+ * Size in bytes of Lock Value Block
13710+ */
13711+
13712+#define DLM_LVB_LEN (32)
13713+
13714+/*
13715+ * Flags to dlm_new_lockspace
13716+ *
13717+ * DLM_LSF_NOTIMERS
13718+ *
13719+ * Do not subject locks in this lockspace to time-outs.
13720+ *
13721+ */
13722+
13723+#define DLM_LSF_NOTIMERS (1)
13724+
13725+/*
13726+ * Flags to dlm_lock
13727+ *
13728+ * DLM_LKF_NOQUEUE
13729+ *
13730+ * Do not queue the lock request on the wait queue if it cannot be granted
13731+ * immediately. If the lock cannot be granted because of this flag, DLM will
13732+ * either return -EAGAIN from the dlm_lock call or will return 0 from
13733+ * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
13734+ *
13735+ * DLM_LKF_CONVERT
13736+ *
13737+ * Indicates a lock conversion request. For conversions the name and namelen
13738+ * are ignored and the lock ID in the LKSB is used to identify the lock.
13739+ *
13740+ * DLM_LKF_VALBLK
13741+ *
13742+ * Requests DLM to return the current contents of the lock value block in the
13743+ * lock status block. When this flag is set in a lock conversion from PW or EX
13744+ * modes, DLM assigns the value specified in the lock status block to the lock
13745+ * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
13746+ * containing application-specific information.
13747+ *
13748+ * DLM_LKF_QUECVT
13749+ *
13750+ * Force a conversion lock request to the back of the convert queue. All other
13751+ * conversion requests ahead of it must be granted before it can be granted.
13752+ * This enforces a FIFO ordering on the convert queue. When this flag is set,
13753+ * indefinite postponement is averted. This flag is allowed only when
13754+ * converting a lock to a more restrictive mode.
13755+ *
13756+ * DLM_LKF_CANCEL
13757+ *
13758+ * Used to cancel a pending conversion (with dlm_unlock). Lock is returned to
13759+ * previously granted mode.
13760+ *
13761+ * DLM_LKF_IVVALBLK
13762+ *
13763+ * Invalidate/clear the lock value block.
13764+ *
13765+ * DLM_LKF_CONVDEADLK
13766+ *
13767+ * The granted mode of a lock being converted (from a non-NL mode) can be
13768+ * changed to NL in the process of acquiring the requested mode to avoid
13769+ * conversion deadlock.
13770+ *
13771+ * DLM_LKF_PERSISTENT
13772+ *
13773+ * Only relevant to locks originating in userspace. Signals to the ioctl.c code
13774+ * that this lock should not be unlocked when the process exits.
13775+ *
13776+ * DLM_LKF_NODLKWT
13777+ *
13778+ * This lock is not to be checked for conversion deadlocks.
13779+ *
13780+ * DLM_LKF_NODLCKBLK
13781+ *
13782+ * not yet implemented
13783+ *
13784+ * DLM_LKF_EXPEDITE
13785+ *
13786+ * If this lock conversion cannot be granted immediately it is to go to the
13787+ * head of the conversion queue regardless of its requested lock mode.
13788+ *
13789+ * DLM_LKF_NOQUEUEBAST
13790+ *
13791+ * Send blocking AST's before returning -EAGAIN to the caller. It is only
13792+ * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
13793+ * NOQUEUE requests otherwise.
13794+ *
13795+ */
13796+
13797+#define DLM_LKF_NOQUEUE (0x00000001)
13798+#define DLM_LKF_CANCEL (0x00000002)
13799+#define DLM_LKF_CONVERT (0x00000004)
13800+#define DLM_LKF_VALBLK (0x00000008)
13801+#define DLM_LKF_QUECVT (0x00000010)
13802+#define DLM_LKF_IVVALBLK (0x00000020)
13803+#define DLM_LKF_CONVDEADLK (0x00000040)
13804+#define DLM_LKF_PERSISTENT (0x00000080)
13805+#define DLM_LKF_NODLCKWT (0x00000100)
13806+#define DLM_LKF_NODLCKBLK (0x00000200)
13807+#define DLM_LKF_EXPEDITE (0x00000400)
13808+#define DLM_LKF_NOQUEUEBAST (0x00000800)
13809+
13810+/*
13811+ * Some return codes that are not not in errno.h
13812+ */
13813+
13814+#define DLM_ECANCEL (0x10001)
13815+#define DLM_EUNLOCK (0x10002)
13816+
13817+typedef void dlm_lockspace_t;
13818+
13819+/*
13820+ * Lock range structure
13821+ */
13822+
13823+struct dlm_range {
13824+ uint64_t ra_start;
13825+ uint64_t ra_end;
13826+};
13827+
13828+/*
13829+ * Lock status block
13830+ *
13831+ * Use this structure to specify the contents of the lock value block. For a
13832+ * conversion request, this structure is used to specify the lock ID of the
13833+ * lock. DLM writes the status of the lock request and the lock ID assigned
13834+ * to the request in the lock status block.
13835+ *
13836+ * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
13837+ * It is available when dlm_lock returns.
13838+ *
13839+ * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
13840+ * shown for the DLM_LKF_VALBLK flag.
13841+ *
13842+ * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
13843+ * it was first demoted to NL to avoid conversion deadlock.
13844+ *
13845+ * sb_status: the returned status of the lock request set prior to AST
13846+ * execution. Possible return values:
13847+ *
13848+ * 0 if lock request was successful
13849+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
13850+ * -ENOMEM if there is no memory to process request
13851+ * -EINVAL if there are invalid parameters
13852+ * -DLM_EUNLOCK if unlock request was successful
13853+ * -DLM_ECANCEL ?
13854+ */
13855+
13856+#define DLM_SBF_DEMOTED (0x01)
13857+
13858+struct dlm_lksb {
13859+ int sb_status;
13860+ uint32_t sb_lkid;
13861+ char sb_flags;
13862+ char * sb_lvbptr;
13863+};
13864+
13865+/*
13866+ * These defines are the bits that make up the
13867+ * query code.
13868+ */
13869+
13870+/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
13871+ * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
13872+#define DLM_LOCK_THIS 0x0007
13873+#define DLM_QUERY_MODE_MASK 0x0007
13874+
13875+/* Bits 3, 4, 5 bitmap of queue(s) to query */
13876+#define DLM_QUERY_QUEUE_WAIT 0x0008
13877+#define DLM_QUERY_QUEUE_CONVERT 0x0010
13878+#define DLM_QUERY_QUEUE_GRANT 0x0020
13879+#define DLM_QUERY_QUEUE_GRANTED 0x0030 /* Shorthand */
13880+#define DLM_QUERY_QUEUE_ALL 0x0038 /* Shorthand */
13881+
13882+/* Bit 6, Return only the information that can be established without a network
13883+ * round-trip. The caller must be aware of the implications of this. Useful for
13884+ * just getting the master node id or resource name. */
13885+#define DLM_QUERY_LOCAL 0x0040
13886+
13887+/* Bits 8 up, query type */
13888+#define DLM_QUERY_LOCKS_HIGHER 0x0100
13889+#define DLM_QUERY_LOCKS_LOWER 0x0200
13890+#define DLM_QUERY_LOCKS_EQUAL 0x0300
13891+#define DLM_QUERY_LOCKS_BLOCKING 0x0400
13892+#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
13893+#define DLM_QUERY_LOCKS_ALL 0x0600
13894+#define DLM_QUERY_MASK 0x0F00
13895+
13896+/* GRMODE is the default for mode comparisons,
13897+ RQMODE might also be handy */
13898+#define DLM_QUERY_GRMODE 0x0000
13899+#define DLM_QUERY_RQMODE 0x1000
13900+
13901+/* Structures passed into and out of the query */
13902+
13903+struct dlm_lockinfo {
13904+ int lki_lkid; /* Lock ID on originating node */
13905+ int lki_mstlkid; /* Lock ID on master node */
13906+ int lki_parent;
13907+ int lki_node; /* Originating node (not master) */
13908+ uint8_t lki_state; /* Queue the lock is on */
13909+ uint8_t lki_grmode; /* Granted mode */
13910+ uint8_t lki_rqmode; /* Requested mode */
13911+ struct dlm_range lki_grrange; /* Granted range, if applicable */
13912+ struct dlm_range lki_rqrange; /* Requested range, if applicable */
13913+};
13914+
13915+struct dlm_resinfo {
13916+ int rsi_length;
13917+ int rsi_grantcount; /* No. of nodes on grant queue */
13918+ int rsi_convcount; /* No. of nodes on convert queue */
13919+ int rsi_waitcount; /* No. of nodes on wait queue */
13920+ int rsi_masternode; /* Master for this resource */
13921+ char rsi_name[DLM_RESNAME_MAXLEN]; /* Resource name */
13922+ char rsi_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable
13923+ */
13924+};
13925+
13926+struct dlm_queryinfo {
13927+ struct dlm_resinfo *gqi_resinfo;
13928+ struct dlm_lockinfo *gqi_lockinfo; /* This points to an array
13929+ * of structs */
13930+ int gqi_locksize; /* input */
13931+ int gqi_lockcount; /* output */
13932+};
13933+
13934+#ifdef __KERNEL__
13935+/*
13936+ * dlm_init
13937+ *
13938+ * Starts and initializes DLM threads and structures. Creation of the first
13939+ * lockspace will call this if it has not been called already.
13940+ *
13941+ * Returns: 0 if successful, -EXXX on error
13942+ */
13943+
13944+int dlm_init(void);
13945+
13946+/*
13947+ * dlm_release
13948+ *
13949+ * Stops DLM threads.
13950+ *
13951+ * Returns: 0 if successful, -EXXX on error
13952+ */
13953+
13954+int dlm_release(void);
13955+
13956+/*
13957+ * dlm_new_lockspace
13958+ *
13959+ * Starts a lockspace with the given name. If the named lockspace exists in
13960+ * the cluster, the calling node joins it.
13961+ */
13962+
13963+int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
13964+ int flags);
13965+
13966+/*
13967+ * dlm_release_lockspace
13968+ *
13969+ * Stop a lockspace.
13970+ */
13971+
13972+int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
13973+
13974+/*
13975+ * dlm_lock
13976+ *
13977+ * Make an asyncronous request to acquire or convert a lock on a named
13978+ * resource.
13979+ *
13980+ * lockspace: context for the request
13981+ * mode: the requested mode of the lock (DLM_LOCK_)
13982+ * lksb: lock status block for input and async return values
13983+ * flags: input flags (DLM_LKF_)
13984+ * name: name of the resource to lock, can be binary
13985+ * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
13986+ * parent: the lock ID of a parent lock or 0 if none
13987+ * lockast: function DLM executes when it completes processing the request
13988+ * astarg: argument passed to lockast and bast functions
13989+ * bast: function DLM executes when this lock later blocks another request
13990+ *
13991+ * Returns:
13992+ * 0 if request is successfully queued for processing
13993+ * -EINVAL if any input parameters are invalid
13994+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
13995+ * -ENOMEM if there is no memory to process request
13996+ * -ENOTCONN if there is a communication error
13997+ *
13998+ * If the call to dlm_lock returns an error then the operation has failed and
13999+ * the AST routine will not be called. If dlm_lock returns 0 it is still
14000+ * possible that the lock operation will fail. The AST routine will be called
14001+ * when the locking is complete and the status is returned in the lksb.
14002+ *
14003+ * If the AST routines or parameter are passed to a conversion operation then
14004+ * they will overwrite those values that were passed to a previous dlm_lock
14005+ * call.
14006+ *
14007+ * AST routines should not block (at least not for long), but may make
14008+ * any locking calls they please.
14009+ */
14010+
14011+int dlm_lock(dlm_lockspace_t *lockspace,
14012+ uint32_t mode,
14013+ struct dlm_lksb *lksb,
14014+ uint32_t flags,
14015+ void *name,
14016+ unsigned int namelen,
14017+ uint32_t parent,
14018+ void (*lockast) (void *astarg),
14019+ void *astarg,
14020+ void (*bast) (void *astarg, int mode),
14021+ struct dlm_range *range);
14022+
14023+/*
14024+ * dlm_unlock
14025+ *
14026+ * Asynchronously release a lock on a resource. The AST routine is called
14027+ * when the resource is successfully unlocked.
14028+ *
14029+ * lockspace: context for the request
14030+ * lkid: the lock ID as returned in the lksb
14031+ * flags: input flags (DLM_LKF_)
14032+ * lksb: if NULL the lksb parameter passed to last lock request is used
14033+ * astarg: if NULL, astarg in last lock request is used
14034+ *
14035+ * Returns:
14036+ * 0 if request is successfully queued for processing
14037+ * -EINVAL if any input parameters are invalid
14038+ * -ENOTEMPTY if the lock still has sublocks
14039+ * -EBUSY if the lock is waiting for a remote lock operation
14040+ * -ENOTCONN if there is a communication error
14041+ */
14042+
14043+extern int dlm_unlock(dlm_lockspace_t *lockspace,
14044+ uint32_t lkid,
14045+ uint32_t flags,
14046+ struct dlm_lksb *lksb,
14047+ void *astarg);
14048+
14049+/* Query interface
14050+ *
14051+ * Query the other holders of a resource, given a known lock ID
14052+ *
14053+ * lockspace: context for the request
14054+ * lksb: LKSB, sb_lkid contains the lock ID of a valid lock
14055+ * on the resource. sb_status will contain the status
14056+ * of the request on completion.
14057+ * query: query bitmap see DLM_QUERY_* above
14058+ * qinfo: pointer to dlm_queryinfo structure
14059+ * ast_routine: AST routine to call on completion
14060+ * artarg: argument to AST routine. It is "traditional"
14061+ * to put the qinfo pointer into lksb->sb_lvbptr
14062+ * and pass the lksb in here.
14063+ */
14064+extern int dlm_query(dlm_lockspace_t *lockspace,
14065+ struct dlm_lksb *lksb,
14066+ int query,
14067+ struct dlm_queryinfo *qinfo,
14068+ void (ast_routine(void *)),
14069+ void *astarg);
14070+
14071+#endif /* __KERNEL__ */
14072+
14073+#endif /* __DLM_DOT_H__ */
14074diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
14075--- linux-orig/include/cluster/dlm_device.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14076+++ linux-patched/include/cluster/dlm_device.h 2004-06-29 20:01:20.000000000 +0800
4bf12011 14077@@ -0,0 +1,63 @@
14078+/******************************************************************************
14079+*******************************************************************************
14080+**
14081+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14082+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14083+**
14084+** This copyrighted material is made available to anyone wishing to use,
14085+** modify, copy, or redistribute it subject to the terms and conditions
14086+** of the GNU General Public License v.2.
14087+**
14088+*******************************************************************************
14089+******************************************************************************/
14090+
14091+/* This is the device interface for dlm, most users will use a library
14092+ * interface.
14093+ */
14094+
14095+/* Version of the device interface */
14096+#define DLM_DEVICE_VERSION_MAJOR 2
14097+#define DLM_DEVICE_VERSION_MINOR 0
14098+#define DLM_DEVICE_VERSION_PATCH 0
14099+
14100+/* struct passed to the lock write */
14101+struct dlm_lock_params {
14102+ uint32_t version[3];
14103+ uint8_t cmd;
14104+ uint8_t mode;
14105+ uint16_t flags;
14106+ uint32_t lkid;
14107+ uint32_t parent;
14108+ struct dlm_range range;
14109+ uint8_t namelen;
14110+ void *astparam;
14111+ void *astaddr;
14112+ void *bastaddr;
14113+ struct dlm_lksb *lksb;
14114+ char name[1];
14115+};
14116+
14117+
14118+/* struct read from the "device" fd,
14119+ consists mainly of userspace pointers for the library to use */
14120+struct dlm_lock_result {
14121+ uint8_t cmd;
14122+ void *astparam;
14123+ void (*astaddr)(void *astparam);
14124+ struct dlm_lksb *user_lksb;
14125+ struct dlm_lksb lksb; /* But this has real data in it */
14126+ uint8_t bast_mode; /* Not yet used */
14127+};
14128+
14129+/* commands passed to the device */
14130+#define DLM_USER_LOCK 1
14131+#define DLM_USER_UNLOCK 2
14132+#define DLM_USER_QUERY 3
14133+
14134+/* Arbitrary length restriction */
14135+#define MAX_LS_NAME_LEN 64
14136+
14137+/* ioctls on the device */
14138+#define DLM_CREATE_LOCKSPACE _IOW('D', 0x01, char *)
14139+#define DLM_RELEASE_LOCKSPACE _IOW('D', 0x02, char *)
14140+#define DLM_FORCE_RELEASE_LOCKSPACE _IOW('D', 0x03, char *)
This page took 1.631685 seconds and 4 git commands to generate.