]> git.pld-linux.org Git - packages/kernel.git/blame - linux-cluster-dlm.patch
- obsolete
[packages/kernel.git] / linux-cluster-dlm.patch
CommitLineData
bb1d8b11
AM
1# Add DLM to the build system
2diff -urN -p linux-2.6.8.1/cluster/Kconfig linux/cluster/Kconfig
3--- linux-2.6.8.1/cluster/Kconfig 2004-08-24 13:23:09.000000000 +0800
4+++ linux/cluster/Kconfig 2004-08-24 13:23:32.000000000 +0800
5@@ -10,4 +10,22 @@ config CLUSTER
6 needed by all the other components. It provides membership services
7 for those other subsystems.
8
9+config CLUSTER_DLM
10+ tristate "Distributed Lock Manager"
11+ depends on CLUSTER
12+ ---help---
13+ A fully distributed lock manager, providing cluster-wide locking services
14+ and protected lock namespaces for kernel and userland applications.
15+
16+config CLUSTER_DLM_PROCLOCKS
17+ boolean "/proc/locks support for DLM"
18+ depends on CLUSTER_DLM
19+ depends on PROC_FS
20+ ---help---
21+ If this option is enabled a file will appear in /proc/cluster/dlm_locks.
22+ write into this "file" the name of a lockspace known to the DLM and then
23+ read out a list of all the resources and locks in that lockspace that are
24+ known to the local node. Note because the DLM is distributed this may not
25+ be the full lock picture.
26+
27 endmenu
28diff -urN -p linux-2.6.8.1/cluster/Makefile linux/cluster/Makefile
29--- linux-2.6.8.1/cluster/Makefile 2004-08-24 13:23:09.000000000 +0800
30+++ linux/cluster/Makefile 2004-08-24 13:23:32.000000000 +0800
31@@ -1,3 +1,4 @@
32 obj-y := nocluster.o
33
34 obj-$(CONFIG_CLUSTER) += cman/
35+obj-$(CONFIG_CLUSTER_DLM) += dlm/
36diff -urN -p linux-2.6.8.1/cluster/dlm/Makefile linux/cluster/dlm/Makefile
37--- linux-2.6.8.1/cluster/dlm/Makefile 1970-01-01 07:30:00.000000000 +0730
38+++ linux/cluster/dlm/Makefile 2004-08-24 13:23:32.000000000 +0800
39@@ -0,0 +1,23 @@
40+dlm-objs := ast.o \
41+ config.o \
42+ device.o \
43+ dir.o \
44+ lkb.o \
45+ locking.o \
46+ lockqueue.o \
47+ lockspace.o \
48+ lowcomms.o \
49+ main.o \
50+ memory.o \
51+ midcomms.o \
52+ nodes.o \
53+ proc.o \
54+ queries.o \
55+ rebuild.o \
56+ reccomms.o \
57+ recover.o \
58+ recoverd.o \
59+ rsb.o \
60+ util.o \
61+
62+obj-$(CONFIG_CLUSTER_DLM) += dlm.o
c1c6733f
AM
63diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
64--- linux-orig/cluster/dlm/ast.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
65+++ linux-patched/cluster/dlm/ast.c 2004-11-03 11:31:56.000000000 +0800
66@@ -0,0 +1,618 @@
c1c6733f
AM
67+/******************************************************************************
68+*******************************************************************************
69+**
70+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
71+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
72+**
73+** This copyrighted material is made available to anyone wishing to use,
74+** modify, copy, or redistribute it subject to the terms and conditions
75+** of the GNU General Public License v.2.
76+**
77+*******************************************************************************
78+******************************************************************************/
79+
80+/*
81+ * This delivers ASTs and checks for dead remote requests and deadlocks.
82+ */
83+
84+#include <linux/timer.h>
85+
86+#include "dlm_internal.h"
87+#include "rsb.h"
88+#include "lockqueue.h"
89+#include "dir.h"
90+#include "locking.h"
91+#include "lkb.h"
92+#include "lowcomms.h"
93+#include "midcomms.h"
94+#include "ast.h"
95+#include "nodes.h"
96+#include "config.h"
b7b72b66 97+#include "util.h"
c1c6733f
AM
98+
99+/* Wake up flags for astd */
b7b72b66
AM
100+#define WAKE_ASTS 1
101+#define WAKE_TIMER 2
102+
103+static struct list_head ast_queue;
104+static struct semaphore ast_queue_lock;
105+static wait_queue_head_t astd_waitchan;
106+struct task_struct * astd_task;
107+static unsigned long astd_wakeflags;
108+
109+static struct list_head _deadlockqueue;
110+static struct semaphore _deadlockqueue_lock;
111+static struct list_head _lockqueue;
112+static struct semaphore _lockqueue_lock;
113+static struct timer_list _lockqueue_timer;
114+
115+void add_to_lockqueue(struct dlm_lkb *lkb)
c1c6733f
AM
116+{
117+ /* Time stamp the entry so we know if it's been waiting too long */
118+ lkb->lkb_lockqueue_time = jiffies;
119+
120+ down(&_lockqueue_lock);
121+ list_add(&lkb->lkb_lockqueue, &_lockqueue);
122+ up(&_lockqueue_lock);
123+}
124+
b7b72b66 125+void remove_from_lockqueue(struct dlm_lkb *lkb)
c1c6733f
AM
126+{
127+ down(&_lockqueue_lock);
128+ list_del(&lkb->lkb_lockqueue);
129+ up(&_lockqueue_lock);
c783755a
AM
130+
131+#ifdef CONFIG_DLM_STATS
132+ dlm_stats.lockqueue_time[lkb->lkb_lockqueue_state] += (jiffies - lkb->lkb_lockqueue_time);
133+ dlm_stats.lockqueue_locks[lkb->lkb_lockqueue_state]++;
134+#endif
135+ lkb->lkb_lockqueue_state = 0;
c1c6733f
AM
136+}
137+
b7b72b66 138+void add_to_deadlockqueue(struct dlm_lkb *lkb)
c1c6733f
AM
139+{
140+ if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
141+ return;
142+ lkb->lkb_duetime = jiffies;
143+ down(&_deadlockqueue_lock);
144+ list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
145+ up(&_deadlockqueue_lock);
146+}
147+
b7b72b66 148+void remove_from_deadlockqueue(struct dlm_lkb *lkb)
c1c6733f
AM
149+{
150+ if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
151+ return;
152+
153+ down(&_deadlockqueue_lock);
154+ list_del(&lkb->lkb_deadlockq);
155+ up(&_deadlockqueue_lock);
156+
157+ /* Invalidate the due time */
158+ memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
159+}
160+
c1c6733f
AM
161+/*
162+ * Queue an AST for delivery, this will only deal with
163+ * kernel ASTs, usermode API will piggyback on top of this.
164+ *
165+ * This can be called in either the user or DLM context.
b7b72b66 166+ * ASTs are queued EVEN IF we are already running in dlm_astd
c1c6733f
AM
167+ * context as we don't know what other locks are held (eg we could
168+ * be being called from a lock operation that was called from
169+ * another AST!
170+ * If the AST is to be queued remotely then a message is sent to
171+ * the target system via midcomms.
172+ */
173+
b7b72b66 174+void queue_ast(struct dlm_lkb *lkb, uint16_t flags, uint8_t rqmode)
c1c6733f 175+{
b7b72b66 176+ struct dlm_request req;
c1c6733f
AM
177+
178+ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
179+ /*
180+ * Send a message to have an ast queued remotely. Note: we do
181+ * not send remote completion asts, they are handled as part of
182+ * remote lock granting.
183+ */
b7b72b66 184+ if (flags & AST_BAST) {
c1c6733f
AM
185+ req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
186+ req.rr_header.rh_length = sizeof(req);
187+ req.rr_header.rh_flags = 0;
188+ req.rr_header.rh_lkid = lkb->lkb_id;
189+ req.rr_header.rh_lockspace =
190+ lkb->lkb_resource->res_ls->ls_global_id;
191+ req.rr_status = lkb->lkb_retstatus;
192+ req.rr_remlkid = lkb->lkb_remid;
193+ req.rr_rqmode = rqmode;
194+
195+ midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
b7b72b66 196+ lkb->lkb_resource->res_ls->ls_allocation);
c1c6733f
AM
197+ } else if (lkb->lkb_retstatus == -EDEADLOCK) {
198+ /*
199+ * We only queue remote Completion ASTs here for error
200+ * completions that happen out of band.
201+ * DEADLOCK is one such.
202+ */
c1c6733f
AM
203+ req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
204+ req.rr_header.rh_length = sizeof(req);
205+ req.rr_header.rh_flags = 0;
206+ req.rr_header.rh_lkid = lkb->lkb_id;
207+ req.rr_header.rh_lockspace =
208+ lkb->lkb_resource->res_ls->ls_global_id;
209+ req.rr_status = lkb->lkb_retstatus;
210+ req.rr_remlkid = lkb->lkb_remid;
211+ req.rr_rqmode = rqmode;
212+
213+ midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
b7b72b66 214+ lkb->lkb_resource->res_ls->ls_allocation);
c1c6733f
AM
215+ }
216+ } else {
217+ /*
b7b72b66 218+ * Prepare info that will be returned in ast/bast.
c1c6733f
AM
219+ */
220+
b7b72b66 221+ if (flags & AST_BAST) {
c1c6733f
AM
222+ lkb->lkb_bastmode = rqmode;
223+ } else {
224+ lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
c1c6733f
AM
225+ if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
226+ lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
227+ else
228+ lkb->lkb_lksb->sb_flags = 0;
229+ }
230+
b7b72b66
AM
231+ down(&ast_queue_lock);
232+ if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST)))
233+ list_add_tail(&lkb->lkb_astqueue, &ast_queue);
234+ lkb->lkb_astflags |= flags;
235+ up(&ast_queue_lock);
c1c6733f
AM
236+
237+ /* It is the responsibility of the caller to call wake_astd()
238+ * after it has finished other locking operations that request
239+ * the ASTs to be delivered after */
240+ }
241+}
242+
243+/*
b7b72b66 244+ * Process any LKBs on the AST queue.
c1c6733f
AM
245+ */
246+
247+static void process_asts(void)
248+{
b7b72b66
AM
249+ struct dlm_ls *ls;
250+ struct dlm_rsb *rsb;
251+ struct dlm_lkb *lkb;
252+ void (*cast) (long param);
253+ void (*bast) (long param, int mode);
254+ long astparam;
255+ uint16_t flags;
256+
257+ for (;;) {
258+ down(&ast_queue_lock);
259+ if (list_empty(&ast_queue)) {
260+ up(&ast_queue_lock);
261+ break;
262+ }
263+
264+ lkb = list_entry(ast_queue.next, struct dlm_lkb, lkb_astqueue);
265+ list_del(&lkb->lkb_astqueue);
266+ flags = lkb->lkb_astflags;
267+ lkb->lkb_astflags = 0;
268+ up(&ast_queue_lock);
c1c6733f 269+
b7b72b66
AM
270+ cast = lkb->lkb_astaddr;
271+ bast = lkb->lkb_bastaddr;
272+ astparam = lkb->lkb_astparam;
273+ rsb = lkb->lkb_resource;
274+ ls = rsb->res_ls;
c1c6733f 275+
b7b72b66
AM
276+ if (flags & AST_COMP) {
277+ if (flags & AST_DEL) {
278+ DLM_ASSERT(lkb->lkb_astflags == 0,);
c1c6733f 279+
b7b72b66
AM
280+ /* FIXME: we don't want to block asts for other
281+ lockspaces while one is being recovered */
c1c6733f 282+
b7b72b66
AM
283+ down_read(&ls->ls_in_recovery);
284+ release_lkb(ls, lkb);
285+ release_rsb(rsb);
286+ up_read(&ls->ls_in_recovery);
287+ }
c1c6733f 288+
c783755a
AM
289+ if (cast) {
290+#ifdef CONFIG_DLM_STATS
291+ dlm_stats.cast++;
292+#endif
b7b72b66 293+ cast(astparam);
c783755a 294+ }
b7b72b66
AM
295+ }
296+
297+ if (flags & AST_BAST && !(flags & AST_DEL)) {
298+ int bmode = lkb->lkb_bastmode;
299+
300+ /* gr or rq mode of the lock may have changed since the
301+ ast was queued making the delivery unnecessary */
302+
303+ if (!bast || dlm_modes_compat(lkb->lkb_grmode, bmode))
304+ continue;
305+
306+ if (lkb->lkb_rqmode == DLM_LOCK_IV ||
c783755a 307+ !dlm_modes_compat(lkb->lkb_rqmode, bmode)) {
b7b72b66 308+ bast(astparam, bmode);
c783755a
AM
309+#ifdef CONFIG_DLM_STATS
310+ dlm_stats.bast++;
311+#endif
312+ }
b7b72b66 313+ }
c1c6733f 314+
b7b72b66 315+ schedule();
c1c6733f 316+ }
c1c6733f
AM
317+}
318+
b7b72b66 319+void lockqueue_lkb_mark(struct dlm_ls *ls)
c1c6733f 320+{
b7b72b66 321+ struct dlm_lkb *lkb, *safe;
c1c6733f
AM
322+ int count = 0;
323+
324+ log_all(ls, "mark waiting requests");
325+
326+ down(&_lockqueue_lock);
327+
328+ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
329+
330+ if (lkb->lkb_resource->res_ls != ls)
331+ continue;
332+
b7b72b66
AM
333+ log_debug(ls, "mark %x lq %d nodeid %d", lkb->lkb_id,
334+ lkb->lkb_lockqueue_state, lkb->lkb_nodeid);
335+
336+ /*
c1c6733f
AM
337+ * These lkb's are new and the master is being looked up. Mark
338+ * the lkb request to be resent. Even if the destination node
339+ * for the request is still living and has our request, it will
340+ * purge all resdir requests in purge_requestqueue. If there's
341+ * a reply to the LOOKUP request in our requestqueue (the reply
342+ * arrived after ls_stop), it is invalid and will be discarded
343+ * in purge_requestqueue, too.
344+ */
345+
346+ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
b7b72b66
AM
347+ DLM_ASSERT(lkb->lkb_nodeid == -1,
348+ print_lkb(lkb);
349+ print_rsb(lkb->lkb_resource););
c1c6733f
AM
350+
351+ lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
352+ count++;
353+ continue;
354+ }
355+
b7b72b66
AM
356+ /*
357+ * We're waiting for an unlock reply and the master node from
358+ * whom we're expecting the reply has failed. If there's a
359+ * reply in the requestqueue do nothing and process it later in
360+ * process_requestqueue. If there's no reply, don't rebuild
361+ * the lkb on a new master, but just assume we've gotten an
362+ * unlock completion reply from the prev master (this also
363+ * means not resending the unlock request). If the unlock is
364+ * for the last lkb on the rsb, the rsb has nodeid of -1 and
365+ * the rsb won't be rebuilt on the new master either.
366+ *
367+ * If we're waiting for an unlock reply and the master node is
368+ * still alive, we should either have a reply in the
369+ * requestqueue from the master already, or we should get one
370+ * from the master once recovery is complete. There is no
371+ * rebuilding of the rsb/lkb in this case and no resending of
372+ * the request.
373+ */
374+
375+ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK) {
376+ if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
377+ if (reply_in_requestqueue(ls, lkb->lkb_id)) {
378+ lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
379+ log_debug(ls, "mark %x unlock have rep",
380+ lkb->lkb_id);
381+ } else {
382+ /* assume we got reply fr old master */
383+ lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
384+ lkb->lkb_flags |= GDLM_LKFLG_UNLOCKDONE;
385+ log_debug(ls, "mark %x unlock no rep",
386+ lkb->lkb_id);
387+ }
388+ }
389+ count++;
390+ continue;
391+ }
392+
393+ /*
c1c6733f
AM
394+ * These lkb's have an outstanding request to a bygone node.
395+ * The request will be redirected to the new master node in
396+ * resend_cluster_requests(). Don't mark the request for
397+ * resending if there's a reply for it saved in the
398+ * requestqueue.
399+ */
400+
401+ if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
402+ !reply_in_requestqueue(ls, lkb->lkb_id)) {
403+
404+ lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
405+
406+ /*
407+ * Don't rebuild this lkb on a new rsb in
408+ * rebuild_rsbs_send().
409+ */
410+
b7b72b66
AM
411+ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONDGRANT) {
412+ DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_WAITING,
413+ print_lkb(lkb);
414+ print_rsb(lkb->lkb_resource););
c1c6733f
AM
415+ lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
416+ }
417+
418+ /*
419+ * This flag indicates to the new master that his lkb
420+ * is in the midst of a convert request and should be
421+ * placed on the granted queue rather than the convert
422+ * queue. We will resend this convert request to the
423+ * new master.
424+ */
425+
b7b72b66
AM
426+ else if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONVERT) {
427+ DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,
428+ print_lkb(lkb);
429+ print_rsb(lkb->lkb_resource););
c1c6733f
AM
430+ lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
431+ }
432+
433+ count++;
434+ }
435+ }
436+ up(&_lockqueue_lock);
437+
438+ log_all(ls, "marked %d requests", count);
439+}
440+
b7b72b66 441+int resend_cluster_requests(struct dlm_ls *ls)
c1c6733f 442+{
b7b72b66
AM
443+ struct dlm_lkb *lkb, *safe;
444+ struct dlm_rsb *r;
c1c6733f
AM
445+ int error = 0, state, count = 0;
446+
447+ log_all(ls, "resend marked requests");
448+
449+ down(&_lockqueue_lock);
450+
451+ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
452+
453+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
454+ log_debug(ls, "resend_cluster_requests: aborted");
455+ error = -EINTR;
456+ break;
457+ }
458+
b7b72b66
AM
459+ r = lkb->lkb_resource;
460+
461+ if (r->res_ls != ls)
c1c6733f
AM
462+ continue;
463+
b7b72b66
AM
464+ log_debug(ls, "resend %x lq %d flg %x node %d/%d \"%s\"",
465+ lkb->lkb_id, lkb->lkb_lockqueue_state, lkb->lkb_flags,
466+ lkb->lkb_nodeid, r->res_nodeid, r->res_name);
c1c6733f 467+
b7b72b66
AM
468+ if (lkb->lkb_flags & GDLM_LKFLG_UNLOCKDONE) {
469+ log_debug(ls, "unlock done %x", lkb->lkb_id);
470+ list_del(&lkb->lkb_lockqueue);
471+ res_lkb_dequeue(lkb);
472+ lkb->lkb_retstatus = -DLM_EUNLOCK;
473+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
474+ count++;
475+ continue;
476+ }
477+
478+ /*
c1c6733f
AM
479+ * Resend/process the lockqueue lkb's (in-progres requests)
480+ * that were flagged at the start of recovery in
481+ * lockqueue_lkb_mark().
482+ */
483+
484+ if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
485+ lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
486+ lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
487+ lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
488+
489+ if (lkb->lkb_nodeid == -1) {
490+ /*
491+ * Send lookup to new resdir node.
492+ */
493+ lkb->lkb_lockqueue_time = jiffies;
494+ send_cluster_request(lkb,
495+ lkb->lkb_lockqueue_state);
496+ }
497+
498+ else if (lkb->lkb_nodeid != 0) {
499+ /*
500+ * There's a new RSB master (that's not us.)
501+ */
502+ lkb->lkb_lockqueue_time = jiffies;
503+ send_cluster_request(lkb,
504+ lkb->lkb_lockqueue_state);
505+ }
506+
507+ else {
508+ /*
509+ * We are the new RSB master for this lkb
510+ * request.
511+ */
512+ state = lkb->lkb_lockqueue_state;
513+ lkb->lkb_lockqueue_state = 0;
514+ /* list_del equals remove_from_lockqueue() */
515+ list_del(&lkb->lkb_lockqueue);
b7b72b66 516+ process_remastered_lkb(ls, lkb, state);
c1c6733f
AM
517+ }
518+
519+ count++;
520+ }
521+ }
522+ up(&_lockqueue_lock);
523+
524+ log_all(ls, "resent %d requests", count);
525+ return error;
526+}
527+
528+/*
529+ * Process any LKBs on the Lock queue, this
530+ * just looks at the entries to see if they have been
531+ * on the queue too long and fails the requests if so.
532+ */
533+
534+static void process_lockqueue(void)
535+{
b7b72b66
AM
536+ struct dlm_lkb *lkb, *safe;
537+ struct dlm_ls *ls;
c1c6733f
AM
538+ int count = 0;
539+
540+ down(&_lockqueue_lock);
541+
542+ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
543+ ls = lkb->lkb_resource->res_ls;
544+
545+ if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
546+ continue;
547+
548+ /* Don't time out locks that are in transition */
549+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
550+ continue;
551+
552+ if (check_timeout(lkb->lkb_lockqueue_time,
553+ dlm_config.lock_timeout)) {
554+ count++;
555+ list_del(&lkb->lkb_lockqueue);
556+ up(&_lockqueue_lock);
557+ cancel_lockop(lkb, -ETIMEDOUT);
558+ down(&_lockqueue_lock);
559+ }
560+ }
561+ up(&_lockqueue_lock);
562+
563+ if (count)
564+ wake_astd();
565+
b7b72b66
AM
566+ mod_timer(&_lockqueue_timer,
567+ jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
c1c6733f
AM
568+}
569+
570+/* Look for deadlocks */
571+static void process_deadlockqueue(void)
572+{
b7b72b66 573+ struct dlm_lkb *lkb, *safe;
c1c6733f
AM
574+
575+ down(&_deadlockqueue_lock);
576+
577+ list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
b7b72b66 578+ struct dlm_lkb *kill_lkb;
c1c6733f
AM
579+
580+ /* Only look at "due" locks */
581+ if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
582+ break;
583+
584+ /* Don't look at locks that are in transition */
585+ if (!test_bit(LSFL_LS_RUN,
586+ &lkb->lkb_resource->res_ls->ls_flags))
587+ continue;
588+
589+ up(&_deadlockqueue_lock);
590+
591+ /* Lock has hit due time, check for conversion deadlock */
592+ kill_lkb = conversion_deadlock_check(lkb);
593+ if (kill_lkb)
594+ cancel_conversion(kill_lkb, -EDEADLOCK);
595+
596+ down(&_deadlockqueue_lock);
597+ }
598+ up(&_deadlockqueue_lock);
599+}
600+
601+static __inline__ int no_asts(void)
602+{
603+ int ret;
604+
b7b72b66
AM
605+ down(&ast_queue_lock);
606+ ret = list_empty(&ast_queue);
607+ up(&ast_queue_lock);
c1c6733f
AM
608+ return ret;
609+}
610+
611+static void lockqueue_timer_fn(unsigned long arg)
612+{
b7b72b66
AM
613+ set_bit(WAKE_TIMER, &astd_wakeflags);
614+ wake_up(&astd_waitchan);
c1c6733f
AM
615+}
616+
617+/*
618+ * DLM daemon which delivers asts.
619+ */
620+
621+static int dlm_astd(void *data)
622+{
b7b72b66
AM
623+ /*
624+ * Set a timer to check the lockqueue for dead locks (and deadlocks).
625+ */
c1c6733f
AM
626+ INIT_LIST_HEAD(&_lockqueue);
627+ init_MUTEX(&_lockqueue_lock);
628+ INIT_LIST_HEAD(&_deadlockqueue);
629+ init_MUTEX(&_deadlockqueue_lock);
c1c6733f
AM
630+ init_timer(&_lockqueue_timer);
631+ _lockqueue_timer.function = lockqueue_timer_fn;
632+ _lockqueue_timer.data = 0;
633+ mod_timer(&_lockqueue_timer,
634+ jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
635+
b7b72b66 636+ while (!kthread_should_stop()) {
bb1d8b11 637+ wchan_cond_sleep_intr(astd_waitchan, !test_bit(WAKE_ASTS, &astd_wakeflags));
c1c6733f 638+
b7b72b66 639+ if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
c1c6733f
AM
640+ process_asts();
641+
b7b72b66 642+ if (test_and_clear_bit(WAKE_TIMER, &astd_wakeflags)) {
c1c6733f
AM
643+ process_lockqueue();
644+ if (dlm_config.deadlocktime)
645+ process_deadlockqueue();
646+ }
647+ }
648+
649+ if (timer_pending(&_lockqueue_timer))
650+ del_timer(&_lockqueue_timer);
651+
c1c6733f
AM
652+ return 0;
653+}
654+
655+void wake_astd(void)
656+{
bb1d8b11
AM
657+ if (!no_asts()) {
658+ set_bit(WAKE_ASTS, &astd_wakeflags);
659+ wake_up(&astd_waitchan);
660+ }
c1c6733f
AM
661+}
662+
b7b72b66 663+int astd_start(void)
c1c6733f 664+{
b7b72b66
AM
665+ struct task_struct *p;
666+ int error = 0;
667+
668+ INIT_LIST_HEAD(&ast_queue);
669+ init_MUTEX(&ast_queue_lock);
670+ init_waitqueue_head(&astd_waitchan);
671+
d3b4771f 672+ p = kthread_run(dlm_astd, NULL, 0, "dlm_astd");
b7b72b66
AM
673+ if (IS_ERR(p))
674+ error = PTR_ERR(p);
675+ else
676+ astd_task = p;
677+ return error;
c1c6733f
AM
678+}
679+
b7b72b66 680+void astd_stop(void)
c1c6733f 681+{
b7b72b66
AM
682+ kthread_stop(astd_task);
683+ wake_up(&astd_waitchan);
c1c6733f
AM
684+}
685diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
686--- linux-orig/cluster/dlm/ast.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 687+++ linux-patched/cluster/dlm/ast.h 2004-11-03 11:31:56.000000000 +0800
b7b72b66 688@@ -0,0 +1,28 @@
c1c6733f
AM
689+/******************************************************************************
690+*******************************************************************************
691+**
692+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
693+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
694+**
695+** This copyrighted material is made available to anyone wishing to use,
696+** modify, copy, or redistribute it subject to the terms and conditions
697+** of the GNU General Public License v.2.
698+**
699+*******************************************************************************
700+******************************************************************************/
701+
702+#ifndef __AST_DOT_H__
703+#define __AST_DOT_H__
704+
b7b72b66
AM
705+void lockqueue_lkb_mark(struct dlm_ls *ls);
706+int resend_cluster_requests(struct dlm_ls *ls);
707+void add_to_lockqueue(struct dlm_lkb *lkb);
708+void remove_from_lockqueue(struct dlm_lkb *lkb);
709+void add_to_deadlockqueue(struct dlm_lkb *lkb);
710+void remove_from_deadlockqueue(struct dlm_lkb *lkb);
711+void queue_ast(struct dlm_lkb *lkb, uint16_t astflags, uint8_t rqmode);
c1c6733f
AM
712+void wake_astd(void);
713+int astd_start(void);
714+void astd_stop(void);
715+
716+#endif /* __AST_DOT_H__ */
717diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
718--- linux-orig/cluster/dlm/config.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 719+++ linux-patched/cluster/dlm/config.c 2004-11-03 11:31:56.000000000 +0800
b7b72b66 720@@ -0,0 +1,137 @@
c1c6733f
AM
721+/******************************************************************************
722+*******************************************************************************
723+**
724+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
725+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
726+**
727+** This copyrighted material is made available to anyone wishing to use,
728+** modify, copy, or redistribute it subject to the terms and conditions
729+** of the GNU General Public License v.2.
730+**
731+*******************************************************************************
732+******************************************************************************/
733+
734+#include <linux/module.h>
735+#include <linux/proc_fs.h>
736+
737+#include "dlm_internal.h"
738+#include "lowcomms.h"
739+#include "config.h"
740+
741+/* Config file defaults */
742+#define DEFAULT_TCP_PORT 21064
743+#define DEFAULT_LOCK_TIMEOUT 30
744+#define DEFAULT_BUFFER_SIZE 4096
b7b72b66
AM
745+#define DEFAULT_RSBTBL_SIZE 256
746+#define DEFAULT_LKBTBL_SIZE 1024
747+#define DEFAULT_DIRTBL_SIZE 512
c783755a 748+#define DEFAULT_CONN_INCREMENT 32
c1c6733f 749+#define DEFAULT_DEADLOCKTIME 10
b7b72b66 750+#define DEFAULT_RECOVER_TIMER 5
c1c6733f
AM
751+
752+struct config_info dlm_config = {
753+ .tcp_port = DEFAULT_TCP_PORT,
754+ .lock_timeout = DEFAULT_LOCK_TIMEOUT,
755+ .buffer_size = DEFAULT_BUFFER_SIZE,
b7b72b66
AM
756+ .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
757+ .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
758+ .dirtbl_size = DEFAULT_DIRTBL_SIZE,
c783755a 759+ .conn_increment = DEFAULT_CONN_INCREMENT,
c1c6733f 760+ .deadlocktime = DEFAULT_DEADLOCKTIME,
b7b72b66 761+ .recover_timer = DEFAULT_RECOVER_TIMER
c1c6733f
AM
762+};
763+
764+
765+static struct config_proc_info {
766+ char *name;
767+ int *value;
768+} config_proc[] = {
769+ {
770+ .name = "tcp_port",
771+ .value = &dlm_config.tcp_port,
772+ },
773+ {
774+ .name = "lock_timeout",
775+ .value = &dlm_config.lock_timeout,
776+ },
777+ {
778+ .name = "buffer_size",
779+ .value = &dlm_config.buffer_size,
780+ },
781+ {
b7b72b66
AM
782+ .name = "rsbtbl_size",
783+ .value = &dlm_config.rsbtbl_size,
784+ },
785+ {
786+ .name = "lkbtbl_size",
787+ .value = &dlm_config.lkbtbl_size,
c1c6733f
AM
788+ },
789+ {
b7b72b66
AM
790+ .name = "dirtbl_size",
791+ .value = &dlm_config.dirtbl_size,
c1c6733f
AM
792+ },
793+ {
c783755a
AM
794+ .name = "conn_increment",
795+ .value = &dlm_config.conn_increment,
c1c6733f
AM
796+ },
797+ {
798+ .name = "deadlocktime",
799+ .value = &dlm_config.deadlocktime,
800+ },
b7b72b66
AM
801+ {
802+ .name = "recover_timer",
803+ .value = &dlm_config.recover_timer,
804+ }
c1c6733f
AM
805+};
806+static struct proc_dir_entry *dlm_dir;
807+
808+static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
809+ int *eof, void *data)
810+{
811+ struct config_proc_info *cinfo = data;
812+ return snprintf(page, count, "%d\n", *cinfo->value);
813+}
814+
815+static int dlm_config_write_proc(struct file *file, const char *buffer,
816+ unsigned long count, void *data)
817+{
818+ struct config_proc_info *cinfo = data;
819+ int value;
820+ char *end;
821+
822+ value = simple_strtoul(buffer, &end, 10);
823+ if (*end)
824+ *cinfo->value = value;
825+ return count;
826+}
827+
828+int dlm_config_init(void)
829+{
830+ int i;
831+ struct proc_dir_entry *pde;
832+
833+ dlm_dir = proc_mkdir("cluster/config/dlm", 0);
834+ if (!dlm_dir)
835+ return -1;
836+
837+ dlm_dir->owner = THIS_MODULE;
838+
839+ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
840+ pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
841+ if (pde) {
842+ pde->data = &config_proc[i];
843+ pde->write_proc = dlm_config_write_proc;
844+ pde->read_proc = dlm_config_read_proc;
845+ }
846+ }
847+ return 0;
848+}
849+
850+void dlm_config_exit(void)
851+{
852+ int i;
853+
854+ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
855+ remove_proc_entry(config_proc[i].name, dlm_dir);
856+ remove_proc_entry("cluster/config/dlm", NULL);
857+}
858diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
859--- linux-orig/cluster/dlm/config.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 860+++ linux-patched/cluster/dlm/config.h 2004-11-03 11:31:56.000000000 +0800
b7b72b66 861@@ -0,0 +1,33 @@
c1c6733f
AM
862+/******************************************************************************
863+*******************************************************************************
864+**
865+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
866+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
867+**
868+** This copyrighted material is made available to anyone wishing to use,
869+** modify, copy, or redistribute it subject to the terms and conditions
870+** of the GNU General Public License v.2.
871+**
872+*******************************************************************************
873+******************************************************************************/
874+
875+#ifndef __CONFIG_DOT_H__
876+#define __CONFIG_DOT_H__
877+
878+struct config_info {
879+ int tcp_port;
880+ int lock_timeout;
881+ int buffer_size;
b7b72b66
AM
882+ int rsbtbl_size;
883+ int lkbtbl_size;
884+ int dirtbl_size;
c783755a 885+ int conn_increment;
c1c6733f 886+ int deadlocktime;
b7b72b66 887+ int recover_timer;
c1c6733f
AM
888+};
889+
890+extern struct config_info dlm_config;
891+extern int dlm_config_init(void);
892+extern void dlm_config_exit(void);
893+
894+#endif /* __CONFIG_DOT_H__ */
895diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
896--- linux-orig/cluster/dlm/device.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 897+++ linux-patched/cluster/dlm/device.c 2004-11-03 11:31:56.000000000 +0800
c783755a 898@@ -0,0 +1,1212 @@
c1c6733f
AM
899+/******************************************************************************
900+*******************************************************************************
901+**
902+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
903+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
904+**
905+** This copyrighted material is made available to anyone wishing to use,
906+** modify, copy, or redistribute it subject to the terms and conditions
907+** of the GNU General Public License v.2.
908+**
909+*******************************************************************************
910+******************************************************************************/
911+
912+/*
913+ * device.c
914+ *
915+ * This is the userland interface to the DLM.
916+ *
917+ * The locking is done via a misc char device (find the
918+ * registered minor number in /proc/misc).
919+ *
920+ * User code should not use this interface directly but
921+ * call the library routines in libdlm.a instead.
922+ *
923+ */
924+
925+#include <linux/miscdevice.h>
926+#include <linux/init.h>
927+#include <linux/wait.h>
928+#include <linux/module.h>
929+#include <linux/file.h>
930+#include <linux/fs.h>
931+#include <linux/poll.h>
932+#include <linux/signal.h>
933+#include <linux/spinlock.h>
934+#include <asm/ioctls.h>
935+
936+#include "dlm_internal.h"
937+#include "device.h"
938+
b7b72b66 939+extern struct dlm_lkb *dlm_get_lkb(struct dlm_ls *, int);
c1c6733f
AM
940+static struct file_operations _dlm_fops;
941+static const char *name_prefix="dlm";
942+static struct list_head user_ls_list;
b7b72b66 943+static struct semaphore user_ls_lock;
c1c6733f
AM
944+
945+/* Flags in li_flags */
946+#define LI_FLAG_COMPLETE 1
947+#define LI_FLAG_FIRSTLOCK 2
948+
c783755a
AM
949+#define LOCKINFO_MAGIC 0x53595324
950+
c1c6733f 951+struct lock_info {
c783755a 952+ uint32_t li_magic;
c1c6733f
AM
953+ uint8_t li_cmd;
954+ struct dlm_lksb li_lksb;
955+ wait_queue_head_t li_waitq;
956+ unsigned long li_flags;
b7b72b66
AM
957+ void __user *li_castparam;
958+ void __user *li_castaddr;
959+ void __user *li_bastparam;
c1c6733f 960+ void __user *li_bastaddr;
b7b72b66
AM
961+ void __user *li_pend_bastparam;
962+ void __user *li_pend_bastaddr;
963+ void __user *li_user_lvbptr;
964+ struct list_head li_ownerqueue;
c1c6733f
AM
965+ struct file_info *li_file;
966+ struct dlm_lksb __user *li_user_lksb;
967+ struct semaphore li_firstlock;
968+ struct dlm_queryinfo *li_queryinfo;
969+ struct dlm_queryinfo __user *li_user_queryinfo;
970+};
971+
972+/* A queued AST no less */
973+struct ast_info {
974+ struct dlm_lock_result result;
975+ struct dlm_queryinfo *queryinfo;
976+ struct dlm_queryinfo __user *user_queryinfo;
977+ struct list_head list;
b7b72b66
AM
978+ void __user *user_lvbptr;
979+ uint32_t ast_reason; /* AST_COMP or AST_BAST from dlm_internal.h */
c1c6733f
AM
980+};
981+
982+/* One of these per userland lockspace */
983+struct user_ls {
984+ void *ls_lockspace;
985+ atomic_t ls_refcnt;
986+ long ls_flags; /* bit 1 means LS has been deleted */
987+
988+ /* Passed into misc_register() */
989+ struct miscdevice ls_miscinfo;
990+ struct list_head ls_list;
991+};
992+
993+/* misc_device info for the control device */
994+static struct miscdevice ctl_device;
995+
996+/*
997+ * Stuff we hang off the file struct.
998+ * The first two are to cope with unlocking all the
999+ * locks help by a process when it dies.
1000+ */
1001+struct file_info {
1002+ struct list_head fi_lkb_list; /* List of active lkbs */
1003+ spinlock_t fi_lkb_lock;
1004+ struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
1005+ spinlock_t fi_ast_lock;
1006+ wait_queue_head_t fi_wait;
1007+ struct user_ls *fi_ls;
1008+ atomic_t fi_refcnt; /* Number of users */
1009+ unsigned long fi_flags; /* Bit 1 means the device is open */
1010+};
1011+
1012+
1013+/* get and put ops for file_info.
1014+ Actually I don't really like "get" and "put", but everyone
1015+ else seems to use them and I can't think of anything
1016+ nicer at the moment */
1017+static void get_file_info(struct file_info *f)
1018+{
1019+ atomic_inc(&f->fi_refcnt);
1020+}
1021+
1022+static void put_file_info(struct file_info *f)
1023+{
1024+ if (atomic_dec_and_test(&f->fi_refcnt))
1025+ kfree(f);
1026+}
1027+
b7b72b66
AM
1028+static void release_lockinfo(struct lock_info *li)
1029+{
1030+ put_file_info(li->li_file);
1031+ if (li->li_lksb.sb_lvbptr && li->li_cmd != DLM_USER_QUERY)
1032+ kfree(li->li_lksb.sb_lvbptr);
1033+ kfree(li);
1034+}
1035+
1036+static struct user_ls *__find_lockspace(int minor)
c1c6733f
AM
1037+{
1038+ struct user_ls *lsinfo;
1039+
1040+ list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
1041+
1042+ if (lsinfo->ls_miscinfo.minor == minor)
1043+ return lsinfo;
1044+ }
1045+ return NULL;
1046+}
1047+
b7b72b66
AM
1048+/* Find a lockspace struct given the device minor number */
1049+static struct user_ls *find_lockspace(int minor)
1050+{
1051+ struct user_ls *lsinfo;
1052+
1053+ down(&user_ls_lock);
1054+ lsinfo = __find_lockspace(minor);
1055+ up(&user_ls_lock);
1056+
1057+ return lsinfo;
1058+}
1059+
c1c6733f
AM
1060+static void add_lockspace_to_list(struct user_ls *lsinfo)
1061+{
b7b72b66 1062+ down(&user_ls_lock);
c1c6733f 1063+ list_add(&lsinfo->ls_list, &user_ls_list);
b7b72b66 1064+ up(&user_ls_lock);
c1c6733f
AM
1065+}
1066+
1067+/* Register a lockspace with the DLM and create a misc
1068+ device for userland to access it */
1069+static int register_lockspace(char *name, struct user_ls **ls)
1070+{
1071+ struct user_ls *newls;
1072+ int status;
1073+ int namelen;
1074+
1075+ namelen = strlen(name)+strlen(name_prefix)+2;
1076+
1077+ newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
1078+ if (!newls)
1079+ return -ENOMEM;
1080+ memset(newls, 0, sizeof(struct user_ls));
1081+
1082+ newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
1083+ if (!newls->ls_miscinfo.name) {
1084+ kfree(newls);
1085+ return -ENOMEM;
1086+ }
c783755a
AM
1087+ status = dlm_new_lockspace(name, strlen(name),
1088+ &newls->ls_lockspace, 0);
c1c6733f
AM
1089+
1090+ if (status != 0) {
1091+ kfree(newls->ls_miscinfo.name);
1092+ kfree(newls);
1093+ return status;
1094+ }
1095+
c783755a
AM
1096+ snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
1097+
c1c6733f
AM
1098+ newls->ls_miscinfo.fops = &_dlm_fops;
1099+ newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
1100+
1101+ status = misc_register(&newls->ls_miscinfo);
1102+ if (status) {
1103+ log_print("failed to register misc device for %s", name);
1104+ dlm_release_lockspace(newls->ls_lockspace, 0);
1105+ kfree(newls->ls_miscinfo.name);
1106+ kfree(newls);
1107+ return status;
1108+ }
1109+
1110+
1111+ add_lockspace_to_list(newls);
1112+ *ls = newls;
1113+ return 0;
1114+}
1115+
b7b72b66 1116+/* Called with the user_ls_lock semaphore held */
c1c6733f
AM
1117+static int unregister_lockspace(struct user_ls *lsinfo, int force)
1118+{
1119+ int status;
1120+
1121+ status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
1122+ if (status)
1123+ return status;
1124+
1125+ status = misc_deregister(&lsinfo->ls_miscinfo);
1126+ if (status)
1127+ return status;
1128+
1129+ list_del(&lsinfo->ls_list);
b7b72b66
AM
1130+ set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
1131+ lsinfo->ls_lockspace = NULL;
1132+ if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
1133+ kfree(lsinfo->ls_miscinfo.name);
1134+ kfree(lsinfo);
1135+ }
c1c6733f
AM
1136+
1137+ return 0;
1138+}
1139+
1140+/* Add it to userland's AST queue */
b7b72b66 1141+static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam, uint32_t reason)
c1c6733f
AM
1142+{
1143+ struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
1144+ if (!ast)
1145+ return;
1146+
b7b72b66 1147+ ast->result.astparam = astparam;
c1c6733f
AM
1148+ ast->result.astaddr = astaddr;
1149+ ast->result.user_lksb = li->li_user_lksb;
1150+ ast->result.cmd = li->li_cmd;
b7b72b66
AM
1151+ ast->user_lvbptr = li->li_user_lvbptr;
1152+ ast->ast_reason = reason;
c1c6733f
AM
1153+ memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
1154+
1155+ /* These two will both be NULL for anything other than queries */
1156+ ast->queryinfo = li->li_queryinfo;
1157+ ast->user_queryinfo = li->li_user_queryinfo;
1158+
1159+ spin_lock(&li->li_file->fi_ast_lock);
1160+ list_add_tail(&ast->list, &li->li_file->fi_ast_list);
1161+ spin_unlock(&li->li_file->fi_ast_lock);
1162+ wake_up_interruptible(&li->li_file->fi_wait);
1163+}
1164+
1165+static void bast_routine(void *param, int mode)
1166+{
1167+ struct lock_info *li = param;
1168+
b7b72b66
AM
1169+ if (li && li->li_bastaddr) {
1170+ add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, AST_BAST);
c1c6733f
AM
1171+ }
1172+}
1173+
1174+/*
1175+ * This is the kernel's AST routine.
1176+ * All lock, unlock & query operations complete here.
1177+ * The only syncronous ops are those done during device close.
1178+ */
1179+static void ast_routine(void *param)
1180+{
1181+ struct lock_info *li = param;
1182+
1183+ /* Param may be NULL if a persistent lock is unlocked by someone else */
b7b72b66 1184+ if (!li)
c1c6733f
AM
1185+ return;
1186+
b7b72b66
AM
1187+ /* If this is a succesful conversion then activate the blocking ast
1188+ * args from the conversion request */
1189+ if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1190+ li->li_lksb.sb_status == 0) {
1191+
1192+ li->li_bastparam = li->li_pend_bastparam;
1193+ li->li_bastaddr = li->li_pend_bastaddr;
1194+ li->li_pend_bastaddr = NULL;
1195+ }
1196+
c1c6733f 1197+ /* If it's an async request then post data to the user's AST queue. */
b7b72b66 1198+ if (li->li_castaddr) {
c1c6733f
AM
1199+
1200+ /* Only queue AST if the device is still open */
1201+ if (test_bit(1, &li->li_file->fi_flags))
b7b72b66 1202+ add_to_astqueue(li, li->li_castaddr, li->li_castparam, AST_COMP);
c1c6733f
AM
1203+
1204+ /* If it's a new lock operation that failed, then
1205+ * remove it from the owner queue and free the
1206+ * lock_info. The DLM will not free the LKB until this
1207+ * AST has completed.
1208+ */
1209+ if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1210+ li->li_lksb.sb_status != 0) {
b7b72b66 1211+ struct dlm_lkb *lkb;
c1c6733f
AM
1212+
1213+ /* Wait till dlm_lock() has finished */
1214+ down(&li->li_firstlock);
b7b72b66
AM
1215+ up(&li->li_firstlock);
1216+
1217+ /* If the LKB has been freed then we need to tidy up too */
c1c6733f 1218+ lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
b7b72b66 1219+ if (!lkb) {
c1c6733f 1220+ spin_lock(&li->li_file->fi_lkb_lock);
b7b72b66 1221+ list_del(&li->li_ownerqueue);
c1c6733f 1222+ spin_unlock(&li->li_file->fi_lkb_lock);
b7b72b66
AM
1223+
1224+ release_lockinfo(li);
c1c6733f 1225+ }
c1c6733f
AM
1226+ return;
1227+ }
1228+ /* Free unlocks & queries */
1229+ if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
1230+ li->li_cmd == DLM_USER_QUERY) {
b7b72b66 1231+ release_lockinfo(li);
c1c6733f
AM
1232+ }
1233+ }
1234+ else {
b7b72b66 1235+ /* Synchronous request, just wake up the caller */
c1c6733f
AM
1236+ set_bit(LI_FLAG_COMPLETE, &li->li_flags);
1237+ wake_up_interruptible(&li->li_waitq);
1238+ }
1239+}
1240+
1241+/*
1242+ * Wait for the lock op to complete and return the status.
1243+ */
1244+static int wait_for_ast(struct lock_info *li)
1245+{
1246+ /* Wait for the AST routine to complete */
1247+ set_task_state(current, TASK_INTERRUPTIBLE);
1248+ while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
1249+ schedule();
1250+
1251+ set_task_state(current, TASK_RUNNING);
1252+
1253+ return li->li_lksb.sb_status;
1254+}
1255+
1256+
1257+/* Open on control device */
1258+static int dlm_ctl_open(struct inode *inode, struct file *file)
1259+{
1260+ return 0;
1261+}
1262+
1263+/* Close on control device */
1264+static int dlm_ctl_close(struct inode *inode, struct file *file)
1265+{
1266+ return 0;
1267+}
1268+
1269+/* Open on lockspace device */
1270+static int dlm_open(struct inode *inode, struct file *file)
1271+{
1272+ struct file_info *f;
1273+ struct user_ls *lsinfo;
1274+
1275+ lsinfo = find_lockspace(iminor(inode));
1276+ if (!lsinfo)
1277+ return -ENOENT;
1278+
1279+ f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
1280+ if (!f)
1281+ return -ENOMEM;
1282+
1283+ atomic_inc(&lsinfo->ls_refcnt);
1284+ INIT_LIST_HEAD(&f->fi_lkb_list);
1285+ INIT_LIST_HEAD(&f->fi_ast_list);
1286+ spin_lock_init(&f->fi_ast_lock);
1287+ spin_lock_init(&f->fi_lkb_lock);
1288+ init_waitqueue_head(&f->fi_wait);
1289+ f->fi_ls = lsinfo;
1290+ atomic_set(&f->fi_refcnt, 1);
1291+ set_bit(1, &f->fi_flags);
1292+
1293+ file->private_data = f;
1294+
1295+ return 0;
1296+}
1297+
1298+/* Check the user's version matches ours */
1299+static int check_version(struct dlm_lock_params *params)
1300+{
1301+ if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
1302+ (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
1303+ params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
1304+
1305+ log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
1306+ params->version[0],
1307+ params->version[1],
1308+ params->version[2],
1309+ DLM_DEVICE_VERSION_MAJOR,
1310+ DLM_DEVICE_VERSION_MINOR,
1311+ DLM_DEVICE_VERSION_PATCH);
1312+ return -EINVAL;
1313+ }
1314+ return 0;
1315+}
1316+
1317+/* Close on lockspace device */
1318+static int dlm_close(struct inode *inode, struct file *file)
1319+{
1320+ struct file_info *f = file->private_data;
1321+ struct lock_info li;
b7b72b66 1322+ struct lock_info *old_li, *safe;
c1c6733f
AM
1323+ sigset_t tmpsig;
1324+ sigset_t allsigs;
c1c6733f
AM
1325+ struct user_ls *lsinfo;
1326+ DECLARE_WAITQUEUE(wq, current);
1327+
1328+ lsinfo = find_lockspace(iminor(inode));
1329+ if (!lsinfo)
1330+ return -ENOENT;
1331+
1332+ /* Mark this closed so that ASTs will not be delivered any more */
1333+ clear_bit(1, &f->fi_flags);
1334+
1335+ /* Block signals while we are doing this */
1336+ sigfillset(&allsigs);
1337+ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1338+
1339+ /* We use our own lock_info struct here, so that any
1340+ * outstanding "real" ASTs will be delivered with the
1341+ * corresponding "real" params, thus freeing the lock_info
1342+ * that belongs the lock. This catches the corner case where
1343+ * a lock is BUSY when we try to unlock it here
1344+ */
1345+ memset(&li, 0, sizeof(li));
1346+ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1347+ init_waitqueue_head(&li.li_waitq);
1348+ add_wait_queue(&li.li_waitq, &wq);
1349+
1350+ /*
1351+ * Free any outstanding locks, they are on the
1352+ * list in LIFO order so there should be no problems
1353+ * about unlocking parents before children.
1354+ * Although we don't remove the lkbs from the list here
1355+ * (what would be the point?), foreach_safe is needed
1356+ * because the lkbs are freed during dlm_unlock operations
1357+ */
b7b72b66 1358+ list_for_each_entry_safe(old_li, safe, &f->fi_lkb_list, li_ownerqueue) {
c1c6733f
AM
1359+ int status;
1360+ int lock_status;
1361+ int flags = 0;
b7b72b66 1362+ struct dlm_lkb *lkb;
c1c6733f 1363+
b7b72b66 1364+ lkb = dlm_get_lkb(f->fi_ls->ls_lockspace, old_li->li_lksb.sb_lkid);
c1c6733f
AM
1365+
1366+ /* Don't unlock persistent locks */
1367+ if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
b7b72b66 1368+ list_del(&old_li->li_ownerqueue);
c1c6733f 1369+
c783755a
AM
1370+ /* Update master copy */
1371+ if (lkb->lkb_resource->res_nodeid) {
1372+ li.li_lksb.sb_lkid = lkb->lkb_id;
1373+ status = dlm_lock(f->fi_ls->ls_lockspace,
1374+ lkb->lkb_grmode, &li.li_lksb,
1375+ DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
1376+ NULL, 0, 0, ast_routine, &li,
1377+ NULL, NULL);
1378+ if (status == 0)
1379+ wait_for_ast(&li);
1380+ }
1381+ lkb->lkb_flags |= GDLM_LKFLG_ORPHAN;
1382+
c1c6733f
AM
1383+ /* But tidy our references in it */
1384+ kfree(old_li);
1385+ lkb->lkb_astparam = (long)NULL;
1386+ put_file_info(f);
c783755a 1387+
c1c6733f
AM
1388+ continue;
1389+ }
1390+
1391+ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1392+
1393+ /* If it's not granted then cancel the request.
1394+ * If the lock was WAITING then it will be dropped,
1395+ * if it was converting then it will be reverted to GRANTED,
1396+ * then we will unlock it.
1397+ */
1398+ lock_status = lkb->lkb_status;
1399+
1400+ if (lock_status != GDLM_LKSTS_GRANTED)
1401+ flags = DLM_LKF_CANCEL;
1402+
c783755a
AM
1403+ if (lkb->lkb_grmode >= DLM_LOCK_PW)
1404+ flags |= DLM_LKF_IVVALBLK;
1405+
c1c6733f
AM
1406+ status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1407+
1408+ /* Must wait for it to complete as the next lock could be its
1409+ * parent */
1410+ if (status == 0)
1411+ wait_for_ast(&li);
1412+
1413+ /* If it was waiting for a conversion, it will
1414+ now be granted so we can unlock it properly */
1415+ if (lock_status == GDLM_LKSTS_CONVERT) {
c783755a 1416+ flags &= ~DLM_LKF_CANCEL;
c1c6733f 1417+ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
c783755a 1418+ status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
c1c6733f
AM
1419+
1420+ if (status == 0)
1421+ wait_for_ast(&li);
1422+ }
1423+ /* Unlock suceeded, free the lock_info struct. */
1424+ if (status == 0) {
1425+ kfree(old_li);
1426+ put_file_info(f);
1427+ }
1428+ }
1429+
1430+ remove_wait_queue(&li.li_waitq, &wq);
1431+
1432+ /* If this is the last reference, and the lockspace has been deleted
b7b72b66 1433+ then free the struct */
c1c6733f 1434+ if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
b7b72b66 1435+ kfree(lsinfo->ls_miscinfo.name);
c1c6733f
AM
1436+ kfree(lsinfo);
1437+ }
1438+
1439+ /* Restore signals */
1440+ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1441+ recalc_sigpending();
1442+
1443+ return 0;
1444+}
1445+
1446+/*
1447+ * ioctls to create/remove lockspaces, and check how many
1448+ * outstanding ASTs there are against a particular LS.
1449+ */
1450+static int dlm_ioctl(struct inode *inode, struct file *file,
1451+ uint command, ulong u)
1452+{
1453+ struct file_info *fi = file->private_data;
1454+ int status = -EINVAL;
1455+ int count;
1456+ struct list_head *tmp_list;
1457+
1458+ switch (command) {
1459+
1460+ /* Are there any ASTs for us to read?
1461+ * Warning, this returns the number of messages (ASTs)
1462+ * in the queue, NOT the number of bytes to read
1463+ */
1464+ case FIONREAD:
1465+ count = 0;
1466+ spin_lock(&fi->fi_ast_lock);
1467+ list_for_each(tmp_list, &fi->fi_ast_list)
1468+ count++;
1469+ spin_unlock(&fi->fi_ast_lock);
1470+ status = put_user(count, (int *)u);
1471+ break;
1472+
1473+ default:
1474+ return -ENOTTY;
1475+ }
1476+
1477+ return status;
1478+}
1479+
1480+/*
1481+ * ioctls to create/remove lockspaces.
1482+ */
1483+static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
1484+ uint command, ulong u)
1485+{
1486+ int status = -EINVAL;
1487+ char ls_name[MAX_LS_NAME_LEN];
1488+ struct user_ls *lsinfo;
1489+ int force = 0;
1490+
1491+ switch (command) {
1492+ case DLM_CREATE_LOCKSPACE:
1493+ if (!capable(CAP_SYS_ADMIN))
1494+ return -EPERM;
1495+
1496+ if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
1497+ return -EFAULT;
1498+ status = register_lockspace(ls_name, &lsinfo);
1499+
1500+ /* If it succeeded then return the minor number */
1501+ if (status == 0)
1502+ status = lsinfo->ls_miscinfo.minor;
1503+ break;
1504+
1505+ case DLM_FORCE_RELEASE_LOCKSPACE:
1506+ force = 2;
1507+
1508+ case DLM_RELEASE_LOCKSPACE:
1509+ if (!capable(CAP_SYS_ADMIN))
1510+ return -EPERM;
1511+
b7b72b66
AM
1512+ down(&user_ls_lock);
1513+ lsinfo = __find_lockspace(u);
1514+ if (!lsinfo) {
1515+ up(&user_ls_lock);
c1c6733f 1516+ return -EINVAL;
b7b72b66
AM
1517+ }
1518+
c1c6733f 1519+ status = unregister_lockspace(lsinfo, force);
b7b72b66 1520+ up(&user_ls_lock);
c1c6733f
AM
1521+ break;
1522+
1523+ default:
1524+ return -ENOTTY;
1525+ }
1526+
1527+ return status;
1528+}
1529+
1530+/* Deal with the messy stuff of copying a web of structs
1531+ from kernel space to userspace */
1532+static int copy_query_result(struct ast_info *ast)
1533+{
1534+ int status = -EFAULT;
1535+ struct dlm_queryinfo qi;
1536+
1537+ /* Get the pointers to userspace structs */
1538+ if (copy_from_user(&qi, ast->user_queryinfo,
1539+ sizeof(struct dlm_queryinfo)))
1540+ goto copy_out;
1541+
c1c6733f
AM
1542+ if (put_user(ast->queryinfo->gqi_lockcount,
1543+ &ast->user_queryinfo->gqi_lockcount))
1544+ goto copy_out;
1545+
1546+ if (qi.gqi_resinfo) {
1547+ if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
1548+ sizeof(struct dlm_resinfo)))
1549+ goto copy_out;
1550+ }
1551+
1552+ if (qi.gqi_lockinfo) {
1553+ if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
1554+ sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
1555+ goto copy_out;
1556+ }
1557+
1558+ status = 0;
1559+
1560+ if (ast->queryinfo->gqi_lockinfo)
1561+ kfree(ast->queryinfo->gqi_lockinfo);
1562+
1563+ if (ast->queryinfo->gqi_resinfo)
1564+ kfree(ast->queryinfo->gqi_resinfo);
1565+
1566+ kfree(ast->queryinfo);
1567+
1568+ copy_out:
1569+ return status;
1570+}
1571+
1572+/* Read call, might block if no ASTs are waiting.
1573+ * It will only ever return one message at a time, regardless
1574+ * of how many are pending.
1575+ */
1576+static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
1577+{
1578+ struct file_info *fi = file->private_data;
1579+ struct ast_info *ast;
1580+ int ret;
1581+ DECLARE_WAITQUEUE(wait, current);
1582+
1583+ if (count < sizeof(struct dlm_lock_result))
1584+ return -EINVAL;
1585+
1586+ spin_lock(&fi->fi_ast_lock);
1587+ if (list_empty(&fi->fi_ast_list)) {
1588+
1589+ /* No waiting ASTs.
1590+ * Return EOF if the lockspace been deleted.
1591+ */
1592+ if (test_bit(1, &fi->fi_ls->ls_flags))
1593+ return 0;
1594+
1595+ if (file->f_flags & O_NONBLOCK) {
1596+ spin_unlock(&fi->fi_ast_lock);
1597+ return -EAGAIN;
1598+ }
1599+
1600+ add_wait_queue(&fi->fi_wait, &wait);
1601+
1602+ repeat:
1603+ set_current_state(TASK_INTERRUPTIBLE);
1604+ if (list_empty(&fi->fi_ast_list) &&
1605+ !signal_pending(current)) {
1606+
1607+ spin_unlock(&fi->fi_ast_lock);
1608+ schedule();
1609+ spin_lock(&fi->fi_ast_lock);
1610+ goto repeat;
1611+ }
1612+
1613+ current->state = TASK_RUNNING;
1614+ remove_wait_queue(&fi->fi_wait, &wait);
1615+
1616+ if (signal_pending(current)) {
1617+ spin_unlock(&fi->fi_ast_lock);
1618+ return -ERESTARTSYS;
1619+ }
1620+ }
1621+
1622+ ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
1623+ list_del(&ast->list);
1624+ spin_unlock(&fi->fi_ast_lock);
1625+
1626+ ret = sizeof(struct dlm_lock_result);
1627+ if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
1628+ ret = -EFAULT;
1629+
b7b72b66
AM
1630+ if (ast->ast_reason == AST_COMP &&
1631+ ast->result.cmd == DLM_USER_LOCK && ast->user_lvbptr) {
1632+ if (copy_to_user(ast->user_lvbptr, ast->result.lksb.sb_lvbptr, DLM_LVB_LEN))
1633+ ret = -EFAULT;
1634+ }
1635+
c1c6733f
AM
1636+ /* If it was a query then copy the result block back here */
1637+ if (ast->queryinfo) {
1638+ int status = copy_query_result(ast);
1639+ if (status)
1640+ ret = status;
1641+ }
1642+
1643+ kfree(ast);
1644+ return ret;
1645+}
1646+
1647+static unsigned int dlm_poll(struct file *file, poll_table *wait)
1648+{
1649+ struct file_info *fi = file->private_data;
1650+
1651+ poll_wait(file, &fi->fi_wait, wait);
1652+
1653+ spin_lock(&fi->fi_ast_lock);
1654+ if (!list_empty(&fi->fi_ast_list)) {
1655+ spin_unlock(&fi->fi_ast_lock);
1656+ return POLLIN | POLLRDNORM;
1657+ }
1658+
1659+ spin_unlock(&fi->fi_ast_lock);
1660+ return 0;
1661+}
1662+
1663+static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
1664+{
1665+ struct lock_info *li;
1666+ int status;
1667+
b7b72b66
AM
1668+ if (!kparams->castaddr)
1669+ return -EINVAL;
1670+
1671+ if (!kparams->lksb)
1672+ return -EINVAL;
1673+
c1c6733f
AM
1674+ li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1675+ if (!li)
1676+ return -ENOMEM;
1677+
1678+ get_file_info(fi);
1679+ li->li_user_lksb = kparams->lksb;
b7b72b66 1680+ li->li_bastparam = kparams->bastparam;
c1c6733f 1681+ li->li_bastaddr = kparams->bastaddr;
b7b72b66
AM
1682+ li->li_castparam = kparams->castparam;
1683+ li->li_castaddr = kparams->castaddr;
c1c6733f
AM
1684+ li->li_file = fi;
1685+ li->li_flags = 0;
1686+ li->li_cmd = kparams->cmd;
1687+ clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1688+
1689+ if (copy_from_user(&li->li_lksb, kparams->lksb,
1690+ sizeof(struct dlm_lksb))) {
1691+ kfree(li);
1692+ return -EFAULT;
1693+ }
1694+ li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
1695+
1696+ /* Allocate query structs */
1697+ status = -ENOMEM;
1698+ li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
1699+ if (!li->li_queryinfo)
1700+ goto out1;
1701+
1702+ /* Mainly to get gqi_lock buffer size */
1703+ if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
1704+ sizeof(struct dlm_queryinfo))) {
1705+ status = -EFAULT;
1706+ goto out1;
1707+ }
1708+
1709+ /* Overwrite userspace pointers we just copied with kernel space ones */
1710+ if (li->li_queryinfo->gqi_resinfo) {
1711+ li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
1712+ if (!li->li_queryinfo->gqi_resinfo)
1713+ goto out1;
1714+ }
1715+ if (li->li_queryinfo->gqi_lockinfo) {
1716+ li->li_queryinfo->gqi_lockinfo =
1717+ kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
1718+ GFP_KERNEL);
1719+ if (!li->li_queryinfo->gqi_lockinfo)
1720+ goto out2;
1721+ }
1722+
1723+ li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
1724+
1725+ return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
1726+ kparams->flags, /* query */
1727+ li->li_queryinfo,
1728+ ast_routine, li);
1729+
1730+ out2:
1731+ kfree(li->li_queryinfo);
1732+
1733+ out1:
1734+ kfree(li);
1735+ return status;
1736+}
1737+
c783755a
AM
1738+static struct lock_info *allocate_lockinfo(struct file_info *fi, struct dlm_lock_params *kparams)
1739+{
1740+ struct lock_info *li;
1741+
1742+ li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1743+ if (li) {
1744+ li->li_magic = LOCKINFO_MAGIC;
1745+ li->li_file = fi;
1746+ li->li_cmd = kparams->cmd;
1747+ li->li_queryinfo = NULL;
1748+ li->li_flags = 0;
1749+ li->li_pend_bastparam = NULL;
1750+ li->li_pend_bastaddr = NULL;
1751+ li->li_lksb.sb_lvbptr = NULL;
1752+ li->li_bastaddr = kparams->bastaddr;
1753+ li->li_bastparam = kparams->bastparam;
1754+
1755+ get_file_info(fi);
1756+ }
1757+ return li;
1758+}
1759+
c1c6733f
AM
1760+static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
1761+ const char *buffer)
1762+{
1763+ struct lock_info *li;
1764+ int status;
1765+ char name[DLM_RESNAME_MAXLEN];
b7b72b66 1766+ void *lvbptr;
c1c6733f
AM
1767+
1768+ /*
1769+ * Validate things that we need to have correct.
1770+ */
b7b72b66 1771+ if (!kparams->castaddr)
c1c6733f
AM
1772+ return -EINVAL;
1773+
1774+ if (!kparams->lksb)
1775+ return -EINVAL;
1776+
b7b72b66 1777+ if (!access_ok(VERIFY_WRITE, kparams->lksb, sizeof(struct dlm_lksb)))
c1c6733f 1778+ return -EFAULT;
c1c6733f 1779+
c783755a
AM
1780+ /* Persistent child locks are not available yet */
1781+ if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
1782+ return -EINVAL;
1783+
1784+ /* For conversions, the lock will already have a lock_info
c1c6733f
AM
1785+ block squirelled away in astparam */
1786+ if (kparams->flags & DLM_LKF_CONVERT) {
b7b72b66 1787+ struct dlm_lkb *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
c1c6733f
AM
1788+ if (!lkb) {
1789+ return -EINVAL;
1790+ }
c1c6733f 1791+
b7b72b66 1792+ li = (struct lock_info *)lkb->lkb_astparam;
c783755a
AM
1793+
1794+ /* li may be NULL if the lock was PERSISTENT and the process went
1795+ away, so we need to allocate a new one */
1796+ if (!li) {
1797+ li = allocate_lockinfo(fi, kparams);
1798+ if (li) {
1799+ spin_lock(&fi->fi_lkb_lock);
1800+ list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1801+ spin_unlock(&fi->fi_lkb_lock);
1802+ }
1803+ else {
1804+ return -ENOMEM;
1805+ }
1806+ }
1807+
1808+ if (li->li_magic != LOCKINFO_MAGIC)
1809+ return -EINVAL;
1810+
b7b72b66
AM
1811+ /* For conversions don't overwrite the current blocking AST
1812+ info so that:
1813+ a) if a blocking AST fires before the conversion is queued
1814+ it runs the current handler
1815+ b) if the conversion is cancelled, the original blocking AST
1816+ declaration is active
1817+ The pend_ info is made active when the conversion
1818+ completes.
1819+ */
1820+ li->li_pend_bastaddr = kparams->bastaddr;
1821+ li->li_pend_bastparam = kparams->bastparam;
c1c6733f
AM
1822+ }
1823+ else {
c783755a 1824+ li = allocate_lockinfo(fi, kparams);
c1c6733f
AM
1825+ if (!li)
1826+ return -ENOMEM;
1827+
b7b72b66
AM
1828+ /* Get the lock name */
1829+ if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
1830+ kparams->namelen)) {
1831+ return -EFAULT;
1832+ }
c1c6733f
AM
1833+
1834+ /* semaphore to allow us to complete our work before
1835+ the AST routine runs. In fact we only need (and use) this
1836+ when the initial lock fails */
1837+ init_MUTEX_LOCKED(&li->li_firstlock);
1838+ set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
c1c6733f
AM
1839+ }
1840+
b7b72b66
AM
1841+ li->li_user_lksb = kparams->lksb;
1842+ li->li_castaddr = kparams->castaddr;
1843+ li->li_castparam = kparams->castparam;
1844+
c1c6733f 1845+ /* Copy the user's LKSB into kernel space,
b7b72b66
AM
1846+ needed for conversions & value block operations.
1847+ Save our kernel-space lvbptr first */
1848+ lvbptr = li->li_lksb.sb_lvbptr;
1849+ if (copy_from_user(&li->li_lksb, kparams->lksb, sizeof(struct dlm_lksb))) {
1850+ status = -EFAULT;
1851+ goto out_err;
1852+ }
1853+ /* Store new userland LVBptr and restore kernel one */
1854+ li->li_user_lvbptr = li->li_lksb.sb_lvbptr;
1855+ li->li_lksb.sb_lvbptr = lvbptr;
1856+
1857+ /* Copy in the value block */
1858+ if (kparams->flags & DLM_LKF_VALBLK) {
1859+ if (!li->li_lksb.sb_lvbptr) {
1860+ li->li_lksb.sb_lvbptr = kmalloc(DLM_LVB_LEN, GFP_KERNEL);
1861+ if (!li->li_lksb.sb_lvbptr) {
1862+ status = -ENOMEM;
1863+ goto out_err;
1864+ }
1865+ }
1866+
1867+ if (copy_from_user(li->li_lksb.sb_lvbptr, kparams->lksb->sb_lvbptr,
1868+ DLM_LVB_LEN)) {
1869+ status = -EFAULT;
1870+ goto out_err;
1871+ }
1872+ }
1873+ else {
1874+ li->li_user_lvbptr = NULL;
1875+ }
c1c6733f
AM
1876+
1877+ /* Lock it ... */
1878+ status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
1879+ kparams->flags, name, kparams->namelen,
1880+ kparams->parent,
1881+ ast_routine,
1882+ li,
b7b72b66
AM
1883+ (li->li_pend_bastaddr || li->li_bastaddr) ?
1884+ bast_routine : NULL,
c1c6733f
AM
1885+ kparams->range.ra_end ? &kparams->range : NULL);
1886+
1887+ /* If it succeeded (this far) with a new lock then keep track of
1888+ it on the file's lkb list */
1889+ if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
c1c6733f 1890+
b7b72b66
AM
1891+ spin_lock(&fi->fi_lkb_lock);
1892+ list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1893+ spin_unlock(&fi->fi_lkb_lock);
1894+
c1c6733f 1895+ up(&li->li_firstlock);
b7b72b66
AM
1896+
1897+ /* Copy the lkid back to userspace in case they want to cancel.
1898+ This address has already been tested so /should/ be OK, if not:
1899+ tough - we've taken the lock! */
1900+ copy_to_user(&kparams->lksb->sb_lkid,
1901+ &li->li_lksb.sb_lkid,
1902+ sizeof(li->li_lksb.sb_lkid));
c1c6733f
AM
1903+ }
1904+
1905+ return status;
b7b72b66
AM
1906+
1907+ out_err:
1908+ if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
1909+
1910+ release_lockinfo(li);
1911+ }
1912+ return status;
1913+
c1c6733f
AM
1914+}
1915+
1916+static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
1917+{
1918+ struct lock_info *li;
b7b72b66 1919+ struct dlm_lkb *lkb;
c1c6733f 1920+ int status;
b7b72b66 1921+ int convert_cancel = 0;
c1c6733f
AM
1922+
1923+ lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1924+ if (!lkb) {
1925+ return -EINVAL;
1926+ }
1927+
b7b72b66
AM
1928+ /* Cancelling a conversion doesn't remove the lock...*/
1929+ if (kparams->flags & DLM_LKF_CANCEL &&
1930+ lkb->lkb_status == GDLM_LKSTS_CONVERT) {
1931+ convert_cancel = 1;
1932+ }
c1c6733f 1933+
b7b72b66 1934+ li = (struct lock_info *)lkb->lkb_astparam;
c783755a
AM
1935+ if (!li) {
1936+ li = allocate_lockinfo(fi, kparams);
1937+ spin_lock(&fi->fi_lkb_lock);
1938+ list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1939+ spin_unlock(&fi->fi_lkb_lock);
1940+ }
1941+ if (!li)
1942+ return -ENOMEM;
1943+
1944+ if (li->li_magic != LOCKINFO_MAGIC)
1945+ return -EINVAL;
1946+
c1c6733f 1947+ li->li_user_lksb = kparams->lksb;
b7b72b66 1948+ li->li_castparam = kparams->castparam;
c1c6733f
AM
1949+ li->li_cmd = kparams->cmd;
1950+
b7b72b66
AM
1951+ /* dlm_unlock() passes a 0 for castaddr which means don't overwrite
1952+ the existing li_castaddr as that's the completion routine for
1953+ unlocks. dlm_unlock_wait() specifies a new AST routine to be
1954+ executed when the unlock completes. */
1955+ if (kparams->castaddr)
1956+ li->li_castaddr = kparams->castaddr;
1957+
c1c6733f
AM
1958+ /* Have to do it here cos the lkb may not exist after
1959+ * dlm_unlock() */
b7b72b66
AM
1960+ if (!convert_cancel) {
1961+ spin_lock(&fi->fi_lkb_lock);
1962+ list_del(&li->li_ownerqueue);
1963+ spin_unlock(&fi->fi_lkb_lock);
1964+ }
c1c6733f
AM
1965+
1966+ /* Use existing lksb & astparams */
1967+ status = dlm_unlock(fi->fi_ls->ls_lockspace,
1968+ kparams->lkid,
b7b72b66
AM
1969+ kparams->flags, &li->li_lksb, li);
1970+ if (status && !convert_cancel) {
1971+ /* It failed, put it back on the list */
1972+ spin_lock(&fi->fi_lkb_lock);
1973+ list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1974+ spin_unlock(&fi->fi_lkb_lock);
1975+ }
c1c6733f
AM
1976+
1977+ return status;
1978+}
1979+
1980+/* Write call, submit a locking request */
1981+static ssize_t dlm_write(struct file *file, const char __user *buffer,
1982+ size_t count, loff_t *ppos)
1983+{
1984+ struct file_info *fi = file->private_data;
1985+ struct dlm_lock_params kparams;
1986+ sigset_t tmpsig;
1987+ sigset_t allsigs;
1988+ int status;
1989+
b7b72b66 1990+ if (count < sizeof(kparams)-1) /* -1 because lock name is optional */
c1c6733f
AM
1991+ return -EINVAL;
1992+
1993+ /* Has the lockspace been deleted */
1994+ if (test_bit(1, &fi->fi_ls->ls_flags))
1995+ return -ENOENT;
1996+
1997+ /* Get the command info */
1998+ if (copy_from_user(&kparams, buffer, sizeof(kparams)))
1999+ return -EFAULT;
2000+
2001+ if (check_version(&kparams))
2002+ return -EINVAL;
2003+
2004+ /* Block signals while we are doing this */
2005+ sigfillset(&allsigs);
2006+ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
2007+
2008+ switch (kparams.cmd)
2009+ {
2010+ case DLM_USER_LOCK:
2011+ status = do_user_lock(fi, &kparams, buffer);
2012+ break;
2013+
2014+ case DLM_USER_UNLOCK:
2015+ status = do_user_unlock(fi, &kparams);
2016+ break;
2017+
2018+ case DLM_USER_QUERY:
2019+ status = do_user_query(fi, &kparams);
2020+ break;
2021+
2022+ default:
2023+ status = -EINVAL;
2024+ break;
2025+ }
2026+ /* Restore signals */
2027+ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
2028+ recalc_sigpending();
2029+
2030+ if (status == 0)
2031+ return count;
2032+ else
2033+ return status;
2034+}
2035+
b7b72b66
AM
2036+/* Called when the cluster is shutdown uncleanly, all lockspaces
2037+ have been summarily removed */
c1c6733f
AM
2038+void dlm_device_free_devices()
2039+{
2040+ struct user_ls *tmp;
2041+ struct user_ls *lsinfo;
2042+
b7b72b66 2043+ down(&user_ls_lock);
c1c6733f
AM
2044+ list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
2045+ misc_deregister(&lsinfo->ls_miscinfo);
2046+
2047+ /* Tidy up, but don't delete the lsinfo struct until
2048+ all the users have closed their devices */
2049+ list_del(&lsinfo->ls_list);
c1c6733f 2050+ set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
b7b72b66 2051+ lsinfo->ls_lockspace = NULL;
c1c6733f 2052+ }
b7b72b66 2053+ up(&user_ls_lock);
c1c6733f
AM
2054+}
2055+
2056+static struct file_operations _dlm_fops = {
2057+ .open = dlm_open,
2058+ .release = dlm_close,
2059+ .ioctl = dlm_ioctl,
2060+ .read = dlm_read,
2061+ .write = dlm_write,
2062+ .poll = dlm_poll,
2063+ .owner = THIS_MODULE,
2064+};
2065+
2066+static struct file_operations _dlm_ctl_fops = {
2067+ .open = dlm_ctl_open,
2068+ .release = dlm_ctl_close,
2069+ .ioctl = dlm_ctl_ioctl,
2070+ .owner = THIS_MODULE,
2071+};
2072+
2073+/*
2074+ * Create control device
2075+ */
2076+int dlm_device_init(void)
2077+{
2078+ int r;
2079+
2080+ INIT_LIST_HEAD(&user_ls_list);
b7b72b66 2081+ init_MUTEX(&user_ls_lock);
c1c6733f
AM
2082+
2083+ ctl_device.name = "dlm-control";
2084+ ctl_device.fops = &_dlm_ctl_fops;
2085+ ctl_device.minor = MISC_DYNAMIC_MINOR;
2086+
2087+ r = misc_register(&ctl_device);
2088+ if (r) {
2089+ log_print("misc_register failed for DLM control device");
2090+ return r;
2091+ }
2092+
2093+ return 0;
2094+}
2095+
2096+void dlm_device_exit(void)
2097+{
2098+ misc_deregister(&ctl_device);
2099+}
2100+
2101+/*
2102+ * Overrides for Emacs so that we follow Linus's tabbing style.
2103+ * Emacs will notice this stuff at the end of the file and automatically
2104+ * adjust the settings for this buffer only. This must remain at the end
2105+ * of the file.
2106+ * ---------------------------------------------------------------------------
2107+ * Local variables:
2108+ * c-file-style: "linux"
2109+ * End:
2110+ */
2111diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
2112--- linux-orig/cluster/dlm/device.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 2113+++ linux-patched/cluster/dlm/device.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
2114@@ -0,0 +1,19 @@
2115+/******************************************************************************
2116+*******************************************************************************
2117+**
2118+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2119+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2120+**
2121+** This copyrighted material is made available to anyone wishing to use,
2122+** modify, copy, or redistribute it subject to the terms and conditions
2123+** of the GNU General Public License v.2.
2124+**
2125+*******************************************************************************
2126+******************************************************************************/
2127+
2128+#ifndef __DEVICE_DOT_H__
2129+#define __DEVICE_DOT_H__
2130+
2131+extern void dlm_device_free_devices(void);
2132+
2133+#endif /* __DEVICE_DOT_H__ */
2134diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
2135--- linux-orig/cluster/dlm/dir.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
2136+++ linux-patched/cluster/dlm/dir.c 2004-11-03 11:31:56.000000000 +0800
2137@@ -0,0 +1,471 @@
c1c6733f
AM
2138+/******************************************************************************
2139+*******************************************************************************
2140+**
2141+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2142+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2143+**
2144+** This copyrighted material is made available to anyone wishing to use,
2145+** modify, copy, or redistribute it subject to the terms and conditions
2146+** of the GNU General Public License v.2.
2147+**
2148+*******************************************************************************
2149+******************************************************************************/
2150+
2151+#include "dlm_internal.h"
2152+#include "nodes.h"
2153+#include "lockspace.h"
2154+#include "lowcomms.h"
2155+#include "reccomms.h"
2156+#include "rsb.h"
2157+#include "config.h"
2158+#include "memory.h"
2159+#include "recover.h"
2160+#include "util.h"
2161+
b7b72b66
AM
2162+struct resmov {
2163+ uint32_t rm_nodeid;
2164+ uint16_t rm_length;
2165+ uint16_t rm_pad;
2166+};
2167+
2168+void print_name(char *b, int len)
2169+{
2170+ int i;
2171+ for (i = 0; i < len; i++)
2172+ printk("%c", b[i]);
2173+ printk("\n");
2174+}
2175+
2176+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
2177+{
2178+ spin_lock(&ls->ls_recover_list_lock);
2179+ list_add(&de->list, &ls->ls_recover_list);
2180+ spin_unlock(&ls->ls_recover_list_lock);
2181+}
2182+
2183+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
2184+{
2185+ int found = FALSE;
2186+ struct dlm_direntry *de;
2187+
2188+ spin_lock(&ls->ls_recover_list_lock);
2189+ list_for_each_entry(de, &ls->ls_recover_list, list) {
2190+ if (de->length == len) {
2191+ list_del(&de->list);
2192+ de->master_nodeid = 0;
2193+ memset(de->name, 0, len);
2194+ found = TRUE;
2195+ break;
2196+ }
2197+ }
2198+ spin_unlock(&ls->ls_recover_list_lock);
2199+
2200+ if (!found)
2201+ de = allocate_direntry(ls, len);
2202+ return de;
2203+}
2204+
c783755a 2205+void clear_free_de(struct dlm_ls *ls)
b7b72b66
AM
2206+{
2207+ struct dlm_direntry *de;
2208+
2209+ spin_lock(&ls->ls_recover_list_lock);
2210+ while (!list_empty(&ls->ls_recover_list)) {
2211+ de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
2212+ list);
2213+ list_del(&de->list);
2214+ free_direntry(de);
2215+ }
2216+ spin_unlock(&ls->ls_recover_list_lock);
2217+}
2218+
c1c6733f
AM
2219+/*
2220+ * We use the upper 16 bits of the hash value to select the directory node.
2221+ * Low bits are used for distribution of rsb's among hash buckets on each node.
2222+ *
c1c6733f 2223+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
b7b72b66
AM
2224+ * num_nodes to the hash value. This value in the desired range is used as an
2225+ * offset into the sorted list of nodeid's to give the particular nodeid of the
2226+ * directory node.
c1c6733f
AM
2227+ */
2228+
b7b72b66 2229+uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length)
c1c6733f
AM
2230+{
2231+ struct list_head *tmp;
b7b72b66 2232+ struct dlm_csb *csb = NULL;
c1c6733f
AM
2233+ uint32_t hash, node, n = 0, nodeid;
2234+
2235+ if (ls->ls_num_nodes == 1) {
2236+ nodeid = our_nodeid();
2237+ goto out;
2238+ }
2239+
b7b72b66
AM
2240+ hash = dlm_hash(name, length);
2241+ node = (hash >> 16) % ls->ls_num_nodes;
c1c6733f 2242+
bb1d8b11
AM
2243+ if (ls->ls_node_array) {
2244+ nodeid = ls->ls_node_array[node];
2245+ goto out;
2246+ }
2247+
c1c6733f
AM
2248+ list_for_each(tmp, &ls->ls_nodes) {
2249+ if (n++ != node)
2250+ continue;
b7b72b66 2251+ csb = list_entry(tmp, struct dlm_csb, list);
c1c6733f
AM
2252+ break;
2253+ }
2254+
b7b72b66
AM
2255+ DLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u\n",
2256+ ls->ls_num_nodes, n, node););
2257+ nodeid = csb->node->nodeid;
bb1d8b11 2258+ out:
c1c6733f
AM
2259+ return nodeid;
2260+}
2261+
b7b72b66 2262+uint32_t get_directory_nodeid(struct dlm_rsb *rsb)
c1c6733f
AM
2263+{
2264+ return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
2265+ rsb->res_length);
2266+}
2267+
b7b72b66 2268+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
c1c6733f
AM
2269+{
2270+ uint32_t val;
2271+
b7b72b66
AM
2272+ val = dlm_hash(name, len);
2273+ val &= (ls->ls_dirtbl_size - 1);
c1c6733f
AM
2274+
2275+ return val;
2276+}
2277+
b7b72b66 2278+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
c1c6733f 2279+{
b7b72b66 2280+ uint32_t bucket;
c1c6733f 2281+
b7b72b66
AM
2282+ bucket = dir_hash(ls, de->name, de->length);
2283+ list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
c1c6733f
AM
2284+}
2285+
b7b72b66
AM
2286+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
2287+ int namelen, uint32_t bucket)
c1c6733f 2288+{
b7b72b66 2289+ struct dlm_direntry *de;
c1c6733f 2290+
b7b72b66
AM
2291+ list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
2292+ if (de->length == namelen && !memcmp(name, de->name, namelen))
c1c6733f
AM
2293+ goto out;
2294+ }
b7b72b66
AM
2295+ de = NULL;
2296+ out:
2297+ return de;
c1c6733f
AM
2298+}
2299+
b7b72b66 2300+void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen)
c1c6733f 2301+{
b7b72b66 2302+ struct dlm_direntry *de;
c1c6733f
AM
2303+ uint32_t bucket;
2304+
b7b72b66 2305+ bucket = dir_hash(ls, name, namelen);
c1c6733f 2306+
b7b72b66 2307+ write_lock(&ls->ls_dirtbl[bucket].lock);
c1c6733f 2308+
b7b72b66 2309+ de = search_bucket(ls, name, namelen, bucket);
c1c6733f 2310+
b7b72b66
AM
2311+ if (!de) {
2312+ log_all(ls, "remove fr %u none", nodeid);
2313+ print_name(name, namelen);
c1c6733f
AM
2314+ goto out;
2315+ }
2316+
b7b72b66
AM
2317+ if (de->master_nodeid != nodeid) {
2318+ log_all(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
2319+ print_name(name, namelen);
c1c6733f
AM
2320+ goto out;
2321+ }
2322+
b7b72b66
AM
2323+ list_del(&de->list);
2324+ free_direntry(de);
2325+ out:
2326+ write_unlock(&ls->ls_dirtbl[bucket].lock);
c1c6733f
AM
2327+}
2328+
b7b72b66 2329+void dlm_dir_clear(struct dlm_ls *ls)
c1c6733f
AM
2330+{
2331+ struct list_head *head;
b7b72b66 2332+ struct dlm_direntry *de;
c1c6733f
AM
2333+ int i;
2334+
b7b72b66
AM
2335+ for (i = 0; i < ls->ls_dirtbl_size; i++) {
2336+ write_lock(&ls->ls_dirtbl[i].lock);
2337+ head = &ls->ls_dirtbl[i].list;
c1c6733f 2338+ while (!list_empty(head)) {
b7b72b66
AM
2339+ de = list_entry(head->next, struct dlm_direntry, list);
2340+ list_del(&de->list);
2341+ put_free_de(ls, de);
c1c6733f 2342+ }
b7b72b66 2343+ write_unlock(&ls->ls_dirtbl[i].lock);
c1c6733f
AM
2344+ }
2345+}
2346+
b7b72b66 2347+static void resmov_in(struct resmov *rm, char *buf)
c1c6733f 2348+{
b7b72b66 2349+ struct resmov tmp;
c1c6733f 2350+
b7b72b66 2351+ memcpy(&tmp, buf, sizeof(struct resmov));
c1c6733f
AM
2352+
2353+ rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
2354+ rm->rm_length = be16_to_cpu(tmp.rm_length);
2355+}
2356+
b7b72b66 2357+int dlm_dir_rebuild_local(struct dlm_ls *ls)
c1c6733f 2358+{
b7b72b66
AM
2359+ struct dlm_csb *csb;
2360+ struct dlm_direntry *de;
2361+ struct dlm_rcom *rc;
2362+ struct resmov mov, last_mov;
c1c6733f
AM
2363+ char *b, *last_name;
2364+ int error = -ENOMEM, count = 0;
2365+
2366+ log_all(ls, "rebuild resource directory");
2367+
b7b72b66 2368+ dlm_dir_clear(ls);
c1c6733f
AM
2369+
2370+ rc = allocate_rcom_buffer(ls);
2371+ if (!rc)
2372+ goto out;
2373+
2374+ last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
2375+ if (!last_name)
2376+ goto free_rc;
2377+
b7b72b66 2378+ list_for_each_entry(csb, &ls->ls_nodes, list) {
c1c6733f
AM
2379+ last_mov.rm_length = 0;
2380+ for (;;) {
b7b72b66 2381+ error = dlm_recovery_stopped(ls);
c1c6733f
AM
2382+ if (error)
2383+ goto free_last;
2384+
2385+ memcpy(rc->rc_buf, last_name, last_mov.rm_length);
2386+ rc->rc_datalen = last_mov.rm_length;
2387+
b7b72b66 2388+ error = rcom_send_message(ls, csb->node->nodeid,
c1c6733f
AM
2389+ RECCOMM_RECOVERNAMES, rc, 1);
2390+ if (error)
2391+ goto free_last;
2392+
2393+ schedule();
2394+
2395+ /*
2396+ * pick each res out of buffer
2397+ */
2398+
2399+ b = rc->rc_buf;
2400+
2401+ for (;;) {
b7b72b66
AM
2402+ resmov_in(&mov, b);
2403+ b += sizeof(struct resmov);
c1c6733f
AM
2404+
2405+ /* Length of 0 with a non-zero nodeid marks the
2406+ * end of the list */
2407+ if (!mov.rm_length && mov.rm_nodeid)
2408+ goto done;
2409+
2410+ /* This is just the end of the block */
2411+ if (!mov.rm_length)
2412+ break;
2413+
b7b72b66
AM
2414+ DLM_ASSERT(mov.rm_nodeid == csb->node->nodeid,);
2415+
c1c6733f 2416+ error = -ENOMEM;
b7b72b66
AM
2417+ de = get_free_de(ls, mov.rm_length);
2418+ if (!de)
c1c6733f
AM
2419+ goto free_last;
2420+
b7b72b66
AM
2421+ de->master_nodeid = mov.rm_nodeid;
2422+ de->length = mov.rm_length;
2423+ memcpy(de->name, b, mov.rm_length);
c1c6733f
AM
2424+ b += mov.rm_length;
2425+
b7b72b66 2426+ add_entry_to_hash(ls, de);
c1c6733f
AM
2427+ count++;
2428+
2429+ last_mov = mov;
2430+ memset(last_name, 0, DLM_RESNAME_MAXLEN);
b7b72b66 2431+ memcpy(last_name, de->name, de->length);
c1c6733f
AM
2432+ }
2433+ }
2434+ done:
2435+ ;
2436+ }
2437+
2438+ set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
2439+ error = 0;
2440+
2441+ log_all(ls, "rebuilt %d resources", count);
2442+
2443+ free_last:
2444+ kfree(last_name);
2445+
2446+ free_rc:
2447+ free_rcom_buffer(rc);
2448+
2449+ out:
b7b72b66 2450+ clear_free_de(ls);
c1c6733f
AM
2451+ return error;
2452+}
2453+
2454+/*
b7b72b66 2455+ * The reply end of dlm_dir_rebuild_local/RECOVERNAMES. Collect and send as
c1c6733f
AM
2456+ * many resource names as can fit in the buffer.
2457+ */
2458+
b7b72b66
AM
2459+int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2460+ char *outbuf, int outlen, uint32_t nodeid)
c1c6733f
AM
2461+{
2462+ struct list_head *list;
b7b72b66 2463+ struct dlm_rsb *start_rsb = NULL, *rsb;
c1c6733f
AM
2464+ int offset = 0, start_namelen, error;
2465+ char *start_name;
b7b72b66 2466+ struct resmov tmp;
c1c6733f
AM
2467+ uint32_t dir_nodeid;
2468+
2469+ /*
2470+ * Find the rsb where we left off (or start again)
2471+ */
2472+
2473+ start_namelen = inlen;
2474+ start_name = inbuf;
2475+
2476+ if (start_namelen > 1) {
b7b72b66
AM
2477+ error = find_rsb(ls, NULL, start_name, start_namelen, 0,
2478+ &start_rsb);
2479+ DLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
c1c6733f
AM
2480+ release_rsb(start_rsb);
2481+ }
2482+
2483+ /*
2484+ * Send rsb names for rsb's we're master of and whose directory node
2485+ * matches the requesting node.
2486+ */
2487+
b7b72b66 2488+ down_read(&ls->ls_root_lock);
c1c6733f
AM
2489+ if (start_rsb)
2490+ list = start_rsb->res_rootlist.next;
2491+ else
2492+ list = ls->ls_rootres.next;
2493+
2494+ for (offset = 0; list != &ls->ls_rootres; list = list->next) {
b7b72b66 2495+ rsb = list_entry(list, struct dlm_rsb, res_rootlist);
c1c6733f
AM
2496+ if (rsb->res_nodeid)
2497+ continue;
2498+
2499+ dir_nodeid = get_directory_nodeid(rsb);
2500+ if (dir_nodeid != nodeid)
2501+ continue;
2502+
b7b72b66 2503+ if (offset + sizeof(struct resmov)*2 + rsb->res_length > outlen) {
c1c6733f 2504+ /* Write end-of-block record */
b7b72b66
AM
2505+ memset(&tmp, 0, sizeof(struct resmov));
2506+ memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2507+ offset += sizeof(struct resmov);
c1c6733f
AM
2508+ goto out;
2509+ }
2510+
b7b72b66 2511+ memset(&tmp, 0, sizeof(struct resmov));
c1c6733f
AM
2512+ tmp.rm_nodeid = cpu_to_be32(our_nodeid());
2513+ tmp.rm_length = cpu_to_be16(rsb->res_length);
2514+
b7b72b66
AM
2515+ memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2516+ offset += sizeof(struct resmov);
c1c6733f
AM
2517+
2518+ memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
2519+ offset += rsb->res_length;
2520+ }
2521+
2522+ /*
2523+ * If we've reached the end of the list (and there's room) write a
2524+ * terminating record.
2525+ */
2526+
2527+ if ((list == &ls->ls_rootres) &&
b7b72b66 2528+ (offset + sizeof(struct resmov) <= outlen)) {
c1c6733f 2529+
b7b72b66 2530+ memset(&tmp, 0, sizeof(struct resmov));
c1c6733f
AM
2531+ /* This only needs to be non-zero */
2532+ tmp.rm_nodeid = cpu_to_be32(1);
2533+ /* and this must be zero */
2534+ tmp.rm_length = 0;
b7b72b66
AM
2535+ memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2536+ offset += sizeof(struct resmov);
c1c6733f
AM
2537+ }
2538+
2539+ out:
b7b72b66 2540+ up_read(&ls->ls_root_lock);
c1c6733f
AM
2541+ return offset;
2542+}
2543+
b7b72b66
AM
2544+static int get_entry(struct dlm_ls *ls, uint32_t nodeid, char *name,
2545+ int namelen, uint32_t *r_nodeid)
c1c6733f 2546+{
b7b72b66 2547+ struct dlm_direntry *de, *tmp;
c1c6733f
AM
2548+ uint32_t bucket;
2549+
b7b72b66 2550+ bucket = dir_hash(ls, name, namelen);
c1c6733f 2551+
b7b72b66
AM
2552+ write_lock(&ls->ls_dirtbl[bucket].lock);
2553+ de = search_bucket(ls, name, namelen, bucket);
2554+ if (de) {
2555+ *r_nodeid = de->master_nodeid;
2556+ write_unlock(&ls->ls_dirtbl[bucket].lock);
2557+ if (*r_nodeid == nodeid)
2558+ return -EEXIST;
2559+ return 0;
2560+ }
c1c6733f 2561+
b7b72b66 2562+ write_unlock(&ls->ls_dirtbl[bucket].lock);
c1c6733f 2563+
b7b72b66
AM
2564+ de = allocate_direntry(ls, namelen);
2565+ if (!de)
c1c6733f
AM
2566+ return -ENOMEM;
2567+
b7b72b66
AM
2568+ de->master_nodeid = nodeid;
2569+ de->length = namelen;
2570+ memcpy(de->name, name, namelen);
c1c6733f 2571+
b7b72b66
AM
2572+ write_lock(&ls->ls_dirtbl[bucket].lock);
2573+ tmp = search_bucket(ls, name, namelen, bucket);
c1c6733f 2574+ if (tmp) {
b7b72b66
AM
2575+ free_direntry(de);
2576+ de = tmp;
2577+ } else {
2578+ list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
c1c6733f 2579+ }
b7b72b66
AM
2580+ *r_nodeid = de->master_nodeid;
2581+ write_unlock(&ls->ls_dirtbl[bucket].lock);
c1c6733f
AM
2582+ return 0;
2583+}
2584+
b7b72b66
AM
2585+int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2586+ uint32_t *r_nodeid)
2587+{
2588+ return get_entry(ls, nodeid, name, namelen, r_nodeid);
2589+}
2590+
c1c6733f
AM
2591+/*
2592+ * The node with lowest id queries all nodes to determine when all are done.
2593+ * All other nodes query the low nodeid for this.
2594+ */
2595+
b7b72b66 2596+int dlm_dir_rebuild_wait(struct dlm_ls *ls)
c1c6733f
AM
2597+{
2598+ int error;
2599+
2600+ if (ls->ls_low_nodeid == our_nodeid()) {
b7b72b66 2601+ error = dlm_wait_status_all(ls, RESDIR_VALID);
c1c6733f
AM
2602+ if (!error)
2603+ set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
2604+ } else
b7b72b66 2605+ error = dlm_wait_status_low(ls, RESDIR_ALL_VALID);
c1c6733f
AM
2606+
2607+ return error;
2608+}
2609diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
2610--- linux-orig/cluster/dlm/dir.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 2611+++ linux-patched/cluster/dlm/dir.h 2004-11-03 11:31:56.000000000 +0800
c783755a 2612@@ -0,0 +1,33 @@
c1c6733f
AM
2613+/******************************************************************************
2614+*******************************************************************************
2615+**
2616+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2617+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2618+**
2619+** This copyrighted material is made available to anyone wishing to use,
2620+** modify, copy, or redistribute it subject to the terms and conditions
2621+** of the GNU General Public License v.2.
2622+**
2623+*******************************************************************************
2624+******************************************************************************/
2625+
2626+#ifndef __DIR_DOT_H__
2627+#define __DIR_DOT_H__
2628+
b7b72b66
AM
2629+void print_name(char *b, int len);
2630+uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length);
2631+uint32_t get_directory_nodeid(struct dlm_rsb *rsb);
2632+
2633+int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2634+ uint32_t *r_nodeid);
2635+void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name,
2636+ int namelen);
2637+int dlm_dir_rebuild_local(struct dlm_ls *ls);
2638+int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2639+ char *outbuf, int outlen, uint32_t nodeid);
2640+int dlm_dir_rebuild_wait(struct dlm_ls * ls);
2641+void dlm_dir_clear(struct dlm_ls *ls);
2642+void dlm_dir_dump(struct dlm_ls *ls);
c783755a 2643+void clear_free_de(struct dlm_ls *ls);
c1c6733f
AM
2644+
2645+#endif /* __DIR_DOT_H__ */
2646diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
2647--- linux-orig/cluster/dlm/dlm_internal.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
2648+++ linux-patched/cluster/dlm/dlm_internal.h 2004-11-03 11:31:56.000000000 +0800
2649@@ -0,0 +1,612 @@
c1c6733f
AM
2650+/******************************************************************************
2651+*******************************************************************************
2652+**
2653+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
2654+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
2655+**
2656+** This copyrighted material is made available to anyone wishing to use,
2657+** modify, copy, or redistribute it subject to the terms and conditions
2658+** of the GNU General Public License v.2.
2659+**
2660+*******************************************************************************
2661+******************************************************************************/
2662+
2663+#ifndef __DLM_INTERNAL_DOT_H__
2664+#define __DLM_INTERNAL_DOT_H__
2665+
2666+/*
2667+ * This is the main header file to be included in each DLM source file.
2668+ */
2669+
2670+#define DLM_RELEASE_NAME "<CVS>"
2671+
2672+#include <linux/slab.h>
2673+#include <linux/sched.h>
2674+#include <asm/semaphore.h>
2675+#include <linux/types.h>
2676+#include <linux/spinlock.h>
2677+#include <linux/vmalloc.h>
2678+#include <asm/uaccess.h>
2679+#include <linux/list.h>
2680+#include <linux/errno.h>
2681+#include <linux/random.h>
b7b72b66
AM
2682+#include <linux/delay.h>
2683+#include <linux/interrupt.h>
2684+#include <linux/kthread.h>
c1c6733f
AM
2685+
2686+#include <cluster/dlm.h>
2687+#include <cluster/dlm_device.h>
2688+#include <cluster/service.h>
2689+
2690+#ifndef TRUE
2691+#define TRUE (1)
2692+#endif
2693+
2694+#ifndef FALSE
2695+#define FALSE (0)
2696+#endif
2697+
2698+#if (BITS_PER_LONG == 64)
2699+#define PRIu64 "lu"
2700+#define PRId64 "ld"
2701+#define PRIo64 "lo"
2702+#define PRIx64 "lx"
2703+#define PRIX64 "lX"
2704+#define SCNu64 "lu"
2705+#define SCNd64 "ld"
2706+#define SCNo64 "lo"
2707+#define SCNx64 "lx"
2708+#define SCNX64 "lX"
2709+#else
2710+#define PRIu64 "Lu"
2711+#define PRId64 "Ld"
2712+#define PRIo64 "Lo"
2713+#define PRIx64 "Lx"
2714+#define PRIX64 "LX"
2715+#define SCNu64 "Lu"
2716+#define SCNd64 "Ld"
2717+#define SCNo64 "Lo"
2718+#define SCNx64 "Lx"
2719+#define SCNX64 "LX"
2720+#endif
2721+
2722+#define wchan_cond_sleep_intr(chan, sleep_cond) \
2723+do \
2724+{ \
2725+ DECLARE_WAITQUEUE(__wait_chan, current); \
2726+ current->state = TASK_INTERRUPTIBLE; \
2727+ add_wait_queue(&chan, &__wait_chan); \
2728+ if ((sleep_cond)) \
2729+ schedule(); \
2730+ remove_wait_queue(&chan, &__wait_chan); \
2731+ current->state = TASK_RUNNING; \
2732+} \
2733+while (0)
2734+
2735+static inline int check_timeout(unsigned long stamp, unsigned int seconds)
2736+{
2737+ return time_after(jiffies, stamp + seconds * HZ);
2738+}
2739+
2740+
2741+#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
2742+
2743+#define log_all(ls, fmt, args...) \
2744+ do { \
2745+ printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
2746+ dlm_debug_log(ls, fmt, ##args); \
2747+ } while (0)
2748+
2749+#define log_error log_all
2750+
b7b72b66
AM
2751+#if defined(DLM_DEBUG2)
2752+int nibbler_printf(const char *fmt, ...);
2753+#define log_debug2(fmt, args...) nibbler_printf(fmt"\n", ##args)
2754+#else
2755+#define log_debug2(fmt, args...)
2756+#endif
c1c6733f
AM
2757+
2758+#define DLM_DEBUG
2759+#if defined(DLM_DEBUG)
2760+#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
2761+#else
2762+#define log_debug(ls, fmt, args...)
2763+#endif
2764+
2765+#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
2766+#undef log_debug
2767+#define log_debug log_all
2768+#endif
2769+
2770+
b7b72b66 2771+#define DLM_ASSERT(x, do) \
c1c6733f
AM
2772+{ \
2773+ if (!(x)) \
2774+ { \
b7b72b66 2775+ dlm_locks_dump(); \
c1c6733f
AM
2776+ dlm_debug_dump(); \
2777+ printk("\nDLM: Assertion failed on line %d of file %s\n" \
2778+ "DLM: assertion: \"%s\"\n" \
2779+ "DLM: time = %lu\n", \
2780+ __LINE__, __FILE__, #x, jiffies); \
2781+ {do} \
2782+ printk("\n"); \
2783+ BUG(); \
2784+ panic("DLM: Record message above and reboot.\n"); \
2785+ } \
2786+}
2787+
2788+
b7b72b66
AM
2789+struct dlm_ls;
2790+struct dlm_lkb;
2791+struct dlm_rsb;
2792+struct dlm_csb;
2793+struct dlm_node;
2794+struct dlm_lkbtable;
2795+struct dlm_rsbtable;
2796+struct dlm_dirtable;
2797+struct dlm_direntry;
2798+struct dlm_recover;
2799+struct dlm_header;
2800+struct dlm_request;
2801+struct dlm_reply;
2802+struct dlm_rcom;
2803+struct dlm_query_request;
2804+struct dlm_query_reply;
c1c6733f 2805+
c1c6733f 2806+
b7b72b66
AM
2807+struct dlm_direntry {
2808+ struct list_head list;
2809+ uint32_t master_nodeid;
2810+ uint16_t length;
2811+ char name[1];
c1c6733f
AM
2812+};
2813+
b7b72b66
AM
2814+struct dlm_dirtable {
2815+ struct list_head list;
2816+ rwlock_t lock;
c1c6733f
AM
2817+};
2818+
b7b72b66
AM
2819+struct dlm_rsbtable {
2820+ struct list_head list;
2821+ rwlock_t lock;
c1c6733f
AM
2822+};
2823+
b7b72b66
AM
2824+struct dlm_lkbtable {
2825+ struct list_head list;
2826+ rwlock_t lock;
2827+ uint16_t counter;
c1c6733f
AM
2828+};
2829+
2830+/*
2831+ * Cluster node (per node in cluster)
2832+ */
2833+
b7b72b66
AM
2834+struct dlm_node {
2835+ struct list_head list;
2836+ uint32_t nodeid;
c783755a 2837+ atomic_t refcount; /* num csb's referencing */
c1c6733f
AM
2838+};
2839+
2840+/*
2841+ * Cluster System Block (per node in a ls)
2842+ */
2843+
b7b72b66
AM
2844+struct dlm_csb {
2845+ struct list_head list; /* per-lockspace node list */
2846+ struct dlm_node * node; /* global node structure */
2847+ int gone_event; /* event id when node removed */
c1c6733f
AM
2848+};
2849+
2850+/*
b7b72b66 2851+ * Used to save and manage recovery state for a lockspace.
c1c6733f
AM
2852+ */
2853+
b7b72b66
AM
2854+struct dlm_recover {
2855+ struct list_head list;
2856+ uint32_t * nodeids;
2857+ int node_count;
2858+ int event_id;
c1c6733f
AM
2859+};
2860+
2861+/*
b7b72b66 2862+ * Elements in the range array
c1c6733f
AM
2863+ */
2864+
b7b72b66
AM
2865+#define GR_RANGE_START (0)
2866+#define GR_RANGE_END (1)
2867+#define RQ_RANGE_START (2)
2868+#define RQ_RANGE_END (3)
c1c6733f 2869+
b7b72b66
AM
2870+/*
2871+ * Lockspace structure
2872+ */
2873+
2874+#define LSFL_WORK (0)
2875+#define LSFL_LS_RUN (1)
2876+#define LSFL_LS_STOP (2)
2877+#define LSFL_LS_START (3)
2878+#define LSFL_LS_FINISH (4)
2879+#define LSFL_RECCOMM_WAIT (5)
2880+#define LSFL_RECCOMM_READY (6)
2881+#define LSFL_NOTIMERS (7)
2882+#define LSFL_FINISH_RECOVERY (8)
2883+#define LSFL_RESDIR_VALID (9)
2884+#define LSFL_ALL_RESDIR_VALID (10)
2885+#define LSFL_NODES_VALID (11)
2886+#define LSFL_ALL_NODES_VALID (12)
2887+#define LSFL_REQUEST_WARN (13)
c783755a 2888+#define LSFL_RECOVERD_EXIT (14)
b7b72b66
AM
2889+
2890+#define LSST_NONE (0)
2891+#define LSST_INIT (1)
2892+#define LSST_INIT_DONE (2)
2893+#define LSST_CLEAR (3)
2894+#define LSST_WAIT_START (4)
2895+#define LSST_RECONFIG_DONE (5)
2896+
2897+struct dlm_ls {
2898+ struct list_head ls_list; /* list of lockspaces */
2899+ uint32_t ls_local_id; /* local unique lockspace ID */
2900+ uint32_t ls_global_id; /* global unique lockspace ID */
2901+ int ls_allocation; /* Memory allocation policy */
2902+ int ls_count; /* reference count */
2903+ unsigned long ls_flags; /* LSFL_ */
2904+
2905+ struct dlm_rsbtable * ls_rsbtbl;
2906+ uint32_t ls_rsbtbl_size;
2907+
2908+ struct dlm_lkbtable * ls_lkbtbl;
2909+ uint32_t ls_lkbtbl_size;
2910+
2911+ struct dlm_dirtable * ls_dirtbl;
2912+ uint32_t ls_dirtbl_size;
2913+
bb1d8b11 2914+ struct list_head ls_nodes; /* current nodes in ls */
b7b72b66 2915+ struct list_head ls_nodes_gone; /* dead node list, recovery */
bb1d8b11 2916+ uint32_t ls_num_nodes; /* number of nodes in ls */
b7b72b66 2917+ uint32_t ls_low_nodeid;
bb1d8b11 2918+ uint32_t * ls_node_array;
b7b72b66
AM
2919+
2920+ struct rw_semaphore ls_unlock_sem; /* To prevent unlock on a
2921+ parent lock racing with a
2922+ new child lock */
2923+
2924+ struct list_head ls_deadlockq; /* List of locks in conversion
2925+ ordered by duetime. for
2926+ deadlock detection */
2927+
2928+ /* recovery related */
2929+
2930+ struct task_struct * ls_recoverd_task;
c783755a 2931+ struct semaphore ls_recoverd_lock;
b7b72b66
AM
2932+ struct list_head ls_recover; /* dlm_recover structs */
2933+ spinlock_t ls_recover_lock;
2934+ int ls_last_stop;
2935+ int ls_last_start;
2936+ int ls_last_finish;
2937+ int ls_state; /* recovery states */
2938+
2939+ struct rw_semaphore ls_in_recovery; /* block local requests */
2940+ struct list_head ls_requestqueue;/* queue remote requests */
2941+ struct semaphore ls_requestqueue_lock;
2942+
2943+ struct dlm_rcom * ls_rcom; /* recovery comms */
2944+ uint32_t ls_rcom_msgid;
2945+ struct semaphore ls_rcom_lock;
2946+
2947+ struct list_head ls_recover_list;
2948+ spinlock_t ls_recover_list_lock;
2949+ int ls_recover_list_count;
2950+ wait_queue_head_t ls_wait_general;
2951+
2952+ struct list_head ls_rootres; /* root resources */
2953+ struct rw_semaphore ls_root_lock; /* protect rootres list */
2954+
2955+ struct list_head ls_rebuild_rootrsb_list; /* Root of lock trees
2956+ we're deserialising */
2957+ int ls_namelen;
2958+ char ls_name[1];
2959+};
c1c6733f 2960+
b7b72b66
AM
2961+/*
2962+ * Resource block
2963+ */
c1c6733f 2964+
b7b72b66
AM
2965+#define RESFL_NEW_MASTER (0)
2966+#define RESFL_RECOVER_LIST (1)
2967+#define RESFL_MASTER (2)
c1c6733f 2968+
b7b72b66
AM
2969+struct dlm_rsb {
2970+ struct list_head res_hashchain;
2971+ uint32_t res_bucket;
c1c6733f 2972+
b7b72b66 2973+ struct dlm_ls * res_ls; /* The owning lockspace */
c1c6733f 2974+
b7b72b66 2975+ struct list_head res_rootlist; /* List of root rsb's */
c1c6733f 2976+
b7b72b66
AM
2977+ struct list_head res_subreslist; /* List of all sub-resources
2978+ for this root rsb */
c1c6733f 2979+
b7b72b66
AM
2980+ uint8_t res_depth; /* Depth in resource tree */
2981+ unsigned long res_flags; /* Flags, RESFL_ */
c1c6733f 2982+
b7b72b66
AM
2983+ struct list_head res_grantqueue;
2984+ struct list_head res_convertqueue;
2985+ struct list_head res_waitqueue;
c1c6733f 2986+
b7b72b66 2987+ uint32_t res_nodeid; /* nodeid of master node */
c1c6733f 2988+
b7b72b66
AM
2989+ struct dlm_rsb * res_root; /* root rsb if a subresource */
2990+ struct dlm_rsb * res_parent; /* parent rsb (if any) */
c1c6733f 2991+
b7b72b66
AM
2992+ atomic_t res_ref; /* Number of lkb's */
2993+ uint16_t res_remasterid; /* ID used during remaster */
c1c6733f 2994+
b7b72b66
AM
2995+ struct list_head res_recover_list; /* General list for use
2996+ during recovery */
2997+ int res_recover_msgid;
2998+ int res_newlkid_expect;
c1c6733f 2999+
b7b72b66 3000+ struct rw_semaphore res_lock;
c1c6733f 3001+
b7b72b66 3002+ char * res_lvbptr; /* Lock value block */
c1c6733f 3003+
b7b72b66
AM
3004+ uint8_t res_length;
3005+ char res_name[1]; /* <res_length> bytes */
c1c6733f
AM
3006+};
3007+
3008+/*
b7b72b66
AM
3009+ * Lock block. To avoid confusion, where flags mirror the public flags, they
3010+ * should have the same value.
3011+ *
3012+ * In general, DLM_LKF flags from dlm.h apply only to lkb_lockqueue_flags
3013+ * and GDLM_LKFLG flags from dlm_internal.h apply only to lkb_flags.
3014+ * The rr_flags field in the request struct is a copy of lkb_lockqueue_flags.
3015+ * There is one dangerous exception: GDLM_LKFLG_RANGE is set in rr_flags
3016+ * when sending a remote range lock request. This value is then copied into
3017+ * the remote lkb_lockqueue_flags field. This means GDLM_LKFLG_RANGE must
3018+ * not have the same value as any external DLM_LKF flag.
3019+ */
3020+
3021+#define GDLM_LKSTS_NEW (0)
3022+#define GDLM_LKSTS_WAITING (1)
3023+#define GDLM_LKSTS_GRANTED (2)
3024+#define GDLM_LKSTS_CONVERT (3)
3025+
3026+/* mirror external flags */
3027+#define GDLM_LKFLG_VALBLK (0x00000008)
3028+#define GDLM_LKFLG_PERSISTENT (0x00000080)
3029+#define GDLM_LKFLG_NODLCKWT (0x00000100)
3030+#define GDLM_LKFLG_EXPEDITE (0x00000400)
c783755a
AM
3031+#define GDLM_LKFLG_ORPHAN (0x00004000)
3032+/* external flags now go up to: (0x00004000) : DLM_LKF_ORPHAN */
b7b72b66
AM
3033+
3034+/* internal-only flags */
3035+#define GDLM_LKFLG_RANGE (0x00010000)
3036+#define GDLM_LKFLG_MSTCPY (0x00020000)
3037+#define GDLM_LKFLG_DELETED (0x00040000)
3038+#define GDLM_LKFLG_LQCONVERT (0x00080000)
3039+#define GDLM_LKFLG_LQRESEND (0x00100000)
3040+#define GDLM_LKFLG_DEMOTED (0x00200000)
3041+#define GDLM_LKFLG_RESENT (0x00400000)
3042+#define GDLM_LKFLG_NOREBUILD (0x00800000)
3043+#define GDLM_LKFLG_UNLOCKDONE (0x01000000)
3044+
3045+#define AST_COMP (1)
3046+#define AST_BAST (2)
3047+#define AST_DEL (4)
3048+
3049+struct dlm_lkb {
3050+ uint32_t lkb_flags;
3051+ uint16_t lkb_status; /* grant, wait, convert */
3052+ int8_t lkb_rqmode; /* requested lock mode */
3053+ int8_t lkb_grmode; /* granted lock mode */
3054+ uint32_t lkb_retstatus; /* status to return in lksb */
3055+ uint32_t lkb_id; /* our lock ID */
3056+ struct dlm_lksb * lkb_lksb; /* status block of caller */
3057+ struct list_head lkb_idtbl_list; /* lockidtbl */
3058+ struct list_head lkb_statequeue; /* rsb's g/c/w queue */
3059+ struct dlm_rsb * lkb_resource;
3060+ struct dlm_lkb * lkb_parent; /* parent lock if any */
3061+ atomic_t lkb_childcnt; /* number of children */
3062+
3063+ struct list_head lkb_lockqueue; /* queue of locks waiting
3064+ for remote reply */
3065+ int lkb_lockqueue_state; /* reason on lockqueue */
3066+ uint32_t lkb_lockqueue_flags; /* as passed into
3067+ lock/unlock */
3068+ int lkb_ownpid; /* pid of lock owner */
3069+ unsigned long lkb_lockqueue_time; /* time lkb went on the
3070+ lockqueue */
3071+ unsigned long lkb_duetime; /* for deadlock detection */
3072+
3073+ uint32_t lkb_remid; /* id on remote partner */
3074+ uint32_t lkb_nodeid; /* id of remote partner */
b7b72b66
AM
3075+ void * lkb_astaddr;
3076+ void * lkb_bastaddr;
3077+ long lkb_astparam;
3078+ struct list_head lkb_astqueue; /* locks with asts to deliver */
3079+ uint16_t lkb_astflags; /* COMP, BAST, DEL */
3080+ uint8_t lkb_bastmode; /* requested mode */
3081+ uint8_t lkb_highbast; /* highest mode bast sent for */
3082+
3083+ struct dlm_request * lkb_request;
3084+
3085+ struct list_head lkb_deadlockq; /* ls_deadlockq list */
3086+
3087+ char * lkb_lvbptr; /* points to lksb lvb on local
3088+ lock, allocated lvb on
3089+ on remote lock */
3090+ uint64_t * lkb_range; /* Points to an array of 64 bit
3091+ numbers that represent the
3092+ requested and granted ranges
3093+ of the lock. NULL implies
3094+ 0-ffffffffffffffff */
c1c6733f
AM
3095+};
3096+
3097+/*
3098+ * Header part of the mid-level comms system. All packets start with
3099+ * this header so we can identify them. The comms packet can
3100+ * contain many of these structs but the are split into individual
3101+ * work units before being passed to the lockqueue routines.
3102+ * below this are the structs that this is a header for
3103+ */
3104+
b7b72b66
AM
3105+struct dlm_header {
3106+ uint8_t rh_cmd; /* What we are */
3107+ uint8_t rh_flags; /* maybe just a pad */
3108+ uint16_t rh_length; /* Length of struct (so we can
3109+ send many in 1 message) */
3110+ uint32_t rh_lkid; /* Lock ID tag: ie the local
3111+ (requesting) lock ID */
3112+ uint32_t rh_lockspace; /* Lockspace ID */
3113+} __attribute__((packed));
c1c6733f
AM
3114+
3115+/*
3116+ * This is the struct used in a remote lock/unlock/convert request
3117+ * The mid-level comms API should turn this into native byte order.
3118+ * Most "normal" lock operations will use these two structs for
3119+ * communications. Recovery operations use their own structs
3120+ * but still with the gd_req_header on the front.
3121+ */
3122+
b7b72b66
AM
3123+struct dlm_request {
3124+ struct dlm_header rr_header;
3125+ uint32_t rr_remlkid; /* Remote lock ID */
3126+ uint32_t rr_remparid; /* Parent's remote lock ID */
3127+ uint32_t rr_flags; /* Flags from lock/convert req*/
3128+ uint64_t rr_range_start; /* Yes, these are in the right
3129+ place... */
3130+ uint64_t rr_range_end;
3131+ uint32_t rr_status; /* Status to return if this is
3132+ an AST request */
3133+ uint32_t rr_pid; /* Owner PID of lock */
3134+ uint8_t rr_rqmode; /* Requested lock mode */
3135+ uint8_t rr_asts; /* Whether the LKB has ASTs */
3136+ char rr_lvb[DLM_LVB_LEN];
3137+ char rr_name[1]; /* As long as needs be. Only
3138+ used for directory lookups.
3139+ The length of this can be
3140+ worked out from the packet
3141+ length */
3142+} __attribute__((packed));
c1c6733f
AM
3143+
3144+/*
3145+ * This is the struct returned by a remote lock/unlock/convert request
3146+ * The mid-level comms API should turn this into native byte order.
3147+ */
3148+
b7b72b66
AM
3149+struct dlm_reply {
3150+ struct dlm_header rl_header;
3151+ uint32_t rl_lockstate; /* Whether request was
3152+ queued/granted/waiting */
3153+ uint32_t rl_nodeid; /* nodeid of lock master */
3154+ uint32_t rl_status; /* Status to return to caller */
3155+ uint32_t rl_lkid; /* Remote lkid */
3156+ char rl_lvb[DLM_LVB_LEN];
3157+} __attribute__((packed));
c1c6733f
AM
3158+
3159+/*
3160+ * Recovery comms message
3161+ */
3162+
b7b72b66
AM
3163+struct dlm_rcom {
3164+ struct dlm_header rc_header; /* 32 byte aligned */
3165+ uint32_t rc_msgid;
3166+ uint16_t rc_datalen;
3167+ uint8_t rc_expanded;
3168+ uint8_t rc_subcmd; /* secondary command */
3169+ char rc_buf[1]; /* first byte of data goes here
3170+ and extends beyond here for
3171+ another datalen - 1 bytes.
3172+ rh_length is set to sizeof
3173+ dlm_rcom + datalen - 1 */
3174+} __attribute__((packed));
c1c6733f
AM
3175+
3176+
3177+/* A remote query: GDLM_REMCMD_QUERY */
c1c6733f 3178+
b7b72b66
AM
3179+struct dlm_query_request {
3180+ struct dlm_header rq_header;
3181+ uint32_t rq_mstlkid; /* LockID on master node */
3182+ uint32_t rq_query; /* query from the user */
3183+ uint32_t rq_maxlocks; /* max number of locks we can
3184+ cope with */
3185+} __attribute__((packed));
c1c6733f
AM
3186+
3187+/* First block of a reply query. cmd = GDLM_REMCMD_QUERY */
3188+/* There may be subsequent blocks of
3189+ lock info in GDLM_REMCMD_QUERYCONT messages which just have
3190+ a normal header. The last of these will have rh_flags set to
3191+ GDLM_REMFLAG_ENDQUERY
3192+ */
c1c6733f 3193+
b7b72b66
AM
3194+struct dlm_query_reply {
3195+ struct dlm_header rq_header;
3196+ uint32_t rq_numlocks; /* Number of locks in reply */
3197+ uint32_t rq_startlock; /* Which lock this block starts
3198+ at (for multi-block replies) */
3199+ uint32_t rq_status;
c1c6733f 3200+
b7b72b66
AM
3201+ /* Resource information */
3202+ uint32_t rq_grantcount; /* No. of nodes on grantqueue */
3203+ uint32_t rq_convcount; /* No. of nodes on convertq */
3204+ uint32_t rq_waitcount; /* No. of nodes on waitqueue */
3205+ char rq_valblk[DLM_LVB_LEN]; /* Master's LVB
3206+ contents, if
3207+ applicable */
3208+} __attribute__((packed));
c1c6733f
AM
3209+
3210+/*
3211+ * Lockqueue wait lock states
3212+ */
3213+
b7b72b66
AM
3214+#define GDLM_LQSTATE_WAIT_RSB 1
3215+#define GDLM_LQSTATE_WAIT_CONVERT 2
3216+#define GDLM_LQSTATE_WAIT_CONDGRANT 3
3217+#define GDLM_LQSTATE_WAIT_UNLOCK 4
c1c6733f
AM
3218+
3219+/* Commands sent across the comms link */
b7b72b66
AM
3220+#define GDLM_REMCMD_LOOKUP 1
3221+#define GDLM_REMCMD_LOCKREQUEST 2
3222+#define GDLM_REMCMD_UNLOCKREQUEST 3
3223+#define GDLM_REMCMD_CONVREQUEST 4
3224+#define GDLM_REMCMD_LOCKREPLY 5
3225+#define GDLM_REMCMD_LOCKGRANT 6
3226+#define GDLM_REMCMD_SENDBAST 7
3227+#define GDLM_REMCMD_SENDCAST 8
3228+#define GDLM_REMCMD_REM_RESDATA 9
3229+#define GDLM_REMCMD_RECOVERMESSAGE 20
3230+#define GDLM_REMCMD_RECOVERREPLY 21
3231+#define GDLM_REMCMD_QUERY 30
3232+#define GDLM_REMCMD_QUERYREPLY 31
c1c6733f
AM
3233+
3234+/* Set in rh_flags when this is the last block of
3235+ query information. Note this could also be the first
3236+ block */
3237+#define GDLM_REMFLAG_ENDQUERY 1
3238+
c783755a
AM
3239+#ifdef CONFIG_DLM_STATS
3240+struct dlm_statinfo
3241+{
3242+ unsigned int cast;
3243+ unsigned int bast;
3244+ unsigned int lockops;
3245+ unsigned int unlockops;
3246+ unsigned int convertops;
3247+ unsigned long lockqueue_time[5];
3248+ unsigned long lockqueue_locks[5];
3249+};
3250+extern struct dlm_statinfo dlm_stats;
3251+#endif
3252+
c1c6733f
AM
3253+#ifndef BUG_ON
3254+#define BUG_ON(x)
3255+#endif
3256+
b7b72b66 3257+void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...);
c1c6733f 3258+void dlm_debug_dump(void);
b7b72b66 3259+void dlm_locks_dump(void);
c1c6733f
AM
3260+
3261+#endif /* __DLM_INTERNAL_DOT_H__ */
3262diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
3263--- linux-orig/cluster/dlm/lkb.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 3264+++ linux-patched/cluster/dlm/lkb.c 2004-11-03 11:31:56.000000000 +0800
b7b72b66 3265@@ -0,0 +1,183 @@
c1c6733f
AM
3266+/******************************************************************************
3267+*******************************************************************************
3268+**
3269+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3270+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3271+**
3272+** This copyrighted material is made available to anyone wishing to use,
3273+** modify, copy, or redistribute it subject to the terms and conditions
3274+** of the GNU General Public License v.2.
3275+**
3276+*******************************************************************************
3277+******************************************************************************/
3278+
3279+/*
3280+ * lkb.c
3281+ *
3282+ * Allocate and free locks on the lock ID table.
3283+ *
3284+ * This is slightly naff but I don't really like the
3285+ * VMS lockidtbl stuff as it uses a realloced array
3286+ * to hold the locks in. I think this is slightly better
3287+ * in some ways.
3288+ *
3289+ * Any better suggestions gratefully received. Patrick
3290+ *
3291+ */
3292+
3293+#include "dlm_internal.h"
3294+#include "lockqueue.h"
3295+#include "lkb.h"
3296+#include "config.h"
3297+#include "rsb.h"
3298+#include "memory.h"
3299+#include "lockspace.h"
3300+#include "util.h"
3301+
3302+/*
3303+ * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
3304+ */
3305+
b7b72b66 3306+static struct dlm_lkb *__find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
c1c6733f 3307+{
b7b72b66
AM
3308+ uint16_t bucket = lkid & 0xFFFF;
3309+ struct dlm_lkb *lkb;
c1c6733f 3310+
b7b72b66 3311+ if (bucket >= ls->ls_lkbtbl_size)
c1c6733f
AM
3312+ goto out;
3313+
b7b72b66 3314+ list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list){
c1c6733f
AM
3315+ if (lkb->lkb_id == lkid)
3316+ return lkb;
3317+ }
b7b72b66 3318+ out:
c1c6733f
AM
3319+ return NULL;
3320+}
3321+
3322+/*
c1c6733f
AM
3323+ * LKB lkid's are 32 bits and have two 16 bit parts. The bottom 16 bits are a
3324+ * random number between 0 and lockidtbl_size-1. This random number specifies
3325+ * the "bucket" for the lkb in lockidtbl. The upper 16 bits are a sequentially
3326+ * assigned per-bucket id.
3327+ *
3328+ * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
3329+ * against the lkid of all lkb's in the bucket to avoid duplication.
3330+ *
3331+ */
3332+
b7b72b66 3333+struct dlm_lkb *create_lkb(struct dlm_ls *ls)
c1c6733f 3334+{
b7b72b66 3335+ struct dlm_lkb *lkb;
c1c6733f
AM
3336+ uint32_t lkid;
3337+ uint16_t bucket;
3338+
3339+ lkb = allocate_lkb(ls);
3340+ if (!lkb)
3341+ goto out;
3342+
b7b72b66
AM
3343+ retry:
3344+ get_random_bytes(&bucket, sizeof(bucket));
3345+ bucket &= (ls->ls_lkbtbl_size - 1);
c1c6733f 3346+
b7b72b66 3347+ write_lock(&ls->ls_lkbtbl[bucket].lock);
c1c6733f 3348+
b7b72b66
AM
3349+ lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
3350+
3351+ if (__find_lock_by_id(ls, lkid)) {
3352+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
3353+ goto retry;
3354+ }
3355+
3356+ lkb->lkb_id = lkid;
3357+ list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
3358+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
3359+ out:
c1c6733f
AM
3360+ return lkb;
3361+}
3362+
3363+/*
3364+ * Free LKB and remove it from the lockidtbl.
3365+ * NB - this always frees the lkb whereas release_rsb doesn't free an
3366+ * rsb unless its reference count is zero.
3367+ */
3368+
b7b72b66 3369+void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
c1c6733f 3370+{
b7b72b66
AM
3371+ uint16_t bucket = lkb->lkb_id & 0xFFFF;
3372+
c1c6733f
AM
3373+ if (lkb->lkb_status) {
3374+ log_error(ls, "release lkb with status %u", lkb->lkb_status);
3375+ print_lkb(lkb);
3376+ return;
3377+ }
3378+
3379+ if (lkb->lkb_parent)
3380+ atomic_dec(&lkb->lkb_parent->lkb_childcnt);
3381+
b7b72b66 3382+ write_lock(&ls->ls_lkbtbl[bucket].lock);
c1c6733f 3383+ list_del(&lkb->lkb_idtbl_list);
b7b72b66 3384+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
c1c6733f
AM
3385+
3386+ /* if this is not a master copy then lvbptr points into the user's
3387+ * lksb, so don't free it */
3388+ if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3389+ free_lvb(lkb->lkb_lvbptr);
3390+
3391+ if (lkb->lkb_range)
3392+ free_range(lkb->lkb_range);
3393+
3394+ free_lkb(lkb);
3395+}
3396+
b7b72b66 3397+struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
c1c6733f 3398+{
b7b72b66
AM
3399+ struct dlm_lkb *lkb;
3400+ uint16_t bucket = lkid & 0xFFFF;
c1c6733f 3401+
b7b72b66 3402+ read_lock(&ls->ls_lkbtbl[bucket].lock);
c1c6733f 3403+ lkb = __find_lock_by_id(ls, lkid);
b7b72b66 3404+ read_unlock(&ls->ls_lkbtbl[bucket].lock);
c1c6733f
AM
3405+
3406+ return lkb;
3407+}
3408+
b7b72b66 3409+struct dlm_lkb *dlm_get_lkb(void *lockspace, uint32_t lkid)
c1c6733f 3410+{
b7b72b66
AM
3411+ struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
3412+ struct dlm_lkb *lkb = find_lock_by_id(ls, lkid);
3413+ put_lockspace(ls);
3414+ return lkb;
c1c6733f
AM
3415+}
3416+
3417+/*
3418+ * Initialise the range parts of an LKB.
3419+ */
3420+
b7b72b66 3421+int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end)
c1c6733f
AM
3422+{
3423+ int ret = -ENOMEM;
3424+
3425+ /*
3426+ * if this wasn't already a range lock, make it one
3427+ */
3428+ if (!lkb->lkb_range) {
3429+ lkb->lkb_range = allocate_range(lspace);
3430+ if (!lkb->lkb_range)
3431+ goto out;
3432+
3433+ /*
3434+ * This is needed for conversions that contain ranges where the
3435+ * original lock didn't but it's harmless for new locks too.
3436+ */
3437+ lkb->lkb_range[GR_RANGE_START] = 0LL;
3438+ lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
3439+ }
3440+
3441+ lkb->lkb_range[RQ_RANGE_START] = start;
3442+ lkb->lkb_range[RQ_RANGE_END] = end;
3443+
3444+ ret = 0;
3445+
3446+ out:
3447+ return ret;
3448+}
3449diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
3450--- linux-orig/cluster/dlm/lkb.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 3451+++ linux-patched/cluster/dlm/lkb.h 2004-11-03 11:31:56.000000000 +0800
b7b72b66 3452@@ -0,0 +1,23 @@
c1c6733f
AM
3453+/******************************************************************************
3454+*******************************************************************************
3455+**
3456+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3457+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3458+**
3459+** This copyrighted material is made available to anyone wishing to use,
3460+** modify, copy, or redistribute it subject to the terms and conditions
3461+** of the GNU General Public License v.2.
3462+**
3463+*******************************************************************************
3464+******************************************************************************/
3465+
3466+#ifndef __LKB_DOT_H__
3467+#define __LKB_DOT_H__
3468+
b7b72b66
AM
3469+struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid);
3470+struct dlm_lkb *create_lkb(struct dlm_ls *ls);
3471+void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb);
3472+struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid);
3473+int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end);
c1c6733f
AM
3474+
3475+#endif /* __LKB_DOT_H__ */
3476diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
3477--- linux-orig/cluster/dlm/locking.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 3478+++ linux-patched/cluster/dlm/locking.c 2004-11-03 11:31:56.000000000 +0800
c783755a 3479@@ -0,0 +1,1378 @@
c1c6733f
AM
3480+/******************************************************************************
3481+*******************************************************************************
3482+**
3483+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3484+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
b7b72b66 3485+**
c1c6733f
AM
3486+** This copyrighted material is made available to anyone wishing to use,
3487+** modify, copy, or redistribute it subject to the terms and conditions
3488+** of the GNU General Public License v.2.
3489+**
3490+*******************************************************************************
3491+******************************************************************************/
3492+
b7b72b66 3493+/*
c1c6733f
AM
3494+ * locking.c
3495+ *
3496+ * This is where the main work of the DLM goes on
3497+ *
3498+ */
3499+
3500+#include "dlm_internal.h"
3501+#include "lockqueue.h"
3502+#include "locking.h"
3503+#include "lockspace.h"
3504+#include "lkb.h"
3505+#include "nodes.h"
3506+#include "dir.h"
3507+#include "ast.h"
3508+#include "memory.h"
3509+#include "rsb.h"
b7b72b66
AM
3510+#include "util.h"
3511+#include "lowcomms.h"
3512+
3513+extern struct list_head lslist;
c1c6733f
AM
3514+
3515+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
3516+
b7b72b66 3517+/*
c1c6733f
AM
3518+ * Lock compatibilty matrix - thanks Steve
3519+ * UN = Unlocked state. Not really a state, used as a flag
3520+ * PD = Padding. Used to make the matrix a nice power of two in size
3521+ * Other states are the same as the VMS DLM.
3522+ * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
3523+ */
3524+
3525+#define modes_compat(gr, rq) \
3526+ __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
3527+
3528+const int __dlm_compat_matrix[8][8] = {
3529+ /* UN NL CR CW PR PW EX PD */
3530+ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
3531+ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
3532+ {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
3533+ {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
3534+ {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
3535+ {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
3536+ {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
3537+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3538+};
3539+
b7b72b66 3540+/*
c1c6733f
AM
3541+ * Compatibility matrix for conversions with QUECVT set.
3542+ * Granted mode is the row; requested mode is the column.
3543+ * Usage: matrix[grmode+1][rqmode+1]
3544+ */
3545+
3546+const int __quecvt_compat_matrix[8][8] = {
3547+ /* UN NL CR CW PR PW EX PD */
3548+ {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
3549+ {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
3550+ {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
3551+ {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
3552+ {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
3553+ {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
3554+ {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
3555+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
3556+};
3557+
b7b72b66 3558+/*
c1c6733f
AM
3559+ * This defines the direction of transfer of LVB data.
3560+ * Granted mode is the row; requested mode is the column.
3561+ * Usage: matrix[grmode+1][rqmode+1]
3562+ * 1 = LVB is returned to the caller
3563+ * 0 = LVB is written to the resource
3564+ * -1 = nothing happens to the LVB
3565+ */
3566+
3567+const int __lvb_operations[8][8] = {
3568+ /* UN NL CR CW PR PW EX PD*/
3569+ { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
3570+ { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
3571+ { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
3572+ { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
3573+ { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
3574+ { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
3575+ { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
3576+ { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
3577+};
3578+
b7b72b66
AM
3579+static void grant_lock(struct dlm_lkb *lkb, int send_remote);
3580+static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
3581+static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
3582+static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
3583+ uint32_t flags, void *ast, void *astarg, void *bast,
c1c6733f 3584+ struct dlm_range *range);
b7b72b66
AM
3585+static int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb,
3586+ uint32_t flags, char *name, int namelen);
c1c6733f
AM
3587+
3588+
b7b72b66 3589+inline int dlm_modes_compat(int mode1, int mode2)
c1c6733f 3590+{
b7b72b66
AM
3591+ return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
3592+}
3593+
3594+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
3595+{
3596+ struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
3597+
3598+ if (lkb->lkb_id == first->lkb_id)
3599+ return 1;
3600+
3601+ return 0;
3602+}
3603+
b7b72b66 3604+/*
c1c6733f
AM
3605+ * Return 1 if the locks' ranges overlap
3606+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
3607+ */
3608+
b7b72b66 3609+static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
c1c6733f
AM
3610+{
3611+ if (!lkb1->lkb_range || !lkb2->lkb_range)
3612+ return 1;
3613+
3614+ if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
3615+ lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
3616+ return 0;
3617+
3618+ return 1;
3619+}
3620+
3621+/*
c1c6733f
AM
3622+ * "A conversion deadlock arises with a pair of lock requests in the converting
3623+ * queue for one resource. The granted mode of each lock blocks the requested
3624+ * mode of the other lock."
3625+ */
3626+
b7b72b66
AM
3627+static struct dlm_lkb *conversion_deadlock_detect(struct dlm_rsb *rsb,
3628+ struct dlm_lkb *lkb)
c1c6733f 3629+{
b7b72b66 3630+ struct dlm_lkb *this;
c1c6733f
AM
3631+
3632+ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3633+ if (this == lkb)
3634+ continue;
3635+
3636+ if (!ranges_overlap(lkb, this))
3637+ continue;
3638+
3639+ if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
b7b72b66 3640+ return this;
c1c6733f 3641+ }
b7b72b66
AM
3642+
3643+ return NULL;
c1c6733f
AM
3644+}
3645+
3646+/*
3647+ * Check if the given lkb conflicts with another lkb on the queue.
3648+ */
3649+
b7b72b66 3650+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
c1c6733f 3651+{
b7b72b66 3652+ struct dlm_lkb *this;
c1c6733f
AM
3653+
3654+ list_for_each_entry(this, head, lkb_statequeue) {
3655+ if (this == lkb)
3656+ continue;
3657+ if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
3658+ return TRUE;
3659+ }
3660+ return FALSE;
3661+}
3662+
3663+/*
b7b72b66
AM
3664+ * Return 1 if the lock can be granted, 0 otherwise.
3665+ * Also detect and resolve conversion deadlocks.
3666+ *
3667+ * lkb is the lock to be granted
3668+ *
3669+ * now is 1 if the function is being called in the context of the
3670+ * immediate request, it is 0 if called later, after the lock has been
3671+ * queued.
3672+ *
3673+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
c1c6733f
AM
3674+ */
3675+
b7b72b66 3676+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
c1c6733f 3677+{
b7b72b66 3678+ int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
c1c6733f 3679+
b7b72b66
AM
3680+ /*
3681+ * 6-10: Version 5.4 introduced an option to address the phenomenon of
3682+ * a new request for a NL mode lock being blocked.
3683+ *
3684+ * 6-11: If the optional EXPEDITE flag is used with the new NL mode
3685+ * request, then it would be granted. In essence, the use of this flag
3686+ * tells the Lock Manager to expedite theis request by not considering
3687+ * what may be in the CONVERTING or WAITING queues... As of this
3688+ * writing, the EXPEDITE flag can be used only with new requests for NL
3689+ * mode locks. This flag is not valid for conversion requests.
3690+ *
3691+ * A shortcut. Earlier checks return an error if EXPEDITE is used in a
3692+ * conversion or used with a non-NL requested mode. We also know an
3693+ * EXPEDITE request is always granted immediately, so now must always
3694+ * be 1. The full condition to grant an expedite request: (now &&
3695+ * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
3696+ * therefore be shortened to just checking the flag.
3697+ */
c1c6733f 3698+
b7b72b66
AM
3699+ if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
3700+ return TRUE;
c1c6733f 3701+
b7b72b66
AM
3702+ /*
3703+ * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
3704+ * added to the remaining conditions.
3705+ */
c1c6733f 3706+
b7b72b66
AM
3707+ if (queue_conflict(&r->res_grantqueue, lkb))
3708+ goto out;
3709+
3710+ /*
3711+ * 6-3: By default, a conversion request is immediately granted if the
3712+ * requested mode is compatible with the modes of all other granted
3713+ * locks
3714+ */
3715+
3716+ if (queue_conflict(&r->res_convertqueue, lkb))
3717+ goto out;
3718+
3719+ /*
3720+ * 6-5: But the default algorithm for deciding whether to grant or
3721+ * queue conversion requests does not by itself guarantee that such
3722+ * requests are serviced on a "first come first serve" basis. This, in
3723+ * turn, can lead to a phenomenon known as "indefinate postponement".
3724+ *
3725+ * 6-7: This issue is dealt with by using the optional QUECVT flag with
3726+ * the system service employed to request a lock conversion. This flag
3727+ * forces certain conversion requests to be queued, even if they are
3728+ * compatible with the granted modes of other locks on the same
3729+ * resource. Thus, the use of this flag results in conversion requests
3730+ * being ordered on a "first come first servce" basis.
3731+ */
c1c6733f 3732+
b7b72b66 3733+ if (now && conv && !(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
c1c6733f
AM
3734+ return TRUE;
3735+
b7b72b66
AM
3736+ /*
3737+ * When using range locks the NOORDER flag is set to avoid the standard
3738+ * vms rules on grant order.
3739+ */
c1c6733f 3740+
b7b72b66
AM
3741+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOORDER)
3742+ return TRUE;
c1c6733f 3743+
b7b72b66
AM
3744+ /*
3745+ * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
3746+ * granted until all other conversion requests ahead of it are granted
3747+ * and/or canceled.
3748+ */
c1c6733f 3749+
b7b72b66
AM
3750+ if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
3751+ return TRUE;
c1c6733f 3752+
b7b72b66
AM
3753+ /*
3754+ * 6-4: By default, a new request is immediately granted only if all
3755+ * three of the following conditions are satisfied when the request is
3756+ * issued:
3757+ * - The queue of ungranted conversion requests for the resource is
3758+ * empty.
3759+ * - The queue of ungranted new requests for the resource is empty.
3760+ * - The mode of the new request is compatible with the most
3761+ * restrictive mode of all granted locks on the resource.
3762+ */
c1c6733f 3763+
b7b72b66
AM
3764+ if (now && !conv && list_empty(&r->res_convertqueue) &&
3765+ list_empty(&r->res_waitqueue))
3766+ return TRUE;
c1c6733f 3767+
b7b72b66
AM
3768+ /*
3769+ * 6-4: Once a lock request is in the queue of ungranted new requests,
3770+ * it cannot be granted until the queue of ungranted conversion
3771+ * requests is empty, all ungranted new requests ahead of it are
3772+ * granted and/or canceled, and it is compatible with the granted mode
3773+ * of the most restrictive lock granted on the resource.
3774+ */
3775+
3776+ if (!now && !conv && list_empty(&r->res_convertqueue) &&
3777+ first_in_list(lkb, &r->res_waitqueue))
c1c6733f
AM
3778+ return TRUE;
3779+
b7b72b66
AM
3780+ out:
3781+ /*
3782+ * The following, enabled by CONVDEADLK, departs from VMS.
3783+ */
3784+
3785+ if (now && conv && (lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK) &&
3786+ conversion_deadlock_detect(r, lkb)) {
3787+ lkb->lkb_grmode = DLM_LOCK_NL;
3788+ lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
3789+ }
3790+
c1c6733f
AM
3791+ return FALSE;
3792+}
3793+
3794+int dlm_lock(void *lockspace,
3795+ uint32_t mode,
3796+ struct dlm_lksb *lksb,
3797+ uint32_t flags,
3798+ void *name,
3799+ unsigned int namelen,
3800+ uint32_t parent,
3801+ void (*ast) (void *astarg),
3802+ void *astarg,
3803+ void (*bast) (void *astarg, int mode),
3804+ struct dlm_range *range)
3805+{
b7b72b66
AM
3806+ struct dlm_ls *lspace;
3807+ struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
c1c6733f
AM
3808+ int ret = -EINVAL;
3809+
3810+ lspace = find_lockspace_by_local_id(lockspace);
3811+ if (!lspace)
b7b72b66 3812+ return ret;
c1c6733f
AM
3813+
3814+ if (mode < 0 || mode > DLM_LOCK_EX)
3815+ goto out;
3816+
b7b72b66 3817+ if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
c1c6733f
AM
3818+ goto out;
3819+
3820+ if (flags & DLM_LKF_CANCEL)
3821+ goto out;
3822+
3823+ if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
3824+ goto out;
3825+
b7b72b66
AM
3826+ if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
3827+ goto out;
3828+
3829+ if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
3830+ goto out;
3831+
3832+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
c1c6733f
AM
3833+ goto out;
3834+
3835+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
3836+ goto out;
3837+
3838+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
3839+ goto out;
3840+
b7b72b66 3841+ if (flags & DLM_LKF_EXPEDITE && (mode != DLM_LOCK_NL))
c1c6733f
AM
3842+ goto out;
3843+
b7b72b66 3844+ if (!ast || !lksb)
c1c6733f
AM
3845+ goto out;
3846+
3847+ if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
3848+ goto out;
3849+
b7b72b66 3850+ /*
c1c6733f
AM
3851+ * Take conversion path.
3852+ */
3853+
3854+ if (flags & DLM_LKF_CONVERT) {
3855+ ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
3856+ bast, range);
3857+ goto out;
3858+ }
3859+
c783755a
AM
3860+#ifdef CONFIG_DLM_STATS
3861+ dlm_stats.lockops++;
3862+#endif
b7b72b66 3863+ /*
c1c6733f
AM
3864+ * Take new lock path.
3865+ */
3866+
3867+ if (parent) {
3868+ down_read(&lspace->ls_unlock_sem);
3869+
3870+ parent_lkb = find_lock_by_id(lspace, parent);
3871+
3872+ if (!parent_lkb ||
3873+ parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
3874+ parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
3875+ parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3876+ up_read(&lspace->ls_unlock_sem);
3877+ goto out;
3878+ }
3879+
3880+ atomic_inc(&parent_lkb->lkb_childcnt);
3881+ up_read(&lspace->ls_unlock_sem);
3882+ }
3883+
3884+ down_read(&lspace->ls_in_recovery);
3885+
3886+ ret = -ENOMEM;
3887+
3888+ lkb = create_lkb(lspace);
3889+ if (!lkb)
3890+ goto fail_dec;
3891+ lkb->lkb_astaddr = ast;
3892+ lkb->lkb_astparam = (long) astarg;
3893+ lkb->lkb_bastaddr = bast;
3894+ lkb->lkb_rqmode = mode;
3895+ lkb->lkb_grmode = DLM_LOCK_IV;
b7b72b66 3896+ lkb->lkb_nodeid = -1;
c1c6733f
AM
3897+ lkb->lkb_lksb = lksb;
3898+ lkb->lkb_parent = parent_lkb;
3899+ lkb->lkb_lockqueue_flags = flags;
3900+ lkb->lkb_lvbptr = lksb->sb_lvbptr;
3901+
b7b72b66
AM
3902+ if (!in_interrupt() && current)
3903+ lkb->lkb_ownpid = (int) current->pid;
3904+ else
3905+ lkb->lkb_ownpid = 0;
3906+
c1c6733f
AM
3907+ if (range) {
3908+ if (range->ra_start > range->ra_end) {
3909+ ret = -EINVAL;
3910+ goto fail_free;
3911+ }
3912+
3913+ if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
3914+ goto fail_free;
3915+ }
3916+
3917+ /* Convert relevant flags to internal numbers */
3918+ if (flags & DLM_LKF_VALBLK)
3919+ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3920+ if (flags & DLM_LKF_PERSISTENT)
3921+ lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
3922+ if (flags & DLM_LKF_NODLCKWT)
3923+ lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
3924+
3925+ lksb->sb_lkid = lkb->lkb_id;
3926+
3927+ ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
3928+ if (ret)
3929+ goto fail_free;
3930+
3931+ up_read(&lspace->ls_in_recovery);
3932+
3933+ wake_astd();
3934+
b7b72b66 3935+ put_lockspace(lspace);
c1c6733f
AM
3936+ return 0;
3937+
3938+ fail_free:
3939+ release_lkb(lspace, lkb);
3940+ goto fail_unlock;
3941+
3942+ fail_dec:
3943+ if (parent_lkb)
3944+ atomic_dec(&parent_lkb->lkb_childcnt);
3945+
3946+ fail_unlock:
3947+ up_read(&lspace->ls_in_recovery);
3948+
3949+ out:
b7b72b66 3950+ put_lockspace(lspace);
c1c6733f
AM
3951+ return ret;
3952+}
3953+
b7b72b66
AM
3954+int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb, uint32_t flags,
3955+ char *name, int namelen)
c1c6733f 3956+{
b7b72b66
AM
3957+ struct dlm_rsb *rsb, *parent_rsb = NULL;
3958+ struct dlm_lkb *parent_lkb = lkb->lkb_parent;
c1c6733f 3959+ uint32_t nodeid;
b7b72b66 3960+ int error, dir_error = 0;
c1c6733f
AM
3961+
3962+ if (parent_lkb)
3963+ parent_rsb = parent_lkb->lkb_resource;
3964+
b7b72b66 3965+ error = find_rsb(ls, parent_rsb, name, namelen, CREATE, &rsb);
c1c6733f 3966+ if (error)
b7b72b66 3967+ return error;
c1c6733f 3968+ lkb->lkb_resource = rsb;
b7b72b66 3969+ down_write(&rsb->res_lock);
c1c6733f 3970+
b7b72b66
AM
3971+ log_debug(ls, "(%d) rq %u %x \"%s\"", lkb->lkb_ownpid, lkb->lkb_rqmode,
3972+ lkb->lkb_id, rsb->res_name);
3973+ /*
c1c6733f
AM
3974+ * Next stage, do we need to find the master or can
3975+ * we get on with the real locking work ?
3976+ */
3977+
b7b72b66 3978+ retry:
c1c6733f
AM
3979+ if (rsb->res_nodeid == -1) {
3980+ if (get_directory_nodeid(rsb) != our_nodeid()) {
b7b72b66
AM
3981+ remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
3982+ up_write(&rsb->res_lock);
3983+ return 0;
c1c6733f
AM
3984+ }
3985+
b7b72b66
AM
3986+ error = dlm_dir_lookup(ls, our_nodeid(), rsb->res_name,
3987+ rsb->res_length, &nodeid);
3988+ if (error) {
3989+ DLM_ASSERT(error == -EEXIST,);
b7b72b66
AM
3990+ msleep(500);
3991+ dir_error = error;
3992+ goto retry;
3993+ }
c1c6733f 3994+
b7b72b66
AM
3995+ if (nodeid == our_nodeid()) {
3996+ set_bit(RESFL_MASTER, &rsb->res_flags);
3997+ rsb->res_nodeid = 0;
3998+ } else {
3999+ clear_bit(RESFL_MASTER, &rsb->res_flags);
4000+ rsb->res_nodeid = nodeid;
4001+ }
4002+
4003+ if (dir_error) {
4004+ log_all(ls, "dir lookup retry %x %u", lkb->lkb_id,
4005+ nodeid);
4006+ }
c1c6733f
AM
4007+ }
4008+
b7b72b66
AM
4009+ lkb->lkb_nodeid = rsb->res_nodeid;
4010+ up_write(&rsb->res_lock);
c1c6733f 4011+
b7b72b66 4012+ error = dlm_lock_stage2(ls, lkb, rsb, flags);
c1c6733f
AM
4013+
4014+ return error;
4015+}
4016+
b7b72b66 4017+/*
c1c6733f
AM
4018+ * Locking routine called after we have an RSB, either a copy of a remote one
4019+ * or a local one, or perhaps a shiny new one all of our very own
4020+ */
4021+
b7b72b66
AM
4022+int dlm_lock_stage2(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *rsb,
4023+ uint32_t flags)
c1c6733f
AM
4024+{
4025+ int error = 0;
4026+
b7b72b66
AM
4027+ DLM_ASSERT(rsb->res_nodeid != -1, print_lkb(lkb); print_rsb(rsb););
4028+
c1c6733f
AM
4029+ if (rsb->res_nodeid) {
4030+ res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4031+ error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
4032+ } else {
4033+ dlm_lock_stage3(lkb);
4034+ }
4035+
4036+ return error;
4037+}
4038+
b7b72b66 4039+/*
c1c6733f
AM
4040+ * Called on an RSB's master node to do stage2 locking for a remote lock
4041+ * request. Returns a proper lkb with rsb ready for lock processing.
4042+ * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
4043+ */
4044+
b7b72b66
AM
4045+struct dlm_lkb *remote_stage2(int remote_nodeid, struct dlm_ls *ls,
4046+ struct dlm_request *freq)
c1c6733f 4047+{
b7b72b66
AM
4048+ struct dlm_rsb *rsb = NULL, *parent_rsb = NULL;
4049+ struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
c1c6733f
AM
4050+ int error, namelen;
4051+
4052+ if (freq->rr_remparid) {
4053+ parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
4054+ if (!parent_lkb)
4055+ goto fail;
4056+
4057+ atomic_inc(&parent_lkb->lkb_childcnt);
4058+ parent_rsb = parent_lkb->lkb_resource;
4059+ }
4060+
b7b72b66 4061+ /*
c1c6733f
AM
4062+ * A new MSTCPY lkb. Initialize lkb fields including the real lkid and
4063+ * node actually holding the (non-MSTCPY) lkb. AST address are just
4064+ * flags in the master copy.
4065+ */
4066+
4067+ lkb = create_lkb(ls);
4068+ if (!lkb)
4069+ goto fail_dec;
4070+ lkb->lkb_grmode = DLM_LOCK_IV;
4071+ lkb->lkb_rqmode = freq->rr_rqmode;
4072+ lkb->lkb_parent = parent_lkb;
b7b72b66
AM
4073+ lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP);
4074+ lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST);
c1c6733f
AM
4075+ lkb->lkb_nodeid = remote_nodeid;
4076+ lkb->lkb_remid = freq->rr_header.rh_lkid;
4077+ lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
4078+ lkb->lkb_lockqueue_flags = freq->rr_flags;
4079+
4080+ if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
4081+ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
4082+ allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
4083+ if (!lkb->lkb_lvbptr)
4084+ goto fail_free;
4085+ }
4086+
4087+ if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
4088+ error = lkb_set_range(ls, lkb, freq->rr_range_start,
4089+ freq->rr_range_end);
4090+ if (error)
4091+ goto fail_free;
4092+ }
4093+
b7b72b66 4094+ /*
c1c6733f
AM
4095+ * Get the RSB which this lock is for. Create a new RSB if this is a
4096+ * new lock on a new resource. We must be the master of any new rsb.
4097+ */
4098+
4099+ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
4100+
b7b72b66 4101+ error = find_rsb(ls, parent_rsb, freq->rr_name, namelen, MASTER, &rsb);
c1c6733f
AM
4102+ if (error)
4103+ goto fail_free;
4104+
b7b72b66
AM
4105+ if (!rsb) {
4106+ log_debug(ls, "send einval to %u", remote_nodeid);
4107+ /* print_name(freq->rr_name, namelen); */
4108+ lkb->lkb_retstatus = -EINVAL;
4109+ goto out;
4110+ }
4111+
c1c6733f 4112+ lkb->lkb_resource = rsb;
c1c6733f 4113+
b7b72b66
AM
4114+ log_debug(ls, "(%d) rq %u from %u %x \"%s\"",
4115+ lkb->lkb_ownpid, lkb->lkb_rqmode, remote_nodeid,
4116+ lkb->lkb_id, rsb->res_name);
c1c6733f 4117+
b7b72b66
AM
4118+ out:
4119+ return lkb;
c1c6733f
AM
4120+
4121+ fail_free:
4122+ /* release_lkb handles parent */
4123+ release_lkb(ls, lkb);
4124+ parent_lkb = NULL;
4125+
4126+ fail_dec:
4127+ if (parent_lkb)
4128+ atomic_dec(&parent_lkb->lkb_childcnt);
4129+ fail:
4130+ return NULL;
4131+}
4132+
b7b72b66 4133+/*
c1c6733f
AM
4134+ * The final bit of lock request processing on the master node. Here the lock
4135+ * is granted and the completion ast is queued, or the lock is put on the
4136+ * waitqueue and blocking asts are sent.
4137+ */
4138+
b7b72b66 4139+void dlm_lock_stage3(struct dlm_lkb *lkb)
c1c6733f 4140+{
b7b72b66 4141+ struct dlm_rsb *rsb = lkb->lkb_resource;
c1c6733f 4142+
b7b72b66 4143+ /*
c1c6733f
AM
4144+ * This is a locally mastered lock on a resource that already exists,
4145+ * see if it can be granted or if it must wait. When this function is
4146+ * called for a remote lock request (process_cluster_request,
4147+ * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
4148+ * requesting node at the end of process_cluster_request, not at the
4149+ * end of grant_lock.
4150+ */
4151+
4152+ down_write(&rsb->res_lock);
4153+
b7b72b66 4154+ if (can_be_granted(rsb, lkb, TRUE)) {
c1c6733f
AM
4155+ grant_lock(lkb, 0);
4156+ goto out;
4157+ }
4158+
b7b72b66 4159+ /*
c1c6733f
AM
4160+ * This request is not a conversion, so the lkb didn't exist other than
4161+ * for this request and should be freed after EAGAIN is returned in the
4162+ * ast.
4163+ */
4164+
4165+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
c1c6733f 4166+ lkb->lkb_retstatus = -EAGAIN;
c1c6733f
AM
4167+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4168+ send_blocking_asts_all(rsb, lkb);
b7b72b66 4169+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
c1c6733f
AM
4170+ goto out;
4171+ }
4172+
b7b72b66 4173+ /*
c1c6733f
AM
4174+ * The requested lkb must wait. Because the rsb of the requested lkb
4175+ * is mastered here, send blocking asts for the lkb's blocking the
4176+ * request.
4177+ */
4178+
b7b72b66
AM
4179+ log_debug2("w %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4180+ lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4181+ lkb->lkb_status, rsb->res_name);
4182+
c1c6733f
AM
4183+ lkb->lkb_retstatus = 0;
4184+ lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4185+
4186+ send_blocking_asts(rsb, lkb);
4187+
4188+ out:
4189+ up_write(&rsb->res_lock);
4190+}
4191+
4192+int dlm_unlock(void *lockspace,
4193+ uint32_t lkid,
4194+ uint32_t flags,
4195+ struct dlm_lksb *lksb,
4196+ void *astarg)
4197+{
b7b72b66
AM
4198+ struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
4199+ struct dlm_lkb *lkb;
4200+ struct dlm_rsb *rsb;
c1c6733f
AM
4201+ int ret = -EINVAL;
4202+
b7b72b66
AM
4203+ if (!ls) {
4204+ log_print("dlm_unlock: lkid %x lockspace not found", lkid);
4205+ return ret;
4206+ }
c1c6733f
AM
4207+
4208+ lkb = find_lock_by_id(ls, lkid);
b7b72b66
AM
4209+ if (!lkb) {
4210+ log_debug(ls, "unlock %x no id", lkid);
c1c6733f 4211+ goto out;
b7b72b66 4212+ }
c1c6733f
AM
4213+
4214+ /* Can't dequeue a master copy (a remote node's mastered lock) */
b7b72b66
AM
4215+ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4216+ log_debug(ls, "(%d) unlock %x lkb_flags %x",
4217+ lkb->lkb_ownpid, lkid, lkb->lkb_flags);
c1c6733f 4218+ goto out;
b7b72b66 4219+ }
c1c6733f
AM
4220+
4221+ /* Already waiting for a remote lock operation */
4222+ if (lkb->lkb_lockqueue_state) {
b7b72b66
AM
4223+ log_debug(ls, "(%d) unlock %x lq%d",
4224+ lkb->lkb_ownpid, lkid, lkb->lkb_lockqueue_state);
c1c6733f
AM
4225+ ret = -EBUSY;
4226+ goto out;
4227+ }
4228+
c783755a
AM
4229+#ifdef CONFIG_DLM_STATS
4230+ dlm_stats.unlockops++;
4231+#endif
c1c6733f
AM
4232+ /* Can only cancel WAITING or CONVERTing locks.
4233+ * This is just a quick check - it is also checked in unlock_stage2()
4234+ * (which may be on the master) under the semaphore.
4235+ */
4236+ if ((flags & DLM_LKF_CANCEL) &&
b7b72b66
AM
4237+ (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
4238+ log_debug(ls, "(%d) unlock %x %x %d",
4239+ lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
c1c6733f 4240+ goto out;
b7b72b66 4241+ }
c1c6733f
AM
4242+
4243+ /* "Normal" unlocks must operate on a granted lock */
4244+ if (!(flags & DLM_LKF_CANCEL) &&
b7b72b66
AM
4245+ (lkb->lkb_status != GDLM_LKSTS_GRANTED)) {
4246+ log_debug(ls, "(%d) unlock %x %x %d",
4247+ lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
c1c6733f 4248+ goto out;
b7b72b66 4249+ }
c1c6733f 4250+
b7b72b66
AM
4251+ if (lkb->lkb_flags & GDLM_LKFLG_DELETED) {
4252+ log_debug(ls, "(%d) unlock deleted %x %x %d",
4253+ lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
4254+ goto out;
4255+ }
c1c6733f 4256+
b7b72b66 4257+ down_write(&ls->ls_unlock_sem);
c1c6733f
AM
4258+ /* Can't dequeue a lock with sublocks */
4259+ if (atomic_read(&lkb->lkb_childcnt)) {
4260+ up_write(&ls->ls_unlock_sem);
4261+ ret = -ENOTEMPTY;
4262+ goto out;
4263+ }
c1c6733f
AM
4264+ /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
4265+ if (!(flags & DLM_LKF_CANCEL))
4266+ lkb->lkb_flags |= GDLM_LKFLG_DELETED;
4267+ up_write(&ls->ls_unlock_sem);
4268+
b7b72b66
AM
4269+ down_read(&ls->ls_in_recovery);
4270+ rsb = find_rsb_to_unlock(ls, lkb);
4271+
4272+ log_debug(ls, "(%d) un %x %x %d %d \"%s\"",
4273+ lkb->lkb_ownpid,
4274+ lkb->lkb_id,
4275+ lkb->lkb_flags,
4276+ lkb->lkb_nodeid,
4277+ rsb->res_nodeid,
4278+ rsb->res_name);
4279+
c1c6733f
AM
4280+ /* Save any new params */
4281+ if (lksb)
4282+ lkb->lkb_lksb = lksb;
c783755a 4283+ lkb->lkb_astparam = (long) astarg;
c1c6733f
AM
4284+ lkb->lkb_lockqueue_flags = flags;
4285+
b7b72b66 4286+ if (lkb->lkb_nodeid)
c1c6733f
AM
4287+ ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
4288+ else
b7b72b66 4289+ ret = dlm_unlock_stage2(lkb, rsb, flags);
c1c6733f
AM
4290+ up_read(&ls->ls_in_recovery);
4291+
4292+ wake_astd();
4293+
4294+ out:
b7b72b66 4295+ put_lockspace(ls);
c1c6733f
AM
4296+ return ret;
4297+}
4298+
b7b72b66 4299+int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags)
c1c6733f 4300+{
c1c6733f 4301+ int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
b7b72b66 4302+ int old_status;
c1c6733f
AM
4303+
4304+ down_write(&rsb->res_lock);
4305+
4306+ /* Can only cancel WAITING or CONVERTing locks */
4307+ if ((flags & DLM_LKF_CANCEL) &&
4308+ (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
4309+ lkb->lkb_retstatus = -EINVAL;
b7b72b66 4310+ queue_ast(lkb, AST_COMP, 0);
c1c6733f
AM
4311+ goto out;
4312+ }
4313+
b7b72b66
AM
4314+ log_debug2("u %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4315+ lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4316+ lkb->lkb_status, rsb->res_name);
c1c6733f 4317+
b7b72b66 4318+ old_status = lkb_dequeue(lkb);
c1c6733f 4319+
b7b72b66 4320+ /*
c1c6733f
AM
4321+ * Cancelling a conversion
4322+ */
4323+
4324+ if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
4325+ /* VMS semantics say we should send blocking ASTs again here */
4326+ send_blocking_asts(rsb, lkb);
4327+
4328+ /* Remove from deadlock detection */
4329+ if (lkb->lkb_duetime)
4330+ remove_from_deadlockqueue(lkb);
4331+
4332+ /* Stick it back on the granted queue */
4333+ lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4334+ lkb->lkb_rqmode = lkb->lkb_grmode;
4335+
4336+ /* Was it blocking any other locks? */
4337+ if (first_in_list(lkb, &rsb->res_convertqueue))
4338+ grant_pending_locks(rsb);
4339+
4340+ lkb->lkb_retstatus = -DLM_ECANCEL;
b7b72b66 4341+ queue_ast(lkb, AST_COMP, 0);
c1c6733f
AM
4342+ goto out;
4343+ }
4344+
b7b72b66
AM
4345+ /*
4346+ * If was granted grant any converting or waiting locks
4347+ * and save or clear lvb
c1c6733f
AM
4348+ */
4349+
b7b72b66
AM
4350+ if (old_status == GDLM_LKSTS_GRANTED) {
4351+ if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
4352+ if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
4353+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr,
4354+ DLM_LVB_LEN);
4355+ if (flags & DLM_LKF_IVVALBLK)
4356+ memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
4357+ }
c1c6733f 4358+
b7b72b66
AM
4359+ grant_pending_locks(rsb);
4360+ } else
4361+ DLM_ASSERT(0, print_lkb(lkb); print_rsb(rsb););
c1c6733f 4362+
b7b72b66 4363+ lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK;
c1c6733f 4364+
b7b72b66
AM
4365+ if (!remote) {
4366+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
4367+ } else {
c1c6733f
AM
4368+ up_write(&rsb->res_lock);
4369+ release_lkb(rsb->res_ls, lkb);
4370+ release_rsb(rsb);
4371+ goto out2;
4372+ }
4373+
b7b72b66 4374+ out:
c1c6733f 4375+ up_write(&rsb->res_lock);
b7b72b66 4376+ out2:
c1c6733f
AM
4377+ wake_astd();
4378+ return 0;
4379+}
4380+
b7b72b66 4381+/*
c1c6733f
AM
4382+ * Lock conversion
4383+ */
4384+
b7b72b66
AM
4385+static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
4386+ uint32_t flags, void *ast, void *astarg, void *bast,
c1c6733f
AM
4387+ struct dlm_range *range)
4388+{
b7b72b66
AM
4389+ struct dlm_lkb *lkb;
4390+ struct dlm_rsb *rsb;
c1c6733f
AM
4391+ int ret = -EINVAL;
4392+
4393+ lkb = find_lock_by_id(ls, lksb->sb_lkid);
4394+ if (!lkb) {
4395+ goto out;
4396+ }
4397+
4398+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
4399+ ret = -EBUSY;
4400+ goto out;
4401+ }
4402+
4403+ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4404+ goto out;
4405+ }
4406+
4407+ if ((flags & DLM_LKF_QUECVT) &&
4408+ !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
4409+ goto out;
4410+ }
4411+
4412+ if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
b7b72b66 4413+ goto out;
c1c6733f
AM
4414+ }
4415+
c783755a
AM
4416+#ifdef CONFIG_DLM_STATS
4417+ dlm_stats.convertops++;
4418+#endif
c1c6733f
AM
4419+ /* Set up the ranges as appropriate */
4420+ if (range) {
4421+ if (range->ra_start > range->ra_end)
4422+ goto out;
4423+
4424+ if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
4425+ ret = -ENOMEM;
4426+ goto out;
4427+ }
4428+ }
4429+
4430+ rsb = lkb->lkb_resource;
b7b72b66
AM
4431+ down_read(&ls->ls_in_recovery);
4432+
4433+ log_debug(ls, "(%d) cv %u %x \"%s\"", lkb->lkb_ownpid, mode,
4434+ lkb->lkb_id, rsb->res_name);
c1c6733f
AM
4435+
4436+ lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
4437+ lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
4438+
4439+ if (flags & DLM_LKF_NODLCKWT)
4440+ lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
b7b72b66
AM
4441+ lkb->lkb_astaddr = ast;
4442+ lkb->lkb_astparam = (long) astarg;
4443+ lkb->lkb_bastaddr = bast;
c1c6733f
AM
4444+ lkb->lkb_rqmode = mode;
4445+ lkb->lkb_lockqueue_flags = flags;
4446+ lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
4447+ lkb->lkb_lvbptr = lksb->sb_lvbptr;
4448+
4449+ if (rsb->res_nodeid) {
4450+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4451+ ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
4452+ } else {
4453+ ret = dlm_convert_stage2(lkb, FALSE);
4454+ }
4455+
b7b72b66 4456+ up_read(&ls->ls_in_recovery);
c1c6733f
AM
4457+
4458+ wake_astd();
4459+
4460+ out:
4461+ return ret;
4462+}
4463+
b7b72b66 4464+/*
c1c6733f
AM
4465+ * For local conversion requests on locally mastered locks this is called
4466+ * directly from dlm_lock/convert_lock. This function is also called for
4467+ * remote conversion requests of MSTCPY locks (from process_cluster_request).
4468+ */
4469+
b7b72b66 4470+int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast)
c1c6733f 4471+{
b7b72b66 4472+ struct dlm_rsb *rsb = lkb->lkb_resource;
c1c6733f
AM
4473+ int ret = 0;
4474+
4475+ down_write(&rsb->res_lock);
4476+
b7b72b66 4477+ if (can_be_granted(rsb, lkb, TRUE)) {
c1c6733f
AM
4478+ grant_lock(lkb, 0);
4479+ grant_pending_locks(rsb);
4480+ goto out;
4481+ }
4482+
c1c6733f 4483+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
c1c6733f
AM
4484+ ret = lkb->lkb_retstatus = -EAGAIN;
4485+ if (do_ast)
b7b72b66 4486+ queue_ast(lkb, AST_COMP, 0);
c1c6733f
AM
4487+ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4488+ send_blocking_asts_all(rsb, lkb);
4489+ goto out;
4490+ }
4491+
b7b72b66
AM
4492+ log_debug2("c %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4493+ lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4494+ lkb->lkb_status, rsb->res_name);
c1c6733f
AM
4495+
4496+ lkb->lkb_retstatus = 0;
b7b72b66 4497+ lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
c1c6733f 4498+
b7b72b66
AM
4499+ /*
4500+ * The granted mode may have been reduced to NL by conversion deadlock
4501+ * avoidance in can_be_granted(). If so, try to grant other locks.
c1c6733f
AM
4502+ */
4503+
b7b72b66
AM
4504+ if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
4505+ grant_pending_locks(rsb);
4506+
c1c6733f
AM
4507+ send_blocking_asts(rsb, lkb);
4508+
4509+ if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
4510+ add_to_deadlockqueue(lkb);
4511+
4512+ out:
4513+ up_write(&rsb->res_lock);
4514+ return ret;
4515+}
4516+
b7b72b66 4517+/*
c1c6733f
AM
4518+ * Remove lkb from any queue it's on, add it to the granted queue, and queue a
4519+ * completion ast. rsb res_lock must be held in write when this is called.
4520+ */
4521+
b7b72b66 4522+static void grant_lock(struct dlm_lkb *lkb, int send_remote)
c1c6733f 4523+{
b7b72b66 4524+ struct dlm_rsb *rsb = lkb->lkb_resource;
c1c6733f
AM
4525+
4526+ if (lkb->lkb_duetime)
4527+ remove_from_deadlockqueue(lkb);
4528+
4529+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
4530+ int b;
b7b72b66 4531+ DLM_ASSERT(lkb->lkb_lvbptr,);
c1c6733f
AM
4532+
4533+ if (!rsb->res_lvbptr)
4534+ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
4535+
4536+ b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
4537+ if (b)
4538+ memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
4539+ else
4540+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4541+ }
4542+
4543+ if (lkb->lkb_range) {
4544+ lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
4545+ lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
4546+ }
4547+
b7b72b66
AM
4548+ log_debug2("g %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4549+ lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4550+ lkb->lkb_status, rsb->res_name);
c1c6733f 4551+
b7b72b66
AM
4552+ if (lkb->lkb_grmode != lkb->lkb_rqmode) {
4553+ lkb->lkb_grmode = lkb->lkb_rqmode;
4554+ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4555+ }
4556+ lkb->lkb_rqmode = DLM_LOCK_IV;
c1c6733f
AM
4557+ lkb->lkb_highbast = 0;
4558+ lkb->lkb_retstatus = 0;
b7b72b66 4559+ queue_ast(lkb, AST_COMP, 0);
c1c6733f 4560+
b7b72b66 4561+ /*
c1c6733f
AM
4562+ * A remote conversion request has been granted, either immediately
4563+ * upon being requested or after waiting a bit. In the former case,
4564+ * reply_and_grant() is called. In the later case send_remote is 1 and
4565+ * remote_grant() is called.
4566+ *
4567+ * The "send_remote" flag is set only for locks which are granted "out
4568+ * of band" - ie by another lock being converted or unlocked.
4569+ *
4570+ * The second case occurs when this lkb is granted right away as part
4571+ * of processing the initial request. In that case, we send a single
4572+ * message in reply_and_grant which combines the request reply with the
4573+ * grant message.
4574+ */
4575+
4576+ if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
4577+ if (send_remote)
4578+ remote_grant(lkb);
4579+ else if (lkb->lkb_request)
4580+ reply_and_grant(lkb);
4581+ }
4582+
4583+}
4584+
b7b72b66 4585+static void send_bast_queue(struct list_head *head, struct dlm_lkb *lkb)
c1c6733f 4586+{
b7b72b66 4587+ struct dlm_lkb *gr;
c1c6733f
AM
4588+
4589+ list_for_each_entry(gr, head, lkb_statequeue) {
4590+ if (gr->lkb_bastaddr &&
4591+ gr->lkb_highbast < lkb->lkb_rqmode &&
4592+ ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
b7b72b66 4593+ queue_ast(gr, AST_BAST, lkb->lkb_rqmode);
c1c6733f
AM
4594+ gr->lkb_highbast = lkb->lkb_rqmode;
4595+ }
4596+ }
4597+}
4598+
b7b72b66 4599+/*
c1c6733f
AM
4600+ * Notify granted locks if they are blocking a newly forced-to-wait lock.
4601+ */
4602+
b7b72b66 4603+static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
c1c6733f
AM
4604+{
4605+ send_bast_queue(&rsb->res_grantqueue, lkb);
4606+ /* check if the following improves performance */
4607+ /* send_bast_queue(&rsb->res_convertqueue, lkb); */
4608+}
4609+
b7b72b66 4610+static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
c1c6733f
AM
4611+{
4612+ send_bast_queue(&rsb->res_grantqueue, lkb);
4613+ send_bast_queue(&rsb->res_convertqueue, lkb);
4614+}
4615+
b7b72b66 4616+/*
c1c6733f
AM
4617+ * Called when a lock has been dequeued. Look for any locks to grant that are
4618+ * waiting for conversion or waiting to be granted.
4619+ * The rsb res_lock must be held in write when this function is called.
4620+ */
4621+
b7b72b66 4622+int grant_pending_locks(struct dlm_rsb *r)
c1c6733f 4623+{
b7b72b66 4624+ struct dlm_lkb *lkb, *s;
c1c6733f
AM
4625+ int8_t high = DLM_LOCK_IV;
4626+
b7b72b66
AM
4627+ list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
4628+ if (can_be_granted(r, lkb, FALSE))
c1c6733f
AM
4629+ grant_lock(lkb, 1);
4630+ else
4631+ high = MAX(lkb->lkb_rqmode, high);
4632+ }
4633+
b7b72b66
AM
4634+ list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
4635+ if (lkb->lkb_lockqueue_state)
4636+ continue;
c1c6733f 4637+
b7b72b66 4638+ if (can_be_granted(r, lkb, FALSE))
c1c6733f
AM
4639+ grant_lock(lkb, 1);
4640+ else
4641+ high = MAX(lkb->lkb_rqmode, high);
4642+ }
4643+
b7b72b66 4644+ /*
c1c6733f
AM
4645+ * If there are locks left on the wait/convert queue then send blocking
4646+ * ASTs to granted locks that are blocking
4647+ *
4648+ * FIXME: This might generate some spurious blocking ASTs for range
4649+ * locks.
4650+ */
4651+
4652+ if (high > DLM_LOCK_IV) {
b7b72b66
AM
4653+ list_for_each_entry_safe(lkb, s, &r->res_grantqueue,
4654+ lkb_statequeue) {
4655+ if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
c1c6733f 4656+ !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
b7b72b66 4657+ queue_ast(lkb, AST_BAST, high);
c1c6733f
AM
4658+ lkb->lkb_highbast = high;
4659+ }
4660+ }
4661+ }
4662+
4663+ return 0;
4664+}
4665+
b7b72b66 4666+/*
c1c6733f
AM
4667+ * Called to cancel a locking operation that failed due to some internal
4668+ * reason.
4669+ *
4670+ * Waiting locks will be removed, converting locks will be reverted to their
4671+ * granted status, unlocks will be left where they are.
4672+ *
4673+ * A completion AST will be delivered to the caller.
4674+ */
4675+
b7b72b66 4676+int cancel_lockop(struct dlm_lkb *lkb, int status)
c1c6733f
AM
4677+{
4678+ int state = lkb->lkb_lockqueue_state;
b7b72b66 4679+ uint16_t astflags = AST_COMP;
c1c6733f
AM
4680+
4681+ lkb->lkb_lockqueue_state = 0;
4682+
4683+ switch (state) {
4684+ case GDLM_LQSTATE_WAIT_RSB:
b7b72b66 4685+ astflags |= AST_DEL;
c1c6733f
AM
4686+ break;
4687+
4688+ case GDLM_LQSTATE_WAIT_CONDGRANT:
4689+ res_lkb_dequeue(lkb);
b7b72b66 4690+ astflags |= AST_DEL;
c1c6733f
AM
4691+ break;
4692+
4693+ case GDLM_LQSTATE_WAIT_CONVERT:
4694+ res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
4695+
4696+ /* Remove from deadlock detection */
4697+ if (lkb->lkb_duetime) {
4698+ remove_from_deadlockqueue(lkb);
4699+ }
4700+ break;
4701+
4702+ case GDLM_LQSTATE_WAIT_UNLOCK:
4703+ /* We can leave this. I think.... */
4704+ break;
4705+ }
4706+
4707+ lkb->lkb_retstatus = status;
b7b72b66 4708+ queue_ast(lkb, astflags, 0);
c1c6733f
AM
4709+
4710+ return 0;
4711+}
4712+
b7b72b66 4713+/*
c1c6733f
AM
4714+ * Check for conversion deadlock. If a deadlock was found
4715+ * return lkb to kill, else return NULL
4716+ */
4717+
b7b72b66 4718+struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb)
c1c6733f 4719+{
b7b72b66 4720+ struct dlm_rsb *rsb = lkb->lkb_resource;
c1c6733f
AM
4721+ struct list_head *entry;
4722+
b7b72b66 4723+ DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
c1c6733f
AM
4724+
4725+ /* Work our way up to the head of the queue looking for locks that
4726+ * conflict with us */
4727+
4728+ down_read(&rsb->res_lock);
4729+
4730+ entry = lkb->lkb_statequeue.prev;
4731+ while (entry != &rsb->res_convertqueue) {
b7b72b66 4732+ struct dlm_lkb *lkb2 = list_entry(entry, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
4733+
4734+ if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
4735+ up_read(&rsb->res_lock);
4736+ return lkb;
4737+ }
4738+ entry = entry->prev;
4739+ }
4740+ up_read(&rsb->res_lock);
4741+
4742+ return 0;
4743+}
4744+
b7b72b66 4745+/*
c1c6733f
AM
4746+ * Conversion operation was cancelled by us (not the user).
4747+ * ret contains the return code to pass onto the user
4748+ */
4749+
b7b72b66 4750+void cancel_conversion(struct dlm_lkb *lkb, int ret)
c1c6733f 4751+{
b7b72b66 4752+ struct dlm_rsb *rsb = lkb->lkb_resource;
c1c6733f
AM
4753+
4754+ /* Stick it back on the granted queue */
4755+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4756+ lkb->lkb_rqmode = lkb->lkb_grmode;
4757+
4758+ remove_from_deadlockqueue(lkb);
4759+
4760+ lkb->lkb_retstatus = ret;
b7b72b66 4761+ queue_ast(lkb, AST_COMP, 0);
c1c6733f
AM
4762+ wake_astd();
4763+}
4764+
b7b72b66 4765+/*
c1c6733f
AM
4766+ * As new master of the rsb for this lkb, we need to handle these requests
4767+ * removed from the lockqueue and originating from local processes:
4768+ * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
4769+ * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
4770+ */
4771+
b7b72b66 4772+void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state)
c1c6733f 4773+{
b7b72b66
AM
4774+ struct dlm_rsb *rsb;
4775+
c1c6733f
AM
4776+ switch (state) {
4777+ case GDLM_LQSTATE_WAIT_RSB:
4778+ dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
4779+ lkb->lkb_lockqueue_flags,
4780+ lkb->lkb_resource->res_name,
4781+ lkb->lkb_resource->res_length);
4782+ break;
4783+
4784+ case GDLM_LQSTATE_WAIT_CONDGRANT:
4785+ res_lkb_dequeue(lkb);
4786+ dlm_lock_stage3(lkb);
4787+ break;
4788+
4789+ case GDLM_LQSTATE_WAIT_UNLOCK:
b7b72b66
AM
4790+ rsb = find_rsb_to_unlock(ls, lkb);
4791+ dlm_unlock_stage2(lkb, rsb, lkb->lkb_lockqueue_flags);
c1c6733f
AM
4792+ break;
4793+
4794+ case GDLM_LQSTATE_WAIT_CONVERT:
4795+ dlm_convert_stage2(lkb, TRUE);
4796+ break;
4797+
4798+ default:
b7b72b66
AM
4799+ DLM_ASSERT(0,);
4800+ }
4801+}
4802+
4803+static void dump_queue(struct list_head *head, char *qname)
4804+{
4805+ struct dlm_lkb *lkb;
4806+
4807+ list_for_each_entry(lkb, head, lkb_statequeue) {
4808+ printk("%s %08x gr %d rq %d flg %x sts %u node %u remid %x "
4809+ "lq %d,%x\n",
4810+ qname,
4811+ lkb->lkb_id,
4812+ lkb->lkb_grmode,
4813+ lkb->lkb_rqmode,
4814+ lkb->lkb_flags,
4815+ lkb->lkb_status,
4816+ lkb->lkb_nodeid,
4817+ lkb->lkb_remid,
4818+ lkb->lkb_lockqueue_state,
4819+ lkb->lkb_lockqueue_flags);
4820+ }
4821+}
4822+
4823+static void dump_rsb(struct dlm_rsb *rsb)
4824+{
4825+ printk("name \"%s\" flags %lx nodeid %d ref %u\n",
4826+ rsb->res_name, rsb->res_flags, rsb->res_nodeid,
4827+ atomic_read(&rsb->res_ref));
4828+
4829+ if (!list_empty(&rsb->res_grantqueue))
4830+ dump_queue(&rsb->res_grantqueue, "G");
4831+
4832+ if (!list_empty(&rsb->res_convertqueue))
4833+ dump_queue(&rsb->res_convertqueue, "C");
4834+
4835+ if (!list_empty(&rsb->res_waitqueue))
4836+ dump_queue(&rsb->res_waitqueue, "W");
4837+}
4838+
4839+void dlm_locks_dump(void)
4840+{
4841+ struct dlm_ls *ls;
4842+ struct dlm_rsb *rsb;
4843+ struct list_head *head;
4844+ int i;
4845+
4846+ lowcomms_stop_accept();
4847+
4848+ list_for_each_entry(ls, &lslist, ls_list) {
4849+ down_write(&ls->ls_in_recovery);
4850+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
4851+ head = &ls->ls_rsbtbl[i].list;
4852+ list_for_each_entry(rsb, head, res_hashchain)
4853+ dump_rsb(rsb);
4854+ }
c1c6733f
AM
4855+ }
4856+}
b7b72b66 4857+
c1c6733f
AM
4858diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
4859--- linux-orig/cluster/dlm/locking.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 4860+++ linux-patched/cluster/dlm/locking.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
4861@@ -0,0 +1,33 @@
4862+/******************************************************************************
4863+*******************************************************************************
4864+**
4865+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4866+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4867+**
4868+** This copyrighted material is made available to anyone wishing to use,
4869+** modify, copy, or redistribute it subject to the terms and conditions
4870+** of the GNU General Public License v.2.
4871+**
4872+*******************************************************************************
4873+******************************************************************************/
4874+
4875+#ifndef __LOCKING_DOT_H__
4876+#define __LOCKING_DOT_H__
4877+
b7b72b66
AM
4878+int dlm_modes_compat(int mode1, int mode2);
4879+void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state);
4880+void dlm_lock_stage3(struct dlm_lkb *lkb);
4881+int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast);
4882+int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
4883+int dlm_lock_stage2(struct dlm_ls *lspace, struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
4884+struct dlm_rsb *create_rsb(struct dlm_ls *lspace, struct dlm_lkb *lkb, char *name, int namelen);
4885+int free_rsb_if_unused(struct dlm_rsb *rsb);
4886+struct dlm_lkb *remote_stage2(int remote_csid, struct dlm_ls *lspace,
4887+ struct dlm_request *freq);
4888+int cancel_lockop(struct dlm_lkb *lkb, int status);
4889+int dlm_remove_lock(struct dlm_lkb *lkb, uint32_t flags);
4890+int grant_pending_locks(struct dlm_rsb *rsb);
4891+void cancel_conversion(struct dlm_lkb *lkb, int ret);
4892+struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb);
c1c6733f
AM
4893+
4894+#endif /* __LOCKING_DOT_H__ */
4895diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
4896--- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 4897+++ linux-patched/cluster/dlm/lockqueue.c 2004-11-03 11:31:56.000000000 +0800
b7b72b66 4898@@ -0,0 +1,1159 @@
c1c6733f
AM
4899+/******************************************************************************
4900+*******************************************************************************
4901+**
4902+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4903+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4904+**
4905+** This copyrighted material is made available to anyone wishing to use,
4906+** modify, copy, or redistribute it subject to the terms and conditions
4907+** of the GNU General Public License v.2.
4908+**
4909+*******************************************************************************
4910+******************************************************************************/
4911+
4912+/*
4913+ * lockqueue.c
4914+ *
4915+ * This controls the lock queue, which is where locks
4916+ * come when they need to wait for a remote operation
4917+ * to complete.
4918+ *
4919+ * This could also be thought of as the "high-level" comms
4920+ * layer.
4921+ *
4922+ */
4923+
4924+#include "dlm_internal.h"
4925+#include "lockqueue.h"
4926+#include "dir.h"
4927+#include "locking.h"
4928+#include "lkb.h"
4929+#include "lowcomms.h"
4930+#include "midcomms.h"
4931+#include "reccomms.h"
4932+#include "nodes.h"
4933+#include "lockspace.h"
4934+#include "ast.h"
4935+#include "memory.h"
4936+#include "rsb.h"
4937+#include "queries.h"
b7b72b66 4938+#include "util.h"
c1c6733f 4939+
b7b72b66
AM
4940+static void add_reply_lvb(struct dlm_lkb * lkb, struct dlm_reply *reply);
4941+static void add_request_lvb(struct dlm_lkb * lkb, struct dlm_request *req);
c1c6733f
AM
4942+
4943+/*
4944+ * format of an entry on the request queue
4945+ */
4946+struct rq_entry {
4947+ struct list_head rqe_list;
4948+ uint32_t rqe_nodeid;
4949+ char rqe_request[1];
4950+};
4951+
4952+/*
4953+ * Add a new request (if appropriate) to the request queue and send the remote
4954+ * request out. - runs in the context of the locking caller
4955+ *
4956+ * Recovery of a remote_stage request if the remote end fails while the lkb
4957+ * is still on the lockqueue:
4958+ *
4959+ * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
4960+ * lockqueue_lkb_mark() at the start of recovery.
4961+ *
4962+ * o Some lkb's will be rebuilt on new master rsb's during recovery.
4963+ * (depends on the type of request, see below).
4964+ *
4965+ * o At the end of recovery, resend_cluster_requests() looks at these
4966+ * LQRESEND lkb's and either:
4967+ *
4968+ * i) resends the request to the new master for the rsb where the
4969+ * request is processed as usual. The lkb remains on the lockqueue until
4970+ * the new master replies and we run process_lockqueue_reply().
4971+ *
4972+ * ii) if we've become the rsb master, remove the lkb from the lockqueue
4973+ * and processes the request locally via process_remastered_lkb().
4974+ *
4975+ * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
4976+ * and the request should be resent if dest node is failed.
4977+ *
4978+ * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
4979+ * wait queue. Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
4980+ * makes send_lkb_queue() skip it). Resend this request to the new master.
4981+ *
4982+ * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue. It will
4983+ * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
4984+ * Resend this request to the new master.
4985+ *
4986+ * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
4987+ * It will be rebuilt on the new master rsb's granted queue. Resend this
4988+ * request to the new master.
4989+ */
4990+
b7b72b66 4991+int remote_stage(struct dlm_lkb *lkb, int state)
c1c6733f
AM
4992+{
4993+ int error;
4994+
4995+ lkb->lkb_lockqueue_state = state;
4996+ add_to_lockqueue(lkb);
4997+
4998+ error = send_cluster_request(lkb, state);
4999+ if (error < 0) {
b7b72b66
AM
5000+ log_error(lkb->lkb_resource->res_ls, "remote_stage error %d %x",
5001+ error, lkb->lkb_id);
c1c6733f
AM
5002+ /* Leave on lockqueue, it will be resent to correct node during
5003+ * recovery. */
c1c6733f
AM
5004+ }
5005+ return 0;
5006+}
5007+
5008+/*
5009+ * Requests received while the lockspace is in recovery get added to the
5010+ * request queue and processed when recovery is complete.
5011+ */
5012+
b7b72b66 5013+void add_to_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
c1c6733f
AM
5014+{
5015+ struct rq_entry *entry;
b7b72b66
AM
5016+ int length = hd->rh_length;
5017+
5018+ if (test_bit(LSFL_REQUEST_WARN, &ls->ls_flags))
5019+ log_error(ls, "request during recovery from %u", nodeid);
c1c6733f
AM
5020+
5021+ if (in_nodes_gone(ls, nodeid))
5022+ return;
5023+
5024+ entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
5025+ if (!entry) {
5026+ // TODO something better
5027+ printk("dlm: add_to_requestqueue: out of memory\n");
5028+ return;
5029+ }
5030+
b7b72b66 5031+ log_debug(ls, "add_to_requestq cmd %d fr %d", hd->rh_cmd, nodeid);
c1c6733f 5032+ entry->rqe_nodeid = nodeid;
b7b72b66
AM
5033+ memcpy(entry->rqe_request, hd, length);
5034+
5035+ down(&ls->ls_requestqueue_lock);
c1c6733f 5036+ list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
b7b72b66 5037+ up(&ls->ls_requestqueue_lock);
c1c6733f
AM
5038+}
5039+
b7b72b66 5040+int process_requestqueue(struct dlm_ls *ls)
c1c6733f
AM
5041+{
5042+ int error = 0, count = 0;
b7b72b66
AM
5043+ struct rq_entry *entry;
5044+ struct dlm_header *hd;
c1c6733f
AM
5045+
5046+ log_all(ls, "process held requests");
5047+
b7b72b66 5048+ down(&ls->ls_requestqueue_lock);
c1c6733f 5049+
b7b72b66
AM
5050+ for (;;) {
5051+ if (list_empty(&ls->ls_requestqueue)) {
5052+ up(&ls->ls_requestqueue_lock);
5053+ error = 0;
c1c6733f
AM
5054+ break;
5055+ }
5056+
b7b72b66
AM
5057+ entry = list_entry(ls->ls_requestqueue.next, struct rq_entry,
5058+ rqe_list);
5059+ up(&ls->ls_requestqueue_lock);
5060+ hd = (struct dlm_header *) entry->rqe_request;
5061+
5062+ log_debug(ls, "process_requestq cmd %d fr %u", hd->rh_cmd,
5063+ entry->rqe_nodeid);
5064+
5065+ error = process_cluster_request(entry->rqe_nodeid, hd, TRUE);
c1c6733f 5066+ if (error == -EINTR) {
b7b72b66
AM
5067+ /* entry is left on requestqueue */
5068+ log_debug(ls, "process_requestqueue abort eintr");
c1c6733f
AM
5069+ break;
5070+ }
5071+
b7b72b66 5072+ down(&ls->ls_requestqueue_lock);
c1c6733f
AM
5073+ list_del(&entry->rqe_list);
5074+ kfree(entry);
5075+ count++;
b7b72b66
AM
5076+
5077+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5078+ log_debug(ls, "process_requestqueue abort ls_run");
5079+ up(&ls->ls_requestqueue_lock);
5080+ error = -EINTR;
5081+ break;
5082+ }
c1c6733f
AM
5083+ }
5084+
5085+ log_all(ls, "processed %d requests", count);
5086+ return error;
5087+}
5088+
b7b72b66 5089+void wait_requestqueue(struct dlm_ls *ls)
c1c6733f 5090+{
b7b72b66
AM
5091+ for (;;) {
5092+ down(&ls->ls_requestqueue_lock);
5093+ if (list_empty(&ls->ls_requestqueue))
5094+ break;
5095+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
5096+ break;
5097+ up(&ls->ls_requestqueue_lock);
c1c6733f 5098+ schedule();
b7b72b66
AM
5099+ }
5100+ up(&ls->ls_requestqueue_lock);
c1c6733f
AM
5101+}
5102+
5103+/*
5104+ * Resdir requests (lookup or remove) and replies from before recovery are
5105+ * invalid since the resdir was rebuilt. Clear them. Requests from nodes now
5106+ * gone are also invalid.
5107+ */
5108+
b7b72b66 5109+void purge_requestqueue(struct dlm_ls *ls)
c1c6733f
AM
5110+{
5111+ int count = 0;
5112+ struct rq_entry *entry, *safe;
b7b72b66
AM
5113+ struct dlm_header *hd;
5114+ struct dlm_lkb *lkb;
c1c6733f
AM
5115+
5116+ log_all(ls, "purge requests");
5117+
b7b72b66
AM
5118+ down(&ls->ls_requestqueue_lock);
5119+
c1c6733f 5120+ list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
b7b72b66 5121+ hd = (struct dlm_header *) entry->rqe_request;
c1c6733f 5122+
b7b72b66
AM
5123+ if (hd->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
5124+ hd->rh_cmd == GDLM_REMCMD_LOOKUP ||
c1c6733f
AM
5125+ in_nodes_gone(ls, entry->rqe_nodeid)) {
5126+
5127+ list_del(&entry->rqe_list);
5128+ kfree(entry);
5129+ count++;
5130+
b7b72b66 5131+ } else if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
c1c6733f
AM
5132+
5133+ /*
5134+ * Replies to resdir lookups are invalid and must be
5135+ * purged. The lookup requests are marked in
5136+ * lockqueue_lkb_mark and will be resent in
5137+ * resend_cluster_requests. The only way to check if
5138+ * this is a lookup reply is to look at the
5139+ * lockqueue_state of the lkb.
5140+ */
5141+
b7b72b66
AM
5142+ lkb = find_lock_by_id(ls, hd->rh_lkid);
5143+ DLM_ASSERT(lkb,);
c1c6733f
AM
5144+ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
5145+ list_del(&entry->rqe_list);
5146+ kfree(entry);
5147+ count++;
5148+ }
5149+ }
5150+ }
b7b72b66 5151+ up(&ls->ls_requestqueue_lock);
c1c6733f
AM
5152+
5153+ log_all(ls, "purged %d requests", count);
5154+}
5155+
5156+/*
5157+ * Check if there's a reply for the given lkid in the requestqueue.
5158+ */
5159+
b7b72b66 5160+int reply_in_requestqueue(struct dlm_ls *ls, int lkid)
c1c6733f
AM
5161+{
5162+ int rv = FALSE;
b7b72b66
AM
5163+ struct rq_entry *entry;
5164+ struct dlm_header *hd;
c1c6733f 5165+
b7b72b66 5166+ down(&ls->ls_requestqueue_lock);
c1c6733f 5167+
b7b72b66
AM
5168+ list_for_each_entry(entry, &ls->ls_requestqueue, rqe_list) {
5169+ hd = (struct dlm_header *) entry->rqe_request;
5170+ if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY && hd->rh_lkid == lkid){
5171+ log_debug(ls, "reply_in_requestq cmd %d fr %d id %x",
5172+ hd->rh_cmd, entry->rqe_nodeid, lkid);
c1c6733f
AM
5173+ rv = TRUE;
5174+ break;
5175+ }
5176+ }
b7b72b66 5177+ up(&ls->ls_requestqueue_lock);
c1c6733f
AM
5178+
5179+ return rv;
5180+}
5181+
b7b72b66 5182+void allocate_and_copy_lvb(struct dlm_ls *ls, char **lvbptr, char *src)
c1c6733f
AM
5183+{
5184+ if (!*lvbptr)
5185+ *lvbptr = allocate_lvb(ls);
5186+ if (*lvbptr)
5187+ memcpy(*lvbptr, src, DLM_LVB_LEN);
5188+}
5189+
5190+/*
5191+ * Process a lockqueue LKB after it has had it's remote processing complete and
b7b72b66
AM
5192+ * been pulled from the lockqueue. Runs in the context of the DLM recvd thread
5193+ * on the machine that requested the lock.
c1c6733f
AM
5194+ */
5195+
b7b72b66
AM
5196+static void process_lockqueue_reply(struct dlm_lkb *lkb,
5197+ struct dlm_reply *reply,
5198+ uint32_t nodeid)
c1c6733f 5199+{
b7b72b66
AM
5200+ struct dlm_rsb *rsb = lkb->lkb_resource;
5201+ struct dlm_ls *ls = rsb->res_ls;
5202+ int oldstate, state = lkb->lkb_lockqueue_state;
c1c6733f 5203+
c1c6733f
AM
5204+ if (state)
5205+ remove_from_lockqueue(lkb);
5206+
5207+ switch (state) {
5208+ case GDLM_LQSTATE_WAIT_RSB:
5209+
b7b72b66
AM
5210+ if (reply->rl_status) {
5211+ DLM_ASSERT(reply->rl_status == -EEXIST,);
b7b72b66
AM
5212+ if (rsb->res_nodeid == -1) {
5213+ msleep(500);
5214+ remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
5215+ break;
5216+ }
5217+ } else {
5218+ if (reply->rl_nodeid == our_nodeid()) {
5219+ set_bit(RESFL_MASTER, &rsb->res_flags);
5220+ rsb->res_nodeid = 0;
5221+ } else {
5222+ clear_bit(RESFL_MASTER, &rsb->res_flags);
5223+ rsb->res_nodeid = reply->rl_nodeid;
5224+ }
5225+ }
c1c6733f 5226+
b7b72b66
AM
5227+ log_debug(ls, "(%d) lu rep %x fr %u %u", lkb->lkb_ownpid,
5228+ lkb->lkb_id, nodeid,
5229+ rsb->res_nodeid);
c1c6733f 5230+
b7b72b66
AM
5231+ lkb->lkb_nodeid = rsb->res_nodeid;
5232+ dlm_lock_stage2(ls, lkb, rsb, lkb->lkb_lockqueue_flags);
c1c6733f
AM
5233+ break;
5234+
5235+ case GDLM_LQSTATE_WAIT_CONVERT:
5236+ case GDLM_LQSTATE_WAIT_CONDGRANT:
5237+
5238+ /*
b7b72b66
AM
5239+ * the destination wasn't the master
5240+ * this implies the request was a CONDGRANT
5241+ */
5242+
5243+ if (reply->rl_status == -EINVAL) {
5244+ int master_nodeid;
5245+
5246+ DLM_ASSERT(state == GDLM_LQSTATE_WAIT_CONDGRANT, );
5247+
5248+ log_debug(ls, "(%d) req reply einval %x fr %d r %d %s",
5249+ lkb->lkb_ownpid, lkb->lkb_id, nodeid,
5250+ rsb->res_nodeid, rsb->res_name);
5251+
5252+ lkb_dequeue(lkb);
5253+
5254+ if (rsb->res_nodeid == lkb->lkb_nodeid || rsb->res_nodeid == -1){
5255+ /*
5256+ * We need to re-lookup the master and resend our
5257+ * request to it.
5258+ */
5259+
5260+ lkb->lkb_nodeid = -1;
5261+ rsb->res_nodeid = -1;
5262+
5263+ if (get_directory_nodeid(rsb) != our_nodeid())
5264+ remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
5265+ else {
5266+ int error = dlm_dir_lookup(ls, our_nodeid(),
5267+ rsb->res_name,
5268+ rsb->res_length,
5269+ &master_nodeid);
5270+ if (error == -EEXIST) {
5271+ /* don't expect this will happen */
5272+ log_all(ls, "EEXIST %x", lkb->lkb_id);
5273+ print_lkb(lkb);
5274+ print_rsb(rsb);
5275+ }
5276+
5277+ if (master_nodeid == our_nodeid()) {
5278+ set_bit(RESFL_MASTER, &rsb->res_flags);
5279+ master_nodeid = 0;
5280+ } else
5281+ clear_bit(RESFL_MASTER,&rsb->res_flags);
5282+
5283+ rsb->res_nodeid = master_nodeid;
5284+ lkb->lkb_nodeid = master_nodeid;
5285+
5286+ dlm_lock_stage2(ls, lkb, rsb,
5287+ lkb->lkb_lockqueue_flags);
5288+ }
5289+ } else {
5290+ /*
5291+ * Another request on this rsb has since found
5292+ * the master, we'll use that one although it too
5293+ * may be invalid requiring us to retry again.
5294+ */
5295+
5296+ lkb->lkb_nodeid = rsb->res_nodeid;
5297+ dlm_lock_stage2(ls, lkb, rsb,
5298+ lkb->lkb_lockqueue_flags);
5299+ }
5300+
5301+ break;
5302+ }
5303+
5304+
5305+ /*
c1c6733f
AM
5306+ * After a remote lock/conversion/grant request we put the lock
5307+ * on the right queue and send an AST if appropriate. Any lock
5308+ * shuffling (eg newly granted locks because this one was
5309+ * converted downwards) will be dealt with in seperate messages
5310+ * (which may be in the same network message)
5311+ */
5312+
5313+ if (!lkb->lkb_remid)
5314+ lkb->lkb_remid = reply->rl_lkid;
5315+
5316+ /*
5317+ * The remote request failed (we assume because of NOQUEUE).
5318+ * If this is a new request (non-conv) the lkb was created just
5319+ * for it so the lkb should be freed. If this was a
5320+ * conversion, the lkb already existed so we should put it back
5321+ * on the grant queue.
5322+ */
5323+
5324+ if (reply->rl_status != 0) {
b7b72b66 5325+ DLM_ASSERT(reply->rl_status == -EAGAIN,);
c1c6733f
AM
5326+
5327+ if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
5328+ res_lkb_dequeue(lkb);
b7b72b66
AM
5329+ lkb->lkb_retstatus = reply->rl_status;
5330+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
5331+ } else {
c1c6733f 5332+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
b7b72b66
AM
5333+ lkb->lkb_retstatus = reply->rl_status;
5334+ queue_ast(lkb, AST_COMP, 0);
5335+ }
c1c6733f
AM
5336+ break;
5337+ }
5338+
5339+ /*
5340+ * The remote request was successful in granting the request or
5341+ * queuing it to be granted later. Add the lkb to the
5342+ * appropriate rsb queue.
5343+ */
5344+
5345+ switch (reply->rl_lockstate) {
5346+ case GDLM_LKSTS_GRANTED:
5347+
5348+ /* Compact version of grant_lock(). */
5349+
5350+ down_write(&rsb->res_lock);
5351+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5352+ memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
5353+ DLM_LVB_LEN);
5354+
5355+ lkb->lkb_grmode = lkb->lkb_rqmode;
5356+ lkb->lkb_rqmode = DLM_LOCK_IV;
5357+ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5358+
5359+ if (lkb->lkb_range) {
5360+ lkb->lkb_range[GR_RANGE_START] =
5361+ lkb->lkb_range[RQ_RANGE_START];
5362+ lkb->lkb_range[GR_RANGE_END] =
5363+ lkb->lkb_range[RQ_RANGE_END];
5364+ }
5365+ up_write(&rsb->res_lock);
5366+
5367+ lkb->lkb_retstatus = 0;
b7b72b66 5368+ queue_ast(lkb, AST_COMP, 0);
c1c6733f
AM
5369+ break;
5370+
5371+ case GDLM_LKSTS_WAITING:
5372+
5373+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5374+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
5375+ else
5376+ log_error(ls, "wait reply for granted %x %u",
5377+ lkb->lkb_id, lkb->lkb_nodeid);
5378+ break;
5379+
5380+ case GDLM_LKSTS_CONVERT:
5381+
5382+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5383+ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
5384+ else
5385+ log_error(ls, "convert reply for granted %x %u",
5386+ lkb->lkb_id, lkb->lkb_nodeid);
5387+ break;
5388+
5389+ default:
5390+ log_error(ls, "process_lockqueue_reply state %d",
5391+ reply->rl_lockstate);
5392+ }
5393+
5394+ break;
5395+
5396+ case GDLM_LQSTATE_WAIT_UNLOCK:
5397+
5398+ /*
5399+ * Unlocks should never fail. Update local lock info. This
5400+ * always sends completion AST with status in lksb
5401+ */
5402+
b7b72b66 5403+ DLM_ASSERT(reply->rl_status == 0,);
c1c6733f
AM
5404+ oldstate = res_lkb_dequeue(lkb);
5405+
5406+ /* Differentiate between unlocks and conversion cancellations */
b7b72b66
AM
5407+ if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL) {
5408+ if (oldstate == GDLM_LKSTS_CONVERT) {
5409+ res_lkb_enqueue(lkb->lkb_resource, lkb,
5410+ GDLM_LKSTS_GRANTED);
5411+ lkb->lkb_retstatus = -DLM_ECANCEL;
5412+ queue_ast(lkb, AST_COMP, 0);
5413+ } else
5414+ log_error(ls, "cancel state %d", oldstate);
c1c6733f 5415+ } else {
b7b72b66
AM
5416+ DLM_ASSERT(oldstate == GDLM_LKSTS_GRANTED,
5417+ print_lkb(lkb););
5418+
c1c6733f 5419+ lkb->lkb_retstatus = -DLM_EUNLOCK;
b7b72b66 5420+ queue_ast(lkb, AST_COMP | AST_DEL, 0);
c1c6733f 5421+ }
c1c6733f
AM
5422+ break;
5423+
5424+ default:
5425+ log_error(ls, "process_lockqueue_reply id %x state %d",
5426+ lkb->lkb_id, state);
5427+ }
5428+}
5429+
5430+/*
5431+ * Tell a remote node to grant a lock. This happens when we are the master
5432+ * copy for a lock that is actually held on a remote node. The remote end is
5433+ * also responsible for sending the completion AST.
5434+ */
5435+
b7b72b66 5436+void remote_grant(struct dlm_lkb *lkb)
c1c6733f
AM
5437+{
5438+ struct writequeue_entry *e;
b7b72b66 5439+ struct dlm_request *req;
c1c6733f
AM
5440+
5441+ // TODO Error handling
5442+ e = lowcomms_get_buffer(lkb->lkb_nodeid,
b7b72b66 5443+ sizeof(struct dlm_request),
c1c6733f
AM
5444+ lkb->lkb_resource->res_ls->ls_allocation,
5445+ (char **) &req);
5446+ if (!e)
5447+ return;
5448+
5449+ req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
b7b72b66 5450+ req->rr_header.rh_length = sizeof(struct dlm_request);
c1c6733f
AM
5451+ req->rr_header.rh_flags = 0;
5452+ req->rr_header.rh_lkid = lkb->lkb_id;
5453+ req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
5454+ req->rr_remlkid = lkb->lkb_remid;
5455+ req->rr_flags = 0;
5456+
5457+ if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
5458+ /* This is a confusing non-standard use of rr_flags which is
5459+ * usually used to pass lockqueue_flags. */
5460+ req->rr_flags |= GDLM_LKFLG_DEMOTED;
5461+ }
5462+
5463+ add_request_lvb(lkb, req);
5464+ midcomms_send_buffer(&req->rr_header, e);
5465+}
5466+
b7b72b66 5467+void reply_and_grant(struct dlm_lkb *lkb)
c1c6733f 5468+{
b7b72b66
AM
5469+ struct dlm_request *req = lkb->lkb_request;
5470+ struct dlm_reply *reply;
c1c6733f
AM
5471+ struct writequeue_entry *e;
5472+
5473+ // TODO Error handling
5474+ e = lowcomms_get_buffer(lkb->lkb_nodeid,
b7b72b66 5475+ sizeof(struct dlm_reply),
c1c6733f
AM
5476+ lkb->lkb_resource->res_ls->ls_allocation,
5477+ (char **) &reply);
5478+ if (!e)
5479+ return;
5480+
5481+ reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5482+ reply->rl_header.rh_flags = 0;
b7b72b66 5483+ reply->rl_header.rh_length = sizeof(struct dlm_reply);
c1c6733f
AM
5484+ reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
5485+ reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
5486+
5487+ reply->rl_status = lkb->lkb_retstatus;
5488+ reply->rl_lockstate = lkb->lkb_status;
5489+ reply->rl_lkid = lkb->lkb_id;
5490+
b7b72b66 5491+ DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
c1c6733f
AM
5492+
5493+ lkb->lkb_request = NULL;
5494+
5495+ add_reply_lvb(lkb, reply);
5496+ midcomms_send_buffer(&reply->rl_header, e);
5497+}
5498+
5499+/*
5500+ * Request removal of a dead entry in the resource directory
5501+ */
5502+
b7b72b66
AM
5503+void remote_remove_direntry(struct dlm_ls *ls, int nodeid, char *name,
5504+ int namelen)
c1c6733f
AM
5505+{
5506+ struct writequeue_entry *e;
b7b72b66 5507+ struct dlm_request *req;
c1c6733f
AM
5508+
5509+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
b7b72b66 5510+ struct dlm_rcom *rc = allocate_rcom_buffer(ls);
c1c6733f
AM
5511+
5512+ memcpy(rc->rc_buf, name, namelen);
5513+ rc->rc_datalen = namelen;
5514+
5515+ rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
5516+
5517+ free_rcom_buffer(rc);
5518+ return;
5519+ }
5520+ // TODO Error handling
5521+ e = lowcomms_get_buffer(nodeid,
b7b72b66 5522+ sizeof(struct dlm_request) + namelen - 1,
c1c6733f
AM
5523+ ls->ls_allocation, (char **) &req);
5524+ if (!e)
5525+ return;
5526+
b7b72b66 5527+ memset(req, 0, sizeof(struct dlm_request) + namelen - 1);
c1c6733f
AM
5528+ req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
5529+ req->rr_header.rh_length =
b7b72b66 5530+ sizeof(struct dlm_request) + namelen - 1;
c1c6733f
AM
5531+ req->rr_header.rh_flags = 0;
5532+ req->rr_header.rh_lkid = 0;
5533+ req->rr_header.rh_lockspace = ls->ls_global_id;
5534+ req->rr_remlkid = 0;
c1c6733f
AM
5535+ memcpy(req->rr_name, name, namelen);
5536+
5537+ midcomms_send_buffer(&req->rr_header, e);
5538+}
5539+
5540+/*
5541+ * Send remote cluster request to directory or master node before the request
5542+ * is put on the lock queue. Runs in the context of the locking caller.
5543+ */
5544+
b7b72b66 5545+int send_cluster_request(struct dlm_lkb *lkb, int state)
c1c6733f
AM
5546+{
5547+ uint32_t target_nodeid;
b7b72b66
AM
5548+ struct dlm_rsb *rsb = lkb->lkb_resource;
5549+ struct dlm_ls *ls = rsb->res_ls;
5550+ struct dlm_request *req;
c1c6733f
AM
5551+ struct writequeue_entry *e;
5552+
c1c6733f
AM
5553+ if (state == GDLM_LQSTATE_WAIT_RSB)
5554+ target_nodeid = get_directory_nodeid(rsb);
b7b72b66
AM
5555+ else
5556+ target_nodeid = lkb->lkb_nodeid;
c1c6733f 5557+
b7b72b66
AM
5558+ /* during recovery it's valid for target_nodeid to equal our own;
5559+ resend_cluster_requests does this to get requests back on track */
5560+
5561+ DLM_ASSERT(target_nodeid && target_nodeid != -1,
5562+ print_lkb(lkb);
5563+ print_rsb(rsb);
5564+ printk("target_nodeid %u\n", target_nodeid););
c1c6733f
AM
5565+
5566+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5567+ /* this may happen when called by resend_cluster_request */
5568+ log_error(ls, "send_cluster_request to %u state %d recovery",
5569+ target_nodeid, state);
5570+ }
5571+
5572+ e = lowcomms_get_buffer(target_nodeid,
b7b72b66 5573+ sizeof(struct dlm_request) +
c1c6733f
AM
5574+ rsb->res_length - 1, ls->ls_allocation,
5575+ (char **) &req);
5576+ if (!e)
5577+ return -ENOBUFS;
b7b72b66 5578+ memset(req, 0, sizeof(struct dlm_request) + rsb->res_length - 1);
c1c6733f
AM
5579+
5580+ /* Common stuff, some are just defaults */
5581+
5582+ if (lkb->lkb_bastaddr)
b7b72b66 5583+ req->rr_asts = AST_BAST;
c1c6733f 5584+ if (lkb->lkb_astaddr)
b7b72b66 5585+ req->rr_asts |= AST_COMP;
c1c6733f
AM
5586+ if (lkb->lkb_parent)
5587+ req->rr_remparid = lkb->lkb_parent->lkb_remid;
5588+
5589+ req->rr_flags = lkb->lkb_lockqueue_flags;
5590+ req->rr_rqmode = lkb->lkb_rqmode;
5591+ req->rr_remlkid = lkb->lkb_remid;
b7b72b66 5592+ req->rr_pid = lkb->lkb_ownpid;
c1c6733f 5593+ req->rr_header.rh_length =
b7b72b66 5594+ sizeof(struct dlm_request) + rsb->res_length - 1;
c1c6733f
AM
5595+ req->rr_header.rh_flags = 0;
5596+ req->rr_header.rh_lkid = lkb->lkb_id;
5597+ req->rr_header.rh_lockspace = ls->ls_global_id;
5598+
5599+ switch (state) {
5600+
5601+ case GDLM_LQSTATE_WAIT_RSB:
5602+
b7b72b66
AM
5603+ DLM_ASSERT(!lkb->lkb_parent,
5604+ print_lkb(lkb);
5605+ print_rsb(rsb););
5606+
5607+ log_debug(ls, "(%d) send lu %x to %u",
5608+ lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
c1c6733f
AM
5609+
5610+ req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
5611+ memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5612+ break;
5613+
5614+ case GDLM_LQSTATE_WAIT_CONVERT:
5615+
b7b72b66
AM
5616+ DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5617+ print_lkb(lkb);
5618+ print_rsb(rsb););
5619+
5620+ log_debug(ls, "(%d) send cv %x to %u",
5621+ lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5622+
c1c6733f
AM
5623+ req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
5624+ if (lkb->lkb_range) {
5625+ req->rr_flags |= GDLM_LKFLG_RANGE;
5626+ req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5627+ req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5628+ }
5629+ break;
5630+
5631+ case GDLM_LQSTATE_WAIT_CONDGRANT:
5632+
b7b72b66
AM
5633+ DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5634+ print_lkb(lkb);
5635+ print_rsb(rsb););
5636+
5637+ log_debug(ls, "(%d) send rq %x to %u",
5638+ lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5639+
c1c6733f 5640+ req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
c1c6733f
AM
5641+ memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5642+ if (lkb->lkb_range) {
5643+ req->rr_flags |= GDLM_LKFLG_RANGE;
5644+ req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5645+ req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5646+ }
5647+ break;
5648+
5649+ case GDLM_LQSTATE_WAIT_UNLOCK:
5650+
b7b72b66
AM
5651+ log_debug(ls, "(%d) send un %x to %u",
5652+ lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5653+
c1c6733f
AM
5654+ req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
5655+ break;
5656+
5657+ default:
b7b72b66 5658+ DLM_ASSERT(0, printk("Unknown cluster request\n"););
c1c6733f
AM
5659+ }
5660+
5661+ add_request_lvb(lkb, req);
5662+ midcomms_send_buffer(&req->rr_header, e);
5663+
5664+ return 0;
5665+}
5666+
5667+/*
5668+ * We got a request from another cluster node, process it and return an info
5669+ * structure with the lock state/LVB etc as required. Executes in the DLM's
5670+ * recvd thread.
5671+ */
5672+
b7b72b66 5673+int process_cluster_request(int nodeid, struct dlm_header *req, int recovery)
c1c6733f 5674+{
b7b72b66
AM
5675+ struct dlm_ls *lspace;
5676+ struct dlm_lkb *lkb = NULL;
5677+ struct dlm_rsb *rsb;
c1c6733f 5678+ int send_reply = 0, status = 0, namelen;
b7b72b66
AM
5679+ struct dlm_request *freq = (struct dlm_request *) req;
5680+ struct dlm_reply *rp = (struct dlm_reply *) req;
5681+ struct dlm_reply reply;
c1c6733f
AM
5682+
5683+ lspace = find_lockspace_by_global_id(req->rh_lockspace);
5684+
5685+ if (!lspace) {
5686+ log_print("process_cluster_request invalid lockspace %x "
5687+ "from %d req %u", req->rh_lockspace, nodeid,
5688+ req->rh_cmd);
b7b72b66 5689+ return -EINVAL;
c1c6733f
AM
5690+ }
5691+
5692+ /* wait for recoverd to drain requestqueue */
5693+ if (!recovery)
5694+ wait_requestqueue(lspace);
5695+
5696+ /*
5697+ * If we're in recovery then queue the request for later. Otherwise,
5698+ * we still need to get the "in_recovery" lock to make sure the
5699+ * recovery itself doesn't start until we are done.
5700+ */
5701+ retry:
5702+ if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
b7b72b66
AM
5703+ if (!recovery)
5704+ add_to_requestqueue(lspace, nodeid, req);
c1c6733f
AM
5705+ status = -EINTR;
5706+ goto out;
5707+ }
5708+ if (!down_read_trylock(&lspace->ls_in_recovery)) {
5709+ schedule();
5710+ goto retry;
5711+ }
5712+
5713+
5714+ /*
5715+ * Process the request.
5716+ */
5717+
5718+ switch (req->rh_cmd) {
5719+
5720+ case GDLM_REMCMD_LOOKUP:
5721+ {
b7b72b66 5722+ uint32_t dir_nodeid, r_nodeid;
c1c6733f 5723+ int status;
c1c6733f
AM
5724+
5725+ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5726+
5727+ dir_nodeid = name_to_directory_nodeid(lspace,
5728+ freq->rr_name,
5729+ namelen);
5730+ if (dir_nodeid != our_nodeid())
5731+ log_debug(lspace, "ignoring directory lookup");
5732+
b7b72b66
AM
5733+ status = dlm_dir_lookup(lspace, nodeid, freq->rr_name,
5734+ namelen, &r_nodeid);
c1c6733f
AM
5735+ reply.rl_status = status;
5736+ reply.rl_lockstate = 0;
b7b72b66 5737+ reply.rl_nodeid = r_nodeid;
c1c6733f
AM
5738+ }
5739+ send_reply = 1;
5740+ break;
5741+
5742+ case GDLM_REMCMD_REM_RESDATA:
5743+
5744+ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
b7b72b66 5745+ dlm_dir_remove(lspace, nodeid, freq->rr_name, namelen);
c1c6733f
AM
5746+ break;
5747+
5748+ case GDLM_REMCMD_LOCKREQUEST:
5749+
5750+ lkb = remote_stage2(nodeid, lspace, freq);
5751+ if (lkb) {
5752+ lkb->lkb_request = freq;
b7b72b66
AM
5753+ lkb->lkb_ownpid = freq->rr_pid;
5754+ if (lkb->lkb_retstatus != -EINVAL)
5755+ dlm_lock_stage3(lkb);
c1c6733f
AM
5756+
5757+ /*
5758+ * If the request was granted in lock_stage3, then a
5759+ * reply message was already sent in combination with
5760+ * the grant message and lkb_request is NULL.
5761+ */
5762+
5763+ if (lkb->lkb_request) {
5764+ lkb->lkb_request = NULL;
5765+ send_reply = 1;
5766+ reply.rl_status = lkb->lkb_retstatus;
5767+ reply.rl_lockstate = lkb->lkb_status;
5768+ reply.rl_lkid = lkb->lkb_id;
5769+
5770+ /*
5771+ * If the request could not be granted and the
5772+ * user won't wait, then free up the LKB
5773+ */
5774+
b7b72b66 5775+ if (lkb->lkb_retstatus == -EAGAIN) {
c1c6733f
AM
5776+ rsb = lkb->lkb_resource;
5777+ release_lkb(lspace, lkb);
5778+ release_rsb(rsb);
5779+ lkb = NULL;
5780+ }
b7b72b66
AM
5781+ else if (lkb->lkb_retstatus == -EINVAL) {
5782+ release_lkb(lspace, lkb);
5783+ lkb = NULL;
5784+ }
c1c6733f
AM
5785+ }
5786+ } else {
5787+ reply.rl_status = -ENOMEM;
5788+ send_reply = 1;
5789+ }
5790+ break;
5791+
5792+ case GDLM_REMCMD_CONVREQUEST:
5793+
5794+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5795+
c783755a 5796+
b7b72b66
AM
5797+ DLM_ASSERT(lkb,
5798+ print_request(freq);
5799+ printk("nodeid %u\n", nodeid););
5800+
5801+ rsb = lkb->lkb_resource;
c1c6733f 5802+
b7b72b66
AM
5803+ DLM_ASSERT(rsb,
5804+ print_lkb(lkb);
5805+ print_request(freq);
5806+ printk("nodeid %u\n", nodeid););
5807+
5808+ DLM_ASSERT(!rsb->res_nodeid,
5809+ print_lkb(lkb);
5810+ print_rsb(rsb);
5811+ print_request(freq);
5812+ printk("nodeid %u\n", nodeid););
5813+
5814+ DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5815+ print_lkb(lkb);
5816+ print_rsb(rsb);
5817+ print_request(freq);
5818+ printk("nodeid %u\n", nodeid););
5819+
5820+ DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_GRANTED,
5821+ print_lkb(lkb);
5822+ print_rsb(rsb);
5823+ print_request(freq);
5824+ printk("nodeid %u\n", nodeid););
c1c6733f 5825+
c783755a
AM
5826+ /* Update orphan lock status */
5827+ if (freq->rr_flags & DLM_LKF_ORPHAN) {
5828+ lkb->lkb_flags |= GDLM_LKFLG_ORPHAN;
5829+ }
5830+
c1c6733f
AM
5831+ lkb->lkb_rqmode = freq->rr_rqmode;
5832+ lkb->lkb_lockqueue_flags = freq->rr_flags;
5833+ lkb->lkb_request = freq;
5834+ lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
5835+
b7b72b66
AM
5836+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK ||
5837+ freq->rr_flags & DLM_LKF_VALBLK) {
c1c6733f
AM
5838+ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
5839+ allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
5840+ freq->rr_lvb);
5841+ }
5842+
5843+ if (freq->rr_flags & GDLM_LKFLG_RANGE) {
5844+ if (lkb_set_range(lspace, lkb, freq->rr_range_start,
5845+ freq->rr_range_end)) {
5846+ reply.rl_status = -ENOMEM;
5847+ send_reply = 1;
5848+ goto out;
5849+ }
5850+ }
5851+
b7b72b66
AM
5852+ log_debug(lspace, "(%d) cv %u from %u %x \"%s\"",
5853+ lkb->lkb_ownpid, lkb->lkb_rqmode, nodeid,
5854+ lkb->lkb_id, rsb->res_name);
5855+
c1c6733f
AM
5856+ dlm_convert_stage2(lkb, FALSE);
5857+
5858+ /*
5859+ * If the conv request was granted in stage2, then a reply
5860+ * message was already sent in combination with the grant
5861+ * message.
5862+ */
5863+
5864+ if (lkb->lkb_request) {
5865+ lkb->lkb_request = NULL;
5866+ send_reply = 1;
5867+ reply.rl_status = lkb->lkb_retstatus;
5868+ reply.rl_lockstate = lkb->lkb_status;
5869+ reply.rl_lkid = lkb->lkb_id;
5870+ }
5871+ break;
5872+
5873+ case GDLM_REMCMD_LOCKREPLY:
5874+
b7b72b66 5875+ lkb = find_lock_by_id(lspace, req->rh_lkid);
c1c6733f 5876+
b7b72b66
AM
5877+ DLM_ASSERT(lkb,
5878+ print_reply(rp);
5879+ printk("nodeid %u\n", nodeid););
c1c6733f 5880+
b7b72b66
AM
5881+ DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5882+ print_lkb(lkb);
5883+ print_reply(rp);
5884+ printk("nodeid %u\n", nodeid););
5885+
5886+ process_lockqueue_reply(lkb, rp, nodeid);
c1c6733f
AM
5887+ break;
5888+
5889+ case GDLM_REMCMD_LOCKGRANT:
5890+
5891+ /*
5892+ * Remote lock has been granted asynchronously. Do a compact
5893+ * version of what grant_lock() does.
5894+ */
5895+
5896+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5897+
b7b72b66
AM
5898+ DLM_ASSERT(lkb,
5899+ print_request(freq);
5900+ printk("nodeid %u\n", nodeid););
c1c6733f
AM
5901+
5902+ rsb = lkb->lkb_resource;
5903+
b7b72b66
AM
5904+ DLM_ASSERT(rsb,
5905+ print_lkb(lkb);
5906+ print_request(freq);
5907+ printk("nodeid %u\n", nodeid););
5908+
5909+ DLM_ASSERT(rsb->res_nodeid,
5910+ print_lkb(lkb);
5911+ print_rsb(rsb);
5912+ print_request(freq);
5913+ printk("nodeid %u\n", nodeid););
5914+
5915+ DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5916+ print_lkb(lkb);
5917+ print_rsb(rsb);
5918+ print_request(freq);
5919+ printk("nodeid %u\n", nodeid););
5920+
5921+ if (lkb->lkb_lockqueue_state) {
5922+ log_debug(rsb->res_ls, "grant lock on lockqueue %d",
5923+ lkb->lkb_lockqueue_state);
5924+
5925+ /* Don't grant locks that are waiting for an unlock */
5926+ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK)
5927+ return 0;
5928+
5929+ print_lkb(lkb);
5930+ print_request(freq);
b7b72b66
AM
5931+ remove_from_lockqueue(lkb);
5932+ if (!lkb->lkb_remid)
5933+ lkb->lkb_remid = req->rh_lkid;
5934+ }
c1c6733f
AM
5935+
5936+ down_write(&rsb->res_lock);
5937+
5938+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
b7b72b66 5939+ allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr, freq->rr_lvb);
c1c6733f
AM
5940+
5941+ lkb->lkb_grmode = lkb->lkb_rqmode;
5942+ lkb->lkb_rqmode = DLM_LOCK_IV;
5943+
5944+ if (lkb->lkb_range) {
5945+ lkb->lkb_range[GR_RANGE_START] =
5946+ lkb->lkb_range[RQ_RANGE_START];
5947+ lkb->lkb_range[GR_RANGE_END] =
5948+ lkb->lkb_range[RQ_RANGE_END];
5949+ }
5950+
5951+ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5952+ up_write(&rsb->res_lock);
5953+
5954+ if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
5955+ lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
5956+
5957+ lkb->lkb_retstatus = 0;
b7b72b66 5958+ queue_ast(lkb, AST_COMP, 0);
c1c6733f
AM
5959+ break;
5960+
5961+ case GDLM_REMCMD_SENDBAST:
5962+
5963+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5964+
b7b72b66
AM
5965+ DLM_ASSERT(lkb,
5966+ print_request(freq);
5967+ printk("nodeid %u\n", nodeid););
c1c6733f
AM
5968+
5969+ if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
b7b72b66 5970+ queue_ast(lkb, AST_BAST, freq->rr_rqmode);
c1c6733f
AM
5971+ break;
5972+
5973+ case GDLM_REMCMD_SENDCAST:
5974+
5975+ /* This is only used for some error completion ASTs */
5976+
5977+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5978+
b7b72b66
AM
5979+ DLM_ASSERT(lkb,
5980+ print_request(freq);
5981+ printk("nodeid %u\n", nodeid););
c1c6733f
AM
5982+
5983+ /* Return the lock to granted status */
5984+ res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
c1c6733f 5985+ lkb->lkb_retstatus = freq->rr_status;
b7b72b66 5986+ queue_ast(lkb, AST_COMP, 0);
c1c6733f
AM
5987+ break;
5988+
5989+ case GDLM_REMCMD_UNLOCKREQUEST:
5990+
5991+ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5992+
b7b72b66
AM
5993+ DLM_ASSERT(lkb,
5994+ print_request(freq);
5995+ printk("nodeid %u\n", nodeid););
5996+
5997+ DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5998+ print_lkb(lkb);
5999+ print_request(freq);
6000+ printk("nodeid %u\n", nodeid););
6001+
6002+ DLM_ASSERT(lkb->lkb_nodeid == nodeid,
6003+ print_lkb(lkb);
6004+ print_request(freq);
6005+ printk("nodeid %u\n", nodeid););
6006+
6007+ rsb = find_rsb_to_unlock(lspace, lkb);
c1c6733f 6008+
b7b72b66
AM
6009+ log_debug(lspace, "(%d) un from %u %x \"%s\"", lkb->lkb_ownpid,
6010+ nodeid, lkb->lkb_id, rsb->res_name);
6011+
6012+ reply.rl_status = dlm_unlock_stage2(lkb, rsb, freq->rr_flags);
c1c6733f
AM
6013+ send_reply = 1;
6014+ break;
6015+
6016+ case GDLM_REMCMD_QUERY:
6017+ remote_query(nodeid, lspace, req);
6018+ break;
6019+
6020+ case GDLM_REMCMD_QUERYREPLY:
6021+ remote_query_reply(nodeid, lspace, req);
6022+ break;
6023+
6024+ default:
6025+ log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
6026+ }
6027+
6028+ up_read(&lspace->ls_in_recovery);
6029+
6030+ out:
6031+ if (send_reply) {
6032+ reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
6033+ reply.rl_header.rh_flags = 0;
6034+ reply.rl_header.rh_length = sizeof(reply);
6035+ reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
6036+ reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
6037+
6038+ status = midcomms_send_message(nodeid, &reply.rl_header,
6039+ GFP_KERNEL);
6040+ }
6041+
6042+ wake_astd();
b7b72b66 6043+ put_lockspace(lspace);
c1c6733f
AM
6044+ return status;
6045+}
6046+
b7b72b66 6047+static void add_reply_lvb(struct dlm_lkb *lkb, struct dlm_reply *reply)
c1c6733f
AM
6048+{
6049+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
6050+ memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
6051+}
6052+
b7b72b66 6053+static void add_request_lvb(struct dlm_lkb *lkb, struct dlm_request *req)
c1c6733f
AM
6054+{
6055+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
6056+ memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
6057+}
6058diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
6059--- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 6060+++ linux-patched/cluster/dlm/lockqueue.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
6061@@ -0,0 +1,29 @@
6062+/******************************************************************************
6063+*******************************************************************************
6064+**
6065+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6066+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6067+**
6068+** This copyrighted material is made available to anyone wishing to use,
6069+** modify, copy, or redistribute it subject to the terms and conditions
6070+** of the GNU General Public License v.2.
6071+**
6072+*******************************************************************************
6073+******************************************************************************/
6074+
6075+#ifndef __LOCKQUEUE_DOT_H__
6076+#define __LOCKQUEUE_DOT_H__
6077+
b7b72b66
AM
6078+void remote_grant(struct dlm_lkb * lkb);
6079+void reply_and_grant(struct dlm_lkb * lkb);
6080+int remote_stage(struct dlm_lkb * lkb, int state);
6081+int process_cluster_request(int csid, struct dlm_header *req, int recovery);
6082+int send_cluster_request(struct dlm_lkb * lkb, int state);
6083+void purge_requestqueue(struct dlm_ls * ls);
6084+int process_requestqueue(struct dlm_ls * ls);
6085+int reply_in_requestqueue(struct dlm_ls * ls, int lkid);
6086+void remote_remove_direntry(struct dlm_ls * ls, int nodeid, char *name,
6087+ int namelen);
6088+void allocate_and_copy_lvb(struct dlm_ls * ls, char **lvbptr, char *src);
c1c6733f
AM
6089+
6090+#endif /* __LOCKQUEUE_DOT_H__ */
6091diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
6092--- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
6093+++ linux-patched/cluster/dlm/lockspace.c 2004-11-03 11:31:56.000000000 +0800
6094@@ -0,0 +1,715 @@
c1c6733f
AM
6095+/******************************************************************************
6096+*******************************************************************************
6097+**
6098+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6099+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6100+**
6101+** This copyrighted material is made available to anyone wishing to use,
6102+** modify, copy, or redistribute it subject to the terms and conditions
6103+** of the GNU General Public License v.2.
6104+**
6105+*******************************************************************************
6106+******************************************************************************/
6107+
6108+#include <linux/module.h>
6109+
6110+#include "dlm_internal.h"
6111+#include "recoverd.h"
6112+#include "ast.h"
6113+#include "lkb.h"
6114+#include "nodes.h"
6115+#include "dir.h"
6116+#include "lowcomms.h"
6117+#include "config.h"
6118+#include "memory.h"
6119+#include "lockspace.h"
6120+#include "device.h"
6121+
6122+#define GDST_NONE (0)
6123+#define GDST_RUNNING (1)
6124+
b7b72b66
AM
6125+static int dlmstate;
6126+static int dlmcount;
6127+static struct semaphore dlmstate_lock;
c1c6733f
AM
6128+struct list_head lslist;
6129+spinlock_t lslist_lock;
6130+struct kcl_service_ops ls_ops;
6131+
6132+static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
6133+
6134+
6135+void dlm_lockspace_init(void)
6136+{
b7b72b66
AM
6137+ dlmstate = GDST_NONE;
6138+ dlmcount = 0;
6139+ init_MUTEX(&dlmstate_lock);
c1c6733f
AM
6140+ INIT_LIST_HEAD(&lslist);
6141+ spin_lock_init(&lslist_lock);
6142+}
6143+
b7b72b66 6144+struct dlm_ls *find_lockspace_by_name(char *name, int namelen)
c1c6733f 6145+{
b7b72b66 6146+ struct dlm_ls *ls;
c1c6733f
AM
6147+
6148+ spin_lock(&lslist_lock);
6149+
6150+ list_for_each_entry(ls, &lslist, ls_list) {
b7b72b66
AM
6151+ if (ls->ls_namelen == namelen &&
6152+ memcmp(ls->ls_name, name, namelen) == 0)
c1c6733f
AM
6153+ goto out;
6154+ }
6155+ ls = NULL;
6156+ out:
6157+ spin_unlock(&lslist_lock);
6158+ return ls;
6159+}
6160+
b7b72b66 6161+struct dlm_ls *find_lockspace_by_global_id(uint32_t id)
c1c6733f 6162+{
b7b72b66 6163+ struct dlm_ls *ls;
c1c6733f
AM
6164+
6165+ spin_lock(&lslist_lock);
6166+
6167+ list_for_each_entry(ls, &lslist, ls_list) {
b7b72b66
AM
6168+ if (ls->ls_global_id == id) {
6169+ ls->ls_count++;
c1c6733f 6170+ goto out;
b7b72b66 6171+ }
c1c6733f
AM
6172+ }
6173+ ls = NULL;
6174+ out:
6175+ spin_unlock(&lslist_lock);
6176+ return ls;
6177+}
6178+
b7b72b66 6179+struct dlm_ls *find_lockspace_by_local_id(void *id)
c1c6733f 6180+{
b7b72b66 6181+ struct dlm_ls *ls;
c1c6733f
AM
6182+
6183+ spin_lock(&lslist_lock);
6184+
6185+ list_for_each_entry(ls, &lslist, ls_list) {
b7b72b66
AM
6186+ if (ls->ls_local_id == (uint32_t)(long)id) {
6187+ ls->ls_count++;
c1c6733f 6188+ goto out;
b7b72b66 6189+ }
c1c6733f
AM
6190+ }
6191+ ls = NULL;
6192+ out:
6193+ spin_unlock(&lslist_lock);
6194+ return ls;
6195+}
6196+
b7b72b66
AM
6197+/* must be called with lslist_lock held */
6198+void hold_lockspace(struct dlm_ls *ls)
6199+{
6200+ ls->ls_count++;
6201+}
6202+
6203+void put_lockspace(struct dlm_ls *ls)
6204+{
6205+ spin_lock(&lslist_lock);
6206+ ls->ls_count--;
6207+ spin_unlock(&lslist_lock);
6208+}
6209+
6210+static void remove_lockspace(struct dlm_ls *ls)
6211+{
6212+ for (;;) {
6213+ spin_lock(&lslist_lock);
6214+ if (ls->ls_count == 0) {
6215+ list_del(&ls->ls_list);
6216+ spin_unlock(&lslist_lock);
6217+ return;
6218+ }
6219+ spin_unlock(&lslist_lock);
6220+ set_current_state(TASK_INTERRUPTIBLE);
6221+ schedule_timeout(HZ);
6222+ }
6223+}
6224+
c1c6733f
AM
6225+/*
6226+ * Called from dlm_init. These are the general threads which are not
b7b72b66 6227+ * lockspace-specific and work for all dlm lockspaces.
c1c6733f
AM
6228+ */
6229+
6230+static int threads_start(void)
6231+{
6232+ int error;
6233+
c1c6733f
AM
6234+ /* Thread which process lock requests for all ls's */
6235+ error = astd_start();
6236+ if (error) {
6237+ log_print("cannot start ast thread %d", error);
b7b72b66 6238+ goto fail;
c1c6733f
AM
6239+ }
6240+
6241+ /* Thread for sending/receiving messages for all ls's */
6242+ error = lowcomms_start();
6243+ if (error) {
6244+ log_print("cannot start lowcomms %d", error);
6245+ goto astd_fail;
6246+ }
6247+
6248+ return 0;
6249+
6250+ astd_fail:
6251+ astd_stop();
6252+
c1c6733f
AM
6253+ fail:
6254+ return error;
6255+}
6256+
6257+static void threads_stop(void)
6258+{
6259+ lowcomms_stop();
6260+ astd_stop();
c1c6733f
AM
6261+}
6262+
6263+static int init_internal(void)
6264+{
6265+ int error = 0;
6266+
b7b72b66
AM
6267+ if (dlmstate == GDST_RUNNING)
6268+ dlmcount++;
c1c6733f
AM
6269+ else {
6270+ error = threads_start();
6271+ if (error)
6272+ goto out;
6273+
b7b72b66
AM
6274+ dlmstate = GDST_RUNNING;
6275+ dlmcount = 1;
c1c6733f
AM
6276+ }
6277+
6278+ out:
6279+ return error;
6280+}
6281+
c1c6733f 6282+/*
b7b72b66 6283+ * Called after dlm module is loaded and before any lockspaces are created.
c1c6733f
AM
6284+ * Starts and initializes global threads and structures. These global entities
6285+ * are shared by and independent of all lockspaces.
6286+ *
b7b72b66 6287+ * There should be a dlm-specific user command which a person can run which
c1c6733f
AM
6288+ * calls this function. If a user hasn't run that command and something
6289+ * creates a new lockspace, this is called first.
6290+ *
6291+ * This also starts the default lockspace.
6292+ */
6293+
6294+int dlm_init(void)
6295+{
6296+ int error;
6297+
b7b72b66 6298+ down(&dlmstate_lock);
c1c6733f 6299+ error = init_internal();
b7b72b66 6300+ up(&dlmstate_lock);
c1c6733f
AM
6301+
6302+ return error;
6303+}
6304+
6305+int dlm_release(void)
6306+{
6307+ int error = 0;
6308+
b7b72b66 6309+ down(&dlmstate_lock);
c1c6733f 6310+
b7b72b66 6311+ if (dlmstate == GDST_NONE)
c1c6733f
AM
6312+ goto out;
6313+
b7b72b66
AM
6314+ if (dlmcount)
6315+ dlmcount--;
c1c6733f 6316+
b7b72b66 6317+ if (dlmcount)
c1c6733f
AM
6318+ goto out;
6319+
6320+ spin_lock(&lslist_lock);
6321+ if (!list_empty(&lslist)) {
6322+ spin_unlock(&lslist_lock);
6323+ log_print("cannot stop threads, lockspaces still exist");
6324+ goto out;
6325+ }
6326+ spin_unlock(&lslist_lock);
6327+
6328+ threads_stop();
b7b72b66 6329+ dlmstate = GDST_NONE;
c1c6733f
AM
6330+
6331+ out:
b7b72b66 6332+ up(&dlmstate_lock);
c1c6733f
AM
6333+
6334+ return error;
6335+}
6336+
b7b72b66 6337+struct dlm_ls *allocate_ls(int namelen)
c1c6733f 6338+{
b7b72b66 6339+ struct dlm_ls *ls;
c1c6733f 6340+
b7b72b66 6341+ ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
c1c6733f 6342+ if (ls)
b7b72b66 6343+ memset(ls, 0, sizeof(struct dlm_ls) + namelen);
c1c6733f
AM
6344+
6345+ return ls;
6346+}
6347+
c1c6733f
AM
6348+static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
6349+{
b7b72b66
AM
6350+ struct dlm_ls *ls;
6351+ int i, size, error = -ENOMEM;
c1c6733f
AM
6352+ uint32_t local_id = 0;
6353+
6354+ if (!try_module_get(THIS_MODULE))
6355+ return -EINVAL;
6356+
6357+ if (namelen > MAX_SERVICE_NAME_LEN)
6358+ return -EINVAL;
6359+
b7b72b66
AM
6360+ ls = find_lockspace_by_name(name, namelen);
6361+ if (ls) {
6362+ *lockspace = (void *)(long) ls->ls_local_id;
c1c6733f
AM
6363+ return -EEXIST;
6364+ }
6365+
6366+ /*
6367+ * Initialize ls fields
6368+ */
6369+
6370+ ls = allocate_ls(namelen);
6371+ if (!ls)
6372+ goto out;
6373+
6374+ memcpy(ls->ls_name, name, namelen);
6375+ ls->ls_namelen = namelen;
6376+
6377+ ls->ls_allocation = GFP_KERNEL;
b7b72b66
AM
6378+ ls->ls_count = 0;
6379+ ls->ls_flags = 0;
6380+
6381+ size = dlm_config.rsbtbl_size;
6382+ ls->ls_rsbtbl_size = size;
c1c6733f 6383+
b7b72b66
AM
6384+ ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
6385+ if (!ls->ls_rsbtbl)
c1c6733f 6386+ goto out_lsfree;
b7b72b66
AM
6387+ for (i = 0; i < size; i++) {
6388+ INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
6389+ rwlock_init(&ls->ls_rsbtbl[i].lock);
6390+ }
6391+
6392+ size = dlm_config.lkbtbl_size;
6393+ ls->ls_lkbtbl_size = size;
c1c6733f 6394+
b7b72b66
AM
6395+ ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
6396+ if (!ls->ls_lkbtbl)
6397+ goto out_rsbfree;
6398+ for (i = 0; i < size; i++) {
6399+ INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
6400+ rwlock_init(&ls->ls_lkbtbl[i].lock);
6401+ ls->ls_lkbtbl[i].counter = 1;
6402+ }
c1c6733f 6403+
b7b72b66
AM
6404+ size = dlm_config.dirtbl_size;
6405+ ls->ls_dirtbl_size = size;
c1c6733f 6406+
b7b72b66
AM
6407+ ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
6408+ if (!ls->ls_dirtbl)
6409+ goto out_lkbfree;
6410+ for (i = 0; i < size; i++) {
6411+ INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
6412+ rwlock_init(&ls->ls_dirtbl[i].lock);
6413+ }
c1c6733f
AM
6414+
6415+ INIT_LIST_HEAD(&ls->ls_nodes);
c1c6733f 6416+ INIT_LIST_HEAD(&ls->ls_nodes_gone);
b7b72b66 6417+ ls->ls_num_nodes = 0;
bb1d8b11 6418+ ls->ls_node_array = NULL;
c783755a
AM
6419+ ls->ls_recoverd_task = NULL;
6420+ init_MUTEX(&ls->ls_recoverd_lock);
c1c6733f
AM
6421+ INIT_LIST_HEAD(&ls->ls_recover);
6422+ spin_lock_init(&ls->ls_recover_lock);
6423+ INIT_LIST_HEAD(&ls->ls_recover_list);
6424+ ls->ls_recover_list_count = 0;
6425+ spin_lock_init(&ls->ls_recover_list_lock);
6426+ init_waitqueue_head(&ls->ls_wait_general);
b7b72b66 6427+ INIT_LIST_HEAD(&ls->ls_rootres);
c1c6733f
AM
6428+ INIT_LIST_HEAD(&ls->ls_requestqueue);
6429+ INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
6430+ ls->ls_last_stop = 0;
6431+ ls->ls_last_start = 0;
6432+ ls->ls_last_finish = 0;
6433+ ls->ls_rcom_msgid = 0;
b7b72b66 6434+ init_MUTEX(&ls->ls_requestqueue_lock);
c1c6733f 6435+ init_MUTEX(&ls->ls_rcom_lock);
c1c6733f 6436+ init_rwsem(&ls->ls_unlock_sem);
b7b72b66
AM
6437+ init_rwsem(&ls->ls_root_lock);
6438+ init_rwsem(&ls->ls_in_recovery);
c1c6733f 6439+
b7b72b66 6440+ down_write(&ls->ls_in_recovery);
c1c6733f
AM
6441+
6442+ if (flags & DLM_LSF_NOTIMERS)
6443+ set_bit(LSFL_NOTIMERS, &ls->ls_flags);
6444+
b7b72b66 6445+
c1c6733f
AM
6446+ /*
6447+ * Connect this lockspace with the cluster manager
6448+ */
6449+
6450+ error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
6451+ &ls_ops, TRUE, (void *) ls, &local_id);
6452+ if (error)
b7b72b66 6453+ goto out_recoverd;
c1c6733f
AM
6454+
6455+ ls->ls_state = LSST_INIT;
6456+ ls->ls_local_id = local_id;
6457+
6458+ spin_lock(&lslist_lock);
6459+ list_add(&ls->ls_list, &lslist);
6460+ spin_unlock(&lslist_lock);
6461+
6462+ error = kcl_join_service(local_id);
6463+ if (error) {
6464+ log_error(ls, "service manager join error %d", error);
6465+ goto out_reg;
6466+ }
6467+
6468+ /* The ls isn't actually running until it receives a start() from CMAN.
b7b72b66 6469+ Neither does it have a global ls id until started. */
c1c6733f
AM
6470+
6471+ /* Return the local ID as the lockspace handle. I've left this
6472+ cast to a void* as it allows us to replace it with pretty much
6473+ anything at a future date without breaking clients. But returning
6474+ the address of the lockspace is a bad idea as it could get
6475+ forcibly removed, leaving client with a dangling pointer */
c1c6733f 6476+
b7b72b66 6477+ *lockspace = (void *)(long) local_id;
c1c6733f
AM
6478+ return 0;
6479+
b7b72b66 6480+ out_reg:
c1c6733f 6481+ kcl_unregister_service(ls->ls_local_id);
b7b72b66 6482+ out_recoverd:
c783755a 6483+ dlm_recoverd_stop(ls);
b7b72b66
AM
6484+ kfree(ls->ls_dirtbl);
6485+ out_lkbfree:
6486+ kfree(ls->ls_lkbtbl);
6487+ out_rsbfree:
6488+ kfree(ls->ls_rsbtbl);
6489+ out_lsfree:
6490+ kfree(ls);
6491+ out:
c1c6733f
AM
6492+ return error;
6493+}
6494+
6495+/*
6496+ * Called by a system like GFS which wants independent lock spaces.
6497+ */
6498+
6499+int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
6500+{
6501+ int error = -ENOSYS;
6502+
b7b72b66 6503+ down(&dlmstate_lock);
c1c6733f
AM
6504+ error = init_internal();
6505+ if (error)
6506+ goto out;
6507+
6508+ error = new_lockspace(name, namelen, lockspace, flags);
b7b72b66
AM
6509+ out:
6510+ up(&dlmstate_lock);
c1c6733f
AM
6511+ return error;
6512+}
6513+
6514+/* Return 1 if the lockspace still has active remote locks,
6515+ * 2 if the lockspace still has active local locks.
6516+ */
b7b72b66
AM
6517+static int lockspace_busy(struct dlm_ls *ls)
6518+{
6519+ int i, lkb_found = 0;
6520+ struct dlm_lkb *lkb;
6521+
6522+ /* NOTE: We check the lockidtbl here rather than the resource table.
6523+ This is because there may be LKBs queued as ASTs that have been
6524+ unlinked from their RSBs and are pending deletion once the AST has
6525+ been delivered */
6526+
6527+ for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6528+ read_lock(&ls->ls_lkbtbl[i].lock);
6529+ if (!list_empty(&ls->ls_lkbtbl[i].list)) {
6530+ lkb_found = 1;
6531+ list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
6532+ lkb_idtbl_list) {
6533+ if (!lkb->lkb_nodeid) {
6534+ read_unlock(&ls->ls_lkbtbl[i].lock);
6535+ return 2;
6536+ }
6537+ }
c1c6733f 6538+ }
b7b72b66 6539+ read_unlock(&ls->ls_lkbtbl[i].lock);
c1c6733f 6540+ }
b7b72b66 6541+ return lkb_found;
c1c6733f
AM
6542+}
6543+
b7b72b66 6544+static int release_lockspace(struct dlm_ls *ls, int force)
c1c6733f 6545+{
b7b72b66
AM
6546+ struct dlm_lkb *lkb;
6547+ struct dlm_rsb *rsb;
6548+ struct dlm_recover *rv;
c1c6733f
AM
6549+ struct list_head *head;
6550+ int i;
6551+ int busy = lockspace_busy(ls);
6552+
6553+ /* Don't destroy a busy lockspace */
6554+ if (busy > force)
6555+ return -EBUSY;
6556+
6557+ if (force < 3) {
6558+ kcl_leave_service(ls->ls_local_id);
6559+ kcl_unregister_service(ls->ls_local_id);
6560+ }
6561+
c783755a 6562+ dlm_recoverd_stop(ls);
b7b72b66
AM
6563+
6564+ remove_lockspace(ls);
c1c6733f
AM
6565+
6566+ /*
b7b72b66 6567+ * Free direntry structs.
c1c6733f
AM
6568+ */
6569+
b7b72b66
AM
6570+ dlm_dir_clear(ls);
6571+ kfree(ls->ls_dirtbl);
c1c6733f
AM
6572+
6573+ /*
b7b72b66 6574+ * Free all lkb's on lkbtbl[] lists.
c1c6733f
AM
6575+ */
6576+
b7b72b66
AM
6577+ for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6578+ head = &ls->ls_lkbtbl[i].list;
c1c6733f 6579+ while (!list_empty(head)) {
b7b72b66
AM
6580+ lkb = list_entry(head->next, struct dlm_lkb,
6581+ lkb_idtbl_list);
c1c6733f
AM
6582+ list_del(&lkb->lkb_idtbl_list);
6583+
6584+ if (lkb->lkb_lockqueue_state)
6585+ remove_from_lockqueue(lkb);
6586+
b7b72b66 6587+ if (lkb->lkb_astflags & (AST_COMP | AST_BAST))
c1c6733f
AM
6588+ list_del(&lkb->lkb_astqueue);
6589+
b7b72b66 6590+ if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
c1c6733f
AM
6591+ free_lvb(lkb->lkb_lvbptr);
6592+
6593+ free_lkb(lkb);
6594+ }
6595+ }
6596+
b7b72b66 6597+ kfree(ls->ls_lkbtbl);
c1c6733f
AM
6598+
6599+ /*
b7b72b66 6600+ * Free all rsb's on rsbtbl[] lists
c1c6733f
AM
6601+ */
6602+
b7b72b66
AM
6603+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
6604+ head = &ls->ls_rsbtbl[i].list;
c1c6733f 6605+ while (!list_empty(head)) {
b7b72b66
AM
6606+ rsb = list_entry(head->next, struct dlm_rsb,
6607+ res_hashchain);
c1c6733f
AM
6608+ list_del(&rsb->res_hashchain);
6609+
6610+ if (rsb->res_lvbptr)
6611+ free_lvb(rsb->res_lvbptr);
6612+
6613+ free_rsb(rsb);
6614+ }
6615+ }
6616+
b7b72b66 6617+ kfree(ls->ls_rsbtbl);
c1c6733f
AM
6618+
6619+ /*
6620+ * Free structures on any other lists
6621+ */
6622+
6623+ head = &ls->ls_recover;
6624+ while (!list_empty(head)) {
b7b72b66
AM
6625+ rv = list_entry(head->next, struct dlm_recover, list);
6626+ list_del(&rv->list);
6627+ kfree(rv);
c1c6733f
AM
6628+ }
6629+
c783755a 6630+ clear_free_de(ls);
c1c6733f 6631+
c783755a
AM
6632+ ls_nodes_clear(ls);
6633+ ls_nodes_gone_clear(ls);
bb1d8b11
AM
6634+ if (ls->ls_node_array)
6635+ kfree(ls->ls_node_array);
c1c6733f 6636+
b7b72b66 6637+ kfree(ls);
c1c6733f 6638+ dlm_release();
c1c6733f
AM
6639+ module_put(THIS_MODULE);
6640+ return 0;
6641+}
6642+
6643+
6644+/*
6645+ * Called when a system has released all its locks and is not going to use the
6646+ * lockspace any longer. We blindly free everything we're managing for this
6647+ * lockspace. Remaining nodes will go through the recovery process as if we'd
6648+ * died. The lockspace must continue to function as usual, participating in
6649+ * recoveries, until kcl_leave_service returns.
6650+ *
6651+ * Force has 4 possible values:
6652+ * 0 - don't destroy locksapce if it has any LKBs
6653+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
6654+ * 2 - destroy lockspace regardless of LKBs
6655+ * 3 - destroy lockspace as part of a forced shutdown
6656+ */
6657+
6658+int dlm_release_lockspace(void *lockspace, int force)
6659+{
b7b72b66 6660+ struct dlm_ls *ls;
c1c6733f
AM
6661+
6662+ ls = find_lockspace_by_local_id(lockspace);
6663+ if (!ls)
b7b72b66
AM
6664+ return -EINVAL;
6665+ put_lockspace(ls);
c1c6733f
AM
6666+ return release_lockspace(ls, force);
6667+}
6668+
6669+
6670+/* Called when the cluster is being shut down dirtily */
6671+void dlm_emergency_shutdown()
6672+{
b7b72b66
AM
6673+ struct dlm_ls *ls;
6674+ struct dlm_ls *tmp;
c1c6733f
AM
6675+
6676+ /* Shut lowcomms down to prevent any socket activity */
6677+ lowcomms_stop_accept();
6678+
6679+ /* Delete the devices that belong the the userland
6680+ lockspaces to be deleted. */
6681+ dlm_device_free_devices();
6682+
6683+ /* Now try to clean the lockspaces */
6684+ spin_lock(&lslist_lock);
6685+
6686+ list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
6687+ spin_unlock(&lslist_lock);
6688+ release_lockspace(ls, 3);
6689+ spin_lock(&lslist_lock);
6690+ }
6691+
6692+ spin_unlock(&lslist_lock);
6693+}
6694+
b7b72b66 6695+struct dlm_recover *allocate_dlm_recover(void)
c1c6733f 6696+{
b7b72b66 6697+ struct dlm_recover *rv;
c1c6733f 6698+
b7b72b66
AM
6699+ rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
6700+ if (rv)
6701+ memset(rv, 0, sizeof(struct dlm_recover));
6702+ return rv;
c1c6733f
AM
6703+}
6704+
6705+/*
6706+ * Called by CMAN on a specific ls. "stop" means set flag which while set
6707+ * causes all new requests to ls to be queued and not submitted until flag is
6708+ * cleared. stop on a ls also needs to cancel any prior starts on the ls.
6709+ * The recoverd thread carries out any work called for by this event.
6710+ */
6711+
6712+static int dlm_ls_stop(void *servicedata)
6713+{
b7b72b66 6714+ struct dlm_ls *ls = (struct dlm_ls *) servicedata;
c1c6733f
AM
6715+ int new;
6716+
6717+ spin_lock(&ls->ls_recover_lock);
6718+ ls->ls_last_stop = ls->ls_last_start;
6719+ set_bit(LSFL_LS_STOP, &ls->ls_flags);
6720+ new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
6721+ spin_unlock(&ls->ls_recover_lock);
6722+
6723+ /*
6724+ * This in_recovery lock does two things:
6725+ *
6726+ * 1) Keeps this function from returning until all threads are out
6727+ * of locking routines and locking is truely stopped.
6728+ * 2) Keeps any new requests from being processed until it's unlocked
6729+ * when recovery is complete.
6730+ */
6731+
6732+ if (new)
6733+ down_write(&ls->ls_in_recovery);
6734+
6735+ clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
6736+ clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
6737+ clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
6738+ clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
6739+
b7b72b66 6740+ dlm_recoverd_kick(ls);
c1c6733f
AM
6741+
6742+ return 0;
6743+}
6744+
6745+/*
6746+ * Called by CMAN on a specific ls. "start" means enable the lockspace to do
6747+ * request processing which first requires that the recovery procedure be
6748+ * stepped through with all nodes sharing the lockspace (nodeids). The first
6749+ * start on the ls after it's created is a special case and requires some extra
6750+ * work like figuring out our own local nodeid. We can't do all this in the
6751+ * calling CMAN context, so we must pass this work off to the recoverd thread
b7b72b66 6752+ * which was created in dlm_init(). The recoverd thread carries out any work
c1c6733f
AM
6753+ * called for by this event.
6754+ */
6755+
6756+static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
6757+ int event_id, int type)
6758+{
b7b72b66
AM
6759+ struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6760+ struct dlm_recover *rv;
c1c6733f
AM
6761+ int error = -ENOMEM;
6762+
b7b72b66
AM
6763+ rv = allocate_dlm_recover();
6764+ if (!rv)
c1c6733f
AM
6765+ goto out;
6766+
b7b72b66
AM
6767+ rv->nodeids = nodeids;
6768+ rv->node_count = count;
6769+ rv->event_id = event_id;
c1c6733f
AM
6770+
6771+ spin_lock(&ls->ls_recover_lock);
b7b72b66
AM
6772+ if (ls->ls_last_start == event_id)
6773+ log_all(ls, "repeated start %d stop %d finish %d",
6774+ event_id, ls->ls_last_stop, ls->ls_last_finish);
c1c6733f 6775+ ls->ls_last_start = event_id;
b7b72b66 6776+ list_add_tail(&rv->list, &ls->ls_recover);
c1c6733f
AM
6777+ set_bit(LSFL_LS_START, &ls->ls_flags);
6778+ spin_unlock(&ls->ls_recover_lock);
6779+
b7b72b66 6780+ dlm_recoverd_kick(ls);
c1c6733f
AM
6781+ error = 0;
6782+
6783+ out:
6784+ return error;
6785+}
6786+
6787+/*
6788+ * Called by CMAN on a specific ls. "finish" means that all nodes which
6789+ * received a "start" have completed the start and called kcl_start_done.
6790+ * The recoverd thread carries out any work called for by this event.
6791+ */
6792+
6793+static void dlm_ls_finish(void *servicedata, int event_id)
6794+{
b7b72b66 6795+ struct dlm_ls *ls = (struct dlm_ls *) servicedata;
c1c6733f
AM
6796+
6797+ spin_lock(&ls->ls_recover_lock);
6798+ ls->ls_last_finish = event_id;
6799+ set_bit(LSFL_LS_FINISH, &ls->ls_flags);
6800+ spin_unlock(&ls->ls_recover_lock);
6801+
b7b72b66 6802+ dlm_recoverd_kick(ls);
c1c6733f
AM
6803+}
6804+
6805+struct kcl_service_ops ls_ops = {
6806+ .stop = dlm_ls_stop,
6807+ .start = dlm_ls_start,
6808+ .finish = dlm_ls_finish
6809+};
6810diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
6811--- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 6812+++ linux-patched/cluster/dlm/lockspace.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
6813@@ -0,0 +1,29 @@
6814+/******************************************************************************
6815+*******************************************************************************
6816+**
6817+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6818+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6819+**
6820+** This copyrighted material is made available to anyone wishing to use,
6821+** modify, copy, or redistribute it subject to the terms and conditions
6822+** of the GNU General Public License v.2.
6823+**
6824+*******************************************************************************
6825+******************************************************************************/
6826+
6827+#ifndef __LOCKSPACE_DOT_H__
6828+#define __LOCKSPACE_DOT_H__
6829+
6830+void dlm_lockspace_init(void);
6831+int dlm_init(void);
6832+int dlm_release(void);
6833+int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
6834+int dlm_release_lockspace(void *ls, int force);
c1c6733f 6835+void dlm_emergency_shutdown(void);
b7b72b66
AM
6836+struct dlm_ls *find_lockspace_by_global_id(uint32_t id);
6837+struct dlm_ls *find_lockspace_by_local_id(void *id);
6838+struct dlm_ls *find_lockspace_by_name(char *name, int namelen);
6839+void hold_lockspace(struct dlm_ls *ls);
6840+void put_lockspace(struct dlm_ls *ls);
c1c6733f
AM
6841+
6842+#endif /* __LOCKSPACE_DOT_H__ */
6843diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
6844--- linux-orig/cluster/dlm/lowcomms.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
6845+++ linux-patched/cluster/dlm/lowcomms.c 2004-11-03 11:31:56.000000000 +0800
6846@@ -0,0 +1,1415 @@
c1c6733f
AM
6847+/******************************************************************************
6848+*******************************************************************************
6849+**
6850+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
6851+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6852+**
6853+** This copyrighted material is made available to anyone wishing to use,
6854+** modify, copy, or redistribute it subject to the terms and conditions
6855+** of the GNU General Public License v.2.
6856+**
6857+*******************************************************************************
6858+******************************************************************************/
6859+
6860+/*
6861+ * lowcomms.c
6862+ *
6863+ * This is the "low-level" comms layer.
6864+ *
6865+ * It is responsible for sending/receiving messages
6866+ * from other nodes in the cluster.
6867+ *
6868+ * Cluster nodes are referred to by their nodeids. nodeids are
6869+ * simply 32 bit numbers to the locking module - if they need to
6870+ * be expanded for the cluster infrastructure then that is it's
6871+ * responsibility. It is this layer's
6872+ * responsibility to resolve these into IP address or
6873+ * whatever it needs for inter-node communication.
6874+ *
6875+ * The comms level is two kernel threads that deal mainly with
6876+ * the receiving of messages from other nodes and passing them
6877+ * up to the mid-level comms layer (which understands the
6878+ * message format) for execution by the locking core, and
6879+ * a send thread which does all the setting up of connections
6880+ * to remote nodes and the sending of data. Threads are not allowed
6881+ * to send their own data because it may cause them to wait in times
6882+ * of high load. Also, this way, the sending thread can collect together
6883+ * messages bound for one node and send them in one block.
6884+ *
6885+ * I don't see any problem with the recv thread executing the locking
6886+ * code on behalf of remote processes as the locking code is
6887+ * short, efficient and never waits.
6888+ *
6889+ */
6890+
6891+
6892+#include <asm/ioctls.h>
6893+#include <net/sock.h>
6894+#include <net/tcp.h>
6895+#include <linux/pagemap.h>
6896+#include <cluster/cnxman.h>
6897+
6898+#include "dlm_internal.h"
6899+#include "lowcomms.h"
6900+#include "midcomms.h"
6901+#include "config.h"
6902+
6903+struct cbuf {
6904+ unsigned base;
6905+ unsigned len;
6906+ unsigned mask;
6907+};
6908+
6909+#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
6910+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
6911+#define CBUF_EMPTY(cb) ((cb)->len == 0)
6912+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
6913+#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \
6914+ (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
6915+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
6916+
6917+struct connection {
6918+ struct socket *sock; /* NULL if not connected */
6919+ uint32_t nodeid; /* So we know who we are in the list */
6920+ struct rw_semaphore sock_sem; /* Stop connect races */
6921+ struct list_head read_list; /* On this list when ready for reading */
6922+ struct list_head write_list; /* On this list when ready for writing */
6923+ struct list_head state_list; /* On this list when ready to connect */
6924+ unsigned long flags; /* bit 1,2 = We are on the read/write lists */
6925+#define CF_READ_PENDING 1
6926+#define CF_WRITE_PENDING 2
6927+#define CF_CONNECT_PENDING 3
c783755a 6928+#define CF_IS_OTHERCON 4
c1c6733f
AM
6929+ struct list_head writequeue; /* List of outgoing writequeue_entries */
6930+ struct list_head listenlist; /* List of allocated listening sockets */
6931+ spinlock_t writequeue_lock;
6932+ int (*rx_action) (struct connection *); /* What to do when active */
6933+ struct page *rx_page;
6934+ struct cbuf cb;
6935+ int retries;
b7b72b66 6936+ atomic_t waiting_requests;
c1c6733f 6937+#define MAX_CONNECT_RETRIES 3
c783755a 6938+ struct connection *othercon;
c1c6733f
AM
6939+};
6940+#define sock2con(x) ((struct connection *)(x)->sk_user_data)
c1c6733f
AM
6941+
6942+/* An entry waiting to be sent */
6943+struct writequeue_entry {
6944+ struct list_head list;
6945+ struct page *page;
6946+ int offset;
6947+ int len;
6948+ int end;
6949+ int users;
6950+ struct connection *con;
6951+};
6952+
6953+/* "Template" structure for IPv4 and IPv6 used to fill
6954+ * in the missing bits when converting between cman (which knows
6955+ * nothing about sockaddr structs) and real life where we actually
6956+ * have to connect to these addresses. Also one of these structs
6957+ * will hold the cached "us" address.
6958+ *
6959+ * It's an in6 sockaddr just so there's enough space for anything
6960+ * we're likely to see here.
6961+ */
6962+static struct sockaddr_in6 local_addr;
6963+
6964+/* Manage daemons */
c783755a
AM
6965+static struct task_struct *recv_task;
6966+static struct task_struct *send_task;
c1c6733f
AM
6967+
6968+static wait_queue_t lowcomms_send_waitq_head;
6969+static wait_queue_head_t lowcomms_send_waitq;
c1c6733f
AM
6970+static wait_queue_t lowcomms_recv_waitq_head;
6971+static wait_queue_head_t lowcomms_recv_waitq;
6972+
c783755a
AM
6973+/* An array of pointers to connections, indexed by NODEID */
6974+static struct connection **connections;
6975+static struct rw_semaphore connections_lock;
6976+static kmem_cache_t *con_cache;
6977+static int conn_array_size;
6978+static atomic_t accepting;
6979+
c1c6733f
AM
6980+/* List of sockets that have reads pending */
6981+static struct list_head read_sockets;
6982+static spinlock_t read_sockets_lock;
6983+
6984+/* List of sockets which have writes pending */
6985+static struct list_head write_sockets;
6986+static spinlock_t write_sockets_lock;
6987+
6988+/* List of sockets which have connects pending */
6989+static struct list_head state_sockets;
6990+static spinlock_t state_sockets_lock;
6991+
6992+/* List of allocated listen sockets */
6993+static struct list_head listen_sockets;
6994+
6995+static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
6996+static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
6997+
6998+
c783755a
AM
6999+static struct connection *nodeid2con(int nodeid, int allocation)
7000+{
7001+ struct connection *con = NULL;
7002+
7003+ down_read(&connections_lock);
7004+ if (nodeid >= conn_array_size) {
7005+ int new_size = nodeid + dlm_config.conn_increment;
7006+ struct connection **new_conns;
7007+
7008+ new_conns = kmalloc(sizeof(struct connection *) *
7009+ new_size, allocation);
7010+ if (!new_conns)
7011+ goto finish;
7012+
7013+ up_read(&connections_lock);
7014+ /* The worst that can happen here (I think), is that
7015+ we get two consecutive reallocations */
7016+ down_write(&connections_lock);
7017+
7018+ memset(new_conns, 0, sizeof(struct connection *) * new_size);
7019+ memcpy(new_conns, connections, sizeof(struct connection *) * conn_array_size);
7020+ conn_array_size = new_size;
7021+ kfree(connections);
7022+ connections = new_conns;
7023+
7024+ up_write(&connections_lock);
7025+ down_read(&connections_lock);
7026+ }
7027+
7028+ con = connections[nodeid];
7029+ if (con == NULL && allocation) {
7030+ con = kmem_cache_alloc(con_cache, allocation);
7031+ if (!con)
7032+ goto finish;
7033+
7034+ memset(con, 0, sizeof(*con));
7035+ con->nodeid = nodeid;
7036+ init_rwsem(&con->sock_sem);
7037+ INIT_LIST_HEAD(&con->writequeue);
7038+ spin_lock_init(&con->writequeue_lock);
7039+
7040+ connections[nodeid] = con;
7041+ }
7042+
7043+ finish:
7044+ up_read(&connections_lock);
7045+ return con;
7046+}
7047+
c1c6733f
AM
7048+/* Data available on socket or listen socket received a connect */
7049+static void lowcomms_data_ready(struct sock *sk, int count_unused)
7050+{
7051+ struct connection *con = sock2con(sk);
7052+
b7b72b66 7053+ atomic_inc(&con->waiting_requests);
c1c6733f
AM
7054+ if (test_and_set_bit(CF_READ_PENDING, &con->flags))
7055+ return;
7056+
7057+ spin_lock_bh(&read_sockets_lock);
7058+ list_add_tail(&con->read_list, &read_sockets);
7059+ spin_unlock_bh(&read_sockets_lock);
7060+
7061+ wake_up_interruptible(&lowcomms_recv_waitq);
7062+}
7063+
7064+static void lowcomms_write_space(struct sock *sk)
7065+{
7066+ struct connection *con = sock2con(sk);
7067+
7068+ if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
7069+ return;
7070+
7071+ spin_lock_bh(&write_sockets_lock);
7072+ list_add_tail(&con->write_list, &write_sockets);
7073+ spin_unlock_bh(&write_sockets_lock);
7074+
7075+ wake_up_interruptible(&lowcomms_send_waitq);
7076+}
7077+
7078+static inline void lowcomms_connect_sock(struct connection *con)
7079+{
7080+ if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
7081+ return;
7082+ if (!atomic_read(&accepting))
7083+ return;
7084+
7085+ spin_lock_bh(&state_sockets_lock);
7086+ list_add_tail(&con->state_list, &state_sockets);
7087+ spin_unlock_bh(&state_sockets_lock);
7088+
7089+ wake_up_interruptible(&lowcomms_send_waitq);
7090+}
7091+
7092+static void lowcomms_state_change(struct sock *sk)
7093+{
7094+/* struct connection *con = sock2con(sk); */
7095+
7096+ switch (sk->sk_state) {
7097+ case TCP_ESTABLISHED:
7098+ lowcomms_write_space(sk);
7099+ break;
7100+
7101+ case TCP_FIN_WAIT1:
7102+ case TCP_FIN_WAIT2:
7103+ case TCP_TIME_WAIT:
7104+ case TCP_CLOSE:
7105+ case TCP_CLOSE_WAIT:
7106+ case TCP_LAST_ACK:
7107+ case TCP_CLOSING:
7108+ /* FIXME: I think this causes more trouble than it solves.
7109+ lowcomms wil reconnect anyway when there is something to
7110+ send. This just attempts reconnection if a node goes down!
7111+ */
7112+ /* lowcomms_connect_sock(con); */
7113+ break;
7114+
7115+ default:
7116+ printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
7117+ break;
7118+ }
7119+}
7120+
7121+/* Make a socket active */
7122+static int add_sock(struct socket *sock, struct connection *con)
7123+{
7124+ con->sock = sock;
7125+
7126+ /* Install a data_ready callback */
7127+ con->sock->sk->sk_data_ready = lowcomms_data_ready;
7128+ con->sock->sk->sk_write_space = lowcomms_write_space;
7129+ con->sock->sk->sk_state_change = lowcomms_state_change;
7130+
7131+ return 0;
7132+}
7133+
7134+/* Add the port number to an IP6 or 4 sockaddr and return the address
7135+ length */
7136+static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
7137+ int *addr_len)
7138+{
7139+ saddr->sin6_family = local_addr.sin6_family;
7140+ if (local_addr.sin6_family == AF_INET) {
b7b72b66
AM
7141+ struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
7142+ in4_addr->sin_port = cpu_to_be16(port);
7143+ *addr_len = sizeof(struct sockaddr_in);
c1c6733f
AM
7144+ }
7145+ else {
b7b72b66
AM
7146+ saddr->sin6_port = cpu_to_be16(port);
7147+ *addr_len = sizeof(struct sockaddr_in6);
c1c6733f
AM
7148+ }
7149+}
7150+
7151+/* Close a remote connection and tidy up */
c783755a 7152+static void close_connection(struct connection *con, int and_other)
c1c6733f 7153+{
c1c6733f
AM
7154+ down_write(&con->sock_sem);
7155+
7156+ if (con->sock) {
7157+ sock_release(con->sock);
7158+ con->sock = NULL;
c783755a
AM
7159+ if (con->othercon && and_other) {
7160+ /* Argh! recursion in kernel code!
7161+ Actually, this isn't a list so it
7162+ will only re-enter once.
7163+ */
7164+ close_connection(con->othercon, TRUE);
c1c6733f
AM
7165+ }
7166+ }
7167+ if (con->rx_page) {
7168+ __free_page(con->rx_page);
7169+ con->rx_page = NULL;
7170+ }
7171+ up_write(&con->sock_sem);
7172+}
7173+
7174+/* Data received from remote end */
7175+static int receive_from_sock(struct connection *con)
7176+{
7177+ int ret = 0;
7178+ struct msghdr msg;
7179+ struct iovec iov[2];
7180+ mm_segment_t fs;
7181+ unsigned len;
7182+ int r;
7183+ int call_again_soon = 0;
7184+
7185+ down_read(&con->sock_sem);
7186+
7187+ if (con->sock == NULL)
7188+ goto out;
7189+ if (con->rx_page == NULL) {
7190+ /*
7191+ * This doesn't need to be atomic, but I think it should
7192+ * improve performance if it is.
7193+ */
7194+ con->rx_page = alloc_page(GFP_ATOMIC);
7195+ if (con->rx_page == NULL)
7196+ goto out_resched;
7197+ CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
7198+ }
b7b72b66 7199+
c1c6733f 7200+ /*
b7b72b66
AM
7201+ * To avoid doing too many short reads, we will reschedule for
7202+ * another time if there are less than 20 bytes left in the buffer.
c1c6733f 7203+ */
b7b72b66 7204+ if (!CBUF_MAY_ADD(&con->cb, 20))
c1c6733f
AM
7205+ goto out_resched;
7206+
7207+ msg.msg_control = NULL;
7208+ msg.msg_controllen = 0;
7209+ msg.msg_iovlen = 1;
7210+ msg.msg_iov = iov;
7211+ msg.msg_name = NULL;
7212+ msg.msg_namelen = 0;
7213+ msg.msg_flags = 0;
7214+
7215+ /*
7216+ * iov[0] is the bit of the circular buffer between the current end
7217+ * point (cb.base + cb.len) and the end of the buffer.
7218+ */
7219+ iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
7220+ iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
7221+ iov[1].iov_len = 0;
7222+
7223+ /*
7224+ * iov[1] is the bit of the circular buffer between the start of the
7225+ * buffer and the start of the currently used section (cb.base)
7226+ */
7227+ if (CBUF_DATA(&con->cb) >= con->cb.base) {
7228+ iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
7229+ iov[1].iov_len = con->cb.base;
7230+ iov[1].iov_base = page_address(con->rx_page);
7231+ msg.msg_iovlen = 2;
7232+ }
7233+ len = iov[0].iov_len + iov[1].iov_len;
7234+
7235+ fs = get_fs();
7236+ set_fs(get_ds());
7237+ r = ret = sock_recvmsg(con->sock, &msg, len,
7238+ MSG_DONTWAIT | MSG_NOSIGNAL);
7239+ set_fs(fs);
7240+
7241+ if (ret <= 0)
7242+ goto out_close;
7243+ if (ret == len)
7244+ call_again_soon = 1;
7245+ CBUF_ADD(&con->cb, ret);
7246+ ret = midcomms_process_incoming_buffer(con->nodeid,
7247+ page_address(con->rx_page),
7248+ con->cb.base, con->cb.len,
7249+ PAGE_CACHE_SIZE);
7250+ if (ret == -EBADMSG) {
7251+ printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
7252+ "iov_len=%u, iov_base[0]=%p, read=%d\n",
7253+ page_address(con->rx_page), con->cb.base, con->cb.len,
7254+ len, iov[0].iov_base, r);
7255+ }
7256+ if (ret < 0)
7257+ goto out_close;
7258+ CBUF_EAT(&con->cb, ret);
7259+
7260+ if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
7261+ __free_page(con->rx_page);
7262+ con->rx_page = NULL;
7263+ }
b7b72b66 7264+
c1c6733f
AM
7265+ out:
7266+ if (call_again_soon)
7267+ goto out_resched;
7268+ up_read(&con->sock_sem);
7269+ ret = 0;
7270+ goto out_ret;
7271+
7272+ out_resched:
7273+ lowcomms_data_ready(con->sock->sk, 0);
7274+ up_read(&con->sock_sem);
7275+ ret = 0;
7276+ goto out_ret;
7277+
7278+ out_close:
7279+ up_read(&con->sock_sem);
c783755a
AM
7280+ if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
7281+ close_connection(con, FALSE);
c1c6733f
AM
7282+ lowcomms_connect_sock(con);
7283+ }
7284+
7285+ out_ret:
7286+ return ret;
7287+}
7288+
7289+/* Listening socket is busy, accept a connection */
7290+static int accept_from_sock(struct connection *con)
7291+{
7292+ int result;
7293+ struct sockaddr_in6 peeraddr;
7294+ struct socket *newsock;
7295+ int len;
7296+ int nodeid;
7297+ struct connection *newcon;
7298+
7299+ memset(&peeraddr, 0, sizeof(peeraddr));
7300+ newsock = sock_alloc();
7301+ if (!newsock)
7302+ return -ENOMEM;
7303+
7304+ down_read(&con->sock_sem);
7305+
7306+ result = -ENOTCONN;
7307+ if (con->sock == NULL)
7308+ goto accept_err;
7309+
7310+ newsock->type = con->sock->type;
7311+ newsock->ops = con->sock->ops;
7312+
7313+ result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
7314+ if (result < 0)
7315+ goto accept_err;
7316+
7317+ /* Get the connected socket's peer */
7318+ if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
7319+ &len, 2)) {
7320+ result = -ECONNABORTED;
7321+ goto accept_err;
7322+ }
7323+
7324+ /* Get the new node's NODEID */
7325+ nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
7326+ if (nodeid == 0) {
7327+ printk("dlm: connect from non cluster node\n");
7328+ sock_release(newsock);
7329+ up_read(&con->sock_sem);
7330+ return -1;
7331+ }
7332+
7333+ log_print("got connection from %d", nodeid);
7334+
7335+ /* Check to see if we already have a connection to this node. This
7336+ * could happen if the two nodes initiate a connection at roughly
7337+ * the same time and the connections cross on the wire.
7338+ * TEMPORARY FIX:
c783755a 7339+ * In this case we store the incoming one in "othercon"
c1c6733f 7340+ */
c783755a
AM
7341+ newcon = nodeid2con(nodeid, GFP_KERNEL);
7342+ if (!newcon) {
7343+ result = -ENOMEM;
7344+ goto accept_err;
7345+ }
c1c6733f
AM
7346+ down_write(&newcon->sock_sem);
7347+ if (newcon->sock) {
c783755a 7348+ struct connection *othercon = newcon->othercon;
c1c6733f 7349+
c1c6733f 7350+ if (!othercon) {
c783755a
AM
7351+ othercon = kmem_cache_alloc(con_cache, GFP_KERNEL);
7352+ if (!othercon) {
7353+ printk("dlm: failed to allocate incoming socket\n");
7354+ up_write(&newcon->sock_sem);
7355+ result = -ENOMEM;
7356+ goto accept_err;
7357+ }
7358+ memset(othercon, 0, sizeof(*othercon));
7359+ othercon->nodeid = nodeid;
7360+ othercon->rx_action = receive_from_sock;
7361+ init_rwsem(&othercon->sock_sem);
7362+ set_bit(CF_IS_OTHERCON, &othercon->flags);
7363+ newcon->othercon = othercon;
7364+ }
c1c6733f 7365+ othercon->sock = newsock;
c1c6733f 7366+ newsock->sk->sk_user_data = othercon;
b7b72b66
AM
7367+ add_sock(newsock, othercon);
7368+ }
7369+ else {
7370+ newsock->sk->sk_user_data = newcon;
7371+ newcon->rx_action = receive_from_sock;
7372+ add_sock(newsock, newcon);
c1c6733f 7373+
c1c6733f
AM
7374+ }
7375+
c1c6733f
AM
7376+ up_write(&newcon->sock_sem);
7377+
7378+ /*
7379+ * Add it to the active queue in case we got data
7380+ * beween processing the accept adding the socket
7381+ * to the read_sockets list
7382+ */
7383+ lowcomms_data_ready(newsock->sk, 0);
c1c6733f
AM
7384+ up_read(&con->sock_sem);
7385+
c1c6733f
AM
7386+ return 0;
7387+
7388+ accept_err:
7389+ up_read(&con->sock_sem);
7390+ sock_release(newsock);
7391+
b7b72b66
AM
7392+ if (result != -EAGAIN)
7393+ printk("dlm: error accepting connection from node: %d\n", result);
c1c6733f
AM
7394+ return result;
7395+}
7396+
7397+/* Connect a new socket to its peer */
7398+static int connect_to_sock(struct connection *con)
7399+{
7400+ int result = -EHOSTUNREACH;
7401+ struct sockaddr_in6 saddr;
7402+ int addr_len;
7403+ struct socket *sock;
7404+
7405+ if (con->nodeid == 0) {
7406+ log_print("attempt to connect sock 0 foiled");
7407+ return 0;
7408+ }
7409+
7410+ down_write(&con->sock_sem);
7411+ if (con->retries++ > MAX_CONNECT_RETRIES)
7412+ goto out;
7413+
7414+ // FIXME not sure this should happen, let alone like this.
7415+ if (con->sock) {
7416+ sock_release(con->sock);
7417+ con->sock = NULL;
7418+ }
7419+
7420+ /* Create a socket to communicate with */
7421+ result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
7422+ if (result < 0)
7423+ goto out_err;
7424+
b7b72b66 7425+ memset(&saddr, 0, sizeof(saddr));
c1c6733f
AM
7426+ if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
7427+ goto out_err;
7428+
7429+ sock->sk->sk_user_data = con;
7430+ con->rx_action = receive_from_sock;
7431+
7432+ make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
7433+
7434+ add_sock(sock, con);
c783755a
AM
7435+
7436+ log_print("connecting to %d", con->nodeid);
c1c6733f
AM
7437+ result =
7438+ sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
7439+ O_NONBLOCK);
7440+ if (result == -EINPROGRESS)
7441+ result = 0;
7442+ if (result != 0)
7443+ goto out_err;
7444+
7445+ out:
7446+ up_write(&con->sock_sem);
7447+ /*
7448+ * Returning an error here means we've given up trying to connect to
7449+ * a remote node, otherwise we return 0 and reschedule the connetion
7450+ * attempt
7451+ */
7452+ return result;
7453+
7454+ out_err:
7455+ if (con->sock) {
7456+ sock_release(con->sock);
7457+ con->sock = NULL;
7458+ }
7459+ /*
7460+ * Some errors are fatal and this list might need adjusting. For other
7461+ * errors we try again until the max number of retries is reached.
7462+ */
7463+ if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
7464+ result != -ENETDOWN && result != EINVAL
7465+ && result != -EPROTONOSUPPORT) {
7466+ lowcomms_connect_sock(con);
7467+ result = 0;
7468+ }
7469+ goto out;
7470+}
7471+
7472+static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
7473+{
7474+ struct socket *sock = NULL;
7475+ mm_segment_t fs;
7476+ int result = 0;
7477+ int one = 1;
7478+ struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
7479+
7480+ /* Create a socket to communicate with */
7481+ result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
7482+ if (result < 0) {
7483+ printk("dlm: Can't create listening comms socket\n");
7484+ goto create_out;
7485+ }
7486+
7487+ fs = get_fs();
7488+ set_fs(get_ds());
7489+ result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
7490+ set_fs(fs);
7491+ if (result < 0) {
7492+ printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
7493+ }
7494+ sock->sk->sk_user_data = con;
7495+ con->rx_action = accept_from_sock;
7496+ con->sock = sock;
7497+
7498+ /* Bind to our port */
7499+ make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
7500+ result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
7501+ if (result < 0) {
7502+ printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
7503+ sock_release(sock);
7504+ sock = NULL;
7505+ goto create_out;
7506+ }
7507+
7508+ fs = get_fs();
7509+ set_fs(get_ds());
7510+
7511+ result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
7512+ set_fs(fs);
7513+ if (result < 0) {
7514+ printk("dlm: Set keepalive failed: %d\n", result);
7515+ }
7516+
7517+ result = sock->ops->listen(sock, 5);
7518+ if (result < 0) {
7519+ printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
7520+ sock_release(sock);
7521+ sock = NULL;
7522+ goto create_out;
7523+ }
7524+
7525+ create_out:
7526+ return sock;
7527+}
7528+
7529+
7530+/* Listen on all interfaces */
7531+static int listen_for_all(void)
7532+{
7533+ int result = 0;
7534+ int nodeid;
7535+ struct socket *sock = NULL;
7536+ struct list_head *addr_list;
c783755a 7537+ struct connection *con = nodeid2con(0, GFP_KERNEL);
b7b72b66 7538+ struct connection *temp;
c1c6733f
AM
7539+ struct cluster_node_addr *node_addr;
7540+ char local_addr[sizeof(struct sockaddr_in6)];
7541+
7542+ /* This will also fill in local_addr */
7543+ nodeid = lowcomms_our_nodeid();
7544+
7545+ addr_list = kcl_get_node_addresses(nodeid);
7546+ if (!addr_list) {
7547+ printk("dlm: cannot initialise comms layer\n");
7548+ result = -ENOTCONN;
7549+ goto create_out;
7550+ }
7551+
7552+ list_for_each_entry(node_addr, addr_list, list) {
7553+
7554+ if (!con) {
c783755a 7555+ con = kmem_cache_alloc(con_cache, GFP_KERNEL);
c1c6733f
AM
7556+ if (!con) {
7557+ printk("dlm: failed to allocate listen socket\n");
b7b72b66
AM
7558+ result = -ENOMEM;
7559+ goto create_free;
c1c6733f
AM
7560+ }
7561+ memset(con, 0, sizeof(*con));
7562+ init_rwsem(&con->sock_sem);
7563+ spin_lock_init(&con->writequeue_lock);
7564+ INIT_LIST_HEAD(&con->writequeue);
c783755a 7565+ set_bit(CF_IS_OTHERCON, &con->flags);
c1c6733f
AM
7566+ }
7567+
7568+ memcpy(local_addr, node_addr->addr, node_addr->addr_len);
7569+ sock = create_listen_sock(con, local_addr,
7570+ node_addr->addr_len);
7571+ if (sock) {
7572+ add_sock(sock, con);
b7b72b66
AM
7573+
7574+ /* Keep a list of dynamically allocated listening sockets
7575+ so we can free them at shutdown */
c783755a 7576+ if (test_bit(CF_IS_OTHERCON, &con->flags)) {
b7b72b66
AM
7577+ list_add_tail(&con->listenlist, &listen_sockets);
7578+ }
c1c6733f
AM
7579+ }
7580+ else {
b7b72b66 7581+ result = -EADDRINUSE;
c783755a 7582+ kmem_cache_free(con_cache, con);
b7b72b66 7583+ goto create_free;
c1c6733f
AM
7584+ }
7585+
c1c6733f
AM
7586+ con = NULL;
7587+ }
7588+
7589+ create_out:
7590+ return result;
b7b72b66
AM
7591+
7592+ create_free:
7593+ /* Free up any dynamically allocated listening sockets */
7594+ list_for_each_entry_safe(con, temp, &listen_sockets, listenlist) {
7595+ sock_release(con->sock);
c783755a 7596+ kmem_cache_free(con_cache, con);
b7b72b66
AM
7597+ }
7598+ return result;
c1c6733f
AM
7599+}
7600+
7601+
7602+
7603+static struct writequeue_entry *new_writequeue_entry(struct connection *con,
7604+ int allocation)
7605+{
7606+ struct writequeue_entry *entry;
7607+
7608+ entry = kmalloc(sizeof(struct writequeue_entry), allocation);
7609+ if (!entry)
7610+ return NULL;
7611+
7612+ entry->page = alloc_page(allocation);
7613+ if (!entry->page) {
7614+ kfree(entry);
7615+ return NULL;
7616+ }
7617+
7618+ entry->offset = 0;
7619+ entry->len = 0;
7620+ entry->end = 0;
7621+ entry->users = 0;
7622+ entry->con = con;
7623+
7624+ return entry;
7625+}
7626+
7627+struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7628+ int allocation, char **ppc)
7629+{
c783755a 7630+ struct connection *con = nodeid2con(nodeid, allocation);
c1c6733f
AM
7631+ struct writequeue_entry *e;
7632+ int offset = 0;
7633+ int users = 0;
7634+
c783755a
AM
7635+ if (!con)
7636+ return NULL;
7637+
c1c6733f
AM
7638+ if (!atomic_read(&accepting))
7639+ return NULL;
7640+
7641+ spin_lock(&con->writequeue_lock);
7642+ e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
7643+ if (((struct list_head *) e == &con->writequeue) ||
7644+ (PAGE_CACHE_SIZE - e->end < len)) {
7645+ e = NULL;
7646+ } else {
7647+ offset = e->end;
7648+ e->end += len;
7649+ users = e->users++;
7650+ }
7651+ spin_unlock(&con->writequeue_lock);
7652+
7653+ if (e) {
7654+ got_one:
7655+ if (users == 0)
7656+ kmap(e->page);
7657+ *ppc = page_address(e->page) + offset;
7658+ return e;
7659+ }
7660+
7661+ e = new_writequeue_entry(con, allocation);
7662+ if (e) {
7663+ spin_lock(&con->writequeue_lock);
7664+ offset = e->end;
7665+ e->end += len;
7666+ users = e->users++;
7667+ list_add_tail(&e->list, &con->writequeue);
7668+ spin_unlock(&con->writequeue_lock);
c1c6733f
AM
7669+ goto got_one;
7670+ }
7671+ return NULL;
7672+}
7673+
7674+void lowcomms_commit_buffer(struct writequeue_entry *e)
7675+{
7676+ struct connection *con = e->con;
7677+ int users;
7678+
7679+ if (!atomic_read(&accepting))
7680+ return;
7681+
7682+ spin_lock(&con->writequeue_lock);
7683+ users = --e->users;
7684+ if (users)
7685+ goto out;
7686+ e->len = e->end - e->offset;
7687+ kunmap(e->page);
7688+ spin_unlock(&con->writequeue_lock);
7689+
7690+ if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
7691+ spin_lock_bh(&write_sockets_lock);
7692+ list_add_tail(&con->write_list, &write_sockets);
7693+ spin_unlock_bh(&write_sockets_lock);
7694+
7695+ wake_up_interruptible(&lowcomms_send_waitq);
7696+ }
7697+ return;
7698+
7699+ out:
7700+ spin_unlock(&con->writequeue_lock);
7701+ return;
7702+}
7703+
7704+static void free_entry(struct writequeue_entry *e)
7705+{
7706+ __free_page(e->page);
7707+ kfree(e);
c1c6733f
AM
7708+}
7709+
7710+/* Send a message */
7711+static int send_to_sock(struct connection *con)
7712+{
7713+ int ret = 0;
7714+ ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
7715+ const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
7716+ struct writequeue_entry *e;
7717+ int len, offset;
7718+
7719+ down_read(&con->sock_sem);
7720+ if (con->sock == NULL)
7721+ goto out_connect;
7722+
7723+ sendpage = con->sock->ops->sendpage;
7724+
7725+ spin_lock(&con->writequeue_lock);
7726+ for (;;) {
7727+ e = list_entry(con->writequeue.next, struct writequeue_entry,
7728+ list);
7729+ if ((struct list_head *) e == &con->writequeue)
7730+ break;
7731+
7732+ len = e->len;
7733+ offset = e->offset;
7734+ BUG_ON(len == 0 && e->users == 0);
7735+ spin_unlock(&con->writequeue_lock);
7736+
7737+ ret = 0;
7738+ if (len) {
7739+ ret = sendpage(con->sock, e->page, offset, len,
7740+ msg_flags);
7741+ if (ret == -EAGAIN || ret == 0)
7742+ goto out;
7743+ if (ret <= 0)
7744+ goto send_error;
7745+ }
7746+
7747+ spin_lock(&con->writequeue_lock);
7748+ e->offset += ret;
7749+ e->len -= ret;
7750+
7751+ if (e->len == 0 && e->users == 0) {
7752+ list_del(&e->list);
7753+ free_entry(e);
7754+ continue;
7755+ }
7756+ }
7757+ spin_unlock(&con->writequeue_lock);
7758+ out:
7759+ up_read(&con->sock_sem);
7760+ return ret;
7761+
7762+ send_error:
7763+ up_read(&con->sock_sem);
c783755a 7764+ close_connection(con, FALSE);
c1c6733f
AM
7765+ lowcomms_connect_sock(con);
7766+ return ret;
7767+
7768+ out_connect:
7769+ up_read(&con->sock_sem);
7770+ lowcomms_connect_sock(con);
7771+ return 0;
7772+}
7773+
c783755a
AM
7774+static void clean_one_writequeue(struct connection *con)
7775+{
7776+ struct list_head *list;
7777+ struct list_head *temp;
7778+
7779+ spin_lock(&con->writequeue_lock);
7780+ list_for_each_safe(list, temp, &con->writequeue) {
7781+ struct writequeue_entry *e =
7782+ list_entry(list, struct writequeue_entry, list);
7783+ list_del(&e->list);
7784+ free_entry(e);
7785+ }
7786+ spin_unlock(&con->writequeue_lock);
7787+}
7788+
7789+/* Called from recovery when it knows that a node has
c1c6733f
AM
7790+ left the cluster */
7791+int lowcomms_close(int nodeid)
7792+{
7793+ struct connection *con;
7794+
7795+ if (!connections)
7796+ goto out;
7797+
c783755a
AM
7798+ log_print("closing connection to node %d", nodeid);
7799+ con = nodeid2con(nodeid, 0);
7800+ if (con) {
7801+ close_connection(con, TRUE);
7802+ clean_one_writequeue(con);
7803+ atomic_set(&con->waiting_requests, 0);
c1c6733f 7804+ }
c783755a 7805+ return 0;
c1c6733f
AM
7806+
7807+ out:
7808+ return -1;
7809+}
7810+
7811+/* API send message call, may queue the request */
7812+/* N.B. This is the old interface - use the new one for new calls */
7813+int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
7814+{
7815+ struct writequeue_entry *e;
7816+ char *b;
7817+
c1c6733f
AM
7818+ e = lowcomms_get_buffer(nodeid, len, allocation, &b);
7819+ if (e) {
7820+ memcpy(b, buf, len);
7821+ lowcomms_commit_buffer(e);
7822+ return 0;
7823+ }
7824+ return -ENOBUFS;
7825+}
7826+
7827+/* Look for activity on active sockets */
7828+static void process_sockets(void)
7829+{
7830+ struct list_head *list;
7831+ struct list_head *temp;
7832+
7833+ spin_lock_bh(&read_sockets_lock);
7834+ list_for_each_safe(list, temp, &read_sockets) {
7835+ struct connection *con =
7836+ list_entry(list, struct connection, read_list);
7837+ list_del(&con->read_list);
7838+ clear_bit(CF_READ_PENDING, &con->flags);
7839+
7840+ spin_unlock_bh(&read_sockets_lock);
7841+
c783755a 7842+ /* This can reach zero if we are processing requests
b7b72b66
AM
7843+ * as they come in.
7844+ */
7845+ if (atomic_read(&con->waiting_requests) == 0) {
7846+ spin_lock_bh(&read_sockets_lock);
7847+ continue;
7848+ }
7849+
7850+ do {
7851+ con->rx_action(con);
c783755a
AM
7852+ } while (!atomic_dec_and_test(&con->waiting_requests) &&
7853+ !kthread_should_stop());
c1c6733f
AM
7854+
7855+ /* Don't starve out everyone else */
7856+ schedule();
7857+ spin_lock_bh(&read_sockets_lock);
7858+ }
7859+ spin_unlock_bh(&read_sockets_lock);
7860+}
7861+
7862+/* Try to send any messages that are pending
7863+ */
7864+static void process_output_queue(void)
7865+{
7866+ struct list_head *list;
7867+ struct list_head *temp;
7868+ int ret;
7869+
7870+ spin_lock_bh(&write_sockets_lock);
7871+ list_for_each_safe(list, temp, &write_sockets) {
7872+ struct connection *con =
7873+ list_entry(list, struct connection, write_list);
7874+ list_del(&con->write_list);
7875+ clear_bit(CF_WRITE_PENDING, &con->flags);
7876+
7877+ spin_unlock_bh(&write_sockets_lock);
7878+
7879+ ret = send_to_sock(con);
7880+ if (ret < 0) {
7881+ }
7882+ spin_lock_bh(&write_sockets_lock);
7883+ }
7884+ spin_unlock_bh(&write_sockets_lock);
7885+}
7886+
7887+static void process_state_queue(void)
7888+{
7889+ struct list_head *list;
7890+ struct list_head *temp;
7891+ int ret;
7892+
7893+ spin_lock_bh(&state_sockets_lock);
7894+ list_for_each_safe(list, temp, &state_sockets) {
7895+ struct connection *con =
7896+ list_entry(list, struct connection, state_list);
7897+ list_del(&con->state_list);
7898+ clear_bit(CF_CONNECT_PENDING, &con->flags);
7899+ spin_unlock_bh(&state_sockets_lock);
7900+
7901+ ret = connect_to_sock(con);
7902+ if (ret < 0) {
7903+ }
7904+ spin_lock_bh(&state_sockets_lock);
7905+ }
7906+ spin_unlock_bh(&state_sockets_lock);
7907+}
7908+
c783755a 7909+
c1c6733f
AM
7910+/* Discard all entries on the write queues */
7911+static void clean_writequeues(void)
7912+{
c1c6733f
AM
7913+ int nodeid;
7914+
c783755a
AM
7915+ for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
7916+ struct connection *con = nodeid2con(nodeid, 0);
c1c6733f 7917+
c783755a
AM
7918+ if (con)
7919+ clean_one_writequeue(con);
c1c6733f
AM
7920+ }
7921+}
7922+
7923+static int read_list_empty(void)
7924+{
7925+ int status;
7926+
7927+ spin_lock_bh(&read_sockets_lock);
7928+ status = list_empty(&read_sockets);
7929+ spin_unlock_bh(&read_sockets_lock);
7930+
7931+ return status;
7932+}
7933+
7934+/* DLM Transport comms receive daemon */
7935+static int dlm_recvd(void *data)
7936+{
c1c6733f
AM
7937+ init_waitqueue_head(&lowcomms_recv_waitq);
7938+ init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
7939+ add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
7940+
c783755a
AM
7941+ while (!kthread_should_stop()) {
7942+ set_current_state(TASK_INTERRUPTIBLE);
c1c6733f
AM
7943+ if (read_list_empty())
7944+ schedule();
c783755a 7945+ set_current_state(TASK_RUNNING);
c1c6733f
AM
7946+
7947+ process_sockets();
7948+ }
7949+
c1c6733f
AM
7950+ return 0;
7951+}
7952+
7953+static int write_and_state_lists_empty(void)
7954+{
7955+ int status;
7956+
7957+ spin_lock_bh(&write_sockets_lock);
7958+ status = list_empty(&write_sockets);
7959+ spin_unlock_bh(&write_sockets_lock);
7960+
7961+ spin_lock_bh(&state_sockets_lock);
7962+ if (list_empty(&state_sockets) == 0)
7963+ status = 0;
7964+ spin_unlock_bh(&state_sockets_lock);
7965+
7966+ return status;
7967+}
7968+
7969+/* DLM Transport send daemon */
7970+static int dlm_sendd(void *data)
7971+{
c1c6733f
AM
7972+ init_waitqueue_head(&lowcomms_send_waitq);
7973+ init_waitqueue_entry(&lowcomms_send_waitq_head, current);
7974+ add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
7975+
c783755a
AM
7976+ while (!kthread_should_stop()) {
7977+ set_current_state(TASK_INTERRUPTIBLE);
c1c6733f
AM
7978+ if (write_and_state_lists_empty())
7979+ schedule();
c783755a 7980+ set_current_state(TASK_RUNNING);
c1c6733f
AM
7981+
7982+ process_state_queue();
7983+ process_output_queue();
7984+ }
7985+
c1c6733f
AM
7986+ return 0;
7987+}
7988+
7989+static void daemons_stop(void)
7990+{
c783755a
AM
7991+ kthread_stop(recv_task);
7992+ kthread_stop(send_task);
c1c6733f
AM
7993+}
7994+
7995+static int daemons_start(void)
7996+{
c783755a 7997+ struct task_struct *p;
c1c6733f
AM
7998+ int error;
7999+
d3b4771f 8000+ p = kthread_run(dlm_recvd, NULL, 0, "dlm_recvd");
c783755a
AM
8001+ error = IS_ERR(p);
8002+ if (error) {
8003+ log_print("can't start dlm_recvd %d", error);
8004+ return error;
c1c6733f 8005+ }
c783755a 8006+ recv_task = p;
c1c6733f 8007+
d3b4771f 8008+ p = kthread_run(dlm_sendd, NULL, 0, "dlm_sendd");
c783755a
AM
8009+ error = IS_ERR(p);
8010+ if (error) {
8011+ log_print("can't start dlm_sendd %d", error);
8012+ kthread_stop(recv_task);
8013+ return error;
c1c6733f 8014+ }
c783755a 8015+ send_task = p;
c1c6733f 8016+
c783755a 8017+ return 0;
c1c6733f
AM
8018+}
8019+
8020+/*
8021+ * Return the largest buffer size we can cope with.
8022+ */
8023+int lowcomms_max_buffer_size(void)
8024+{
8025+ return PAGE_CACHE_SIZE;
8026+}
8027+
8028+void lowcomms_stop(void)
8029+{
8030+ int i;
8031+ struct connection *temp;
8032+ struct connection *lcon;
8033+
8034+ atomic_set(&accepting, 0);
8035+
8036+ /* Set all the activity flags to prevent any
8037+ socket activity.
8038+ */
8039+ for (i = 0; i < conn_array_size; i++) {
c783755a
AM
8040+ if (connections[i])
8041+ connections[i]->flags = 0x7;
c1c6733f
AM
8042+ }
8043+ daemons_stop();
8044+ clean_writequeues();
8045+
8046+ for (i = 0; i < conn_array_size; i++) {
c783755a
AM
8047+ if (connections[i]) {
8048+ close_connection(connections[i], TRUE);
8049+ if (connections[i]->othercon)
8050+ kmem_cache_free(con_cache, connections[i]->othercon);
8051+ kmem_cache_free(con_cache, connections[i]);
8052+ }
c1c6733f
AM
8053+ }
8054+
8055+ kfree(connections);
8056+ connections = NULL;
8057+
8058+ /* Free up any dynamically allocated listening sockets */
8059+ list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
8060+ sock_release(lcon->sock);
c783755a 8061+ kmem_cache_free(con_cache, lcon);
c1c6733f
AM
8062+ }
8063+
c783755a 8064+ kmem_cache_destroy(con_cache);
c1c6733f
AM
8065+ kcl_releaseref_cluster();
8066+}
8067+
8068+/* This is quite likely to sleep... */
8069+int lowcomms_start(void)
8070+{
8071+ int error = 0;
bb1d8b11
AM
8072+ struct connection *temp;
8073+ struct connection *lcon;
c1c6733f
AM
8074+
8075+ INIT_LIST_HEAD(&read_sockets);
8076+ INIT_LIST_HEAD(&write_sockets);
8077+ INIT_LIST_HEAD(&state_sockets);
8078+ INIT_LIST_HEAD(&listen_sockets);
8079+
8080+ spin_lock_init(&read_sockets_lock);
8081+ spin_lock_init(&write_sockets_lock);
8082+ spin_lock_init(&state_sockets_lock);
c783755a 8083+ init_rwsem(&connections_lock);
c1c6733f
AM
8084+
8085+ error = -ENOTCONN;
8086+ if (kcl_addref_cluster())
8087+ goto out;
8088+
8089+ /*
8090+ * Temporarily initialise the waitq head so that lowcomms_send_message
8091+ * doesn't crash if it gets called before the thread is fully
8092+ * initialised
8093+ */
8094+ init_waitqueue_head(&lowcomms_send_waitq);
8095+
8096+ error = -ENOMEM;
c783755a
AM
8097+ connections = kmalloc(sizeof(struct connection *) *
8098+ dlm_config.conn_increment, GFP_KERNEL);
c1c6733f
AM
8099+ if (!connections)
8100+ goto out;
8101+
8102+ memset(connections, 0,
c783755a
AM
8103+ sizeof(struct connection *) * dlm_config.conn_increment);
8104+
8105+ conn_array_size = dlm_config.conn_increment;
8106+
8107+ con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
8108+ __alignof__(struct connection), 0, NULL, NULL);
8109+ if (!con_cache)
8110+ goto fail_free_conn;
8111+
c1c6733f
AM
8112+
8113+ /* Start listening */
8114+ error = listen_for_all();
8115+ if (error)
bb1d8b11 8116+ goto fail_unlisten;
c1c6733f
AM
8117+
8118+ error = daemons_start();
8119+ if (error)
bb1d8b11 8120+ goto fail_unlisten;
c1c6733f
AM
8121+
8122+ atomic_set(&accepting, 1);
8123+
8124+ return 0;
8125+
bb1d8b11
AM
8126+ fail_unlisten:
8127+ close_connection(connections[0], 0);
8128+ kmem_cache_free(con_cache, connections[0]);
8129+ list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
8130+ sock_release(lcon->sock);
8131+ kmem_cache_free(con_cache, lcon);
8132+ }
8133+
c783755a
AM
8134+ kmem_cache_destroy(con_cache);
8135+
c1c6733f 8136+ fail_free_conn:
c783755a 8137+ kcl_releaseref_cluster();
c1c6733f
AM
8138+ kfree(connections);
8139+
8140+ out:
8141+ return error;
8142+}
8143+
8144+/* Don't accept any more outgoing work */
8145+void lowcomms_stop_accept()
8146+{
8147+ atomic_set(&accepting, 0);
8148+}
8149+
8150+/* Cluster Manager interface functions for looking up
8151+ nodeids and IP addresses by each other
8152+*/
8153+
8154+/* Return the IP address of a node given its NODEID */
8155+static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
8156+{
8157+ struct list_head *addrs;
8158+ struct cluster_node_addr *node_addr;
8159+ struct cluster_node_addr *current_addr = NULL;
8160+ struct sockaddr_in6 *saddr;
8161+ int interface;
8162+ int i;
8163+
8164+ addrs = kcl_get_node_addresses(nodeid);
8165+ if (!addrs)
8166+ return -1;
8167+
8168+ interface = kcl_get_current_interface();
8169+
8170+ /* Look for address number <interface> */
8171+ i=0; /* i/f numbers start at 1 */
8172+ list_for_each_entry(node_addr, addrs, list) {
8173+ if (interface == ++i) {
8174+ current_addr = node_addr;
8175+ break;
8176+ }
8177+ }
8178+
8179+ /* If that failed then just use the first one */
8180+ if (!current_addr)
8181+ current_addr = (struct cluster_node_addr *)addrs->next;
8182+
8183+ saddr = (struct sockaddr_in6 *)current_addr->addr;
8184+
8185+ /* Extract the IP address */
b7b72b66 8186+ if (local_addr.sin6_family == AF_INET) {
c1c6733f
AM
8187+ struct sockaddr_in *in4 = (struct sockaddr_in *)saddr;
8188+ struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
8189+ ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
8190+ }
8191+ else {
8192+ struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
8193+ memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
8194+ }
8195+
8196+ return 0;
8197+}
8198+
8199+/* Return the NODEID for a node given its sockaddr */
8200+static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
8201+{
8202+ struct kcl_cluster_node node;
8203+ struct sockaddr_in6 ipv6_addr;
8204+ struct sockaddr_in ipv4_addr;
8205+
b7b72b66 8206+ if (local_addr.sin6_family == AF_INET) {
c1c6733f
AM
8207+ struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
8208+ memcpy(&ipv4_addr, &local_addr, addr_len);
8209+ memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
8210+
8211+ addr = (struct sockaddr *)&ipv4_addr;
8212+ }
8213+ else {
8214+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
8215+ memcpy(&ipv6_addr, &local_addr, addr_len);
8216+ memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
8217+
8218+ addr = (struct sockaddr *)&ipv6_addr;
8219+ }
8220+
8221+ if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
8222+ return node.node_id;
8223+ else
8224+ return 0;
8225+}
8226+
8227+int lowcomms_our_nodeid(void)
8228+{
8229+ struct kcl_cluster_node node;
8230+ struct list_head *addrs;
8231+ struct cluster_node_addr *first_addr;
8232+ static int our_nodeid = 0;
8233+
8234+ if (our_nodeid)
8235+ return our_nodeid;
8236+
8237+ if (kcl_get_node_by_nodeid(0, &node) == -1)
8238+ return 0;
8239+
8240+ our_nodeid = node.node_id;
8241+
8242+ /* Fill in the "template" structure */
8243+ addrs = kcl_get_node_addresses(our_nodeid);
8244+ if (!addrs)
8245+ return 0;
8246+
8247+ first_addr = (struct cluster_node_addr *) addrs->next;
8248+ memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
8249+
8250+ return node.node_id;
8251+}
8252+/*
8253+ * Overrides for Emacs so that we follow Linus's tabbing style.
8254+ * Emacs will notice this stuff at the end of the file and automatically
8255+ * adjust the settings for this buffer only. This must remain at the end
8256+ * of the file.
8257+ * ---------------------------------------------------------------------------
8258+ * Local variables:
8259+ * c-file-style: "linux"
8260+ * End:
8261+ */
8262diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
8263--- linux-orig/cluster/dlm/lowcomms.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 8264+++ linux-patched/cluster/dlm/lowcomms.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
8265@@ -0,0 +1,34 @@
8266+/******************************************************************************
8267+*******************************************************************************
8268+**
8269+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8270+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8271+**
8272+** This copyrighted material is made available to anyone wishing to use,
8273+** modify, copy, or redistribute it subject to the terms and conditions
8274+** of the GNU General Public License v.2.
8275+**
8276+*******************************************************************************
8277+******************************************************************************/
8278+
8279+#ifndef __LOWCOMMS_DOT_H__
8280+#define __LOWCOMMS_DOT_H__
8281+
8282+/* The old interface */
8283+int lowcomms_send_message(int csid, char *buf, int len, int allocation);
8284+
8285+/* The new interface */
8286+struct writequeue_entry;
8287+extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
8288+ int allocation, char **ppc);
8289+extern void lowcomms_commit_buffer(struct writequeue_entry *e);
8290+
8291+int lowcomms_start(void);
8292+void lowcomms_stop(void);
8293+void lowcomms_stop_accept(void);
8294+int lowcomms_close(int nodeid);
8295+int lowcomms_max_buffer_size(void);
8296+
8297+int lowcomms_our_nodeid(void);
8298+
8299+#endif /* __LOWCOMMS_DOT_H__ */
8300diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
8301--- linux-orig/cluster/dlm/main.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 8302+++ linux-patched/cluster/dlm/main.c 2004-11-03 11:31:56.000000000 +0800
c783755a 8303@@ -0,0 +1,93 @@
c1c6733f
AM
8304+/******************************************************************************
8305+*******************************************************************************
8306+**
8307+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8308+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8309+**
8310+** This copyrighted material is made available to anyone wishing to use,
8311+** modify, copy, or redistribute it subject to the terms and conditions
8312+** of the GNU General Public License v.2.
8313+**
8314+*******************************************************************************
8315+******************************************************************************/
8316+
8317+#define EXPORT_SYMTAB
8318+
8319+#include <linux/init.h>
8320+#include <linux/proc_fs.h>
8321+#include <linux/ctype.h>
c1c6733f
AM
8322+#include <linux/module.h>
8323+#include <net/sock.h>
8324+
8325+#include <cluster/cnxman.h>
8326+
8327+#include "dlm_internal.h"
8328+#include "lockspace.h"
c1c6733f
AM
8329+#include "ast.h"
8330+#include "lkb.h"
8331+#include "nodes.h"
8332+#include "locking.h"
8333+#include "config.h"
8334+#include "memory.h"
8335+#include "recover.h"
8336+#include "lowcomms.h"
8337+
8338+int dlm_device_init(void);
8339+void dlm_device_exit(void);
8340+void dlm_proc_init(void);
8341+void dlm_proc_exit(void);
8342+
8343+
8344+/* Cluster manager callbacks, we want to know if a node dies
8345+ N.B. this is independent of lockspace-specific event callbacks from SM */
8346+
8347+static void cman_callback(kcl_callback_reason reason, long arg)
8348+{
c1c6733f
AM
8349+ /* This is unconditional. so do what we can to tidy up */
8350+ if (reason == LEAVING) {
8351+ dlm_emergency_shutdown();
8352+ }
8353+}
8354+
8355+int __init init_dlm(void)
8356+{
8357+ dlm_proc_init();
8358+ dlm_lockspace_init();
c1c6733f
AM
8359+ dlm_nodes_init();
8360+ dlm_device_init();
8361+ dlm_memory_init();
8362+ dlm_config_init();
8363+
8364+ kcl_add_callback(cman_callback);
8365+
8366+ printk("DLM %s (built %s %s) installed\n",
8367+ DLM_RELEASE_NAME, __DATE__, __TIME__);
8368+
8369+ return 0;
8370+}
8371+
8372+void __exit exit_dlm(void)
8373+{
8374+ kcl_remove_callback(cman_callback);
8375+
8376+ dlm_device_exit();
8377+ dlm_memory_exit();
8378+ dlm_config_exit();
8379+ dlm_proc_exit();
8380+}
8381+
8382+MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
8383+MODULE_AUTHOR("Red Hat, Inc.");
8384+MODULE_LICENSE("GPL");
8385+
8386+module_init(init_dlm);
8387+module_exit(exit_dlm);
8388+
8389+EXPORT_SYMBOL(dlm_init);
8390+EXPORT_SYMBOL(dlm_release);
8391+EXPORT_SYMBOL(dlm_new_lockspace);
8392+EXPORT_SYMBOL(dlm_release_lockspace);
8393+EXPORT_SYMBOL(dlm_lock);
8394+EXPORT_SYMBOL(dlm_unlock);
b7b72b66
AM
8395+EXPORT_SYMBOL(dlm_debug_dump);
8396+EXPORT_SYMBOL(dlm_locks_dump);
c1c6733f
AM
8397diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
8398--- linux-orig/cluster/dlm/memory.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 8399+++ linux-patched/cluster/dlm/memory.c 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
8400@@ -0,0 +1,238 @@
8401+/******************************************************************************
8402+*******************************************************************************
8403+**
8404+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8405+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8406+**
8407+** This copyrighted material is made available to anyone wishing to use,
8408+** modify, copy, or redistribute it subject to the terms and conditions
8409+** of the GNU General Public License v.2.
8410+**
8411+*******************************************************************************
8412+******************************************************************************/
8413+
8414+/* memory.c
8415+ *
8416+ * memory allocation routines
8417+ *
8418+ */
8419+
8420+#include "dlm_internal.h"
8421+#include "memory.h"
8422+#include "config.h"
8423+
8424+/* as the man says...Shouldn't this be in a header file somewhere? */
8425+#define BYTES_PER_WORD sizeof(void *)
8426+
8427+static kmem_cache_t *rsb_cache_small;
8428+static kmem_cache_t *rsb_cache_large;
8429+static kmem_cache_t *lkb_cache;
8430+static kmem_cache_t *lvb_cache;
8431+static kmem_cache_t *resdir_cache_large;
8432+static kmem_cache_t *resdir_cache_small;
8433+
b7b72b66 8434+/* The thresholds above which we allocate large RSBs/direntry rather than small
c1c6733f
AM
8435+ * ones. This must make the resultant structure end on a word boundary */
8436+#define LARGE_RSB_NAME 28
8437+#define LARGE_RES_NAME 28
8438+
8439+int dlm_memory_init()
8440+{
8441+ int ret = -ENOMEM;
8442+
8443+
8444+ rsb_cache_small =
8445+ kmem_cache_create("dlm_rsb(small)",
b7b72b66
AM
8446+ (sizeof(struct dlm_rsb) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
8447+ __alignof__(struct dlm_rsb), 0, NULL, NULL);
c1c6733f
AM
8448+ if (!rsb_cache_small)
8449+ goto out;
8450+
8451+ rsb_cache_large =
8452+ kmem_cache_create("dlm_rsb(large)",
b7b72b66
AM
8453+ sizeof(struct dlm_rsb) + DLM_RESNAME_MAXLEN,
8454+ __alignof__(struct dlm_rsb), 0, NULL, NULL);
c1c6733f
AM
8455+ if (!rsb_cache_large)
8456+ goto out_free_rsbs;
8457+
b7b72b66
AM
8458+ lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
8459+ __alignof__(struct dlm_lkb), 0, NULL, NULL);
c1c6733f
AM
8460+ if (!lkb_cache)
8461+ goto out_free_rsbl;
8462+
8463+ resdir_cache_large =
8464+ kmem_cache_create("dlm_resdir(l)",
b7b72b66
AM
8465+ sizeof(struct dlm_direntry) + DLM_RESNAME_MAXLEN,
8466+ __alignof__(struct dlm_direntry), 0, NULL, NULL);
c1c6733f
AM
8467+ if (!resdir_cache_large)
8468+ goto out_free_lkb;
8469+
8470+ resdir_cache_small =
8471+ kmem_cache_create("dlm_resdir(s)",
b7b72b66
AM
8472+ (sizeof(struct dlm_direntry) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
8473+ __alignof__(struct dlm_direntry), 0, NULL, NULL);
c1c6733f
AM
8474+ if (!resdir_cache_small)
8475+ goto out_free_resl;
8476+
8477+ /* LVB cache also holds ranges, so should be 64bit aligned */
8478+ lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
8479+ __alignof__(uint64_t), 0, NULL, NULL);
8480+ if (!lkb_cache)
8481+ goto out_free_ress;
8482+
8483+ ret = 0;
8484+ goto out;
8485+
8486+ out_free_ress:
8487+ kmem_cache_destroy(resdir_cache_small);
8488+
8489+ out_free_resl:
8490+ kmem_cache_destroy(resdir_cache_large);
8491+
8492+ out_free_lkb:
8493+ kmem_cache_destroy(lkb_cache);
8494+
8495+ out_free_rsbl:
8496+ kmem_cache_destroy(rsb_cache_large);
8497+
8498+ out_free_rsbs:
8499+ kmem_cache_destroy(rsb_cache_small);
8500+
8501+ out:
8502+ return ret;
8503+}
8504+
8505+void dlm_memory_exit()
8506+{
8507+ kmem_cache_destroy(rsb_cache_large);
8508+ kmem_cache_destroy(rsb_cache_small);
8509+ kmem_cache_destroy(lkb_cache);
8510+ kmem_cache_destroy(resdir_cache_small);
8511+ kmem_cache_destroy(resdir_cache_large);
8512+ kmem_cache_destroy(lvb_cache);
8513+}
8514+
b7b72b66 8515+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
c1c6733f 8516+{
b7b72b66 8517+ struct dlm_rsb *r;
c1c6733f 8518+
b7b72b66 8519+ DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
c1c6733f
AM
8520+
8521+ if (namelen >= LARGE_RSB_NAME)
8522+ r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
8523+ else
8524+ r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
8525+
8526+ if (r)
b7b72b66 8527+ memset(r, 0, sizeof(struct dlm_rsb) + namelen);
c1c6733f
AM
8528+
8529+ return r;
8530+}
8531+
b7b72b66 8532+void free_rsb(struct dlm_rsb *r)
c1c6733f
AM
8533+{
8534+ int length = r->res_length;
8535+
8536+#ifdef POISON
b7b72b66 8537+ memset(r, 0x55, sizeof(struct dlm_rsb) + r->res_length);
c1c6733f
AM
8538+#endif
8539+
8540+ if (length >= LARGE_RSB_NAME)
8541+ kmem_cache_free(rsb_cache_large, r);
8542+ else
8543+ kmem_cache_free(rsb_cache_small, r);
8544+}
8545+
b7b72b66 8546+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
c1c6733f 8547+{
b7b72b66 8548+ struct dlm_lkb *l;
c1c6733f
AM
8549+
8550+ l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
8551+ if (l)
b7b72b66 8552+ memset(l, 0, sizeof(struct dlm_lkb));
c1c6733f
AM
8553+
8554+ return l;
8555+}
8556+
b7b72b66 8557+void free_lkb(struct dlm_lkb *l)
c1c6733f
AM
8558+{
8559+#ifdef POISON
b7b72b66 8560+ memset(l, 0xAA, sizeof(struct dlm_lkb));
c1c6733f
AM
8561+#endif
8562+ kmem_cache_free(lkb_cache, l);
8563+}
8564+
b7b72b66 8565+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
c1c6733f 8566+{
b7b72b66 8567+ struct dlm_direntry *rd;
c1c6733f 8568+
b7b72b66 8569+ DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
c1c6733f
AM
8570+
8571+ if (namelen >= LARGE_RES_NAME)
8572+ rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
8573+ else
8574+ rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
8575+
8576+ if (rd)
b7b72b66 8577+ memset(rd, 0, sizeof(struct dlm_direntry));
c1c6733f
AM
8578+
8579+ return rd;
8580+}
8581+
b7b72b66 8582+void free_direntry(struct dlm_direntry *de)
c1c6733f 8583+{
b7b72b66
AM
8584+ if (de->length >= LARGE_RES_NAME)
8585+ kmem_cache_free(resdir_cache_large, de);
c1c6733f 8586+ else
b7b72b66 8587+ kmem_cache_free(resdir_cache_small, de);
c1c6733f
AM
8588+}
8589+
b7b72b66 8590+char *allocate_lvb(struct dlm_ls *ls)
c1c6733f
AM
8591+{
8592+ char *l;
8593+
8594+ l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8595+ if (l)
8596+ memset(l, 0, DLM_LVB_LEN);
8597+
8598+ return l;
8599+}
8600+
8601+void free_lvb(char *l)
8602+{
8603+ kmem_cache_free(lvb_cache, l);
8604+}
8605+
8606+/* Ranges are allocated from the LVB cache as they are the same size (4x64
8607+ * bits) */
b7b72b66 8608+uint64_t *allocate_range(struct dlm_ls * ls)
c1c6733f
AM
8609+{
8610+ uint64_t *l;
8611+
8612+ l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8613+ if (l)
8614+ memset(l, 0, DLM_LVB_LEN);
8615+
8616+ return l;
8617+}
8618+
8619+void free_range(uint64_t *l)
8620+{
8621+ kmem_cache_free(lvb_cache, l);
8622+}
8623+
b7b72b66 8624+struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls)
c1c6733f 8625+{
b7b72b66 8626+ struct dlm_rcom *rc;
c1c6733f
AM
8627+
8628+ rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
8629+ if (rc)
8630+ memset(rc, 0, dlm_config.buffer_size);
8631+
8632+ return rc;
8633+}
8634+
b7b72b66 8635+void free_rcom_buffer(struct dlm_rcom *rc)
c1c6733f
AM
8636+{
8637+ kfree(rc);
8638+}
8639diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
8640--- linux-orig/cluster/dlm/memory.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 8641+++ linux-patched/cluster/dlm/memory.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
8642@@ -0,0 +1,32 @@
8643+/******************************************************************************
8644+*******************************************************************************
8645+**
8646+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8647+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8648+**
8649+** This copyrighted material is made available to anyone wishing to use,
8650+** modify, copy, or redistribute it subject to the terms and conditions
8651+** of the GNU General Public License v.2.
8652+**
8653+*******************************************************************************
8654+******************************************************************************/
8655+
8656+#ifndef __MEMORY_DOT_H__
8657+#define __MEMORY_DOT_H__
8658+
8659+int dlm_memory_init(void);
8660+void dlm_memory_exit(void);
b7b72b66
AM
8661+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
8662+void free_rsb(struct dlm_rsb *r);
8663+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
8664+void free_lkb(struct dlm_lkb *l);
8665+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
8666+void free_direntry(struct dlm_direntry *de);
8667+char *allocate_lvb(struct dlm_ls *ls);
c1c6733f 8668+void free_lvb(char *l);
b7b72b66
AM
8669+struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls);
8670+void free_rcom_buffer(struct dlm_rcom *rc);
8671+uint64_t *allocate_range(struct dlm_ls *ls);
8672+void free_range(uint64_t *l);
c1c6733f
AM
8673+
8674+#endif /* __MEMORY_DOT_H__ */
8675diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
8676--- linux-orig/cluster/dlm/midcomms.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 8677+++ linux-patched/cluster/dlm/midcomms.c 2004-11-03 11:31:56.000000000 +0800
c783755a 8678@@ -0,0 +1,355 @@
c1c6733f
AM
8679+/******************************************************************************
8680+*******************************************************************************
8681+**
8682+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8683+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8684+**
8685+** This copyrighted material is made available to anyone wishing to use,
8686+** modify, copy, or redistribute it subject to the terms and conditions
8687+** of the GNU General Public License v.2.
8688+**
8689+*******************************************************************************
8690+******************************************************************************/
8691+
8692+/*
8693+ * midcomms.c
8694+ *
8695+ * This is the appallingly named "mid-level" comms layer.
8696+ *
8697+ * Its purpose is to take packets from the "real" comms layer,
8698+ * split them up into packets and pass them to the interested
8699+ * part of the locking mechanism.
8700+ *
8701+ * It also takes messages from the locking layer, formats them
8702+ * into packets and sends them to the comms layer.
8703+ *
8704+ * It knows the format of the mid-level messages used and nodeidss
8705+ * but it does not know how to resolve a nodeid into an IP address
8706+ * or any of the comms channel details
8707+ *
8708+ */
8709+
8710+#include "dlm_internal.h"
8711+#include "lowcomms.h"
8712+#include "midcomms.h"
8713+#include "lockqueue.h"
8714+#include "nodes.h"
8715+#include "reccomms.h"
8716+#include "config.h"
8717+
8718+/* Byteorder routines */
8719+
8720+static void host_to_network(void *msg)
8721+{
b7b72b66
AM
8722+ struct dlm_header *head = msg;
8723+ struct dlm_request *req = msg;
8724+ struct dlm_reply *rep = msg;
8725+ struct dlm_query_request *qreq = msg;
8726+ struct dlm_query_reply *qrep= msg;
8727+ struct dlm_rcom *rc = msg;
c1c6733f
AM
8728+
8729+ /* Force into network byte order */
8730+
8731+ /*
8732+ * Do the common header first
8733+ */
8734+
8735+ head->rh_length = cpu_to_le16(head->rh_length);
8736+ head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
8737+ /* Leave the lkid alone as it is transparent at the remote end */
8738+
8739+ /*
8740+ * Do the fields in the remlockrequest or remlockreply structs
8741+ */
8742+
8743+ switch (req->rr_header.rh_cmd) {
8744+
8745+ case GDLM_REMCMD_LOCKREQUEST:
8746+ case GDLM_REMCMD_CONVREQUEST:
8747+ req->rr_range_start = cpu_to_le64(req->rr_range_start);
8748+ req->rr_range_end = cpu_to_le64(req->rr_range_end);
8749+ /* Deliberate fall through */
8750+ case GDLM_REMCMD_UNLOCKREQUEST:
8751+ case GDLM_REMCMD_LOOKUP:
8752+ case GDLM_REMCMD_LOCKGRANT:
8753+ case GDLM_REMCMD_SENDBAST:
8754+ case GDLM_REMCMD_SENDCAST:
8755+ case GDLM_REMCMD_REM_RESDATA:
8756+ req->rr_flags = cpu_to_le32(req->rr_flags);
8757+ req->rr_status = cpu_to_le32(req->rr_status);
8758+ break;
8759+
8760+ case GDLM_REMCMD_LOCKREPLY:
b7b72b66
AM
8761+ rep->rl_lockstate = cpu_to_le32(rep->rl_lockstate);
8762+ rep->rl_nodeid = cpu_to_le32(rep->rl_nodeid);
8763+ rep->rl_status = cpu_to_le32(rep->rl_status);
c1c6733f
AM
8764+ break;
8765+
8766+ case GDLM_REMCMD_RECOVERMESSAGE:
8767+ case GDLM_REMCMD_RECOVERREPLY:
8768+ rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
8769+ rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
8770+ break;
8771+
8772+ case GDLM_REMCMD_QUERY:
b7b72b66
AM
8773+ qreq->rq_mstlkid = cpu_to_le32(qreq->rq_mstlkid);
8774+ qreq->rq_query = cpu_to_le32(qreq->rq_query);
8775+ qreq->rq_maxlocks = cpu_to_le32(qreq->rq_maxlocks);
c1c6733f
AM
8776+ break;
8777+
8778+ case GDLM_REMCMD_QUERYREPLY:
b7b72b66
AM
8779+ qrep->rq_numlocks = cpu_to_le32(qrep->rq_numlocks);
8780+ qrep->rq_status = cpu_to_le32(qrep->rq_status);
8781+ qrep->rq_grantcount = cpu_to_le32(qrep->rq_grantcount);
8782+ qrep->rq_waitcount = cpu_to_le32(qrep->rq_waitcount);
8783+ qrep->rq_convcount = cpu_to_le32(qrep->rq_convcount);
c1c6733f
AM
8784+ break;
8785+
8786+ default:
8787+ printk("dlm: warning, unknown REMCMD type %u\n",
8788+ req->rr_header.rh_cmd);
8789+ }
8790+}
8791+
8792+static void network_to_host(void *msg)
8793+{
b7b72b66
AM
8794+ struct dlm_header *head = msg;
8795+ struct dlm_request *req = msg;
8796+ struct dlm_reply *rep = msg;
8797+ struct dlm_query_request *qreq = msg;
8798+ struct dlm_query_reply *qrep = msg;
8799+ struct dlm_rcom *rc = msg;
c1c6733f
AM
8800+
8801+ /* Force into host byte order */
8802+
8803+ /*
8804+ * Do the common header first
8805+ */
8806+
8807+ head->rh_length = le16_to_cpu(head->rh_length);
8808+ head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
8809+ /* Leave the lkid alone as it is transparent at the remote end */
8810+
8811+ /*
8812+ * Do the fields in the remlockrequest or remlockreply structs
8813+ */
8814+
8815+ switch (req->rr_header.rh_cmd) {
8816+
8817+ case GDLM_REMCMD_LOCKREQUEST:
8818+ case GDLM_REMCMD_CONVREQUEST:
8819+ req->rr_range_start = le64_to_cpu(req->rr_range_start);
8820+ req->rr_range_end = le64_to_cpu(req->rr_range_end);
8821+ case GDLM_REMCMD_LOOKUP:
8822+ case GDLM_REMCMD_UNLOCKREQUEST:
8823+ case GDLM_REMCMD_LOCKGRANT:
8824+ case GDLM_REMCMD_SENDBAST:
8825+ case GDLM_REMCMD_SENDCAST:
8826+ case GDLM_REMCMD_REM_RESDATA:
8827+ /* Actually, not much to do here as the remote lock IDs are
8828+ * transparent too */
8829+ req->rr_flags = le32_to_cpu(req->rr_flags);
8830+ req->rr_status = le32_to_cpu(req->rr_status);
8831+ break;
8832+
8833+ case GDLM_REMCMD_LOCKREPLY:
b7b72b66
AM
8834+ rep->rl_lockstate = le32_to_cpu(rep->rl_lockstate);
8835+ rep->rl_nodeid = le32_to_cpu(rep->rl_nodeid);
8836+ rep->rl_status = le32_to_cpu(rep->rl_status);
c1c6733f
AM
8837+ break;
8838+
8839+ case GDLM_REMCMD_RECOVERMESSAGE:
8840+ case GDLM_REMCMD_RECOVERREPLY:
8841+ rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
8842+ rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
8843+ break;
8844+
8845+
8846+ case GDLM_REMCMD_QUERY:
b7b72b66
AM
8847+ qreq->rq_mstlkid = le32_to_cpu(qreq->rq_mstlkid);
8848+ qreq->rq_query = le32_to_cpu(qreq->rq_query);
8849+ qreq->rq_maxlocks = le32_to_cpu(qreq->rq_maxlocks);
c1c6733f
AM
8850+ break;
8851+
8852+ case GDLM_REMCMD_QUERYREPLY:
b7b72b66
AM
8853+ qrep->rq_numlocks = le32_to_cpu(qrep->rq_numlocks);
8854+ qrep->rq_status = le32_to_cpu(qrep->rq_status);
8855+ qrep->rq_grantcount = le32_to_cpu(qrep->rq_grantcount);
8856+ qrep->rq_waitcount = le32_to_cpu(qrep->rq_waitcount);
8857+ qrep->rq_convcount = le32_to_cpu(qrep->rq_convcount);
c1c6733f
AM
8858+ break;
8859+
8860+ default:
8861+ printk("dlm: warning, unknown REMCMD type %u\n",
8862+ req->rr_header.rh_cmd);
8863+ }
8864+}
8865+
8866+static void copy_from_cb(void *dst, const void *base, unsigned offset,
8867+ unsigned len, unsigned limit)
8868+{
8869+ unsigned copy = len;
8870+
8871+ if ((copy + offset) > limit)
8872+ copy = limit - offset;
8873+ memcpy(dst, base + offset, copy);
8874+ len -= copy;
8875+ if (len)
8876+ memcpy(dst + copy, base, len);
8877+}
8878+
8879+static void khexdump(const unsigned char *c, int len)
8880+{
8881+ while (len > 16) {
8882+ printk(KERN_INFO
8883+ "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
8884+ c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
8885+ c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
8886+ len -= 16;
c783755a 8887+ c += 16;
c1c6733f
AM
8888+ }
8889+ while (len > 4) {
8890+ printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
8891+ c[3]);
8892+ len -= 4;
c783755a 8893+ c += 4;
c1c6733f
AM
8894+ }
8895+ while (len > 0) {
8896+ printk(KERN_INFO "%02x\n", c[0]);
8897+ len--;
c783755a 8898+ c++;
c1c6733f
AM
8899+ }
8900+}
8901+
8902+/*
8903+ * Called from the low-level comms layer to process a buffer of
8904+ * commands.
8905+ *
8906+ * Only complete messages are processed here, any "spare" bytes from
8907+ * the end of a buffer are saved and tacked onto the front of the next
8908+ * message that comes in. I doubt this will happen very often but we
8909+ * need to be able to cope with it and I don't want the task to be waiting
8910+ * for packets to come in when there is useful work to be done.
8911+ *
8912+ */
8913+int midcomms_process_incoming_buffer(int nodeid, const void *base,
8914+ unsigned offset, unsigned len,
8915+ unsigned limit)
8916+{
b7b72b66
AM
8917+ unsigned char __tmp[sizeof(struct dlm_header) + 64];
8918+ struct dlm_header *msg = (struct dlm_header *) __tmp;
c1c6733f
AM
8919+ int ret = 0;
8920+ int err = 0;
8921+ unsigned msglen;
8922+ __u32 id, space;
8923+
b7b72b66 8924+ while (len > sizeof(struct dlm_header)) {
c1c6733f 8925+ /* Get message header and check it over */
b7b72b66 8926+ copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
c1c6733f
AM
8927+ limit);
8928+ msglen = le16_to_cpu(msg->rh_length);
8929+ id = msg->rh_lkid;
8930+ space = msg->rh_lockspace;
8931+
8932+ /* Check message size */
8933+ err = -EINVAL;
b7b72b66 8934+ if (msglen < sizeof(struct dlm_header))
c1c6733f
AM
8935+ break;
8936+ err = -E2BIG;
8937+ if (msglen > dlm_config.buffer_size) {
c783755a
AM
8938+ printk("dlm: message size from %d too big %d(pkt len=%d)\n", nodeid, msglen, len);
8939+ khexdump((const unsigned char *) msg, len);
c1c6733f
AM
8940+ break;
8941+ }
8942+ err = 0;
8943+
8944+ /* Not enough in buffer yet? wait for some more */
8945+ if (msglen > len)
8946+ break;
8947+
8948+ /* Make sure our temp buffer is large enough */
8949+ if (msglen > sizeof(__tmp) &&
b7b72b66 8950+ msg == (struct dlm_header *) __tmp) {
c1c6733f
AM
8951+ msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
8952+ if (msg == NULL)
8953+ return ret;
8954+ }
8955+
8956+ copy_from_cb(msg, base, offset, msglen, limit);
8957+ BUG_ON(id != msg->rh_lkid);
8958+ BUG_ON(space != msg->rh_lockspace);
8959+ ret += msglen;
8960+ offset += msglen;
8961+ offset &= (limit - 1);
8962+ len -= msglen;
8963+ network_to_host(msg);
8964+
8965+ if ((msg->rh_cmd > 32) ||
8966+ (msg->rh_cmd == 0) ||
b7b72b66 8967+ (msg->rh_length < sizeof(struct dlm_header)) ||
c1c6733f
AM
8968+ (msg->rh_length > dlm_config.buffer_size)) {
8969+
8970+ printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
8971+ "lkid=%u, lockspace=%u\n",
8972+ msg->rh_cmd, msg->rh_flags, msg->rh_length,
8973+ msg->rh_lkid, msg->rh_lockspace);
8974+
8975+ printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
8976+ "ret=%u, limit=%08x newbuf=%d\n",
8977+ base, offset, len, ret, limit,
b7b72b66 8978+ ((struct dlm_header *) __tmp == msg));
c1c6733f
AM
8979+
8980+ khexdump((const unsigned char *) msg, msg->rh_length);
8981+
8982+ return -EBADMSG;
8983+ }
8984+
8985+ switch (msg->rh_cmd) {
8986+ case GDLM_REMCMD_RECOVERMESSAGE:
8987+ case GDLM_REMCMD_RECOVERREPLY:
8988+ process_recovery_comm(nodeid, msg);
8989+ break;
8990+ default:
8991+ process_cluster_request(nodeid, msg, FALSE);
8992+ }
8993+ }
8994+
b7b72b66 8995+ if (msg != (struct dlm_header *) __tmp)
c1c6733f
AM
8996+ kfree(msg);
8997+
8998+ return err ? err : ret;
8999+}
9000+
9001+/*
9002+ * Send a lowcomms buffer
9003+ */
9004+
b7b72b66 9005+void midcomms_send_buffer(struct dlm_header *msg, struct writequeue_entry *e)
c1c6733f
AM
9006+{
9007+ host_to_network(msg);
9008+ lowcomms_commit_buffer(e);
9009+}
9010+
9011+/*
9012+ * Make the message into network byte order and send it
9013+ */
9014+
b7b72b66 9015+int midcomms_send_message(uint32_t nodeid, struct dlm_header *msg,
c1c6733f
AM
9016+ int allocation)
9017+{
9018+ int len = msg->rh_length;
9019+
9020+ host_to_network(msg);
9021+
9022+ /*
9023+ * Loopback. In fact, the locking code pretty much prevents this from
9024+ * being needed but it can happen when the directory node is also the
9025+ * local node.
9026+ */
9027+
9028+ if (nodeid == our_nodeid())
9029+ return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
9030+ len, len);
9031+
9032+ return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
9033+}
9034diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
9035--- linux-orig/cluster/dlm/midcomms.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 9036+++ linux-patched/cluster/dlm/midcomms.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
9037@@ -0,0 +1,24 @@
9038+/******************************************************************************
9039+*******************************************************************************
9040+**
9041+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9042+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9043+**
9044+** This copyrighted material is made available to anyone wishing to use,
9045+** modify, copy, or redistribute it subject to the terms and conditions
9046+** of the GNU General Public License v.2.
9047+**
9048+*******************************************************************************
9049+******************************************************************************/
9050+
9051+#ifndef __MIDCOMMS_DOT_H__
9052+#define __MIDCOMMS_DOT_H__
9053+
b7b72b66 9054+int midcomms_send_message(uint32_t csid, struct dlm_header *msg,
c1c6733f
AM
9055+ int allocation);
9056+int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
9057+ unsigned len, unsigned limit);
b7b72b66 9058+void midcomms_send_buffer(struct dlm_header *msg,
c1c6733f
AM
9059+ struct writequeue_entry *e);
9060+
9061+#endif /* __MIDCOMMS_DOT_H__ */
9062diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
9063--- linux-orig/cluster/dlm/nodes.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
9064+++ linux-patched/cluster/dlm/nodes.c 2004-11-03 11:31:56.000000000 +0800
9065@@ -0,0 +1,347 @@
c1c6733f
AM
9066+/******************************************************************************
9067+*******************************************************************************
9068+**
9069+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9070+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9071+**
9072+** This copyrighted material is made available to anyone wishing to use,
9073+** modify, copy, or redistribute it subject to the terms and conditions
9074+** of the GNU General Public License v.2.
9075+**
9076+*******************************************************************************
9077+******************************************************************************/
9078+
9079+#include <net/sock.h>
9080+#include <cluster/cnxman.h>
9081+
9082+#include "dlm_internal.h"
9083+#include "lowcomms.h"
9084+#include "nodes.h"
9085+#include "recover.h"
9086+#include "reccomms.h"
9087+#include "util.h"
9088+
9089+static struct list_head cluster_nodes;
9090+static spinlock_t node_lock;
c1c6733f
AM
9091+
9092+
9093+void dlm_nodes_init(void)
9094+{
9095+ INIT_LIST_HEAD(&cluster_nodes);
9096+ spin_lock_init(&node_lock);
c1c6733f
AM
9097+}
9098+
b7b72b66 9099+static struct dlm_node *search_node(uint32_t nodeid)
c1c6733f 9100+{
b7b72b66 9101+ struct dlm_node *node;
c1c6733f 9102+
b7b72b66
AM
9103+ list_for_each_entry(node, &cluster_nodes, list) {
9104+ if (node->nodeid == nodeid)
c1c6733f
AM
9105+ goto out;
9106+ }
9107+ node = NULL;
c783755a 9108+ out:
c1c6733f
AM
9109+ return node;
9110+}
9111+
b7b72b66 9112+static void put_node(struct dlm_node *node)
c1c6733f
AM
9113+{
9114+ spin_lock(&node_lock);
c783755a
AM
9115+ if (atomic_dec_and_test(&node->refcount)) {
9116+ lowcomms_close(node->nodeid);
b7b72b66 9117+ list_del(&node->list);
c1c6733f
AM
9118+ spin_unlock(&node_lock);
9119+ kfree(node);
9120+ return;
9121+ }
9122+ spin_unlock(&node_lock);
9123+}
9124+
b7b72b66 9125+static int get_node(uint32_t nodeid, struct dlm_node **ndp)
c1c6733f 9126+{
b7b72b66 9127+ struct dlm_node *node, *node2;
c1c6733f
AM
9128+ int error = -ENOMEM;
9129+
9130+ spin_lock(&node_lock);
9131+ node = search_node(nodeid);
9132+ if (node)
c783755a 9133+ atomic_inc(&node->refcount);
c1c6733f
AM
9134+ spin_unlock(&node_lock);
9135+
9136+ if (node)
9137+ goto out;
9138+
b7b72b66 9139+ node = (struct dlm_node *) kmalloc(sizeof(struct dlm_node), GFP_KERNEL);
c1c6733f
AM
9140+ if (!node)
9141+ goto fail;
9142+
b7b72b66
AM
9143+ memset(node, 0, sizeof(struct dlm_node));
9144+ node->nodeid = nodeid;
c1c6733f
AM
9145+
9146+ spin_lock(&node_lock);
9147+ node2 = search_node(nodeid);
9148+ if (node2) {
c783755a 9149+ atomic_inc(&node2->refcount);
c1c6733f
AM
9150+ spin_unlock(&node_lock);
9151+ kfree(node);
9152+ node = node2;
9153+ goto out;
9154+ }
9155+
c783755a 9156+ atomic_set(&node->refcount, 1);
b7b72b66 9157+ list_add_tail(&node->list, &cluster_nodes);
c1c6733f
AM
9158+ spin_unlock(&node_lock);
9159+
c783755a 9160+ out:
c1c6733f
AM
9161+ *ndp = node;
9162+ return 0;
c783755a 9163+ fail:
c1c6733f
AM
9164+ return error;
9165+}
9166+
b7b72b66 9167+int init_new_csb(uint32_t nodeid, struct dlm_csb **ret_csb)
c1c6733f 9168+{
b7b72b66
AM
9169+ struct dlm_csb *csb;
9170+ struct dlm_node *node;
c1c6733f
AM
9171+ int error = -ENOMEM;
9172+
b7b72b66 9173+ csb = (struct dlm_csb *) kmalloc(sizeof(struct dlm_csb), GFP_KERNEL);
c1c6733f
AM
9174+ if (!csb)
9175+ goto fail;
9176+
b7b72b66 9177+ memset(csb, 0, sizeof(struct dlm_csb));
c1c6733f
AM
9178+
9179+ error = get_node(nodeid, &node);
9180+ if (error)
9181+ goto fail_free;
9182+
b7b72b66 9183+ csb->node = node;
c1c6733f
AM
9184+ *ret_csb = csb;
9185+ return 0;
9186+
c783755a 9187+ fail_free:
c1c6733f 9188+ kfree(csb);
c783755a 9189+ fail:
c1c6733f
AM
9190+ return error;
9191+}
9192+
b7b72b66 9193+void release_csb(struct dlm_csb *csb)
c1c6733f 9194+{
b7b72b66 9195+ put_node(csb->node);
c1c6733f
AM
9196+ kfree(csb);
9197+}
9198+
9199+uint32_t our_nodeid(void)
9200+{
9201+ return lowcomms_our_nodeid();
9202+}
9203+
bb1d8b11
AM
9204+static void make_node_array(struct dlm_ls *ls)
9205+{
9206+ struct dlm_csb *csb;
9207+ uint32_t *array;
9208+ int i = 0;
9209+
9210+ if (ls->ls_node_array) {
9211+ kfree(ls->ls_node_array);
9212+ ls->ls_node_array = NULL;
9213+ }
9214+
9215+ array = kmalloc(sizeof(uint32_t) * ls->ls_num_nodes, GFP_KERNEL);
9216+ if (!array)
9217+ return;
9218+
9219+ list_for_each_entry(csb, &ls->ls_nodes, list)
9220+ array[i++] = csb->node->nodeid;
9221+
9222+ ls->ls_node_array = array;
9223+}
9224+
b7b72b66 9225+int nodes_reconfig_wait(struct dlm_ls *ls)
c1c6733f
AM
9226+{
9227+ int error;
9228+
9229+ if (ls->ls_low_nodeid == our_nodeid()) {
b7b72b66 9230+ error = dlm_wait_status_all(ls, NODES_VALID);
c1c6733f
AM
9231+ if (!error)
9232+ set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
9233+
9234+ /* Experimental: this delay should allow any final messages
9235+ * from the previous node to be received before beginning
9236+ * recovery. */
9237+
9238+ if (ls->ls_num_nodes == 1) {
9239+ current->state = TASK_UNINTERRUPTIBLE;
9240+ schedule_timeout((2) * HZ);
9241+ }
9242+
9243+ } else
b7b72b66 9244+ error = dlm_wait_status_low(ls, NODES_ALL_VALID);
c1c6733f
AM
9245+
9246+ return error;
9247+}
9248+
b7b72b66 9249+static void add_ordered_node(struct dlm_ls *ls, struct dlm_csb *new)
c1c6733f 9250+{
b7b72b66 9251+ struct dlm_csb *csb = NULL;
c1c6733f 9252+ struct list_head *tmp;
b7b72b66 9253+ struct list_head *newlist = &new->list;
c1c6733f
AM
9254+ struct list_head *head = &ls->ls_nodes;
9255+
9256+ list_for_each(tmp, head) {
b7b72b66 9257+ csb = list_entry(tmp, struct dlm_csb, list);
c1c6733f 9258+
b7b72b66 9259+ if (new->node->nodeid < csb->node->nodeid)
c1c6733f
AM
9260+ break;
9261+ }
9262+
9263+ if (!csb)
9264+ list_add_tail(newlist, head);
9265+ else {
9266+ /* FIXME: can use list macro here */
9267+ newlist->prev = tmp->prev;
9268+ newlist->next = tmp;
9269+ tmp->prev->next = newlist;
9270+ tmp->prev = newlist;
9271+ }
9272+}
9273+
b7b72b66 9274+int ls_nodes_reconfig(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
c1c6733f 9275+{
b7b72b66 9276+ struct dlm_csb *csb, *safe;
c1c6733f
AM
9277+ int error, i, found, pos = 0, neg = 0;
9278+ uint32_t low = (uint32_t) (-1);
9279+
9280+ /*
9281+ * Remove (and save) departed nodes from lockspace's nodes list
9282+ */
9283+
b7b72b66 9284+ list_for_each_entry_safe(csb, safe, &ls->ls_nodes, list) {
c1c6733f 9285+ found = FALSE;
b7b72b66
AM
9286+ for (i = 0; i < rv->node_count; i++) {
9287+ if (csb->node->nodeid == rv->nodeids[i]) {
c1c6733f
AM
9288+ found = TRUE;
9289+ break;
9290+ }
9291+ }
9292+
9293+ if (!found) {
9294+ neg++;
b7b72b66
AM
9295+ csb->gone_event = rv->event_id;
9296+ list_del(&csb->list);
9297+ list_add_tail(&csb->list, &ls->ls_nodes_gone);
c1c6733f 9298+ ls->ls_num_nodes--;
b7b72b66 9299+ log_all(ls, "remove node %u", csb->node->nodeid);
c1c6733f
AM
9300+ }
9301+ }
9302+
9303+ /*
9304+ * Add new nodes to lockspace's nodes list
9305+ */
9306+
b7b72b66 9307+ for (i = 0; i < rv->node_count; i++) {
c1c6733f 9308+ found = FALSE;
b7b72b66
AM
9309+ list_for_each_entry(csb, &ls->ls_nodes, list) {
9310+ if (csb->node->nodeid == rv->nodeids[i]) {
c1c6733f
AM
9311+ found = TRUE;
9312+ break;
9313+ }
9314+ }
9315+
9316+ if (!found) {
9317+ pos++;
9318+
b7b72b66
AM
9319+ error = init_new_csb(rv->nodeids[i], &csb);
9320+ DLM_ASSERT(!error,);
c1c6733f
AM
9321+
9322+ add_ordered_node(ls, csb);
9323+ ls->ls_num_nodes++;
b7b72b66 9324+ log_all(ls, "add node %u", csb->node->nodeid);
c1c6733f
AM
9325+ }
9326+ }
9327+
b7b72b66
AM
9328+ list_for_each_entry(csb, &ls->ls_nodes, list) {
9329+ if (csb->node->nodeid < low)
9330+ low = csb->node->nodeid;
c1c6733f
AM
9331+ }
9332+
c1c6733f 9333+ ls->ls_low_nodeid = low;
c1c6733f
AM
9334+ set_bit(LSFL_NODES_VALID, &ls->ls_flags);
9335+ *neg_out = neg;
bb1d8b11 9336+ make_node_array(ls);
c1c6733f
AM
9337+
9338+ error = nodes_reconfig_wait(ls);
9339+
9340+ log_all(ls, "total nodes %d", ls->ls_num_nodes);
9341+
9342+ return error;
9343+}
9344+
c783755a
AM
9345+static void nodes_clear(struct list_head *head)
9346+{
9347+ struct dlm_csb *csb;
9348+
9349+ while (!list_empty(head)) {
9350+ csb = list_entry(head->next, struct dlm_csb, list);
9351+ list_del(&csb->list);
9352+ release_csb(csb);
9353+ }
9354+}
9355+
9356+void ls_nodes_clear(struct dlm_ls *ls)
9357+{
9358+ nodes_clear(&ls->ls_nodes);
9359+ ls->ls_num_nodes = 0;
9360+}
9361+
9362+void ls_nodes_gone_clear(struct dlm_ls *ls)
9363+{
9364+ nodes_clear(&ls->ls_nodes_gone);
9365+}
9366+
b7b72b66 9367+int ls_nodes_init(struct dlm_ls *ls, struct dlm_recover *rv)
c1c6733f 9368+{
b7b72b66 9369+ struct dlm_csb *csb;
c1c6733f
AM
9370+ int i, error;
9371+ uint32_t low = (uint32_t) (-1);
9372+
c783755a
AM
9373+ /* nodes may be left from a previous failed start */
9374+ ls_nodes_clear(ls);
9375+
c1c6733f
AM
9376+ log_all(ls, "add nodes");
9377+
b7b72b66
AM
9378+ for (i = 0; i < rv->node_count; i++) {
9379+ error = init_new_csb(rv->nodeids[i], &csb);
c1c6733f
AM
9380+ if (error)
9381+ goto fail;
9382+
9383+ add_ordered_node(ls, csb);
9384+ ls->ls_num_nodes++;
9385+
b7b72b66
AM
9386+ if (csb->node->nodeid < low)
9387+ low = csb->node->nodeid;
c1c6733f
AM
9388+ }
9389+
9390+ ls->ls_low_nodeid = low;
c1c6733f 9391+ set_bit(LSFL_NODES_VALID, &ls->ls_flags);
bb1d8b11 9392+ make_node_array(ls);
c1c6733f
AM
9393+
9394+ error = nodes_reconfig_wait(ls);
9395+
9396+ log_all(ls, "total nodes %d", ls->ls_num_nodes);
c1c6733f 9397+ return error;
c783755a
AM
9398+ fail:
9399+ ls_nodes_clear(ls);
c1c6733f
AM
9400+ return error;
9401+}
9402+
b7b72b66 9403+int in_nodes_gone(struct dlm_ls *ls, uint32_t nodeid)
c1c6733f 9404+{
b7b72b66 9405+ struct dlm_csb *csb;
c1c6733f 9406+
b7b72b66
AM
9407+ list_for_each_entry(csb, &ls->ls_nodes_gone, list) {
9408+ if (csb->node->nodeid == nodeid)
c1c6733f
AM
9409+ return TRUE;
9410+ }
9411+ return FALSE;
9412+}
9413diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
9414--- linux-orig/cluster/dlm/nodes.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 9415+++ linux-patched/cluster/dlm/nodes.h 2004-11-03 11:31:56.000000000 +0800
c783755a 9416@@ -0,0 +1,27 @@
c1c6733f
AM
9417+/******************************************************************************
9418+*******************************************************************************
9419+**
9420+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9421+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9422+**
9423+** This copyrighted material is made available to anyone wishing to use,
9424+** modify, copy, or redistribute it subject to the terms and conditions
9425+** of the GNU General Public License v.2.
9426+**
9427+*******************************************************************************
9428+******************************************************************************/
9429+
9430+#ifndef __NODES_DOT_H__
9431+#define __NODES_DOT_H__
9432+
9433+void dlm_nodes_init(void);
b7b72b66
AM
9434+int init_new_csb(uint32_t nodeid, struct dlm_csb ** ret_csb);
9435+void release_csb(struct dlm_csb * csb);
c1c6733f 9436+uint32_t our_nodeid(void);
b7b72b66
AM
9437+int ls_nodes_reconfig(struct dlm_ls * ls, struct dlm_recover * gr, int *neg);
9438+int ls_nodes_init(struct dlm_ls * ls, struct dlm_recover * gr);
9439+int in_nodes_gone(struct dlm_ls * ls, uint32_t nodeid);
c783755a
AM
9440+void ls_nodes_clear(struct dlm_ls *ls);
9441+void ls_nodes_gone_clear(struct dlm_ls *ls);
c1c6733f
AM
9442+
9443+#endif /* __NODES_DOT_H__ */
9444diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
9445--- linux-orig/cluster/dlm/proc.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
9446+++ linux-patched/cluster/dlm/proc.c 2004-11-03 11:31:56.000000000 +0800
9447@@ -0,0 +1,652 @@
c1c6733f
AM
9448+/******************************************************************************
9449+*******************************************************************************
9450+**
9451+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9452+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9453+**
9454+** This copyrighted material is made available to anyone wishing to use,
9455+** modify, copy, or redistribute it subject to the terms and conditions
9456+** of the GNU General Public License v.2.
9457+**
9458+*******************************************************************************
9459+******************************************************************************/
9460+
9461+#include <linux/init.h>
9462+#include <linux/proc_fs.h>
9463+#include <linux/ctype.h>
9464+#include <linux/seq_file.h>
9465+#include <linux/module.h>
9466+
9467+#include "dlm_internal.h"
9468+#include "lockspace.h"
9469+
9470+#if defined(DLM_DEBUG)
9471+#define DLM_DEBUG_SIZE (1024)
9472+#define MAX_DEBUG_MSG_LEN (64)
9473+#else
9474+#define DLM_DEBUG_SIZE (0)
9475+#define MAX_DEBUG_MSG_LEN (0)
9476+#endif
9477+
9478+static char * debug_buf;
9479+static unsigned int debug_size;
9480+static unsigned int debug_point;
9481+static int debug_wrap;
9482+static spinlock_t debug_lock;
9483+static struct proc_dir_entry * debug_proc_entry = NULL;
c1c6733f
AM
9484+static char proc_ls_name[255] = "";
9485+
9486+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9487+static struct proc_dir_entry * locks_proc_entry = NULL;
9488+static struct seq_operations locks_info_op;
b7b72b66
AM
9489+static struct proc_dir_entry * dir_proc_entry = NULL;
9490+static struct seq_operations dir_info_op;
c1c6733f
AM
9491+
9492+
b7b72b66
AM
9493+/*
9494+ * /proc/cluster/dlm_locks - dump resources and locks
9495+ */
9496+
c1c6733f
AM
9497+static int locks_open(struct inode *inode, struct file *file)
9498+{
9499+ return seq_open(file, &locks_info_op);
9500+}
9501+
9502+/* Write simply sets the lockspace to use */
9503+static ssize_t locks_write(struct file *file, const char *buf,
9504+ size_t count, loff_t * ppos)
9505+{
9506+ if (count < sizeof(proc_ls_name)) {
9507+ copy_from_user(proc_ls_name, buf, count);
9508+ proc_ls_name[count] = '\0';
9509+
9510+ /* Remove any trailing LF so that lazy users
9511+ can just echo "lsname" > /proc/cluster/dlm_locks */
9512+ if (proc_ls_name[count - 1] == '\n')
9513+ proc_ls_name[count - 1] = '\0';
9514+
9515+ return count;
9516+ }
9517+ return 0;
9518+}
9519+
9520+static struct file_operations locks_fops = {
b7b72b66
AM
9521+ open:locks_open,
9522+ write:locks_write,
9523+ read:seq_read,
9524+ llseek:seq_lseek,
9525+ release:seq_release,
c1c6733f
AM
9526+};
9527+
9528+struct ls_dumpinfo {
9529+ int entry;
9530+ struct list_head *next;
b7b72b66
AM
9531+ struct dlm_ls *ls;
9532+ struct dlm_rsb *rsb;
9533+ struct dlm_direntry *de;
c1c6733f
AM
9534+};
9535+
b7b72b66 9536+static int print_resource(struct dlm_rsb * res, struct seq_file *s);
c1c6733f
AM
9537+
9538+static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
9539+{
b7b72b66
AM
9540+ int i;
9541+
c1c6733f
AM
9542+ if (!di->next) {
9543+ /* Find the next non-empty hash bucket */
b7b72b66
AM
9544+ for (i = di->entry; i < di->ls->ls_rsbtbl_size; i++) {
9545+ read_lock(&di->ls->ls_rsbtbl[i].lock);
9546+ if (!list_empty(&di->ls->ls_rsbtbl[i].list)) {
9547+ di->next = di->ls->ls_rsbtbl[i].list.next;
9548+ read_unlock(&di->ls->ls_rsbtbl[i].lock);
9549+ break;
9550+ }
9551+ read_unlock(&di->ls->ls_rsbtbl[i].lock);
c1c6733f 9552+ }
b7b72b66 9553+ di->entry = i;
c1c6733f 9554+
b7b72b66
AM
9555+ if (di->entry >= di->ls->ls_rsbtbl_size)
9556+ return NULL; /* End of hash list */
c1c6733f 9557+ } else { /* Find the next entry in the list */
b7b72b66
AM
9558+ i = di->entry;
9559+ read_lock(&di->ls->ls_rsbtbl[i].lock);
c1c6733f 9560+ di->next = di->next->next;
b7b72b66 9561+ if (di->next->next == di->ls->ls_rsbtbl[i].list.next) {
c1c6733f
AM
9562+ /* End of list - move to next bucket */
9563+ di->next = NULL;
9564+ di->entry++;
b7b72b66 9565+ read_unlock(&di->ls->ls_rsbtbl[i].lock);
c1c6733f
AM
9566+ return next_rsb(di); /* do the top half of this conditional */
9567+ }
b7b72b66 9568+ read_unlock(&di->ls->ls_rsbtbl[i].lock);
c1c6733f 9569+ }
b7b72b66 9570+ di->rsb = list_entry(di->next, struct dlm_rsb, res_hashchain);
c1c6733f
AM
9571+
9572+ return di;
9573+}
9574+
b7b72b66 9575+static void *s_start(struct seq_file *m, loff_t *pos)
c1c6733f
AM
9576+{
9577+ struct ls_dumpinfo *di;
b7b72b66 9578+ struct dlm_ls *ls;
c1c6733f
AM
9579+ int i;
9580+
9581+ ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9582+ if (!ls)
9583+ return NULL;
9584+
9585+ di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
9586+ if (!di)
9587+ return NULL;
9588+
9589+ if (*pos == 0)
9590+ seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
9591+
9592+ di->entry = 0;
9593+ di->next = NULL;
9594+ di->ls = ls;
b7b72b66 9595+ di->de = NULL;
c1c6733f
AM
9596+
9597+ for (i = 0; i < *pos; i++)
9598+ if (next_rsb(di) == NULL)
9599+ return NULL;
9600+
9601+ return next_rsb(di);
9602+}
9603+
b7b72b66 9604+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
c1c6733f
AM
9605+{
9606+ struct ls_dumpinfo *di = p;
9607+
9608+ *pos += 1;
9609+
9610+ return next_rsb(di);
9611+}
9612+
9613+static int s_show(struct seq_file *m, void *p)
9614+{
9615+ struct ls_dumpinfo *di = p;
9616+ return print_resource(di->rsb, m);
9617+}
9618+
9619+static void s_stop(struct seq_file *m, void *p)
9620+{
9621+ kfree(p);
9622+}
9623+
9624+static struct seq_operations locks_info_op = {
b7b72b66
AM
9625+ start:s_start,
9626+ next:s_next,
9627+ stop:s_stop,
9628+ show:s_show
c1c6733f
AM
9629+};
9630+
9631+static char *print_lockmode(int mode)
9632+{
9633+ switch (mode) {
9634+ case DLM_LOCK_IV:
9635+ return "--";
9636+ case DLM_LOCK_NL:
9637+ return "NL";
9638+ case DLM_LOCK_CR:
9639+ return "CR";
9640+ case DLM_LOCK_CW:
9641+ return "CW";
9642+ case DLM_LOCK_PR:
9643+ return "PR";
9644+ case DLM_LOCK_PW:
9645+ return "PW";
9646+ case DLM_LOCK_EX:
9647+ return "EX";
9648+ default:
9649+ return "??";
9650+ }
9651+}
9652+
b7b72b66
AM
9653+static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
9654+ struct dlm_rsb *res)
c1c6733f
AM
9655+{
9656+
9657+ seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
9658+
9659+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9660+ || lkb->lkb_status == GDLM_LKSTS_WAITING)
9661+ seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
9662+
9663+ if (lkb->lkb_range) {
9664+ /* This warns on Alpha. Tough. Only I see it */
9665+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9666+ || lkb->lkb_status == GDLM_LKSTS_GRANTED)
9667+ seq_printf(s, " %" PRIx64 "-%" PRIx64,
9668+ lkb->lkb_range[GR_RANGE_START],
9669+ lkb->lkb_range[GR_RANGE_END]);
9670+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9671+ || lkb->lkb_status == GDLM_LKSTS_WAITING)
9672+ seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
9673+ lkb->lkb_range[RQ_RANGE_START],
9674+ lkb->lkb_range[RQ_RANGE_END]);
9675+ }
9676+
9677+ if (lkb->lkb_nodeid) {
9678+ if (lkb->lkb_nodeid != res->res_nodeid)
9679+ seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
9680+ lkb->lkb_remid);
9681+ else
9682+ seq_printf(s, " Master: %08x", lkb->lkb_remid);
9683+ }
9684+
9685+ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
c783755a
AM
9686+ seq_printf(s, " LQ: %d,0x%x", lkb->lkb_lockqueue_state,
9687+ lkb->lkb_lockqueue_flags);
c1c6733f
AM
9688+
9689+ seq_printf(s, "\n");
9690+}
9691+
b7b72b66 9692+static int print_resource(struct dlm_rsb *res, struct seq_file *s)
c1c6733f
AM
9693+{
9694+ int i;
9695+ struct list_head *locklist;
9696+
9697+ seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
9698+ res->res_parent, res->res_length);
9699+ for (i = 0; i < res->res_length; i++) {
9700+ if (isprint(res->res_name[i]))
9701+ seq_printf(s, "%c", res->res_name[i]);
9702+ else
9703+ seq_printf(s, "%c", '.');
9704+ }
9705+ if (res->res_nodeid)
9706+ seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
9707+ res->res_nodeid);
9708+ else
9709+ seq_printf(s, "\" \nMaster Copy\n");
9710+
9711+ /* Print the LVB: */
9712+ if (res->res_lvbptr) {
9713+ seq_printf(s, "LVB: ");
9714+ for (i = 0; i < DLM_LVB_LEN; i++) {
9715+ if (i == DLM_LVB_LEN / 2)
9716+ seq_printf(s, "\n ");
9717+ seq_printf(s, "%02x ",
9718+ (unsigned char) res->res_lvbptr[i]);
9719+ }
9720+ seq_printf(s, "\n");
9721+ }
9722+
9723+ /* Print the locks attached to this resource */
9724+ seq_printf(s, "Granted Queue\n");
9725+ list_for_each(locklist, &res->res_grantqueue) {
b7b72b66
AM
9726+ struct dlm_lkb *this_lkb =
9727+ list_entry(locklist, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
9728+ print_lock(s, this_lkb, res);
9729+ }
9730+
9731+ seq_printf(s, "Conversion Queue\n");
9732+ list_for_each(locklist, &res->res_convertqueue) {
b7b72b66
AM
9733+ struct dlm_lkb *this_lkb =
9734+ list_entry(locklist, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
9735+ print_lock(s, this_lkb, res);
9736+ }
9737+
9738+ seq_printf(s, "Waiting Queue\n");
9739+ list_for_each(locklist, &res->res_waitqueue) {
b7b72b66
AM
9740+ struct dlm_lkb *this_lkb =
9741+ list_entry(locklist, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
9742+ print_lock(s, this_lkb, res);
9743+ }
b7b72b66
AM
9744+
9745+ return 0;
9746+}
9747+
9748+
9749+/*
9750+ * /proc/cluster/dlm_dir - dump resource directory
9751+ */
9752+
9753+static int print_de(struct dlm_direntry *de, struct seq_file *s)
9754+{
9755+ char strname[DLM_RESNAME_MAXLEN+1];
9756+
9757+ memset(strname, 0, DLM_RESNAME_MAXLEN+1);
9758+ memcpy(strname, de->name, de->length);
9759+
9760+ seq_printf(s, "%s %u\n", strname, de->master_nodeid);
c1c6733f
AM
9761+ return 0;
9762+}
b7b72b66
AM
9763+
9764+static int dir_open(struct inode *inode, struct file *file)
9765+{
9766+ return seq_open(file, &dir_info_op);
9767+}
9768+
9769+static ssize_t dir_write(struct file *file, const char *buf,
9770+ size_t count, loff_t *ppos)
9771+{
9772+ return locks_write(file, buf, count, ppos);
9773+}
9774+
9775+static struct file_operations dir_fops = {
c783755a
AM
9776+ .open = dir_open,
9777+ .write = dir_write,
9778+ .read = seq_read,
9779+ .llseek = seq_lseek,
9780+ .release = seq_release,
9781+ .owner = THIS_MODULE,
b7b72b66
AM
9782+};
9783+
9784+static struct ls_dumpinfo *next_de(struct ls_dumpinfo *di)
9785+{
9786+ int i;
9787+
9788+ if (!di->next) {
9789+ /* Find the next non-empty hash bucket */
9790+ for (i = di->entry; i < di->ls->ls_dirtbl_size; i++) {
9791+ read_lock(&di->ls->ls_dirtbl[i].lock);
9792+ if (!list_empty(&di->ls->ls_dirtbl[i].list)) {
9793+ di->next = di->ls->ls_dirtbl[i].list.next;
9794+ read_unlock(&di->ls->ls_dirtbl[i].lock);
9795+ break;
9796+ }
9797+ read_unlock(&di->ls->ls_dirtbl[i].lock);
9798+ }
9799+ di->entry = i;
9800+
9801+ if (di->entry >= di->ls->ls_dirtbl_size)
9802+ return NULL; /* End of hash list */
9803+ } else { /* Find the next entry in the list */
9804+ i = di->entry;
9805+ read_lock(&di->ls->ls_dirtbl[i].lock);
9806+ di->next = di->next->next;
9807+ if (di->next->next == di->ls->ls_dirtbl[i].list.next) {
9808+ /* End of list - move to next bucket */
9809+ di->next = NULL;
9810+ di->entry++;
9811+ read_unlock(&di->ls->ls_dirtbl[i].lock);
9812+ return next_de(di); /* do the top half of this conditional */
9813+ }
9814+ read_unlock(&di->ls->ls_dirtbl[i].lock);
9815+ }
9816+ di->de = list_entry(di->next, struct dlm_direntry, list);
9817+
9818+ return di;
9819+}
9820+
9821+static void *dir_start(struct seq_file *m, loff_t *pos)
9822+{
9823+ struct ls_dumpinfo *di;
9824+ struct dlm_ls *ls;
9825+ int i;
9826+
9827+ ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9828+ if (!ls)
9829+ return NULL;
9830+
9831+ di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
9832+ if (!di)
9833+ return NULL;
9834+
9835+ if (*pos == 0)
9836+ seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
9837+
9838+ di->entry = 0;
9839+ di->next = NULL;
9840+ di->ls = ls;
9841+
9842+ for (i = 0; i < *pos; i++)
9843+ if (next_de(di) == NULL)
9844+ return NULL;
9845+
9846+ return next_de(di);
9847+}
9848+
9849+static void *dir_next(struct seq_file *m, void *p, loff_t *pos)
9850+{
9851+ struct ls_dumpinfo *di = p;
9852+
9853+ *pos += 1;
9854+
9855+ return next_de(di);
9856+}
9857+
9858+static int dir_show(struct seq_file *m, void *p)
9859+{
9860+ struct ls_dumpinfo *di = p;
9861+ return print_de(di->de, m);
9862+}
9863+
9864+static void dir_stop(struct seq_file *m, void *p)
9865+{
9866+ kfree(p);
9867+}
9868+
9869+static struct seq_operations dir_info_op = {
c783755a
AM
9870+ .start = dir_start,
9871+ .next = dir_next,
9872+ .stop = dir_stop,
9873+ .show = dir_show,
b7b72b66 9874+};
c1c6733f
AM
9875+#endif /* CONFIG_CLUSTER_DLM_PROCLOCKS */
9876+
b7b72b66 9877+void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...)
c1c6733f
AM
9878+{
9879+ va_list va;
9880+ int i, n, size, len;
9881+ char buf[MAX_DEBUG_MSG_LEN+1];
9882+
9883+ spin_lock(&debug_lock);
9884+
9885+ if (!debug_buf)
9886+ goto out;
9887+
9888+ size = MAX_DEBUG_MSG_LEN;
9889+ memset(buf, 0, size+1);
9890+
9891+ n = snprintf(buf, size, "%s ", ls->ls_name);
9892+ size -= n;
9893+
9894+ va_start(va, fmt);
9895+ vsnprintf(buf+n, size, fmt, va);
9896+ va_end(va);
9897+
9898+ len = strlen(buf);
9899+ if (len > MAX_DEBUG_MSG_LEN-1)
9900+ len = MAX_DEBUG_MSG_LEN-1;
9901+ buf[len] = '\n';
9902+ buf[len+1] = '\0';
9903+
9904+ for (i = 0; i < strlen(buf); i++) {
9905+ debug_buf[debug_point++] = buf[i];
9906+
9907+ if (debug_point == debug_size) {
9908+ debug_point = 0;
9909+ debug_wrap = 1;
9910+ }
9911+ }
9912+ out:
9913+ spin_unlock(&debug_lock);
9914+}
9915+
9916+void dlm_debug_dump(void)
9917+{
9918+ int i;
9919+
9920+ spin_lock(&debug_lock);
9921+ if (debug_wrap) {
9922+ for (i = debug_point; i < debug_size; i++)
9923+ printk("%c", debug_buf[i]);
9924+ }
9925+ for (i = 0; i < debug_point; i++)
9926+ printk("%c", debug_buf[i]);
9927+ spin_unlock(&debug_lock);
9928+}
9929+
9930+void dlm_debug_setup(int size)
9931+{
9932+ char *b = NULL;
9933+
9934+ if (size > PAGE_SIZE)
9935+ size = PAGE_SIZE;
9936+ if (size)
9937+ b = kmalloc(size, GFP_KERNEL);
9938+
9939+ spin_lock(&debug_lock);
9940+ if (debug_buf)
9941+ kfree(debug_buf);
9942+ if (!size || !b)
9943+ goto out;
9944+ debug_size = size;
9945+ debug_point = 0;
9946+ debug_wrap = 0;
9947+ debug_buf = b;
9948+ memset(debug_buf, 0, debug_size);
9949+ out:
9950+ spin_unlock(&debug_lock);
9951+}
9952+
9953+static void dlm_debug_init(void)
9954+{
9955+ debug_buf = NULL;
9956+ debug_size = 0;
9957+ debug_point = 0;
9958+ debug_wrap = 0;
9959+ spin_lock_init(&debug_lock);
9960+
9961+ dlm_debug_setup(DLM_DEBUG_SIZE);
9962+}
9963+
9964+#ifdef CONFIG_PROC_FS
9965+int dlm_debug_info(char *b, char **start, off_t offset, int length)
9966+{
9967+ int i, n = 0;
9968+
9969+ spin_lock(&debug_lock);
9970+
9971+ if (debug_wrap) {
9972+ for (i = debug_point; i < debug_size; i++)
9973+ n += sprintf(b + n, "%c", debug_buf[i]);
9974+ }
9975+ for (i = 0; i < debug_point; i++)
9976+ n += sprintf(b + n, "%c", debug_buf[i]);
9977+
9978+ spin_unlock(&debug_lock);
9979+
9980+ return n;
9981+}
bb1d8b11 9982+#endif
c1c6733f 9983+
c783755a
AM
9984+#ifdef CONFIG_DLM_STATS
9985+struct dlm_statinfo dlm_stats;
9986+static struct proc_dir_entry *stats_proc_entry = NULL;
9987+static int dlm_stats_info(char *b, char **start, off_t offset, int length)
9988+{
9989+ int n=0;
9990+ int i;
9991+ long lq_locks = 0;
9992+ unsigned long lq_time = 0;
9993+
9994+ n += sprintf(b+n, "DLM stats (HZ=%d)\n\n", HZ);
9995+ n += sprintf(b+n, "Lock operations: %7d\n", dlm_stats.lockops);
9996+ n += sprintf(b+n, "Unlock operations: %7d\n", dlm_stats.unlockops);
9997+ n += sprintf(b+n, "Convert operations: %7d\n", dlm_stats.convertops);
9998+ n += sprintf(b+n, "Completion ASTs: %7d\n", dlm_stats.cast);
9999+ n += sprintf(b+n, "Blocking ASTs: %7d\n", dlm_stats.bast);
10000+ n += sprintf(b+n, "\n");
10001+ n += sprintf(b+n, "Lockqueue num waittime ave\n");
10002+ for (i=1; i<=4 ; i++) {
10003+ char *lq_reason="???";
10004+ switch (i){
10005+ case 1: lq_reason = "WAIT_RSB ";
10006+ break;
10007+ case 2: lq_reason = "WAIT_CONV ";
10008+ break;
10009+ case 3: lq_reason = "WAIT_GRANT ";
10010+ break;
10011+ case 4: lq_reason = "WAIT_UNLOCK";
10012+ break;
10013+ }
10014+ if (dlm_stats.lockqueue_locks[i])
10015+ n += sprintf(b+n, "%s %6lu %7lu %3lu\n",
10016+ lq_reason,
10017+ dlm_stats.lockqueue_locks[i],
10018+ dlm_stats.lockqueue_time[i],
10019+ dlm_stats.lockqueue_time[i]/
10020+ dlm_stats.lockqueue_locks[i]);
10021+
10022+ lq_locks += dlm_stats.lockqueue_locks[i];
10023+ lq_time += dlm_stats.lockqueue_time[i];
10024+ }
10025+ if (lq_locks)
10026+ n += sprintf(b+n, "Total %6lu %7lu %3lu\n",
10027+ lq_locks, lq_time, lq_time/lq_locks);
10028+ return n;
10029+}
10030+
10031+static int dlm_stats_clear(struct file *file, const char __user *buffer,
10032+ unsigned long count, void *data)
10033+{
10034+ memset(&dlm_stats, 0, sizeof(dlm_stats));
10035+ return count;
10036+}
bb1d8b11 10037+#endif /* CONFIG_DLM_STATS */
c1c6733f
AM
10038+
10039+void dlm_proc_init(void)
10040+{
10041+#ifdef CONFIG_PROC_FS
10042+ debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
10043+ NULL);
10044+ if (!debug_proc_entry)
10045+ return;
10046+
10047+ debug_proc_entry->get_info = &dlm_debug_info;
c1c6733f 10048+#endif
bb1d8b11 10049+
c783755a 10050+#ifdef CONFIG_DLM_STATS
bb1d8b11
AM
10051+ stats_proc_entry = create_proc_entry("cluster/dlm_stats",
10052+ S_IRUSR | S_IWUSR, NULL);
c783755a
AM
10053+ if (!stats_proc_entry)
10054+ return;
10055+
10056+ stats_proc_entry->get_info = &dlm_stats_info;
10057+ stats_proc_entry->write_proc = &dlm_stats_clear;
10058+#endif
10059+
c1c6733f
AM
10060+ dlm_debug_init();
10061+
10062+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
10063+ locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
10064+ S_IFREG | 0400,
10065+ NULL, NULL, NULL);
10066+ if (!locks_proc_entry)
10067+ return;
10068+ locks_proc_entry->proc_fops = &locks_fops;
b7b72b66
AM
10069+
10070+ dir_proc_entry = create_proc_read_entry("cluster/dlm_dir",
10071+ S_IFREG | 0400,
10072+ NULL, NULL, NULL);
10073+ if (!dir_proc_entry)
10074+ return;
10075+ dir_proc_entry->proc_fops = &dir_fops;
c1c6733f
AM
10076+#endif
10077+}
10078+
10079+void dlm_proc_exit(void)
10080+{
10081+#ifdef CONFIG_PROC_FS
10082+ if (debug_proc_entry) {
10083+ remove_proc_entry("cluster/dlm_debug", NULL);
10084+ dlm_debug_setup(0);
10085+ }
c1c6733f 10086+#endif
bb1d8b11 10087+
c783755a
AM
10088+#ifdef CONFIG_DLM_STATS
10089+ if (stats_proc_entry)
10090+ remove_proc_entry("cluster/dlm_stats", NULL);
10091+#endif
c1c6733f
AM
10092+
10093+#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
10094+ if (locks_proc_entry)
10095+ remove_proc_entry("cluster/dlm_locks", NULL);
b7b72b66
AM
10096+ if (dir_proc_entry)
10097+ remove_proc_entry("cluster/dlm_dir", NULL);
c1c6733f
AM
10098+#endif
10099+}
10100diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
10101--- linux-orig/cluster/dlm/queries.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
10102+++ linux-patched/cluster/dlm/queries.c 2004-11-03 11:31:56.000000000 +0800
10103@@ -0,0 +1,713 @@
c1c6733f
AM
10104+/******************************************************************************
10105+*******************************************************************************
10106+**
10107+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10108+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10109+**
10110+** This copyrighted material is made available to anyone wishing to use,
10111+** modify, copy, or redistribute it subject to the terms and conditions
10112+** of the GNU General Public License v.2.
10113+**
10114+*******************************************************************************
10115+******************************************************************************/
10116+
10117+/*
10118+ * queries.c
10119+ *
10120+ * This file provides the kernel query interface to the DLM.
10121+ *
10122+ */
10123+
10124+#define EXPORT_SYMTAB
10125+#include <linux/module.h>
10126+
10127+#include "dlm_internal.h"
b7b72b66 10128+#include "lockspace.h"
c1c6733f
AM
10129+#include "lockqueue.h"
10130+#include "locking.h"
10131+#include "lkb.h"
10132+#include "nodes.h"
10133+#include "dir.h"
10134+#include "ast.h"
10135+#include "memory.h"
10136+#include "lowcomms.h"
10137+#include "midcomms.h"
10138+#include "rsb.h"
10139+
b7b72b66
AM
10140+static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo);
10141+static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo);
c1c6733f
AM
10142+
10143+/*
10144+ * API entry point.
10145+ */
10146+int dlm_query(void *lockspace,
10147+ struct dlm_lksb *lksb,
10148+ int query,
10149+ struct dlm_queryinfo *qinfo,
10150+ void (ast_routine(void *)),
10151+ void *astarg)
10152+{
10153+ int status = -EINVAL;
b7b72b66
AM
10154+ struct dlm_lkb *target_lkb;
10155+ struct dlm_lkb *query_lkb = NULL; /* Our temporary LKB */
10156+ struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
c1c6733f 10157+
b7b72b66
AM
10158+ if (!ls)
10159+ return -EINVAL;
c1c6733f
AM
10160+ if (!qinfo)
10161+ goto out;
c1c6733f
AM
10162+ if (!ast_routine)
10163+ goto out;
10164+ if (!lksb)
10165+ goto out;
10166+
10167+ if (!qinfo->gqi_lockinfo)
10168+ qinfo->gqi_locksize = 0;
10169+
10170+ /* Find the lkid */
10171+ target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
10172+ if (!target_lkb)
10173+ goto out;
10174+
10175+ /* If the user wants a list of locks that are blocking or
10176+ not blocking this lock, then it must be waiting
10177+ for something
10178+ */
10179+ if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
10180+ (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
10181+ target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
b7b72b66 10182+ goto out;
c1c6733f
AM
10183+
10184+ /* We now allocate an LKB for our own use (so we can hang
10185+ * things like the AST routine and the lksb from it) */
10186+ lksb->sb_status = -EBUSY;
10187+ query_lkb = create_lkb(ls);
10188+ if (!query_lkb) {
10189+ status = -ENOMEM;
10190+ goto out;
10191+ }
10192+ query_lkb->lkb_astaddr = ast_routine;
10193+ query_lkb->lkb_astparam = (long)astarg;
10194+ query_lkb->lkb_resource = target_lkb->lkb_resource;
10195+ query_lkb->lkb_lksb = lksb;
10196+
10197+ /* Don't free the resource while we are querying it. This ref
10198+ * will be dropped when the LKB is freed */
10199+ hold_rsb(query_lkb->lkb_resource);
10200+
10201+ /* Fill in the stuff that's always local */
10202+ if (qinfo->gqi_resinfo) {
10203+ if (target_lkb->lkb_resource->res_nodeid)
10204+ qinfo->gqi_resinfo->rsi_masternode =
10205+ target_lkb->lkb_resource->res_nodeid;
10206+ else
10207+ qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
10208+ qinfo->gqi_resinfo->rsi_length =
10209+ target_lkb->lkb_resource->res_length;
10210+ memcpy(qinfo->gqi_resinfo->rsi_name,
10211+ target_lkb->lkb_resource->res_name,
10212+ qinfo->gqi_resinfo->rsi_length);
10213+ }
10214+
10215+ /* If the master is local (or the user doesn't want the overhead of a
10216+ * remote call) - fill in the details here */
10217+ if (target_lkb->lkb_resource->res_nodeid == 0 ||
10218+ (query & DLM_QUERY_LOCAL)) {
10219+
10220+ status = 0;
10221+ /* Resource info */
10222+ if (qinfo->gqi_resinfo) {
10223+ query_resource(target_lkb->lkb_resource,
10224+ qinfo->gqi_resinfo);
10225+ }
10226+
10227+ /* Lock lists */
10228+ if (qinfo->gqi_lockinfo) {
10229+ status = query_locks(query, target_lkb, qinfo);
10230+ }
10231+
10232+ query_lkb->lkb_retstatus = status;
b7b72b66 10233+ queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
c1c6733f
AM
10234+ wake_astd();
10235+
10236+ /* An AST will be delivered so we must return success here */
10237+ status = 0;
10238+ goto out;
10239+ }
10240+
10241+ /* Remote master */
10242+ if (target_lkb->lkb_resource->res_nodeid != 0)
10243+ {
b7b72b66 10244+ struct dlm_query_request *remquery;
c1c6733f
AM
10245+ struct writequeue_entry *e;
10246+
10247+ /* Clear this cos the receiving end adds to it with
10248+ each incoming packet */
10249+ qinfo->gqi_lockcount = 0;
10250+
10251+ /* Squirrel a pointer to the query info struct
10252+ somewhere illegal */
b7b72b66 10253+ query_lkb->lkb_request = (struct dlm_request *) qinfo;
c1c6733f
AM
10254+
10255+ e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
b7b72b66 10256+ sizeof(struct dlm_query_request),
c1c6733f
AM
10257+ ls->ls_allocation,
10258+ (char **) &remquery);
10259+ if (!e) {
10260+ status = -ENOBUFS;
10261+ goto out;
10262+ }
10263+
10264+ /* Build remote packet */
b7b72b66 10265+ memset(remquery, 0, sizeof(struct dlm_query_request));
c1c6733f
AM
10266+
10267+ remquery->rq_maxlocks = qinfo->gqi_locksize;
10268+ remquery->rq_query = query;
10269+ remquery->rq_mstlkid = target_lkb->lkb_remid;
10270+ if (qinfo->gqi_lockinfo)
10271+ remquery->rq_maxlocks = qinfo->gqi_locksize;
10272+
10273+ remquery->rq_header.rh_cmd = GDLM_REMCMD_QUERY;
10274+ remquery->rq_header.rh_flags = 0;
b7b72b66 10275+ remquery->rq_header.rh_length = sizeof(struct dlm_query_request);
c1c6733f
AM
10276+ remquery->rq_header.rh_lkid = query_lkb->lkb_id;
10277+ remquery->rq_header.rh_lockspace = ls->ls_global_id;
10278+
10279+ midcomms_send_buffer(&remquery->rq_header, e);
10280+ status = 0;
10281+ }
10282+
10283+ out:
b7b72b66 10284+ put_lockspace(ls);
c1c6733f
AM
10285+ return status;
10286+}
10287+
10288+static inline int valid_range(struct dlm_range *r)
10289+{
10290+ if (r->ra_start != 0ULL ||
10291+ r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
10292+ return 1;
10293+ else
10294+ return 0;
10295+}
10296+
10297+static void put_int(int x, char *buf, int *offp)
10298+{
10299+ x = cpu_to_le32(x);
10300+ memcpy(buf + *offp, &x, sizeof(int));
10301+ *offp += sizeof(int);
10302+}
10303+
10304+static void put_int64(uint64_t x, char *buf, int *offp)
10305+{
10306+ x = cpu_to_le64(x);
10307+ memcpy(buf + *offp, &x, sizeof(uint64_t));
10308+ *offp += sizeof(uint64_t);
10309+}
10310+
10311+static int get_int(char *buf, int *offp)
10312+{
10313+ int value;
10314+ memcpy(&value, buf + *offp, sizeof(int));
10315+ *offp += sizeof(int);
10316+ return le32_to_cpu(value);
10317+}
10318+
10319+static uint64_t get_int64(char *buf, int *offp)
10320+{
10321+ uint64_t value;
10322+
10323+ memcpy(&value, buf + *offp, sizeof(uint64_t));
10324+ *offp += sizeof(uint64_t);
10325+ return le64_to_cpu(value);
10326+}
10327+
10328+#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
10329+
10330+/* Called from recvd to get lock info for a remote node */
b7b72b66 10331+int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
c1c6733f 10332+{
b7b72b66
AM
10333+ struct dlm_query_request *query = (struct dlm_query_request *) msg;
10334+ struct dlm_query_reply *reply;
c1c6733f
AM
10335+ struct dlm_resinfo resinfo;
10336+ struct dlm_queryinfo qinfo;
10337+ struct writequeue_entry *e;
10338+ char *buf;
b7b72b66 10339+ struct dlm_lkb *lkb;
c1c6733f
AM
10340+ int status = 0;
10341+ int bufidx;
10342+ int finished = 0;
10343+ int cur_lock = 0;
10344+ int start_lock = 0;
10345+
10346+ lkb = find_lock_by_id(ls, query->rq_mstlkid);
10347+ if (!lkb) {
10348+ status = -EINVAL;
10349+ goto send_error;
10350+ }
10351+
10352+ qinfo.gqi_resinfo = &resinfo;
10353+ qinfo.gqi_locksize = query->rq_maxlocks;
10354+
10355+ /* Get the resource bits */
10356+ query_resource(lkb->lkb_resource, &resinfo);
10357+
10358+ /* Now get the locks if wanted */
10359+ if (query->rq_maxlocks) {
10360+ qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
10361+ GFP_KERNEL);
10362+ if (!qinfo.gqi_lockinfo) {
10363+ status = -ENOMEM;
10364+ goto send_error;
10365+ }
10366+
10367+ status = query_locks(query->rq_query, lkb, &qinfo);
10368+ if (status && status != -E2BIG) {
10369+ kfree(qinfo.gqi_lockinfo);
10370+ goto send_error;
10371+ }
10372+ }
10373+ else {
10374+ qinfo.gqi_lockinfo = NULL;
10375+ qinfo.gqi_lockcount = 0;
10376+ }
10377+
10378+ /* Send as many blocks as needed for all the locks */
10379+ do {
10380+ int i;
b7b72b66 10381+ int msg_len = sizeof(struct dlm_query_reply);
c1c6733f
AM
10382+ int last_msg_len = msg_len; /* keeps compiler quiet */
10383+ int last_lock;
10384+
10385+ /* First work out how many locks we can fit into a block */
10386+ for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
10387+
10388+ last_msg_len = msg_len;
10389+
10390+ msg_len += LOCK_LEN;
10391+ if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
10392+ valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
10393+
10394+ msg_len += sizeof(uint64_t) * 4;
10395+ }
10396+ }
10397+
10398+ /* There must be a neater way of doing this... */
10399+ if (msg_len > PAGE_SIZE) {
10400+ last_lock = i-1;
10401+ msg_len = last_msg_len;
10402+ }
10403+ else {
10404+ last_lock = i;
10405+ }
10406+
10407+ e = lowcomms_get_buffer(nodeid,
10408+ msg_len,
10409+ ls->ls_allocation,
10410+ (char **) &reply);
10411+ if (!e) {
10412+ kfree(qinfo.gqi_lockinfo);
10413+ status = -ENOBUFS;
10414+ goto out;
10415+ }
10416+
10417+ reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
10418+ reply->rq_header.rh_length = msg_len;
10419+ reply->rq_header.rh_lkid = msg->rh_lkid;
10420+ reply->rq_header.rh_lockspace = msg->rh_lockspace;
10421+
10422+ reply->rq_status = status;
10423+ reply->rq_startlock = cur_lock;
10424+ reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
10425+ reply->rq_convcount = qinfo.gqi_resinfo->rsi_convcount;
10426+ reply->rq_waitcount = qinfo.gqi_resinfo->rsi_waitcount;
10427+ memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
10428+
10429+ buf = (char *)reply;
b7b72b66 10430+ bufidx = sizeof(struct dlm_query_reply);
c1c6733f
AM
10431+
10432+ for (; cur_lock < last_lock; cur_lock++) {
10433+
10434+ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
10435+ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
10436+ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
10437+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
10438+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
10439+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
10440+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
b7b72b66 10441+ put_int(qinfo.gqi_lockinfo[cur_lock].lki_ownpid, buf, &bufidx);
c1c6733f
AM
10442+
10443+ if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
10444+ valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
10445+
10446+ buf[bufidx++] = 1;
10447+ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
10448+ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
10449+ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
10450+ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
10451+ }
10452+ else {
10453+ buf[bufidx++] = 0;
10454+ }
10455+ }
10456+
10457+ if (cur_lock == qinfo.gqi_lockcount) {
10458+ reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
10459+ finished = 1;
10460+ }
10461+ else {
10462+ reply->rq_header.rh_flags = 0;
10463+ }
10464+
10465+ reply->rq_numlocks = cur_lock - start_lock;
10466+ start_lock = cur_lock;
10467+
10468+ midcomms_send_buffer(&reply->rq_header, e);
10469+ } while (!finished);
10470+
10471+ kfree(qinfo.gqi_lockinfo);
10472+ out:
10473+ return status;
10474+
10475+ send_error:
10476+ e = lowcomms_get_buffer(nodeid,
b7b72b66 10477+ sizeof(struct dlm_query_reply),
c1c6733f
AM
10478+ ls->ls_allocation,
10479+ (char **) &reply);
10480+ if (!e) {
10481+ status = -ENOBUFS;
10482+ goto out;
10483+ }
10484+ reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
b7b72b66
AM
10485+ reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
10486+ reply->rq_header.rh_length = sizeof(struct dlm_query_reply);
c1c6733f
AM
10487+ reply->rq_header.rh_lkid = msg->rh_lkid;
10488+ reply->rq_header.rh_lockspace = msg->rh_lockspace;
10489+ reply->rq_status = status;
10490+ reply->rq_numlocks = 0;
10491+ reply->rq_startlock = 0;
10492+ reply->rq_grantcount = 0;
10493+ reply->rq_convcount = 0;
10494+ reply->rq_waitcount = 0;
10495+
10496+ midcomms_send_buffer(&reply->rq_header, e);
10497+
10498+ return status;
10499+}
10500+
10501+/* Reply to a remote query */
b7b72b66 10502+int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
c1c6733f 10503+{
b7b72b66 10504+ struct dlm_lkb *query_lkb;
c1c6733f 10505+ struct dlm_queryinfo *qinfo;
b7b72b66 10506+ struct dlm_query_reply *reply;
c1c6733f
AM
10507+ char *buf;
10508+ int i;
10509+ int bufidx;
10510+
10511+ query_lkb = find_lock_by_id(ls, msg->rh_lkid);
10512+ if (!query_lkb)
10513+ return -EINVAL;
10514+
10515+ qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
b7b72b66 10516+ reply = (struct dlm_query_reply *) msg;
c1c6733f
AM
10517+
10518+ /* Copy the easy bits first */
10519+ qinfo->gqi_lockcount += reply->rq_numlocks;
10520+ if (qinfo->gqi_resinfo) {
10521+ qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
10522+ qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
10523+ qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
10524+ memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
10525+ DLM_LVB_LEN);
10526+ }
10527+
10528+ /* Now unpack the locks */
b7b72b66 10529+ bufidx = sizeof(struct dlm_query_reply);
c1c6733f
AM
10530+ buf = (char *) msg;
10531+
b7b72b66 10532+ DLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
c1c6733f
AM
10533+ printk("start = %d, num + %d. Max= %d\n",
10534+ reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
10535+
10536+ for (i = reply->rq_startlock;
10537+ i < reply->rq_startlock + reply->rq_numlocks; i++) {
10538+ qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
10539+ qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
10540+ qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
10541+ qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
10542+ qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
10543+ qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
10544+ qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
b7b72b66 10545+ qinfo->gqi_lockinfo[i].lki_ownpid = get_int(buf, &bufidx);
c1c6733f
AM
10546+ if (buf[bufidx++]) {
10547+ qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
10548+ qinfo->gqi_lockinfo[i].lki_grrange.ra_end = get_int64(buf, &bufidx);
10549+ qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
10550+ qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = get_int64(buf, &bufidx);
10551+ }
10552+ else {
10553+ qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
10554+ qinfo->gqi_lockinfo[i].lki_grrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
10555+ qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
10556+ qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
10557+ }
10558+ }
10559+
10560+ /* If this was the last block then now tell the user */
10561+ if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
10562+ query_lkb->lkb_retstatus = reply->rq_status;
b7b72b66 10563+ queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
c1c6733f
AM
10564+ wake_astd();
10565+ }
10566+
10567+ return 0;
10568+}
10569+
10570+/* Aggregate resource information */
b7b72b66 10571+static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo)
c1c6733f
AM
10572+{
10573+ struct list_head *tmp;
10574+
c1c6733f
AM
10575+ if (rsb->res_lvbptr)
10576+ memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
10577+
bb1d8b11 10578+ down_read(&rsb->res_lock);
c1c6733f
AM
10579+ resinfo->rsi_grantcount = 0;
10580+ list_for_each(tmp, &rsb->res_grantqueue) {
10581+ resinfo->rsi_grantcount++;
10582+ }
10583+
10584+ resinfo->rsi_waitcount = 0;
10585+ list_for_each(tmp, &rsb->res_waitqueue) {
10586+ resinfo->rsi_waitcount++;
10587+ }
10588+
10589+ resinfo->rsi_convcount = 0;
10590+ list_for_each(tmp, &rsb->res_convertqueue) {
10591+ resinfo->rsi_convcount++;
10592+ }
bb1d8b11 10593+ up_read(&rsb->res_lock);
c1c6733f
AM
10594+
10595+ return 0;
10596+}
10597+
b7b72b66 10598+static int add_lock(struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
c1c6733f
AM
10599+{
10600+ int entry;
10601+
10602+ /* Don't fill it in if the buffer is full */
10603+ if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
10604+ return -E2BIG;
10605+
10606+ /* gqi_lockcount contains the number of locks we have returned */
10607+ entry = qinfo->gqi_lockcount++;
10608+
10609+ /* Fun with master copies */
10610+ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
10611+ qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
10612+ qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
10613+ }
10614+ else {
10615+ qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
10616+ qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
10617+ }
10618+
10619+ /* Also make sure we always have a valid nodeid in there, the
10620+ calling end may not know which node "0" is */
10621+ if (lkb->lkb_nodeid)
10622+ qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
10623+ else
10624+ qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
10625+
10626+ if (lkb->lkb_parent)
10627+ qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
10628+ else
10629+ qinfo->gqi_lockinfo[entry].lki_parent = 0;
10630+
10631+ qinfo->gqi_lockinfo[entry].lki_state = lkb->lkb_status;
10632+ qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
10633+ qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
b7b72b66 10634+ qinfo->gqi_lockinfo[entry].lki_ownpid = lkb->lkb_ownpid;
c1c6733f
AM
10635+
10636+ if (lkb->lkb_range) {
10637+ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
10638+ lkb->lkb_range[GR_RANGE_START];
10639+ qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
10640+ lkb->lkb_range[GR_RANGE_END];
10641+ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
10642+ lkb->lkb_range[RQ_RANGE_START];
10643+ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
10644+ lkb->lkb_range[RQ_RANGE_END];
10645+ } else {
10646+ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
10647+ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
10648+ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
10649+ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
10650+ }
10651+ return 0;
10652+}
10653+
bb1d8b11
AM
10654+static int query_lkb_queue(struct dlm_rsb *rsb,
10655+ struct list_head *queue, int query,
c1c6733f
AM
10656+ struct dlm_queryinfo *qinfo)
10657+{
10658+ struct list_head *tmp;
10659+ int status = 0;
10660+ int mode = query & DLM_QUERY_MODE_MASK;
10661+
bb1d8b11 10662+ down_read(&rsb->res_lock);
c1c6733f 10663+ list_for_each(tmp, queue) {
b7b72b66 10664+ struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
10665+ int lkmode;
10666+
10667+ if (query & DLM_QUERY_RQMODE)
10668+ lkmode = lkb->lkb_rqmode;
10669+ else
10670+ lkmode = lkb->lkb_grmode;
10671+
10672+ /* Add the LKB info to the list if it matches the criteria in
10673+ * the query bitmap */
10674+ switch (query & DLM_QUERY_MASK) {
10675+ case DLM_QUERY_LOCKS_ALL:
10676+ status = add_lock(lkb, qinfo);
10677+ break;
10678+
10679+ case DLM_QUERY_LOCKS_HIGHER:
10680+ if (lkmode > mode)
10681+ status = add_lock(lkb, qinfo);
10682+ break;
10683+
10684+ case DLM_QUERY_LOCKS_EQUAL:
10685+ if (lkmode == mode)
10686+ status = add_lock(lkb, qinfo);
10687+ break;
10688+
10689+ case DLM_QUERY_LOCKS_LOWER:
10690+ if (lkmode < mode)
10691+ status = add_lock(lkb, qinfo);
c783755a
AM
10692+
10693+ case DLM_QUERY_LOCKS_ORPHAN:
10694+ if (lkb->lkb_flags & GDLM_LKFLG_ORPHAN)
10695+ status = add_lock(lkb, qinfo);
c1c6733f
AM
10696+ break;
10697+ }
10698+ }
bb1d8b11 10699+ up_read(&rsb->res_lock);
c1c6733f
AM
10700+ return status;
10701+}
10702+
10703+/*
10704+ * Return 1 if the locks' ranges overlap
10705+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
10706+ */
b7b72b66 10707+static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
c1c6733f
AM
10708+{
10709+ if (!lkb1->lkb_range || !lkb2->lkb_range)
10710+ return 1;
10711+
10712+ if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
10713+ lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
10714+ return 0;
10715+
10716+ return 1;
10717+}
10718+extern const int __dlm_compat_matrix[8][8];
10719+
10720+
b7b72b66 10721+static int get_blocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
c1c6733f
AM
10722+{
10723+ struct list_head *tmp;
10724+ int status = 0;
10725+
bb1d8b11 10726+ down_read(&qlkb->lkb_resource->res_lock);
c1c6733f 10727+ list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
b7b72b66 10728+ struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
10729+
10730+ if (ranges_overlap(lkb, qlkb) &&
10731+ !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
10732+ status = add_lock(lkb, qinfo);
10733+ }
bb1d8b11 10734+ up_read(&qlkb->lkb_resource->res_lock);
c1c6733f
AM
10735+
10736+ return status;
10737+}
10738+
b7b72b66 10739+static int get_nonblocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
c1c6733f
AM
10740+{
10741+ struct list_head *tmp;
10742+ int status = 0;
10743+
bb1d8b11 10744+ down_read(&qlkb->lkb_resource->res_lock);
c1c6733f 10745+ list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
b7b72b66 10746+ struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
10747+
10748+ if (!(ranges_overlap(lkb, qlkb) &&
10749+ !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
10750+ status = add_lock(lkb, qinfo);
10751+ }
bb1d8b11 10752+ up_read(&qlkb->lkb_resource->res_lock);
c1c6733f
AM
10753+
10754+ return status;
10755+}
10756+
10757+/* Gather a list of appropriate locks */
b7b72b66 10758+static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
c1c6733f
AM
10759+{
10760+ int status = 0;
10761+
10762+
10763+ /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
10764+ * was requested as the mode
10765+ */
10766+ if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
10767+ query &= ~DLM_QUERY_MODE_MASK;
10768+ if (query & DLM_QUERY_RQMODE)
10769+ query |= lkb->lkb_rqmode;
10770+ else
10771+ query |= lkb->lkb_grmode;
10772+ }
10773+
10774+ qinfo->gqi_lockcount = 0;
10775+
10776+ /* BLOCKING/NOTBLOCK only look at the granted queue */
10777+ if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
10778+ return get_blocking_locks(lkb, qinfo);
10779+
10780+ if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
10781+ return get_nonblocking_locks(lkb, qinfo);
10782+
10783+ /* Do the lock queues that were requested */
10784+ if (query & DLM_QUERY_QUEUE_GRANT) {
bb1d8b11
AM
10785+ status = query_lkb_queue(lkb->lkb_resource,
10786+ &lkb->lkb_resource->res_grantqueue,
c1c6733f
AM
10787+ query, qinfo);
10788+ }
10789+
10790+ if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
bb1d8b11
AM
10791+ status = query_lkb_queue(lkb->lkb_resource,
10792+ &lkb->lkb_resource->res_convertqueue,
c1c6733f
AM
10793+ query, qinfo);
10794+ }
10795+
10796+ if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
bb1d8b11
AM
10797+ status = query_lkb_queue(lkb->lkb_resource,
10798+ &lkb->lkb_resource->res_waitqueue,
c1c6733f
AM
10799+ query, qinfo);
10800+ }
10801+
10802+
10803+ return status;
10804+}
10805+
10806+EXPORT_SYMBOL(dlm_query);
10807+/*
10808+ * Overrides for Emacs so that we follow Linus's tabbing style.
10809+ * Emacs will notice this stuff at the end of the file and automatically
10810+ * adjust the settings for this buffer only. This must remain at the end
10811+ * of the file.
10812+ * ---------------------------------------------------------------------------
10813+ * Local variables:
10814+ * c-file-style: "linux"
10815+ * End:
10816+ */
10817diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
10818--- linux-orig/cluster/dlm/queries.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 10819+++ linux-patched/cluster/dlm/queries.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
10820@@ -0,0 +1,20 @@
10821+/******************************************************************************
10822+*******************************************************************************
10823+**
10824+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10825+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10826+**
10827+** This copyrighted material is made available to anyone wishing to use,
10828+** modify, copy, or redistribute it subject to the terms and conditions
10829+** of the GNU General Public License v.2.
10830+**
10831+*******************************************************************************
10832+******************************************************************************/
10833+
10834+#ifndef __QUERIES_DOT_H__
10835+#define __QUERIES_DOT_H__
10836+
b7b72b66
AM
10837+extern int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10838+extern int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
c1c6733f
AM
10839+
10840+#endif /* __QUERIES_DOT_H__ */
10841diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
10842--- linux-orig/cluster/dlm/rebuild.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 10843+++ linux-patched/cluster/dlm/rebuild.c 2004-11-03 11:31:56.000000000 +0800
b7b72b66 10844@@ -0,0 +1,1280 @@
c1c6733f
AM
10845+/******************************************************************************
10846+*******************************************************************************
10847+**
10848+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10849+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
b7b72b66 10850+**
c1c6733f
AM
10851+** This copyrighted material is made available to anyone wishing to use,
10852+** modify, copy, or redistribute it subject to the terms and conditions
10853+** of the GNU General Public License v.2.
10854+**
10855+*******************************************************************************
10856+******************************************************************************/
10857+
b7b72b66 10858+/*
c1c6733f
AM
10859+ * Rebuild RSB's on new masters. Functions for transferring locks and
10860+ * subresources to new RSB masters during recovery.
10861+ */
10862+
10863+#include "dlm_internal.h"
10864+#include "reccomms.h"
10865+#include "lkb.h"
10866+#include "rsb.h"
10867+#include "nodes.h"
10868+#include "config.h"
10869+#include "memory.h"
10870+#include "recover.h"
10871+
10872+
10873+/* Types of entity serialised in remastering messages */
10874+#define REMASTER_ROOTRSB 1
10875+#define REMASTER_RSB 2
10876+#define REMASTER_LKB 3
10877+
10878+struct rcom_fill {
10879+ char * outbuf; /* Beginning of data */
10880+ int offset; /* Current offset into outbuf */
10881+ int maxlen; /* Max value of offset */
10882+ int remasterid;
10883+ int count;
b7b72b66
AM
10884+ struct dlm_rsb * rsb;
10885+ struct dlm_rsb * subrsb;
10886+ struct dlm_lkb * lkb;
c1c6733f
AM
10887+ struct list_head * lkbqueue;
10888+ char more;
10889+};
10890+typedef struct rcom_fill rcom_fill_t;
10891+
10892+
10893+struct rebuild_node {
10894+ struct list_head list;
10895+ int nodeid;
b7b72b66 10896+ struct dlm_rsb * rootrsb;
c1c6733f
AM
10897+};
10898+typedef struct rebuild_node rebuild_node_t;
10899+
10900+
b7b72b66 10901+/*
c1c6733f
AM
10902+ * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
10903+ * master. The rsb will be "done" with recovery when the new master has
10904+ * replied with all the new remote lockid's for this rsb's lkb's.
10905+ */
10906+
b7b72b66 10907+void expect_new_lkids(struct dlm_rsb *rsb)
c1c6733f
AM
10908+{
10909+ rsb->res_newlkid_expect = 0;
10910+ recover_list_add(rsb);
10911+}
10912+
b7b72b66 10913+/*
c1c6733f
AM
10914+ * This function is called on root rsb or subrsb when another lkb is being sent
10915+ * to the new master for which we expect to receive a corresponding remote lkid
10916+ */
10917+
b7b72b66 10918+void need_new_lkid(struct dlm_rsb *rsb)
c1c6733f 10919+{
b7b72b66 10920+ struct dlm_rsb *root = rsb;
c1c6733f
AM
10921+
10922+ if (rsb->res_parent)
10923+ root = rsb->res_root;
10924+
10925+ if (!root->res_newlkid_expect)
10926+ recover_list_add(root);
10927+ else
b7b72b66 10928+ DLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
c1c6733f
AM
10929+
10930+ root->res_newlkid_expect++;
10931+}
10932+
b7b72b66 10933+/*
c1c6733f
AM
10934+ * This function is called for each lkb for which a new remote lkid is
10935+ * received. Decrement the expected number of remote lkids expected for the
10936+ * root rsb.
10937+ */
10938+
b7b72b66 10939+void have_new_lkid(struct dlm_lkb *lkb)
c1c6733f 10940+{
b7b72b66 10941+ struct dlm_rsb *root = lkb->lkb_resource;
c1c6733f
AM
10942+
10943+ if (root->res_parent)
10944+ root = root->res_root;
10945+
10946+ down_write(&root->res_lock);
10947+
b7b72b66
AM
10948+ DLM_ASSERT(root->res_newlkid_expect,
10949+ printk("newlkid_expect=%d\n", root->res_newlkid_expect););
c1c6733f
AM
10950+
10951+ root->res_newlkid_expect--;
10952+
10953+ if (!root->res_newlkid_expect) {
10954+ clear_bit(RESFL_NEW_MASTER, &root->res_flags);
10955+ recover_list_del(root);
10956+ }
10957+ up_write(&root->res_lock);
10958+}
10959+
b7b72b66 10960+/*
c1c6733f
AM
10961+ * Return the rebuild struct for a node - will create an entry on the rootrsb
10962+ * list if necessary.
10963+ *
b7b72b66 10964+ * Currently no locking is needed here as it all happens in the dlm_recvd
c1c6733f
AM
10965+ * thread
10966+ */
10967+
b7b72b66 10968+static rebuild_node_t *find_rebuild_root(struct dlm_ls *ls, int nodeid)
c1c6733f
AM
10969+{
10970+ rebuild_node_t *node = NULL;
10971+
10972+ list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
10973+ if (node->nodeid == nodeid)
10974+ return node;
10975+ }
10976+
10977+ /* Not found, add one */
10978+ node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
10979+ if (!node)
10980+ return NULL;
10981+
10982+ node->nodeid = nodeid;
10983+ node->rootrsb = NULL;
10984+ list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
10985+
10986+ return node;
10987+}
10988+
b7b72b66 10989+/*
c1c6733f
AM
10990+ * Tidy up after a rebuild run. Called when all recovery has finished
10991+ */
10992+
b7b72b66 10993+void rebuild_freemem(struct dlm_ls *ls)
c1c6733f
AM
10994+{
10995+ rebuild_node_t *node = NULL, *s;
10996+
10997+ list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
10998+ list_del(&node->list);
10999+ kfree(node);
11000+ }
11001+}
11002+
11003+static void put_int(int x, char *buf, int *offp)
11004+{
11005+ x = cpu_to_le32(x);
11006+ memcpy(buf + *offp, &x, sizeof(int));
11007+ *offp += sizeof(int);
11008+}
11009+
11010+static void put_int64(uint64_t x, char *buf, int *offp)
11011+{
11012+ x = cpu_to_le64(x);
11013+ memcpy(buf + *offp, &x, sizeof(uint64_t));
11014+ *offp += sizeof(uint64_t);
11015+}
11016+
11017+static void put_bytes(char *x, int len, char *buf, int *offp)
11018+{
11019+ put_int(len, buf, offp);
11020+ memcpy(buf + *offp, x, len);
11021+ *offp += len;
11022+}
11023+
11024+static void put_char(char x, char *buf, int *offp)
11025+{
11026+ buf[*offp] = x;
11027+ *offp += 1;
11028+}
11029+
11030+static int get_int(char *buf, int *offp)
11031+{
11032+ int value;
11033+ memcpy(&value, buf + *offp, sizeof(int));
11034+ *offp += sizeof(int);
11035+ return le32_to_cpu(value);
11036+}
11037+
11038+static uint64_t get_int64(char *buf, int *offp)
11039+{
11040+ uint64_t value;
11041+
11042+ memcpy(&value, buf + *offp, sizeof(uint64_t));
11043+ *offp += sizeof(uint64_t);
11044+ return le64_to_cpu(value);
11045+}
11046+
11047+static char get_char(char *buf, int *offp)
11048+{
11049+ char x = buf[*offp];
11050+
11051+ *offp += 1;
11052+ return x;
11053+}
11054+
11055+static void get_bytes(char *bytes, int *len, char *buf, int *offp)
11056+{
11057+ *len = get_int(buf, offp);
11058+ memcpy(bytes, buf + *offp, *len);
11059+ *offp += *len;
11060+}
11061+
b7b72b66 11062+static int lkb_length(struct dlm_lkb *lkb)
c1c6733f
AM
11063+{
11064+ int len = 0;
11065+
11066+ len += sizeof(int); /* lkb_id */
11067+ len += sizeof(int); /* lkb_resource->res_reamasterid */
11068+ len += sizeof(int); /* lkb_flags */
11069+ len += sizeof(int); /* lkb_status */
11070+ len += sizeof(char); /* lkb_rqmode */
11071+ len += sizeof(char); /* lkb_grmode */
11072+ len += sizeof(int); /* lkb_childcnt */
11073+ len += sizeof(int); /* lkb_parent->lkb_id */
11074+ len += sizeof(int); /* lkb_bastaddr */
b7b72b66 11075+ len += sizeof(int); /* lkb_ownpid */
c1c6733f
AM
11076+
11077+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11078+ len += sizeof(int); /* number of lvb bytes */
11079+ len += DLM_LVB_LEN;
11080+ }
11081+
11082+ if (lkb->lkb_range) {
11083+ len += sizeof(uint64_t);
11084+ len += sizeof(uint64_t);
11085+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
11086+ len += sizeof(uint64_t);
11087+ len += sizeof(uint64_t);
11088+ }
11089+ }
11090+
11091+ return len;
11092+}
11093+
b7b72b66 11094+/*
c1c6733f
AM
11095+ * It's up to the caller to be sure there's enough space in the buffer.
11096+ */
11097+
b7b72b66 11098+static void serialise_lkb(struct dlm_lkb *lkb, char *buf, int *offp)
c1c6733f
AM
11099+{
11100+ int flags;
11101+
11102+ /* Need to tell the remote end if we have a range */
11103+ flags = lkb->lkb_flags;
11104+ if (lkb->lkb_range)
11105+ flags |= GDLM_LKFLG_RANGE;
11106+
b7b72b66 11107+ /*
c1c6733f
AM
11108+ * See lkb_length()
11109+ * Total: 30 (no lvb) or 66 (with lvb) bytes
11110+ */
11111+
11112+ put_int(lkb->lkb_id, buf, offp);
11113+ put_int(lkb->lkb_resource->res_remasterid, buf, offp);
11114+ put_int(flags, buf, offp);
11115+ put_int(lkb->lkb_status, buf, offp);
11116+ put_char(lkb->lkb_rqmode, buf, offp);
11117+ put_char(lkb->lkb_grmode, buf, offp);
11118+ put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
11119+
11120+ if (lkb->lkb_parent)
11121+ put_int(lkb->lkb_parent->lkb_id, buf, offp);
11122+ else
11123+ put_int(0, buf, offp);
11124+
11125+ if (lkb->lkb_bastaddr)
11126+ put_int(1, buf, offp);
11127+ else
11128+ put_int(0, buf, offp);
b7b72b66 11129+ put_int(lkb->lkb_ownpid, buf, offp);
c1c6733f
AM
11130+
11131+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
b7b72b66 11132+ DLM_ASSERT(lkb->lkb_lvbptr,);
c1c6733f
AM
11133+ put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
11134+ }
11135+
11136+ /* Only send the range we actually need */
11137+ if (lkb->lkb_range) {
11138+ switch (lkb->lkb_status) {
11139+ case GDLM_LKSTS_CONVERT:
11140+ put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
11141+ put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
11142+ put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
11143+ put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
11144+ break;
11145+ case GDLM_LKSTS_WAITING:
11146+ put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
11147+ put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
11148+ break;
11149+ case GDLM_LKSTS_GRANTED:
11150+ put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
11151+ put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
11152+ break;
11153+ default:
b7b72b66 11154+ DLM_ASSERT(0,);
c1c6733f
AM
11155+ }
11156+ }
11157+}
11158+
b7b72b66 11159+static int rsb_length(struct dlm_rsb *rsb)
c1c6733f
AM
11160+{
11161+ int len = 0;
11162+
11163+ len += sizeof(int); /* number of res_name bytes */
11164+ len += rsb->res_length; /* res_name */
11165+ len += sizeof(int); /* res_remasterid */
11166+ len += sizeof(int); /* res_parent->res_remasterid */
11167+
11168+ return len;
11169+}
11170+
b7b72b66 11171+static inline struct dlm_rsb *next_subrsb(struct dlm_rsb *subrsb)
c1c6733f
AM
11172+{
11173+ struct list_head *tmp;
b7b72b66 11174+ struct dlm_rsb *r;
c1c6733f
AM
11175+
11176+ tmp = subrsb->res_subreslist.next;
b7b72b66 11177+ r = list_entry(tmp, struct dlm_rsb, res_subreslist);
c1c6733f
AM
11178+
11179+ return r;
11180+}
11181+
b7b72b66 11182+static inline int last_in_list(struct dlm_rsb *r, struct list_head *head)
c1c6733f 11183+{
b7b72b66
AM
11184+ struct dlm_rsb *last;
11185+ last = list_entry(head->prev, struct dlm_rsb, res_subreslist);
c1c6733f
AM
11186+ if (last == r)
11187+ return 1;
11188+ return 0;
11189+}
11190+
b7b72b66
AM
11191+static int lkbs_to_remaster_list(struct list_head *head)
11192+{
11193+ struct dlm_lkb *lkb;
11194+
11195+ list_for_each_entry(lkb, head, lkb_statequeue) {
11196+ if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
11197+ continue;
11198+ return TRUE;
11199+ }
11200+ return FALSE;
11201+}
11202+
11203+/*
c1c6733f 11204+ * Used to decide if an rsb should be rebuilt on a new master. An rsb only
b7b72b66
AM
11205+ * needs to be rebuild if we have lkb's queued on it. NOREBUILD lkb's are not
11206+ * rebuilt.
c1c6733f
AM
11207+ */
11208+
b7b72b66 11209+static int lkbs_to_remaster(struct dlm_rsb *r)
c1c6733f 11210+{
b7b72b66 11211+ struct dlm_rsb *sub;
c1c6733f 11212+
b7b72b66 11213+ if (lkbs_to_remaster_list(&r->res_grantqueue))
c1c6733f 11214+ return TRUE;
b7b72b66
AM
11215+ if (lkbs_to_remaster_list(&r->res_convertqueue))
11216+ return TRUE;
11217+ if (lkbs_to_remaster_list(&r->res_waitqueue))
c1c6733f 11218+ return TRUE;
c1c6733f
AM
11219+
11220+ list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
b7b72b66 11221+ if (lkbs_to_remaster_list(&sub->res_grantqueue))
c1c6733f 11222+ return TRUE;
b7b72b66
AM
11223+ if (lkbs_to_remaster_list(&sub->res_convertqueue))
11224+ return TRUE;
11225+ if (lkbs_to_remaster_list(&sub->res_waitqueue))
c1c6733f 11226+ return TRUE;
c1c6733f
AM
11227+ }
11228+
11229+ return FALSE;
11230+}
11231+
b7b72b66 11232+static void serialise_rsb(struct dlm_rsb *rsb, char *buf, int *offp)
c1c6733f 11233+{
b7b72b66 11234+ /*
c1c6733f
AM
11235+ * See rsb_length()
11236+ * Total: 36 bytes (4 + 24 + 4 + 4)
11237+ */
11238+
11239+ put_bytes(rsb->res_name, rsb->res_length, buf, offp);
11240+ put_int(rsb->res_remasterid, buf, offp);
11241+
11242+ if (rsb->res_parent)
11243+ put_int(rsb->res_parent->res_remasterid, buf, offp);
11244+ else
11245+ put_int(0, buf, offp);
11246+
b7b72b66 11247+ DLM_ASSERT(!rsb->res_lvbptr,);
c1c6733f
AM
11248+}
11249+
b7b72b66 11250+/*
c1c6733f
AM
11251+ * Flatten an LKB into a buffer for sending to the new RSB master. As a
11252+ * side-effect the nodeid of the lock is set to the nodeid of the new RSB
11253+ * master.
11254+ */
11255+
b7b72b66
AM
11256+static int pack_one_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb,
11257+ rcom_fill_t *fill)
c1c6733f
AM
11258+{
11259+ if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
11260+ goto nospace;
11261+
11262+ lkb->lkb_nodeid = r->res_nodeid;
11263+
11264+ put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
11265+ serialise_lkb(lkb, fill->outbuf, &fill->offset);
11266+
11267+ fill->count++;
11268+ need_new_lkid(r);
11269+ return 0;
11270+
11271+ nospace:
11272+ return -ENOSPC;
11273+}
11274+
b7b72b66 11275+/*
c1c6733f
AM
11276+ * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
11277+ */
11278+
b7b72b66 11279+static int pack_lkb_queue(struct dlm_rsb *r, struct list_head *queue,
c1c6733f
AM
11280+ rcom_fill_t *fill)
11281+{
b7b72b66 11282+ struct dlm_lkb *lkb;
c1c6733f
AM
11283+ int error;
11284+
11285+ list_for_each_entry(lkb, queue, lkb_statequeue) {
11286+ if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
11287+ continue;
11288+
11289+ error = pack_one_lkb(r, lkb, fill);
11290+ if (error)
11291+ goto nospace;
11292+ }
11293+
11294+ return 0;
11295+
11296+ nospace:
11297+ fill->lkb = lkb;
11298+ fill->lkbqueue = queue;
11299+
11300+ return error;
11301+}
11302+
b7b72b66 11303+static int pack_lkb_queues(struct dlm_rsb *r, rcom_fill_t *fill)
c1c6733f
AM
11304+{
11305+ int error;
11306+
11307+ error = pack_lkb_queue(r, &r->res_grantqueue, fill);
11308+ if (error)
11309+ goto nospace;
11310+
11311+ error = pack_lkb_queue(r, &r->res_convertqueue, fill);
11312+ if (error)
11313+ goto nospace;
11314+
11315+ error = pack_lkb_queue(r, &r->res_waitqueue, fill);
11316+
11317+ nospace:
11318+ return error;
11319+}
11320+
b7b72b66 11321+/*
c1c6733f
AM
11322+ * Pack remaining lkb's for rsb or subrsb. This may include a partial lkb
11323+ * queue and full lkb queues.
11324+ */
11325+
b7b72b66 11326+static int pack_lkb_remaining(struct dlm_rsb *r, rcom_fill_t *fill)
c1c6733f
AM
11327+{
11328+ struct list_head *tmp, *start, *end;
b7b72b66 11329+ struct dlm_lkb *lkb;
c1c6733f
AM
11330+ int error;
11331+
b7b72b66 11332+ /*
c1c6733f
AM
11333+ * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
11334+ */
11335+
11336+ error = pack_one_lkb(r, fill->lkb, fill);
11337+ if (error)
11338+ goto out;
11339+
11340+ start = fill->lkb->lkb_statequeue.next;
11341+ end = fill->lkbqueue;
11342+
11343+ for (tmp = start; tmp != end; tmp = tmp->next) {
b7b72b66 11344+ lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
c1c6733f
AM
11345+
11346+ error = pack_one_lkb(r, lkb, fill);
11347+ if (error) {
11348+ fill->lkb = lkb;
11349+ goto out;
11350+ }
11351+ }
11352+
b7b72b66 11353+ /*
c1c6733f
AM
11354+ * Pack all lkb's on r's queues following fill->lkbqueue.
11355+ */
11356+
11357+ if (fill->lkbqueue == &r->res_waitqueue)
11358+ goto out;
11359+ if (fill->lkbqueue == &r->res_convertqueue)
11360+ goto skip;
11361+
b7b72b66 11362+ DLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
c1c6733f
AM
11363+
11364+ error = pack_lkb_queue(r, &r->res_convertqueue, fill);
11365+ if (error)
11366+ goto out;
11367+ skip:
11368+ error = pack_lkb_queue(r, &r->res_waitqueue, fill);
11369+
11370+ out:
11371+ return error;
11372+}
11373+
b7b72b66
AM
11374+static int pack_one_subrsb(struct dlm_rsb *rsb, struct dlm_rsb *subrsb,
11375+ rcom_fill_t *fill)
c1c6733f
AM
11376+{
11377+ int error;
11378+
11379+ down_write(&subrsb->res_lock);
11380+
11381+ if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
11382+ goto nospace;
11383+
11384+ subrsb->res_nodeid = rsb->res_nodeid;
11385+ subrsb->res_remasterid = ++fill->remasterid;
11386+
11387+ put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
11388+ serialise_rsb(subrsb, fill->outbuf, &fill->offset);
11389+
11390+ error = pack_lkb_queues(subrsb, fill);
11391+ if (error)
11392+ goto nospace;
11393+
11394+ up_write(&subrsb->res_lock);
11395+
11396+ return 0;
11397+
11398+ nospace:
11399+ up_write(&subrsb->res_lock);
11400+ fill->subrsb = subrsb;
11401+
11402+ return -ENOSPC;
11403+}
11404+
b7b72b66
AM
11405+static int pack_subrsbs(struct dlm_rsb *rsb, struct dlm_rsb *in_subrsb,
11406+ rcom_fill_t *fill)
c1c6733f 11407+{
b7b72b66 11408+ struct dlm_rsb *subrsb;
c1c6733f
AM
11409+ int error = 0;
11410+
b7b72b66 11411+ /*
c1c6733f
AM
11412+ * When an initial subrsb is given, we know it needs to be packed.
11413+ * When no initial subrsb is given, begin with the first (if any exist).
11414+ */
11415+
11416+ if (!in_subrsb) {
11417+ if (list_empty(&rsb->res_subreslist))
11418+ goto out;
11419+
b7b72b66 11420+ subrsb = list_entry(rsb->res_subreslist.next, struct dlm_rsb,
c1c6733f
AM
11421+ res_subreslist);
11422+ } else
11423+ subrsb = in_subrsb;
11424+
11425+ for (;;) {
11426+ error = pack_one_subrsb(rsb, subrsb, fill);
11427+ if (error)
11428+ goto out;
11429+
11430+ if (last_in_list(subrsb, &rsb->res_subreslist))
11431+ break;
11432+
11433+ subrsb = next_subrsb(subrsb);
11434+ }
11435+
11436+ out:
11437+ return error;
11438+}
11439+
b7b72b66 11440+/*
c1c6733f
AM
11441+ * Finish packing whatever is left in an rsb tree. If space runs out while
11442+ * finishing, save subrsb/lkb and this will be called again for the same rsb.
11443+ *
11444+ * !subrsb && lkb, we left off part way through root rsb's lkbs.
11445+ * subrsb && !lkb, we left off just before starting a new subrsb.
11446+ * subrsb && lkb, we left off part way through a subrsb's lkbs.
11447+ * !subrsb && !lkb, we shouldn't be in this function, but starting
11448+ * a new rsb in pack_rsb_tree().
11449+ */
11450+
b7b72b66 11451+static int pack_rsb_tree_remaining(struct dlm_ls *ls, struct dlm_rsb *rsb,
c1c6733f
AM
11452+ rcom_fill_t *fill)
11453+{
b7b72b66 11454+ struct dlm_rsb *subrsb = NULL;
c1c6733f
AM
11455+ int error = 0;
11456+
11457+ if (!fill->subrsb && fill->lkb) {
11458+ error = pack_lkb_remaining(rsb, fill);
11459+ if (error)
11460+ goto out;
11461+
11462+ error = pack_subrsbs(rsb, NULL, fill);
11463+ if (error)
11464+ goto out;
11465+ }
11466+
11467+ else if (fill->subrsb && !fill->lkb) {
11468+ error = pack_subrsbs(rsb, fill->subrsb, fill);
11469+ if (error)
11470+ goto out;
11471+ }
11472+
11473+ else if (fill->subrsb && fill->lkb) {
11474+ error = pack_lkb_remaining(fill->subrsb, fill);
11475+ if (error)
11476+ goto out;
11477+
11478+ if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
11479+ goto out;
11480+
11481+ subrsb = next_subrsb(fill->subrsb);
11482+
11483+ error = pack_subrsbs(rsb, subrsb, fill);
11484+ if (error)
11485+ goto out;
11486+ }
11487+
11488+ fill->subrsb = NULL;
11489+ fill->lkb = NULL;
11490+
11491+ out:
11492+ return error;
11493+}
11494+
b7b72b66 11495+/*
c1c6733f
AM
11496+ * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
11497+ * buffer. When the buffer runs out of space, save the place to restart (the
11498+ * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
11499+ */
11500+
b7b72b66
AM
11501+static int pack_rsb_tree(struct dlm_ls *ls, struct dlm_rsb *rsb,
11502+ rcom_fill_t *fill)
c1c6733f
AM
11503+{
11504+ int error = -ENOSPC;
11505+
11506+ fill->remasterid = 0;
11507+
b7b72b66 11508+ /*
c1c6733f
AM
11509+ * Pack the root rsb itself. A 1 byte type precedes the serialised
11510+ * rsb. Then pack the lkb's for the root rsb.
11511+ */
11512+
11513+ down_write(&rsb->res_lock);
11514+
11515+ if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
11516+ goto out;
11517+
11518+ rsb->res_remasterid = ++fill->remasterid;
11519+ put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
11520+ serialise_rsb(rsb, fill->outbuf, &fill->offset);
11521+
11522+ error = pack_lkb_queues(rsb, fill);
11523+ if (error)
11524+ goto out;
11525+
11526+ up_write(&rsb->res_lock);
11527+
b7b72b66 11528+ /*
c1c6733f
AM
11529+ * Pack subrsb/lkb's under the root rsb.
11530+ */
11531+
11532+ error = pack_subrsbs(rsb, NULL, fill);
11533+
11534+ return error;
11535+
11536+ out:
11537+ up_write(&rsb->res_lock);
11538+ return error;
11539+}
11540+
b7b72b66 11541+/*
c1c6733f
AM
11542+ * Given an RSB, return the next RSB that should be sent to a new master.
11543+ */
11544+
b7b72b66
AM
11545+static struct dlm_rsb *next_remastered_rsb(struct dlm_ls *ls,
11546+ struct dlm_rsb *rsb)
c1c6733f
AM
11547+{
11548+ struct list_head *tmp, *start, *end;
b7b72b66 11549+ struct dlm_rsb *r;
c1c6733f
AM
11550+
11551+ if (!rsb)
11552+ start = ls->ls_rootres.next;
11553+ else
11554+ start = rsb->res_rootlist.next;
11555+
11556+ end = &ls->ls_rootres;
11557+
11558+ for (tmp = start; tmp != end; tmp = tmp->next) {
b7b72b66 11559+ r = list_entry(tmp, struct dlm_rsb, res_rootlist);
c1c6733f
AM
11560+
11561+ if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
11562+ if (r->res_nodeid && lkbs_to_remaster(r)) {
11563+ expect_new_lkids(r);
11564+ return r;
11565+ } else
11566+ clear_bit(RESFL_NEW_MASTER, &r->res_flags);
11567+ }
11568+ }
11569+
11570+ return NULL;
11571+}
11572+
b7b72b66 11573+/*
c1c6733f
AM
11574+ * Given an rcom buffer, fill it with RSB's that need to be sent to a single
11575+ * new master node. In the case where all the data to send to one node
11576+ * requires multiple messages, this function needs to resume filling each
11577+ * successive buffer from the point where it left off when the previous buffer
11578+ * filled up.
11579+ */
11580+
b7b72b66
AM
11581+static void fill_rcom_buffer(struct dlm_ls *ls, rcom_fill_t *fill,
11582+ uint32_t *nodeid)
c1c6733f 11583+{
b7b72b66 11584+ struct dlm_rsb *rsb, *prev_rsb = fill->rsb;
c1c6733f
AM
11585+ int error;
11586+
11587+ fill->offset = 0;
11588+
11589+ if (!prev_rsb) {
11590+
b7b72b66 11591+ /*
c1c6733f
AM
11592+ * The first time this function is called.
11593+ */
11594+
11595+ rsb = next_remastered_rsb(ls, NULL);
11596+ if (!rsb)
11597+ goto no_more;
11598+
11599+ } else if (fill->subrsb || fill->lkb) {
11600+
b7b72b66 11601+ /*
c1c6733f
AM
11602+ * Continue packing an rsb tree that was partially packed last
11603+ * time (fill->subrsb/lkb indicates where packing of last block
11604+ * left off)
11605+ */
11606+
11607+ rsb = prev_rsb;
11608+ *nodeid = rsb->res_nodeid;
11609+
11610+ error = pack_rsb_tree_remaining(ls, rsb, fill);
11611+ if (error == -ENOSPC)
11612+ goto more;
11613+
11614+ rsb = next_remastered_rsb(ls, prev_rsb);
11615+ if (!rsb)
11616+ goto no_more;
11617+
11618+ if (rsb->res_nodeid != prev_rsb->res_nodeid)
11619+ goto more;
11620+ } else {
11621+ rsb = prev_rsb;
11622+ }
11623+
b7b72b66 11624+ /*
c1c6733f
AM
11625+ * Pack rsb trees into the buffer until we run out of space, run out of
11626+ * new rsb's or hit a new nodeid.
11627+ */
11628+
11629+ *nodeid = rsb->res_nodeid;
11630+
11631+ for (;;) {
11632+ error = pack_rsb_tree(ls, rsb, fill);
11633+ if (error == -ENOSPC)
11634+ goto more;
11635+
11636+ prev_rsb = rsb;
11637+
11638+ rsb = next_remastered_rsb(ls, prev_rsb);
11639+ if (!rsb)
11640+ goto no_more;
11641+
11642+ if (rsb->res_nodeid != prev_rsb->res_nodeid)
11643+ goto more;
11644+ }
11645+
11646+ more:
11647+ fill->more = 1;
11648+ fill->rsb = rsb;
11649+ return;
11650+
11651+ no_more:
11652+ fill->more = 0;
11653+}
11654+
b7b72b66 11655+/*
c1c6733f
AM
11656+ * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
11657+ */
11658+
b7b72b66 11659+int rebuild_rsbs_send(struct dlm_ls *ls)
c1c6733f 11660+{
b7b72b66 11661+ struct dlm_rcom *rc;
c1c6733f
AM
11662+ rcom_fill_t fill;
11663+ uint32_t nodeid;
11664+ int error;
11665+
b7b72b66 11666+ DLM_ASSERT(recover_list_empty(ls),);
c1c6733f
AM
11667+
11668+ log_all(ls, "rebuild locks");
11669+
11670+ error = -ENOMEM;
11671+ rc = allocate_rcom_buffer(ls);
11672+ if (!rc)
11673+ goto ret;
11674+
b7b72b66
AM
11675+ down_read(&ls->ls_root_lock);
11676+
c1c6733f
AM
11677+ error = 0;
11678+ memset(&fill, 0, sizeof(rcom_fill_t));
11679+ fill.outbuf = rc->rc_buf;
b7b72b66 11680+ fill.maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
c1c6733f
AM
11681+
11682+ do {
11683+ fill_rcom_buffer(ls, &fill, &nodeid);
11684+ if (!fill.offset)
11685+ break;
11686+
11687+ rc->rc_datalen = fill.offset;
11688+ error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
b7b72b66
AM
11689+ if (error) {
11690+ up_read(&ls->ls_root_lock);
c1c6733f 11691+ goto out;
b7b72b66 11692+ }
c1c6733f
AM
11693+
11694+ schedule();
b7b72b66
AM
11695+ error = dlm_recovery_stopped(ls);
11696+ if (error) {
11697+ up_read(&ls->ls_root_lock);
c1c6733f 11698+ goto out;
b7b72b66 11699+ }
c1c6733f
AM
11700+ }
11701+ while (fill.more);
11702+
b7b72b66
AM
11703+ up_read(&ls->ls_root_lock);
11704+
11705+ error = dlm_wait_function(ls, &recover_list_empty);
c1c6733f
AM
11706+
11707+ log_all(ls, "rebuilt %d locks", fill.count);
11708+
11709+ out:
c1c6733f
AM
11710+ free_rcom_buffer(rc);
11711+
11712+ ret:
11713+ return error;
11714+}
11715+
b7b72b66
AM
11716+static struct dlm_rsb *find_by_remasterid(struct dlm_ls *ls, int remasterid,
11717+ struct dlm_rsb *rootrsb)
c1c6733f 11718+{
b7b72b66 11719+ struct dlm_rsb *rsb;
c1c6733f 11720+
b7b72b66 11721+ DLM_ASSERT(rootrsb,);
c1c6733f
AM
11722+
11723+ if (rootrsb->res_remasterid == remasterid) {
11724+ rsb = rootrsb;
11725+ goto out;
11726+ }
11727+
11728+ list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
11729+ if (rsb->res_remasterid == remasterid)
11730+ goto out;
11731+ }
11732+ rsb = NULL;
11733+
11734+ out:
11735+ return rsb;
11736+}
11737+
b7b72b66 11738+/*
c1c6733f
AM
11739+ * Search a queue for the given remote lock id (remlkid).
11740+ */
11741+
b7b72b66
AM
11742+static struct dlm_lkb *search_remlkid(struct list_head *statequeue, int nodeid,
11743+ int remid)
c1c6733f 11744+{
b7b72b66 11745+ struct dlm_lkb *lkb;
c1c6733f
AM
11746+
11747+ list_for_each_entry(lkb, statequeue, lkb_statequeue) {
11748+ if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
11749+ return lkb;
11750+ }
11751+ }
11752+
11753+ return NULL;
11754+}
11755+
b7b72b66 11756+/*
c1c6733f
AM
11757+ * Given a remote lock ID (and a parent resource), return the local LKB for it
11758+ * Hopefully we dont need to do this too often on deep lock trees. This is
11759+ * VERY suboptimal for anything but the smallest lock trees. It searches the
11760+ * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
11761+ * returns the LKB address. OPTIMISATION: we should keep a list of these while
11762+ * we are building up the remastered LKBs
11763+ */
11764+
b7b72b66
AM
11765+static struct dlm_lkb *find_by_remlkid(struct dlm_rsb *rootrsb, int nodeid,
11766+ int remid)
c1c6733f 11767+{
b7b72b66
AM
11768+ struct dlm_lkb *lkb;
11769+ struct dlm_rsb *rsb;
c1c6733f
AM
11770+
11771+ lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
11772+ if (lkb)
11773+ goto out;
11774+
11775+ lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
11776+ if (lkb)
11777+ goto out;
11778+
11779+ lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
11780+ if (lkb)
11781+ goto out;
11782+
11783+ list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
11784+ lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
11785+ if (lkb)
11786+ goto out;
11787+
11788+ lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
11789+ if (lkb)
11790+ goto out;
11791+
11792+ lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
11793+ if (lkb)
11794+ goto out;
11795+ }
11796+ lkb = NULL;
11797+
11798+ out:
11799+ return lkb;
11800+}
11801+
b7b72b66 11802+/*
c1c6733f
AM
11803+ * Unpack an LKB from a remaster operation
11804+ */
11805+
b7b72b66
AM
11806+static int deserialise_lkb(struct dlm_ls *ls, int rem_nodeid,
11807+ struct dlm_rsb *rootrsb, char *buf, int *ptr,
11808+ char *outbuf, int *outoffp)
c1c6733f 11809+{
b7b72b66
AM
11810+ struct dlm_lkb *lkb, *exist_lkb = NULL;
11811+ struct dlm_rsb *rsb;
c1c6733f
AM
11812+ int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
11813+
11814+ remote_lkid = get_int(buf, ptr);
11815+
11816+ rsb_rmid = get_int(buf, ptr);
11817+ rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
b7b72b66 11818+ DLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
c1c6733f 11819+
b7b72b66 11820+ /*
c1c6733f 11821+ * We could have received this lkb already from a previous recovery
b7b72b66
AM
11822+ * that was interrupted. We still need to advance ptr so read in
11823+ * lkb and then release it. FIXME: verify this is valid.
c1c6733f
AM
11824+ */
11825+ lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
b7b72b66
AM
11826+ if (lkb) {
11827+ log_all(ls, "lkb %x exists %s", remote_lkid, rsb->res_name);
11828+ exist_lkb = lkb;
11829+ }
c1c6733f 11830+
b7b72b66 11831+ lkb = create_lkb(ls);
c1c6733f
AM
11832+ if (!lkb)
11833+ goto out;
11834+
11835+ lkb->lkb_remid = remote_lkid;
11836+ lkb->lkb_flags = get_int(buf, ptr);
11837+ status = get_int(buf, ptr);
11838+ lkb->lkb_rqmode = get_char(buf, ptr);
11839+ lkb->lkb_grmode = get_char(buf, ptr);
11840+ atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
11841+
11842+ parentid = get_int(buf, ptr);
11843+ lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
b7b72b66 11844+ lkb->lkb_ownpid = get_int(buf, ptr);
c1c6733f
AM
11845+
11846+ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11847+ lkb->lkb_lvbptr = allocate_lvb(ls);
11848+ if (!lkb->lkb_lvbptr)
11849+ goto out;
11850+ get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
11851+ }
11852+
11853+ if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
11854+ uint64_t start, end;
11855+
11856+ /* Don't need to keep the range flag, for comms use only */
11857+ lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
11858+ start = get_int64(buf, ptr);
11859+ end = get_int64(buf, ptr);
11860+
b7b72b66 11861+ lkb->lkb_range = allocate_range(ls);
c1c6733f
AM
11862+ if (!lkb->lkb_range)
11863+ goto out;
11864+
11865+ switch (status) {
11866+ case GDLM_LKSTS_CONVERT:
11867+ lkb->lkb_range[RQ_RANGE_START] = start;
11868+ lkb->lkb_range[RQ_RANGE_END] = end;
11869+ start = get_int64(buf, ptr);
11870+ end = get_int64(buf, ptr);
11871+ lkb->lkb_range[GR_RANGE_START] = start;
11872+ lkb->lkb_range[GR_RANGE_END] = end;
11873+
11874+ case GDLM_LKSTS_WAITING:
11875+ lkb->lkb_range[RQ_RANGE_START] = start;
11876+ lkb->lkb_range[RQ_RANGE_END] = end;
11877+ break;
11878+
11879+ case GDLM_LKSTS_GRANTED:
11880+ lkb->lkb_range[GR_RANGE_START] = start;
11881+ lkb->lkb_range[GR_RANGE_END] = end;
11882+ break;
11883+ default:
b7b72b66 11884+ DLM_ASSERT(0,);
c1c6733f
AM
11885+ }
11886+ }
11887+
b7b72b66
AM
11888+ if (exist_lkb) {
11889+ /* verify lkb and exist_lkb values match? */
11890+ release_lkb(ls, lkb);
11891+ lkb = exist_lkb;
11892+ goto put_lkid;
11893+ }
11894+
c1c6733f
AM
11895+ /* Resolve local lock LKB address from parent ID */
11896+ if (parentid)
11897+ lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
11898+ parentid);
11899+
11900+ atomic_inc(&rsb->res_ref);
11901+ lkb->lkb_resource = rsb;
11902+
11903+ lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
11904+ lkb->lkb_nodeid = rem_nodeid;
11905+
b7b72b66 11906+ /*
c1c6733f
AM
11907+ * Put the lkb on an RSB queue. An lkb that's in the midst of a
11908+ * conversion request (on the requesting node's lockqueue and has
11909+ * LQCONVERT set) should be put on the granted queue. The convert
11910+ * request will be resent by the requesting node.
11911+ */
11912+
11913+ if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
11914+ lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
b7b72b66 11915+ DLM_ASSERT(status == GDLM_LKSTS_CONVERT,
c1c6733f
AM
11916+ printk("status=%d\n", status););
11917+ lkb->lkb_rqmode = DLM_LOCK_IV;
11918+ status = GDLM_LKSTS_GRANTED;
11919+ }
11920+
11921+ lkb_enqueue(rsb, lkb, status);
11922+
b7b72b66 11923+ /*
c1c6733f
AM
11924+ * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
11925+ */
11926+
11927+ if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
11928+ && lkb->lkb_grmode > DLM_LOCK_NL) {
11929+ if (!rsb->res_lvbptr)
b7b72b66 11930+ rsb->res_lvbptr = allocate_lvb(ls);
c1c6733f
AM
11931+ if (!rsb->res_lvbptr)
11932+ goto out;
11933+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
11934+ }
11935+
b7b72b66 11936+ /*
c1c6733f
AM
11937+ * Clear flags that may have been sent over that are only relevant in
11938+ * the context of the sender.
11939+ */
11940+
b7b72b66
AM
11941+ lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND |
11942+ GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED);
c1c6733f
AM
11943+
11944+ put_lkid:
11945+ /* Return the new LKID to the caller's buffer */
11946+ put_int(lkb->lkb_id, outbuf, outoffp);
11947+ put_int(lkb->lkb_remid, outbuf, outoffp);
11948+ error = 0;
11949+
11950+ out:
11951+ return error;
11952+}
11953+
b7b72b66
AM
11954+static struct dlm_rsb *deserialise_rsb(struct dlm_ls *ls, int nodeid,
11955+ struct dlm_rsb *rootrsb, char *buf,
11956+ int *ptr)
c1c6733f
AM
11957+{
11958+ int length;
11959+ int remasterid;
11960+ int parent_remasterid;
11961+ char name[DLM_RESNAME_MAXLEN];
11962+ int error;
b7b72b66
AM
11963+ struct dlm_rsb *parent = NULL;
11964+ struct dlm_rsb *rsb;
c1c6733f
AM
11965+
11966+ get_bytes(name, &length, buf, ptr);
11967+ remasterid = get_int(buf, ptr);
11968+ parent_remasterid = get_int(buf, ptr);
11969+
11970+ if (parent_remasterid)
11971+ parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
11972+
b7b72b66 11973+ /*
c1c6733f
AM
11974+ * The rsb reference from this find_or_create_rsb() will keep the rsb
11975+ * around while we add new lkb's to it from deserialise_lkb. Each of
11976+ * the lkb's will add an rsb reference. The reference added here is
11977+ * removed by release_rsb() after all lkb's are added.
11978+ */
11979+
b7b72b66
AM
11980+ error = find_rsb(ls, parent, name, length, CREATE, &rsb);
11981+ DLM_ASSERT(!error,);
11982+
11983+ set_bit(RESFL_MASTER, &rsb->res_flags);
c1c6733f
AM
11984+
11985+ /* There is a case where the above needs to create the RSB. */
11986+ if (rsb->res_nodeid == -1)
11987+ rsb->res_nodeid = our_nodeid();
11988+
11989+ rsb->res_remasterid = remasterid;
11990+
11991+ return rsb;
11992+}
11993+
b7b72b66 11994+/*
c1c6733f
AM
11995+ * Processing at the receiving end of a NEWLOCKS message from a node in
11996+ * rebuild_rsbs_send(). Rebuild a remastered lock tree. Nodeid is the remote
11997+ * node whose locks we are now mastering. For a reply we need to send back the
11998+ * new lockids of the remastered locks so that remote ops can find them.
11999+ */
12000+
b7b72b66 12001+int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
c1c6733f 12002+{
b7b72b66
AM
12003+ struct dlm_rcom *rc;
12004+ struct dlm_rsb *rsb = NULL;
c1c6733f
AM
12005+ rebuild_node_t *rnode;
12006+ char *outbuf;
12007+ int outptr, ptr = 0, error = -ENOMEM;
12008+
12009+ rnode = find_rebuild_root(ls, nodeid);
12010+ if (!rnode)
12011+ goto out;
12012+
b7b72b66 12013+ /*
c1c6733f
AM
12014+ * Allocate a buffer for the reply message which is a list of remote
12015+ * lock IDs and their (new) local lock ids. It will always be big
12016+ * enough to fit <n> ID pairs if it already fit <n> LKBs.
12017+ */
12018+
12019+ rc = allocate_rcom_buffer(ls);
12020+ if (!rc)
12021+ goto out;
12022+ outbuf = rc->rc_buf;
12023+ outptr = 0;
12024+
b7b72b66 12025+ /*
c1c6733f
AM
12026+ * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
12027+ * created. Each deserialise_rsb adds an rsb reference that must be
12028+ * removed with release_rsb once all new lkb's for an rsb have been
12029+ * added.
12030+ */
12031+
12032+ while (ptr < len) {
12033+ int type;
12034+
12035+ type = get_char(buf, &ptr);
12036+
12037+ switch (type) {
12038+ case REMASTER_ROOTRSB:
12039+ if (rsb)
12040+ release_rsb(rsb);
12041+ rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
12042+ &ptr);
12043+ rnode->rootrsb = rsb;
12044+ break;
12045+
12046+ case REMASTER_RSB:
12047+ if (rsb)
12048+ release_rsb(rsb);
12049+ rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
12050+ &ptr);
12051+ break;
12052+
12053+ case REMASTER_LKB:
12054+ deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
12055+ outbuf, &outptr);
12056+ break;
12057+
12058+ default:
b7b72b66 12059+ DLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
c1c6733f
AM
12060+ "len=%d\n", type, nodeid, ptr,
12061+ len););
12062+ }
12063+ }
12064+
12065+ if (rsb)
12066+ release_rsb(rsb);
12067+
b7b72b66 12068+ /*
c1c6733f
AM
12069+ * Reply with the new lock IDs.
12070+ */
12071+
12072+ rc->rc_datalen = outptr;
12073+ error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
12074+
12075+ free_rcom_buffer(rc);
12076+
12077+ out:
12078+ return error;
12079+}
12080+
b7b72b66 12081+/*
c1c6733f
AM
12082+ * Processing for a NEWLOCKIDS message. Called when we get the reply from the
12083+ * new master telling us what the new remote lock IDs are for the remastered
12084+ * locks
12085+ */
12086+
b7b72b66 12087+int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
c1c6733f
AM
12088+{
12089+ int offset = 0;
12090+
12091+ if (len == 1)
12092+ len = 0;
12093+
12094+ while (offset < len) {
12095+ int remote_id;
12096+ int local_id;
b7b72b66 12097+ struct dlm_lkb *lkb;
c1c6733f
AM
12098+
12099+ if (offset + 8 > len) {
12100+ log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
12101+ "length nodeid=%d offset=%d len=%d",
12102+ nodeid, offset, len);
12103+ break;
12104+ }
12105+
12106+ remote_id = get_int(buf, &offset);
12107+ local_id = get_int(buf, &offset);
12108+
12109+ lkb = find_lock_by_id(ls, local_id);
12110+ if (lkb) {
12111+ lkb->lkb_remid = remote_id;
12112+ have_new_lkid(lkb);
12113+ } else {
12114+ log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
12115+ "nodeid=%d id=%x remid=%x offset=%d len=%d",
12116+ nodeid, local_id, remote_id, offset, len);
12117+ }
12118+ }
12119+
12120+ if (recover_list_empty(ls))
12121+ wake_up(&ls->ls_wait_general);
12122+
12123+ return 0;
12124+}
12125diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
12126--- linux-orig/cluster/dlm/rebuild.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 12127+++ linux-patched/cluster/dlm/rebuild.h 2004-11-03 11:31:56.000000000 +0800
c1c6733f
AM
12128@@ -0,0 +1,22 @@
12129+/******************************************************************************
12130+*******************************************************************************
12131+**
12132+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12133+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12134+**
12135+** This copyrighted material is made available to anyone wishing to use,
12136+** modify, copy, or redistribute it subject to the terms and conditions
12137+** of the GNU General Public License v.2.
12138+**
12139+*******************************************************************************
12140+******************************************************************************/
12141+
12142+#ifndef __REBUILD_DOT_H__
12143+#define __REBUILD_DOT_H__
12144+
b7b72b66
AM
12145+int rebuild_rsbs_send(struct dlm_ls *ls);
12146+int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
12147+int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
12148+int rebuild_freemem(struct dlm_ls *ls);
c1c6733f
AM
12149+
12150+#endif /* __REBUILD_DOT_H__ */
12151diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
12152--- linux-orig/cluster/dlm/reccomms.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
12153+++ linux-patched/cluster/dlm/reccomms.c 2004-11-03 11:31:56.000000000 +0800
12154@@ -0,0 +1,447 @@
c1c6733f
AM
12155+/******************************************************************************
12156+*******************************************************************************
12157+**
12158+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12159+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12160+**
12161+** This copyrighted material is made available to anyone wishing to use,
12162+** modify, copy, or redistribute it subject to the terms and conditions
12163+** of the GNU General Public License v.2.
12164+**
12165+*******************************************************************************
12166+******************************************************************************/
12167+
12168+#include "dlm_internal.h"
12169+#include "lowcomms.h"
12170+#include "midcomms.h"
12171+#include "reccomms.h"
12172+#include "nodes.h"
12173+#include "lockspace.h"
12174+#include "recover.h"
12175+#include "dir.h"
12176+#include "config.h"
12177+#include "rebuild.h"
12178+#include "memory.h"
12179+
12180+/* Running on the basis that only a single recovery communication will be done
12181+ * at a time per lockspace */
12182+
bb1d8b11 12183+static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc);
c1c6733f 12184+
b7b72b66 12185+static int rcom_response(struct dlm_ls *ls)
c1c6733f
AM
12186+{
12187+ return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12188+}
12189+
12190+/**
12191+ * rcom_send_message - send or request recovery data
12192+ * @ls: the lockspace
12193+ * @nodeid: node to which the message is sent
12194+ * @type: type of recovery message
12195+ * @rc: the rc buffer to send
12196+ * @need_reply: wait for reply if this is set
12197+ *
12198+ * Using this interface
12199+ * i) Allocate an rc buffer:
12200+ * rc = allocate_rcom_buffer(ls);
12201+ * ii) Copy data to send beginning at rc->rc_buf:
12202+ * memcpy(rc->rc_buf, mybuf, mylen);
12203+ * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
12204+ * rc->rc_datalen = mylen
12205+ * iv) Submit the rc to this function:
12206+ * rcom_send_message(rc);
12207+ *
b7b72b66
AM
12208+ * The max value of "mylen" is dlm_config.buffer_size - sizeof(struct
12209+ * dlm_rcom). If more data must be passed in one send, use
12210+ * rcom_expand_buffer() which incrementally increases the size of the rc buffer
12211+ * by dlm_config.buffer_size bytes.
c1c6733f
AM
12212+ *
12213+ * Any data returned for the message (when need_reply is set) will saved in
12214+ * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
12215+ * number of bytes copied into rc->rc_buf.
12216+ *
12217+ * Returns: 0 on success, -EXXX on failure
12218+ */
12219+
b7b72b66
AM
12220+int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
12221+ struct dlm_rcom *rc, int need_reply)
c1c6733f
AM
12222+{
12223+ int error = 0;
12224+
12225+ if (!rc->rc_datalen)
12226+ rc->rc_datalen = 1;
12227+
12228+ /*
12229+ * Fill in the header.
12230+ */
12231+
12232+ rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
12233+ rc->rc_header.rh_lockspace = ls->ls_global_id;
b7b72b66 12234+ rc->rc_header.rh_length = sizeof(struct dlm_rcom) + rc->rc_datalen - 1;
c1c6733f
AM
12235+ rc->rc_subcmd = type;
12236+ rc->rc_msgid = ++ls->ls_rcom_msgid;
12237+
c1c6733f
AM
12238+ /*
12239+ * When a reply is received, the reply data goes back into this buffer.
12240+ * Synchronous rcom requests (need_reply=1) are serialised because of
12241+ * the single ls_rcom.
12242+ */
12243+
12244+ if (need_reply) {
12245+ down(&ls->ls_rcom_lock);
12246+ ls->ls_rcom = rc;
12247+ }
12248+
12249+ /*
12250+ * After sending the message we'll wait at the end of this function to
12251+ * get a reply. The READY flag will be set when the reply has been
12252+ * received and requested data has been copied into
12253+ * ls->ls_rcom->rc_buf;
12254+ */
12255+
b7b72b66 12256+ DLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
c1c6733f
AM
12257+
12258+ /*
12259+ * The WAIT bit indicates that we're waiting for and willing to accept a
12260+ * reply. Any replies are ignored unless this bit is set.
12261+ */
12262+
12263+ set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
12264+
12265+ /*
12266+ * Process the message locally.
12267+ */
12268+
12269+ if (nodeid == our_nodeid()) {
12270+ rcom_process_message(ls, nodeid, rc);
12271+ goto out;
12272+ }
12273+
12274+ /*
12275+ * Send the message.
12276+ */
12277+
12278+ log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
12279+
b7b72b66 12280+ error = midcomms_send_message(nodeid, (struct dlm_header *) rc,
c1c6733f 12281+ GFP_KERNEL);
b7b72b66 12282+ DLM_ASSERT(error >= 0, printk("error = %d\n", error););
c1c6733f
AM
12283+ error = 0;
12284+
12285+ /*
12286+ * Wait for a reply. Once a reply is processed from midcomms, the
b7b72b66 12287+ * READY bit will be set and we'll be awoken (dlm_wait_function will
c1c6733f
AM
12288+ * return 0).
12289+ */
12290+
12291+ if (need_reply) {
b7b72b66 12292+ error = dlm_wait_function(ls, &rcom_response);
c1c6733f
AM
12293+ if (error)
12294+ log_debug(ls, "rcom wait error %d", error);
12295+ }
12296+
12297+ out:
12298+ clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
12299+ clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12300+
12301+ if (need_reply)
12302+ up(&ls->ls_rcom_lock);
12303+
12304+ return error;
12305+}
12306+
12307+/*
12308+ * Runs in same context as midcomms.
12309+ */
12310+
b7b72b66 12311+static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc)
c1c6733f 12312+{
b7b72b66
AM
12313+ struct dlm_rcom rc_stack;
12314+ struct dlm_rcom *reply = NULL;
c1c6733f 12315+ int status, datalen, maxlen;
b7b72b66 12316+ uint32_t r_nodeid, be_nodeid;
c1c6733f
AM
12317+
12318+ if (!ls)
12319+ return;
12320+
b7b72b66 12321+ if (dlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
c1c6733f
AM
12322+ log_error(ls, "ignoring recovery message %x from %u",
12323+ rc->rc_subcmd, nodeid);
12324+ return;
12325+ }
12326+
12327+ switch (rc->rc_subcmd) {
12328+
12329+ case RECCOMM_STATUS:
12330+
b7b72b66 12331+ memset(&rc_stack, 0, sizeof(struct dlm_rcom));
c1c6733f
AM
12332+ reply = &rc_stack;
12333+
12334+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12335+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12336+ reply->rc_subcmd = rc->rc_subcmd;
12337+ reply->rc_msgid = rc->rc_msgid;
12338+ reply->rc_buf[0] = 0;
12339+
12340+ if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
12341+ reply->rc_buf[0] |= RESDIR_VALID;
12342+
12343+ if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
12344+ reply->rc_buf[0] |= RESDIR_ALL_VALID;
12345+
12346+ if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
12347+ reply->rc_buf[0] |= NODES_VALID;
12348+
12349+ if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
12350+ reply->rc_buf[0] |= NODES_ALL_VALID;
12351+
12352+ reply->rc_datalen = 1;
12353+ reply->rc_header.rh_length =
b7b72b66 12354+ sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
c1c6733f
AM
12355+
12356+ log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
12357+ break;
12358+
12359+ case RECCOMM_RECOVERNAMES:
12360+
12361+ reply = allocate_rcom_buffer(ls);
b7b72b66
AM
12362+ DLM_ASSERT(reply,);
12363+ maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
c1c6733f
AM
12364+
12365+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12366+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12367+ reply->rc_subcmd = rc->rc_subcmd;
12368+ reply->rc_msgid = rc->rc_msgid;
12369+
12370+ /*
12371+ * The other node wants a bunch of resource names. The name of
12372+ * the resource to begin with is in rc->rc_buf.
12373+ */
12374+
b7b72b66
AM
12375+ datalen = dlm_dir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
12376+ reply->rc_buf, maxlen, nodeid);
c1c6733f
AM
12377+
12378+ reply->rc_datalen = datalen;
12379+ reply->rc_header.rh_length =
b7b72b66 12380+ sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
c1c6733f
AM
12381+
12382+ log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
12383+ reply->rc_msgid);
12384+ break;
12385+
12386+ case RECCOMM_GETMASTER:
12387+
12388+ reply = allocate_rcom_buffer(ls);
b7b72b66 12389+ DLM_ASSERT(reply,);
c1c6733f
AM
12390+
12391+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12392+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12393+ reply->rc_subcmd = rc->rc_subcmd;
12394+ reply->rc_msgid = rc->rc_msgid;
12395+
12396+ /*
12397+ * The other node wants to know the master of a named resource.
12398+ */
12399+
b7b72b66
AM
12400+ status = dlm_dir_lookup(ls, nodeid, rc->rc_buf, rc->rc_datalen,
12401+ &r_nodeid);
c1c6733f 12402+ if (status != 0) {
b7b72b66 12403+ log_all(ls, "rcom lookup error %d", status);
c1c6733f
AM
12404+ free_rcom_buffer(reply);
12405+ reply = NULL;
12406+ return;
12407+ }
b7b72b66 12408+ be_nodeid = cpu_to_be32(r_nodeid);
c1c6733f
AM
12409+ memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
12410+ reply->rc_datalen = sizeof(uint32_t);
12411+ reply->rc_header.rh_length =
b7b72b66 12412+ sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
c1c6733f
AM
12413+ break;
12414+
12415+ case RECCOMM_BULKLOOKUP:
12416+
12417+ reply = allocate_rcom_buffer(ls);
b7b72b66 12418+ DLM_ASSERT(reply,);
c1c6733f
AM
12419+
12420+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12421+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12422+ reply->rc_subcmd = rc->rc_subcmd;
12423+ reply->rc_msgid = rc->rc_msgid;
12424+
12425+ /*
12426+ * This is a bulk version of the above and just returns a
12427+ * buffer full of node ids to match the resources
12428+ */
12429+
12430+ datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
12431+ rc->rc_datalen, reply->rc_buf);
12432+ if (datalen < 0) {
12433+ free_rcom_buffer(reply);
12434+ reply = NULL;
12435+ return;
12436+ }
12437+
12438+ reply->rc_datalen = datalen;
12439+ reply->rc_header.rh_length =
b7b72b66 12440+ sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
c1c6733f
AM
12441+ break;
12442+
12443+ /*
12444+ * These RECCOMM messages don't need replies.
12445+ */
12446+
12447+ case RECCOMM_NEWLOCKS:
12448+ rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
12449+ break;
12450+
12451+ case RECCOMM_NEWLOCKIDS:
12452+ rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
12453+ break;
12454+
12455+ case RECCOMM_REMRESDATA:
b7b72b66 12456+ dlm_dir_remove(ls, nodeid, rc->rc_buf, rc->rc_datalen);
c1c6733f
AM
12457+ break;
12458+
12459+ default:
b7b72b66 12460+ DLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
c1c6733f
AM
12461+ }
12462+
12463+ if (reply) {
12464+ if (nodeid == our_nodeid()) {
b7b72b66 12465+ DLM_ASSERT(rc == ls->ls_rcom,);
c1c6733f
AM
12466+ memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
12467+ rc->rc_datalen = reply->rc_datalen;
12468+ } else {
12469+ midcomms_send_message(nodeid,
b7b72b66 12470+ (struct dlm_header *) reply,
c1c6733f
AM
12471+ GFP_KERNEL);
12472+ }
12473+
12474+ if (reply != &rc_stack)
12475+ free_rcom_buffer(reply);
12476+ }
12477+}
12478+
b7b72b66
AM
12479+static void process_reply_sync(struct dlm_ls *ls, uint32_t nodeid,
12480+ struct dlm_rcom *reply)
c1c6733f 12481+{
b7b72b66 12482+ struct dlm_rcom *rc = ls->ls_rcom;
c1c6733f
AM
12483+
12484+ if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
12485+ log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
12486+ return;
12487+ }
12488+
12489+ if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
12490+ log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
12491+ reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
12492+ return;
12493+ }
12494+
12495+ memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
12496+ rc->rc_datalen = reply->rc_datalen;
12497+
12498+ /*
12499+ * Tell the thread waiting in rcom_send_message() that it can go ahead.
12500+ */
12501+
12502+ set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12503+ wake_up(&ls->ls_wait_general);
12504+}
12505+
b7b72b66
AM
12506+static void process_reply_async(struct dlm_ls *ls, uint32_t nodeid,
12507+ struct dlm_rcom *reply)
c1c6733f
AM
12508+{
12509+ restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
12510+ reply->rc_msgid);
12511+}
12512+
12513+/*
12514+ * Runs in same context as midcomms.
12515+ */
12516+
b7b72b66
AM
12517+static void rcom_process_reply(struct dlm_ls *ls, uint32_t nodeid,
12518+ struct dlm_rcom *reply)
c1c6733f 12519+{
b7b72b66 12520+ if (dlm_recovery_stopped(ls)) {
c1c6733f
AM
12521+ log_error(ls, "ignoring recovery reply %x from %u",
12522+ reply->rc_subcmd, nodeid);
12523+ return;
12524+ }
12525+
12526+ switch (reply->rc_subcmd) {
12527+ case RECCOMM_GETMASTER:
12528+ process_reply_async(ls, nodeid, reply);
12529+ break;
12530+ case RECCOMM_STATUS:
12531+ case RECCOMM_NEWLOCKS:
12532+ case RECCOMM_NEWLOCKIDS:
12533+ case RECCOMM_RECOVERNAMES:
12534+ process_reply_sync(ls, nodeid, reply);
12535+ break;
12536+ default:
12537+ log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
12538+ reply->rc_subcmd, nodeid);
12539+ }
12540+}
12541+
12542+
b7b72b66 12543+static int send_ls_not_ready(uint32_t nodeid, struct dlm_header *header)
c1c6733f
AM
12544+{
12545+ struct writequeue_entry *wq;
b7b72b66
AM
12546+ struct dlm_rcom *rc = (struct dlm_rcom *) header;
12547+ struct dlm_rcom *reply;
c1c6733f 12548+
b7b72b66 12549+ wq = lowcomms_get_buffer(nodeid, sizeof(struct dlm_rcom), GFP_KERNEL,
c1c6733f
AM
12550+ (char **)&reply);
12551+ if (!wq)
12552+ return -ENOMEM;
12553+
12554+ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12555+ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12556+ reply->rc_subcmd = rc->rc_subcmd;
12557+ reply->rc_msgid = rc->rc_msgid;
12558+ reply->rc_buf[0] = 0;
12559+
12560+ reply->rc_datalen = 1;
b7b72b66 12561+ reply->rc_header.rh_length = sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
c1c6733f 12562+
b7b72b66 12563+ midcomms_send_buffer((struct dlm_header *)reply, wq);
c1c6733f
AM
12564+ return 0;
12565+}
12566+
12567+
12568+/*
12569+ * Runs in same context as midcomms. Both recovery requests and recovery
12570+ * replies come through this function.
12571+ */
12572+
b7b72b66 12573+void process_recovery_comm(uint32_t nodeid, struct dlm_header *header)
c1c6733f 12574+{
b7b72b66
AM
12575+ struct dlm_ls *ls = find_lockspace_by_global_id(header->rh_lockspace);
12576+ struct dlm_rcom *rc = (struct dlm_rcom *) header;
c1c6733f
AM
12577+
12578+ /* If the lockspace doesn't exist then still send a status message
b7b72b66
AM
12579+ back; it's possible that it just doesn't have its global_id yet. */
12580+
c1c6733f
AM
12581+ if (!ls) {
12582+ send_ls_not_ready(nodeid, header);
12583+ return;
12584+ }
12585+
12586+ switch (header->rh_cmd) {
12587+ case GDLM_REMCMD_RECOVERMESSAGE:
c1c6733f 12588+ rcom_process_message(ls, nodeid, rc);
c1c6733f
AM
12589+ break;
12590+
12591+ case GDLM_REMCMD_RECOVERREPLY:
12592+ rcom_process_reply(ls, nodeid, rc);
12593+ break;
12594+
12595+ default:
b7b72b66 12596+ DLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
c1c6733f 12597+ }
b7b72b66
AM
12598+
12599+ put_lockspace(ls);
c1c6733f
AM
12600+}
12601+
12602diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
12603--- linux-orig/cluster/dlm/reccomms.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11
AM
12604+++ linux-patched/cluster/dlm/reccomms.h 2004-11-03 11:31:56.000000000 +0800
12605@@ -0,0 +1,36 @@
c1c6733f
AM
12606+/******************************************************************************
12607+*******************************************************************************
12608+**
12609+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12610+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12611+**
12612+** This copyrighted material is made available to anyone wishing to use,
12613+** modify, copy, or redistribute it subject to the terms and conditions
12614+** of the GNU General Public License v.2.
12615+**
12616+*******************************************************************************
12617+******************************************************************************/
12618+
12619+#ifndef __RECCOMMS_DOT_H__
12620+#define __RECCOMMS_DOT_H__
12621+
12622+/* Bit flags */
12623+
12624+#define RESDIR_VALID (1)
12625+#define RESDIR_ALL_VALID (2)
12626+#define NODES_VALID (4)
12627+#define NODES_ALL_VALID (8)
12628+
12629+#define RECCOMM_STATUS (1)
12630+#define RECCOMM_RECOVERNAMES (2)
12631+#define RECCOMM_GETMASTER (3)
12632+#define RECCOMM_BULKLOOKUP (4)
12633+#define RECCOMM_NEWLOCKS (5)
12634+#define RECCOMM_NEWLOCKIDS (6)
12635+#define RECCOMM_REMRESDATA (7)
12636+
b7b72b66
AM
12637+int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
12638+ struct dlm_rcom *rc, int need_reply);
12639+void process_recovery_comm(uint32_t nodeid, struct dlm_header *header);
c1c6733f
AM
12640+
12641+#endif
12642diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
12643--- linux-orig/cluster/dlm/recover.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 12644+++ linux-patched/cluster/dlm/recover.c 2004-11-03 11:31:56.000000000 +0800
b7b72b66 12645@@ -0,0 +1,611 @@
c1c6733f
AM
12646+/******************************************************************************
12647+*******************************************************************************
12648+**
12649+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12650+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12651+**
12652+** This copyrighted material is made available to anyone wishing to use,
12653+** modify, copy, or redistribute it subject to the terms and conditions
12654+** of the GNU General Public License v.2.
12655+**
12656+*******************************************************************************
12657+******************************************************************************/
12658+
12659+#include "dlm_internal.h"
12660+#include "reccomms.h"
12661+#include "dir.h"
12662+#include "locking.h"
12663+#include "rsb.h"
12664+#include "lockspace.h"
12665+#include "lkb.h"
12666+#include "nodes.h"
12667+#include "config.h"
12668+#include "ast.h"
12669+#include "memory.h"
12670+
12671+/*
12672+ * Called in recovery routines to check whether the recovery process has been
12673+ * interrupted/stopped by another transition. A recovery in-process will abort
12674+ * if the lockspace is "stopped" so that a new recovery process can start from
12675+ * the beginning when the lockspace is "started" again.
12676+ */
12677+
b7b72b66 12678+int dlm_recovery_stopped(struct dlm_ls *ls)
c1c6733f
AM
12679+{
12680+ return test_bit(LSFL_LS_STOP, &ls->ls_flags);
12681+}
12682+
b7b72b66 12683+static void dlm_wait_timer_fn(unsigned long data)
c1c6733f 12684+{
b7b72b66 12685+ struct dlm_ls *ls = (struct dlm_ls *) data;
c1c6733f
AM
12686+
12687+ wake_up(&ls->ls_wait_general);
12688+}
12689+
12690+/*
12691+ * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
12692+ * set due to failure of a node in ls_nodes). When another function thinks it
12693+ * could have completed the waited-on task, they should wake up ls_wait_general
12694+ * to get an immediate response rather than waiting for the timer to detect the
12695+ * result. A timer wakes us up periodically while waiting to see if we should
12696+ * abort due to a node failure.
12697+ */
12698+
b7b72b66 12699+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
c1c6733f
AM
12700+{
12701+ struct timer_list timer;
12702+ int error = 0;
12703+
12704+ init_timer(&timer);
b7b72b66 12705+ timer.function = dlm_wait_timer_fn;
c1c6733f
AM
12706+ timer.data = (long) ls;
12707+
12708+ for (;;) {
b7b72b66 12709+ mod_timer(&timer, jiffies + (dlm_config.recover_timer * HZ));
c1c6733f
AM
12710+
12711+ wchan_cond_sleep_intr(ls->ls_wait_general,
12712+ !testfn(ls) &&
12713+ !test_bit(LSFL_LS_STOP, &ls->ls_flags));
12714+
12715+ if (timer_pending(&timer))
12716+ del_timer(&timer);
12717+
12718+ if (testfn(ls))
12719+ break;
12720+
12721+ if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
12722+ error = -1;
12723+ break;
12724+ }
12725+ }
12726+
12727+ return error;
12728+}
12729+
b7b72b66 12730+int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status)
c1c6733f 12731+{
b7b72b66
AM
12732+ struct dlm_rcom rc_stack, *rc;
12733+ struct dlm_csb *csb;
c1c6733f
AM
12734+ int status;
12735+ int error = 0;
12736+
b7b72b66 12737+ memset(&rc_stack, 0, sizeof(struct dlm_rcom));
c1c6733f
AM
12738+ rc = &rc_stack;
12739+ rc->rc_datalen = 0;
12740+
b7b72b66 12741+ list_for_each_entry(csb, &ls->ls_nodes, list) {
c1c6733f 12742+ for (;;) {
b7b72b66 12743+ error = dlm_recovery_stopped(ls);
c1c6733f
AM
12744+ if (error)
12745+ goto out;
12746+
b7b72b66 12747+ error = rcom_send_message(ls, csb->node->nodeid,
c1c6733f
AM
12748+ RECCOMM_STATUS, rc, 1);
12749+ if (error)
12750+ goto out;
12751+
12752+ status = rc->rc_buf[0];
12753+ if (status & wait_status)
12754+ break;
12755+ else {
12756+ set_current_state(TASK_INTERRUPTIBLE);
12757+ schedule_timeout(HZ >> 1);
12758+ }
12759+ }
12760+ }
12761+
12762+ out:
12763+ return error;
12764+}
12765+
b7b72b66 12766+int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status)
c1c6733f 12767+{
b7b72b66 12768+ struct dlm_rcom rc_stack, *rc;
c1c6733f
AM
12769+ uint32_t nodeid = ls->ls_low_nodeid;
12770+ int status;
12771+ int error = 0;
12772+
b7b72b66 12773+ memset(&rc_stack, 0, sizeof(struct dlm_rcom));
c1c6733f
AM
12774+ rc = &rc_stack;
12775+ rc->rc_datalen = 0;
12776+
12777+ for (;;) {
b7b72b66 12778+ error = dlm_recovery_stopped(ls);
c1c6733f
AM
12779+ if (error)
12780+ goto out;
12781+
12782+ error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
12783+ if (error)
12784+ break;
12785+
12786+ status = rc->rc_buf[0];
12787+ if (status & wait_status)
12788+ break;
12789+ else {
12790+ set_current_state(TASK_INTERRUPTIBLE);
12791+ schedule_timeout(HZ >> 1);
12792+ }
12793+ }
12794+
12795+ out:
12796+ return error;
12797+}
12798+
b7b72b66 12799+static int purge_queue(struct dlm_ls *ls, struct list_head *queue)
c1c6733f 12800+{
b7b72b66
AM
12801+ struct dlm_lkb *lkb, *safe;
12802+ struct dlm_rsb *rsb;
c1c6733f
AM
12803+ int count = 0;
12804+
12805+ list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
12806+ if (!lkb->lkb_nodeid)
12807+ continue;
12808+
b7b72b66 12809+ DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
c1c6733f
AM
12810+
12811+ if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
12812+ list_del(&lkb->lkb_statequeue);
12813+
12814+ rsb = lkb->lkb_resource;
12815+ lkb->lkb_status = 0;
12816+
12817+ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
12818+ && &lkb->lkb_duetime)
12819+ remove_from_deadlockqueue(lkb);
12820+
12821+ release_lkb(ls, lkb);
b7b72b66 12822+ release_rsb_locked(rsb);
c1c6733f
AM
12823+ count++;
12824+ }
12825+ }
12826+
12827+ return count;
12828+}
12829+
12830+/*
12831+ * Go through local restbl and for each rsb we're master of, clear out any
12832+ * lkb's held by departed nodes.
12833+ */
12834+
b7b72b66 12835+int restbl_lkb_purge(struct dlm_ls *ls)
c1c6733f
AM
12836+{
12837+ struct list_head *tmp2, *safe2;
12838+ int count = 0;
b7b72b66 12839+ struct dlm_rsb *rootrsb, *safe, *rsb;
c1c6733f
AM
12840+
12841+ log_all(ls, "purge locks of departed nodes");
b7b72b66 12842+ down_write(&ls->ls_root_lock);
c1c6733f
AM
12843+
12844+ list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
12845+
c1c6733f
AM
12846+ if (rootrsb->res_nodeid)
12847+ continue;
12848+
12849+ hold_rsb(rootrsb);
12850+ down_write(&rootrsb->res_lock);
12851+
12852+ /* This traverses the subreslist in reverse order so we purge
12853+ * the children before their parents. */
12854+
12855+ for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
12856+ tmp2 != &rootrsb->res_subreslist;
12857+ tmp2 = safe2, safe2 = safe2->prev) {
b7b72b66 12858+ rsb = list_entry(tmp2, struct dlm_rsb, res_subreslist);
c1c6733f
AM
12859+
12860+ hold_rsb(rsb);
12861+ purge_queue(ls, &rsb->res_grantqueue);
12862+ purge_queue(ls, &rsb->res_convertqueue);
12863+ purge_queue(ls, &rsb->res_waitqueue);
b7b72b66 12864+ release_rsb_locked(rsb);
c1c6733f
AM
12865+ }
12866+ count += purge_queue(ls, &rootrsb->res_grantqueue);
12867+ count += purge_queue(ls, &rootrsb->res_convertqueue);
12868+ count += purge_queue(ls, &rootrsb->res_waitqueue);
12869+
12870+ up_write(&rootrsb->res_lock);
b7b72b66 12871+ release_rsb_locked(rootrsb);
c1c6733f
AM
12872+ }
12873+
b7b72b66 12874+ up_write(&ls->ls_root_lock);
c1c6733f
AM
12875+ log_all(ls, "purged %d locks", count);
12876+
12877+ return 0;
12878+}
12879+
12880+/*
12881+ * Grant any locks that have become grantable after a purge
12882+ */
12883+
b7b72b66 12884+int restbl_grant_after_purge(struct dlm_ls *ls)
c1c6733f 12885+{
b7b72b66 12886+ struct dlm_rsb *root, *rsb, *safe;
c1c6733f
AM
12887+ int error = 0;
12888+
b7b72b66 12889+ down_read(&ls->ls_root_lock);
c1c6733f
AM
12890+
12891+ list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
12892+ /* only the rsb master grants locks */
12893+ if (root->res_nodeid)
12894+ continue;
12895+
12896+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
12897+ log_debug(ls, "restbl_grant_after_purge aborted");
12898+ error = -EINTR;
b7b72b66 12899+ up_read(&ls->ls_root_lock);
c1c6733f
AM
12900+ goto out;
12901+ }
12902+
12903+ down_write(&root->res_lock);
12904+ grant_pending_locks(root);
12905+ up_write(&root->res_lock);
12906+
12907+ list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
12908+ down_write(&rsb->res_lock);
12909+ grant_pending_locks(rsb);
12910+ up_write(&rsb->res_lock);
12911+ }
12912+ }
b7b72b66 12913+ up_read(&ls->ls_root_lock);
c1c6733f
AM
12914+ wake_astd();
12915+ out:
12916+ return error;
12917+}
12918+
12919+/*
12920+ * Set the lock master for all LKBs in a lock queue
12921+ */
12922+
12923+static void set_lock_master(struct list_head *queue, int nodeid)
12924+{
b7b72b66 12925+ struct dlm_lkb *lkb;
c1c6733f
AM
12926+
12927+ list_for_each_entry(lkb, queue, lkb_statequeue) {
12928+ /* Don't muck around with pre-exising sublocks */
12929+ if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
12930+ lkb->lkb_nodeid = nodeid;
12931+ }
12932+}
12933+
b7b72b66 12934+static void set_master_lkbs(struct dlm_rsb *rsb)
c1c6733f
AM
12935+{
12936+ set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
12937+ set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
12938+ set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
12939+}
12940+
12941+/*
12942+ * This rsb struct is now the master so it is responsible for keeping the
12943+ * latest rsb. Find if any current lkb's have an up to date copy of the lvb to
12944+ * be used as the rsb copy. An equivalent step occurs as new lkb's arrive for
12945+ * this rsb in deserialise_lkb.
12946+ */
12947+
b7b72b66 12948+static void set_rsb_lvb(struct dlm_rsb *rsb)
c1c6733f 12949+{
b7b72b66 12950+ struct dlm_lkb *lkb;
c1c6733f
AM
12951+
12952+ list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
12953+
12954+ if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12955+ (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12956+ (lkb->lkb_grmode > DLM_LOCK_NL))
12957+ {
12958+ if (!rsb->res_lvbptr)
12959+ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12960+
12961+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12962+ return;
12963+ }
12964+ }
12965+
12966+ list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
12967+
12968+ if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12969+ (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12970+ (lkb->lkb_grmode > DLM_LOCK_NL))
12971+ {
12972+ if (!rsb->res_lvbptr)
12973+ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12974+
12975+ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12976+ return;
12977+ }
12978+ }
12979+}
12980+
12981+/*
12982+ * Propogate the new master nodeid to locks, subrsbs, sublocks.
12983+ * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
12984+ */
12985+
b7b72b66 12986+static void set_new_master(struct dlm_rsb *rsb, uint32_t nodeid)
c1c6733f 12987+{
b7b72b66 12988+ struct dlm_rsb *subrsb;
c1c6733f
AM
12989+
12990+ down_write(&rsb->res_lock);
12991+
b7b72b66
AM
12992+ if (nodeid == our_nodeid()) {
12993+ set_bit(RESFL_MASTER, &rsb->res_flags);
c1c6733f
AM
12994+ rsb->res_nodeid = 0;
12995+ set_rsb_lvb(rsb);
b7b72b66
AM
12996+ } else
12997+ rsb->res_nodeid = nodeid;
c1c6733f
AM
12998+
12999+ set_master_lkbs(rsb);
13000+
13001+ list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
13002+ subrsb->res_nodeid = rsb->res_nodeid;
13003+ set_master_lkbs(subrsb);
13004+ }
13005+
13006+ up_write(&rsb->res_lock);
13007+
13008+ set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
13009+}
13010+
13011+/*
13012+ * The recover_list contains all the rsb's for which we've requested the new
13013+ * master nodeid. As replies are returned from the resource directories the
13014+ * rsb's are removed from the list. When the list is empty we're done.
13015+ *
13016+ * The recover_list is later similarly used for all rsb's for which we've sent
13017+ * new lkb's and need to receive new corresponding lkid's.
13018+ */
13019+
b7b72b66 13020+int recover_list_empty(struct dlm_ls *ls)
c1c6733f
AM
13021+{
13022+ int empty;
13023+
13024+ spin_lock(&ls->ls_recover_list_lock);
13025+ empty = list_empty(&ls->ls_recover_list);
13026+ spin_unlock(&ls->ls_recover_list_lock);
13027+
13028+ return empty;
13029+}
13030+
b7b72b66 13031+int recover_list_count(struct dlm_ls *ls)
c1c6733f
AM
13032+{
13033+ int count;
13034+
13035+ spin_lock(&ls->ls_recover_list_lock);
13036+ count = ls->ls_recover_list_count;
13037+ spin_unlock(&ls->ls_recover_list_lock);
13038+
13039+ return count;
13040+}
13041+
b7b72b66 13042+void recover_list_add(struct dlm_rsb *rsb)
c1c6733f 13043+{
b7b72b66 13044+ struct dlm_ls *ls = rsb->res_ls;
c1c6733f
AM
13045+
13046+ spin_lock(&ls->ls_recover_list_lock);
13047+ if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
13048+ list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
13049+ ls->ls_recover_list_count++;
13050+ hold_rsb(rsb);
13051+ }
13052+ spin_unlock(&ls->ls_recover_list_lock);
13053+}
13054+
b7b72b66 13055+void recover_list_del(struct dlm_rsb *rsb)
c1c6733f 13056+{
b7b72b66 13057+ struct dlm_ls *ls = rsb->res_ls;
c1c6733f
AM
13058+
13059+ spin_lock(&ls->ls_recover_list_lock);
13060+ clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
13061+ list_del(&rsb->res_recover_list);
13062+ ls->ls_recover_list_count--;
13063+ spin_unlock(&ls->ls_recover_list_lock);
13064+
13065+ release_rsb(rsb);
13066+}
13067+
b7b72b66 13068+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, int msgid)
c1c6733f 13069+{
b7b72b66 13070+ struct dlm_rsb *rsb = NULL;
c1c6733f
AM
13071+
13072+ spin_lock(&ls->ls_recover_list_lock);
13073+
13074+ list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
13075+ if (rsb->res_recover_msgid == msgid)
13076+ goto rec_found;
13077+ }
13078+ rsb = NULL;
13079+
13080+ rec_found:
13081+ spin_unlock(&ls->ls_recover_list_lock);
13082+ return rsb;
13083+}
13084+
b7b72b66 13085+static int rsb_master_lookup(struct dlm_rsb *rsb, struct dlm_rcom *rc)
c1c6733f 13086+{
b7b72b66
AM
13087+ struct dlm_ls *ls = rsb->res_ls;
13088+ uint32_t dir_nodeid, r_nodeid;
c1c6733f
AM
13089+ int error;
13090+
13091+ dir_nodeid = get_directory_nodeid(rsb);
13092+
13093+ if (dir_nodeid == our_nodeid()) {
b7b72b66
AM
13094+ error = dlm_dir_lookup(ls, dir_nodeid, rsb->res_name,
13095+ rsb->res_length, &r_nodeid);
13096+ if (error == -EEXIST) {
13097+ log_all(ls, "rsb_master_lookup %u EEXIST %s",
13098+ r_nodeid, rsb->res_name);
13099+ } else if (error)
c1c6733f
AM
13100+ goto fail;
13101+
b7b72b66 13102+ set_new_master(rsb, r_nodeid);
c1c6733f
AM
13103+ } else {
13104+ /* As we are the only thread doing recovery this
13105+ should be safe. if not then we need to use a different
13106+ ID somehow. We must set it in the RSB before rcom_send_msg
13107+ completes cos we may get a reply quite quickly.
13108+ */
13109+ rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
13110+
13111+ recover_list_add(rsb);
13112+
13113+ memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
13114+ rc->rc_datalen = rsb->res_length;
13115+
13116+ error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
13117+ rc, 0);
13118+ if (error)
13119+ goto fail;
13120+ }
13121+
b7b72b66 13122+ fail:
c1c6733f
AM
13123+ return error;
13124+}
13125+
b7b72b66
AM
13126+static int needs_update(struct dlm_ls *ls, struct dlm_rsb *r)
13127+{
13128+ if (!r->res_nodeid)
13129+ return FALSE;
13130+
13131+ if (r->res_nodeid == -1)
13132+ return FALSE;
13133+
13134+ if (in_nodes_gone(ls, r->res_nodeid))
13135+ return TRUE;
13136+
13137+ return FALSE;
13138+}
13139+
c1c6733f
AM
13140+/*
13141+ * Go through local root resources and for each rsb which has a master which
13142+ * has departed, get the new master nodeid from the resdir. The resdir will
13143+ * assign mastery to the first node to look up the new master. That means
13144+ * we'll discover in this lookup if we're the new master of any rsb's.
13145+ *
13146+ * We fire off all the resdir requests individually and asynchronously to the
13147+ * correct resdir node. The replies are processed in rsb_master_recv().
13148+ */
13149+
b7b72b66 13150+int restbl_rsb_update(struct dlm_ls *ls)
c1c6733f 13151+{
b7b72b66
AM
13152+ struct dlm_rsb *rsb, *safe;
13153+ struct dlm_rcom *rc;
c1c6733f
AM
13154+ int error = -ENOMEM;
13155+ int count = 0;
13156+
13157+ log_all(ls, "update remastered resources");
13158+
13159+ rc = allocate_rcom_buffer(ls);
13160+ if (!rc)
13161+ goto out;
13162+
b7b72b66 13163+ down_read(&ls->ls_root_lock);
c1c6733f 13164+
b7b72b66
AM
13165+ list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
13166+ error = dlm_recovery_stopped(ls);
13167+ if (error) {
13168+ up_read(&ls->ls_root_lock);
c1c6733f 13169+ goto out_free;
b7b72b66 13170+ }
c1c6733f 13171+
b7b72b66 13172+ if (needs_update(ls, rsb)) {
c1c6733f 13173+ error = rsb_master_lookup(rsb, rc);
b7b72b66
AM
13174+ if (error) {
13175+ up_read(&ls->ls_root_lock);
c1c6733f 13176+ goto out_free;
b7b72b66 13177+ }
c1c6733f
AM
13178+ count++;
13179+ }
13180+ }
b7b72b66 13181+ up_read(&ls->ls_root_lock);
c1c6733f 13182+
b7b72b66 13183+ error = dlm_wait_function(ls, &recover_list_empty);
c1c6733f
AM
13184+
13185+ log_all(ls, "updated %d resources", count);
b7b72b66 13186+ out_free:
c1c6733f 13187+ free_rcom_buffer(rc);
b7b72b66 13188+ out:
c1c6733f
AM
13189+ return error;
13190+}
13191+
b7b72b66
AM
13192+int restbl_rsb_update_recv(struct dlm_ls *ls, uint32_t nodeid, char *buf,
13193+ int length, int msgid)
c1c6733f 13194+{
b7b72b66 13195+ struct dlm_rsb *rsb;
c1c6733f
AM
13196+ uint32_t be_nodeid;
13197+
13198+ rsb = recover_list_find(ls, msgid);
13199+ if (!rsb) {
13200+ log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
13201+ goto out;
13202+ }
13203+
13204+ memcpy(&be_nodeid, buf, sizeof(uint32_t));
b7b72b66 13205+ set_new_master(rsb, be32_to_cpu(be_nodeid));
c1c6733f
AM
13206+ recover_list_del(rsb);
13207+
13208+ if (recover_list_empty(ls))
13209+ wake_up(&ls->ls_wait_general);
13210+
b7b72b66 13211+ out:
c1c6733f
AM
13212+ return 0;
13213+}
13214+
13215+/*
13216+ * This function not used any longer.
13217+ */
13218+
b7b72b66 13219+int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
c1c6733f
AM
13220+ char *outbuf)
13221+{
13222+ char *inbufptr, *outbufptr;
13223+
13224+ /*
13225+ * The other node wants nodeids matching the resource names in inbuf.
13226+ * The resource names are packed into inbuf as
13227+ * [len1][name1][len2][name2]... where lenX is 1 byte and nameX is
13228+ * lenX bytes. Matching nodeids are packed into outbuf in order
13229+ * [nodeid1][nodeid2]...
13230+ */
13231+
13232+ inbufptr = inbuf;
13233+ outbufptr = outbuf;
13234+
13235+ while (inbufptr < inbuf + inlen) {
b7b72b66 13236+ uint32_t r_nodeid, be_nodeid;
c1c6733f
AM
13237+ int status;
13238+
b7b72b66
AM
13239+ status = dlm_dir_lookup(ls, nodeid, inbufptr + 1, *inbufptr,
13240+ &r_nodeid);
c1c6733f
AM
13241+ if (status != 0)
13242+ goto fail;
13243+
13244+ inbufptr += *inbufptr + 1;
13245+
b7b72b66 13246+ be_nodeid = cpu_to_be32(r_nodeid);
c1c6733f
AM
13247+ memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
13248+ outbufptr += sizeof(uint32_t);
13249+
13250+ /* add assertion that outbufptr - outbuf is not > than ... */
13251+ }
13252+
13253+ return (outbufptr - outbuf);
b7b72b66 13254+ fail:
c1c6733f
AM
13255+ return -1;
13256+}
13257diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
13258--- linux-orig/cluster/dlm/recover.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 13259+++ linux-patched/cluster/dlm/recover.h 2004-11-03 11:31:56.000000000 +0800
b7b72b66 13260@@ -0,0 +1,33 @@
c1c6733f
AM
13261+/******************************************************************************
13262+*******************************************************************************
13263+**
13264+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13265+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13266+**
13267+** This copyrighted material is made available to anyone wishing to use,
13268+** modify, copy, or redistribute it subject to the terms and conditions
13269+** of the GNU General Public License v.2.
13270+**
13271+*******************************************************************************
13272+******************************************************************************/
13273+
13274+#ifndef __RECOVER_DOT_H__
13275+#define __RECOVER_DOT_H__
13276+
b7b72b66
AM
13277+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls));
13278+int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status);
13279+int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status);
13280+int dlm_recovery_stopped(struct dlm_ls *ls);
13281+int recover_list_empty(struct dlm_ls *ls);
13282+int recover_list_count(struct dlm_ls *ls);
13283+void recover_list_add(struct dlm_rsb *rsb);
13284+void recover_list_del(struct dlm_rsb *rsb);
13285+int restbl_lkb_purge(struct dlm_ls *ls);
13286+void restbl_grant_after_purge(struct dlm_ls *ls);
13287+int restbl_rsb_update(struct dlm_ls *ls);
13288+int restbl_rsb_update_recv(struct dlm_ls *ls, int nodeid, char *buf, int len,
c1c6733f 13289+ int msgid);
b7b72b66 13290+int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
c1c6733f
AM
13291+ char *outbuf);
13292+
13293+#endif /* __RECOVER_DOT_H__ */
13294diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
13295--- linux-orig/cluster/dlm/recoverd.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 13296+++ linux-patched/cluster/dlm/recoverd.c 2004-11-03 11:31:56.000000000 +0800
c783755a 13297@@ -0,0 +1,713 @@
c1c6733f
AM
13298+/******************************************************************************
13299+*******************************************************************************
13300+**
13301+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13302+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
b7b72b66 13303+**
c1c6733f
AM
13304+** This copyrighted material is made available to anyone wishing to use,
13305+** modify, copy, or redistribute it subject to the terms and conditions
13306+** of the GNU General Public License v.2.
13307+**
13308+*******************************************************************************
13309+******************************************************************************/
13310+
13311+#include "dlm_internal.h"
13312+#include "nodes.h"
13313+#include "dir.h"
13314+#include "ast.h"
13315+#include "recover.h"
13316+#include "lockspace.h"
13317+#include "lowcomms.h"
13318+#include "lockqueue.h"
13319+#include "lkb.h"
13320+#include "rebuild.h"
13321+
b7b72b66 13322+/*
c1c6733f
AM
13323+ * next_move actions
13324+ */
13325+
13326+#define DO_STOP (1)
13327+#define DO_START (2)
13328+#define DO_FINISH (3)
13329+#define DO_FINISH_STOP (4)
13330+#define DO_FINISH_START (5)
13331+
b7b72b66
AM
13332+/*
13333+ * Queue of lockspaces (dlm_recover structs) which need to be
c1c6733f
AM
13334+ * started/recovered
13335+ */
13336+
b7b72b66 13337+static int enable_locking(struct dlm_ls *ls, int event_id)
c1c6733f
AM
13338+{
13339+ int error = 0;
13340+
13341+ spin_lock(&ls->ls_recover_lock);
13342+ if (ls->ls_last_stop < event_id) {
13343+ set_bit(LSFL_LS_RUN, &ls->ls_flags);
13344+ up_write(&ls->ls_in_recovery);
13345+ } else {
13346+ error = -EINTR;
13347+ log_debug(ls, "enable_locking: abort %d", event_id);
13348+ }
13349+ spin_unlock(&ls->ls_recover_lock);
13350+ return error;
13351+}
13352+
b7b72b66 13353+static int ls_first_start(struct dlm_ls *ls, struct dlm_recover *rv)
c1c6733f
AM
13354+{
13355+ int error;
13356+
b7b72b66 13357+ log_all(ls, "recover event %u (first)", rv->event_id);
c1c6733f
AM
13358+
13359+ kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
13360+
b7b72b66 13361+ error = ls_nodes_init(ls, rv);
c1c6733f
AM
13362+ if (error) {
13363+ log_error(ls, "nodes_init failed %d", error);
13364+ goto out;
13365+ }
13366+
b7b72b66 13367+ error = dlm_dir_rebuild_local(ls);
c1c6733f 13368+ if (error) {
b7b72b66 13369+ log_error(ls, "dlm_dir_rebuild_local failed %d", error);
c1c6733f
AM
13370+ goto out;
13371+ }
13372+
b7b72b66 13373+ error = dlm_dir_rebuild_wait(ls);
c1c6733f 13374+ if (error) {
b7b72b66 13375+ log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
c1c6733f
AM
13376+ goto out;
13377+ }
13378+
b7b72b66
AM
13379+ log_all(ls, "recover event %u done", rv->event_id);
13380+ kcl_start_done(ls->ls_local_id, rv->event_id);
c1c6733f 13381+
c783755a 13382+ out:
c1c6733f
AM
13383+ return error;
13384+}
13385+
b7b72b66 13386+/*
c1c6733f
AM
13387+ * We are given here a new group of nodes which are in the lockspace. We first
13388+ * figure out the differences in ls membership from when we were last running.
13389+ * If nodes from before are gone, then there will be some lock recovery to do.
13390+ * If there are only nodes which have joined, then there's no lock recovery.
13391+ *
13392+ * note: cman requires an rc to finish starting on an revent (where nodes die)
13393+ * before it allows an sevent (where nodes join) to be processed. This means
13394+ * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
13395+ * joined.
13396+ */
13397+
b7b72b66 13398+static int ls_reconfig(struct dlm_ls *ls, struct dlm_recover *rv)
c1c6733f
AM
13399+{
13400+ int error, neg = 0;
13401+
b7b72b66 13402+ log_all(ls, "recover event %u", rv->event_id);
c1c6733f 13403+
b7b72b66
AM
13404+ /*
13405+ * this list may be left over from a previous aborted recovery
13406+ */
13407+
13408+ rebuild_freemem(ls);
13409+
13410+ /*
c1c6733f
AM
13411+ * Add or remove nodes from the lockspace's ls_nodes list.
13412+ */
13413+
b7b72b66 13414+ error = ls_nodes_reconfig(ls, rv, &neg);
c1c6733f
AM
13415+ if (error) {
13416+ log_error(ls, "nodes_reconfig failed %d", error);
13417+ goto fail;
13418+ }
13419+
b7b72b66 13420+ /*
c1c6733f
AM
13421+ * Rebuild our own share of the resdir by collecting from all other
13422+ * nodes rsb name/master pairs for which the name hashes to us.
13423+ */
13424+
b7b72b66 13425+ error = dlm_dir_rebuild_local(ls);
c1c6733f 13426+ if (error) {
b7b72b66 13427+ log_error(ls, "dlm_dir_rebuild_local failed %d", error);
c1c6733f
AM
13428+ goto fail;
13429+ }
13430+
b7b72b66 13431+ /*
c1c6733f
AM
13432+ * Purge resdir-related requests that are being held in requestqueue.
13433+ * All resdir requests from before recovery started are invalid now due
13434+ * to the resdir rebuild and will be resent by the requesting nodes.
13435+ */
13436+
13437+ purge_requestqueue(ls);
13438+ set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
13439+
b7b72b66 13440+ /*
c1c6733f
AM
13441+ * Wait for all nodes to complete resdir rebuild.
13442+ */
13443+
b7b72b66 13444+ error = dlm_dir_rebuild_wait(ls);
c1c6733f 13445+ if (error) {
b7b72b66 13446+ log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
c1c6733f
AM
13447+ goto fail;
13448+ }
13449+
b7b72b66 13450+ /*
c1c6733f
AM
13451+ * Mark our own lkb's waiting in the lockqueue for remote replies from
13452+ * nodes that are now departed. These will be resent to the new
13453+ * masters in resend_cluster_requests. Also mark resdir lookup
13454+ * requests for resending.
13455+ */
13456+
13457+ lockqueue_lkb_mark(ls);
13458+
b7b72b66 13459+ error = dlm_recovery_stopped(ls);
c1c6733f
AM
13460+ if (error)
13461+ goto fail;
13462+
13463+ if (neg) {
b7b72b66 13464+ /*
c1c6733f
AM
13465+ * Clear lkb's for departed nodes. This can't fail since it
13466+ * doesn't involve communicating with other nodes.
13467+ */
13468+
c1c6733f 13469+ restbl_lkb_purge(ls);
c1c6733f 13470+
b7b72b66 13471+ /*
c1c6733f
AM
13472+ * Get new master id's for rsb's of departed nodes. This fails
13473+ * if we can't communicate with other nodes.
13474+ */
13475+
13476+ error = restbl_rsb_update(ls);
13477+ if (error) {
13478+ log_error(ls, "restbl_rsb_update failed %d", error);
b7b72b66 13479+ goto fail;
c1c6733f
AM
13480+ }
13481+
b7b72b66 13482+ /*
c1c6733f
AM
13483+ * Send our lkb info to new masters. This fails if we can't
13484+ * communicate with a node.
13485+ */
13486+
13487+ error = rebuild_rsbs_send(ls);
13488+ if (error) {
13489+ log_error(ls, "rebuild_rsbs_send failed %d", error);
b7b72b66 13490+ goto fail;
c1c6733f 13491+ }
c783755a 13492+ }
c1c6733f
AM
13493+
13494+ clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
13495+
b7b72b66
AM
13496+ log_all(ls, "recover event %u done", rv->event_id);
13497+ kcl_start_done(ls->ls_local_id, rv->event_id);
c1c6733f
AM
13498+ return 0;
13499+
c1c6733f 13500+ fail:
b7b72b66 13501+ log_all(ls, "recover event %d error %d", rv->event_id, error);
c1c6733f
AM
13502+ return error;
13503+}
13504+
b7b72b66 13505+static void clear_finished_nodes(struct dlm_ls *ls, int finish_event)
c1c6733f 13506+{
b7b72b66 13507+ struct dlm_csb *csb, *safe;
c1c6733f 13508+
b7b72b66
AM
13509+ list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, list) {
13510+ if (csb->gone_event <= finish_event) {
13511+ list_del(&csb->list);
c1c6733f
AM
13512+ release_csb(csb);
13513+ }
13514+ }
13515+}
13516+
b7b72b66 13517+/*
c1c6733f
AM
13518+ * Between calls to this routine for a ls, there can be multiple stop/start
13519+ * events from cman where every start but the latest is cancelled by stops.
13520+ * There can only be a single finish from cman because every finish requires us
13521+ * to call start_done. A single finish event could be followed by multiple
13522+ * stop/start events. This routine takes any combination of events from cman
13523+ * and boils them down to one course of action.
13524+ */
13525+
b7b72b66
AM
13526+static int next_move(struct dlm_ls *ls, struct dlm_recover **rv_out,
13527+ int *finish_out)
c1c6733f
AM
13528+{
13529+ LIST_HEAD(events);
13530+ unsigned int cmd = 0, stop, start, finish;
13531+ unsigned int last_stop, last_start, last_finish;
b7b72b66 13532+ struct dlm_recover *rv = NULL, *start_rv = NULL;
c1c6733f 13533+
b7b72b66 13534+ /*
c1c6733f
AM
13535+ * Grab the current state of cman/sm events.
13536+ */
13537+
13538+ spin_lock(&ls->ls_recover_lock);
13539+
13540+ stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
13541+ start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
13542+ finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
13543+
13544+ last_stop = ls->ls_last_stop;
13545+ last_start = ls->ls_last_start;
13546+ last_finish = ls->ls_last_finish;
13547+
13548+ while (!list_empty(&ls->ls_recover)) {
b7b72b66
AM
13549+ rv = list_entry(ls->ls_recover.next, struct dlm_recover, list);
13550+ list_del(&rv->list);
13551+ list_add_tail(&rv->list, &events);
13552+ }
13553+
c783755a
AM
13554+ /*
13555+ * There are two cases where we need to adjust these event values:
13556+ * 1. - we get a first start
13557+ * - we get a stop
13558+ * - we process the start + stop here and notice this special case
13559+ *
13560+ * 2. - we get a first start
13561+ * - we process the start
13562+ * - we get a stop
13563+ * - we process the stop here and notice this special case
13564+ *
13565+ * In both cases, the first start we received was aborted by a
13566+ * stop before we received a finish. last_finish being zero is the
13567+ * indication that this is the "first" start, i.e. we've not yet
13568+ * finished a start; if we had, last_finish would be non-zero.
13569+ * Part of the problem arises from the fact that when we initially
13570+ * get start/stop/start, SM uses the same event id for both starts
13571+ * (since the first was cancelled).
13572+ *
13573+ * In both cases, last_start and last_stop will be equal.
13574+ * In both cases, finish=0.
13575+ * In the first case start=1 && stop=1.
13576+ * In the second case start=0 && stop=1.
13577+ *
13578+ * In both cases, we need to make adjustments to values so:
13579+ * - we process the current event (now) as a normal stop
13580+ * - the next start we receive will be processed normally
13581+ * (taking into account the assertions below)
13582+ *
13583+ * In the first case, dlm_ls_start() will have printed the
13584+ * "repeated start" warning.
13585+ *
13586+ * In the first case we need to get rid of the recover event struct.
13587+ *
13588+ * - set stop=1, start=0, finish=0 for case 4 below
13589+ * - last_stop and last_start must be set equal per the case 4 assert
13590+ * - ls_last_stop = 0 so the next start will be larger
13591+ * - ls_last_start = 0 not really necessary (avoids dlm_ls_start print)
13592+ */
b7b72b66 13593+
c783755a
AM
13594+ if (!last_finish && (last_start == last_stop)) {
13595+ log_all(ls, "move reset %u,%u,%u ids %u,%u,%u", stop,
13596+ start, finish, last_stop, last_start, last_finish);
13597+ stop = 1;
13598+ start = 0;
13599+ finish = 0;
b7b72b66
AM
13600+ last_stop = 0;
13601+ last_start = 0;
c783755a
AM
13602+ ls->ls_last_stop = 0;
13603+ ls->ls_last_start = 0;
13604+
13605+ while (!list_empty(&events)) {
13606+ rv = list_entry(events.next, struct dlm_recover, list);
13607+ list_del(&rv->list);
13608+ kfree(rv->nodeids);
13609+ kfree(rv);
13610+ }
c1c6733f
AM
13611+ }
13612+ spin_unlock(&ls->ls_recover_lock);
13613+
13614+ log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
13615+ last_stop, last_start, last_finish);
13616+
b7b72b66 13617+ /*
c1c6733f
AM
13618+ * Toss start events which have since been cancelled.
13619+ */
13620+
13621+ while (!list_empty(&events)) {
b7b72b66
AM
13622+ DLM_ASSERT(start,);
13623+ rv = list_entry(events.next, struct dlm_recover, list);
13624+ list_del(&rv->list);
13625+
13626+ if (rv->event_id <= last_stop) {
13627+ log_debug(ls, "move skip event %u", rv->event_id);
13628+ kfree(rv->nodeids);
13629+ kfree(rv);
13630+ rv = NULL;
c1c6733f 13631+ } else {
b7b72b66
AM
13632+ log_debug(ls, "move use event %u", rv->event_id);
13633+ DLM_ASSERT(!start_rv,);
13634+ start_rv = rv;
c1c6733f
AM
13635+ }
13636+ }
13637+
b7b72b66 13638+ /*
c1c6733f
AM
13639+ * Eight possible combinations of events.
13640+ */
13641+
13642+ /* 0 */
13643+ if (!stop && !start && !finish) {
b7b72b66 13644+ DLM_ASSERT(!start_rv,);
c1c6733f
AM
13645+ cmd = 0;
13646+ goto out;
13647+ }
13648+
13649+ /* 1 */
13650+ if (!stop && !start && finish) {
b7b72b66
AM
13651+ DLM_ASSERT(!start_rv,);
13652+ DLM_ASSERT(last_start > last_stop,);
13653+ DLM_ASSERT(last_finish == last_start,);
c1c6733f
AM
13654+ cmd = DO_FINISH;
13655+ *finish_out = last_finish;
13656+ goto out;
13657+ }
13658+
13659+ /* 2 */
13660+ if (!stop && start && !finish) {
b7b72b66
AM
13661+ DLM_ASSERT(start_rv,);
13662+ DLM_ASSERT(last_start > last_stop,);
c1c6733f 13663+ cmd = DO_START;
b7b72b66 13664+ *rv_out = start_rv;
c1c6733f
AM
13665+ goto out;
13666+ }
13667+
13668+ /* 3 */
13669+ if (!stop && start && finish) {
b7b72b66 13670+ DLM_ASSERT(0, printk("finish and start with no stop\n"););
c1c6733f
AM
13671+ }
13672+
13673+ /* 4 */
13674+ if (stop && !start && !finish) {
b7b72b66
AM
13675+ DLM_ASSERT(!start_rv,);
13676+ DLM_ASSERT(last_start == last_stop,);
c1c6733f
AM
13677+ cmd = DO_STOP;
13678+ goto out;
13679+ }
13680+
13681+ /* 5 */
13682+ if (stop && !start && finish) {
b7b72b66
AM
13683+ DLM_ASSERT(!start_rv,);
13684+ DLM_ASSERT(last_finish == last_start,);
13685+ DLM_ASSERT(last_stop == last_start,);
c1c6733f
AM
13686+ cmd = DO_FINISH_STOP;
13687+ *finish_out = last_finish;
13688+ goto out;
13689+ }
13690+
13691+ /* 6 */
13692+ if (stop && start && !finish) {
b7b72b66
AM
13693+ if (start_rv) {
13694+ DLM_ASSERT(last_start > last_stop,);
c1c6733f 13695+ cmd = DO_START;
b7b72b66 13696+ *rv_out = start_rv;
c1c6733f 13697+ } else {
b7b72b66 13698+ DLM_ASSERT(last_stop == last_start,);
c1c6733f
AM
13699+ cmd = DO_STOP;
13700+ }
13701+ goto out;
13702+ }
13703+
13704+ /* 7 */
13705+ if (stop && start && finish) {
b7b72b66
AM
13706+ if (start_rv) {
13707+ DLM_ASSERT(last_start > last_stop,);
13708+ DLM_ASSERT(last_start > last_finish,);
c1c6733f
AM
13709+ cmd = DO_FINISH_START;
13710+ *finish_out = last_finish;
b7b72b66 13711+ *rv_out = start_rv;
c1c6733f 13712+ } else {
b7b72b66
AM
13713+ DLM_ASSERT(last_start == last_stop,);
13714+ DLM_ASSERT(last_start > last_finish,);
c1c6733f
AM
13715+ cmd = DO_FINISH_STOP;
13716+ *finish_out = last_finish;
13717+ }
13718+ goto out;
13719+ }
13720+
c783755a 13721+ out:
c1c6733f
AM
13722+ return cmd;
13723+}
13724+
b7b72b66 13725+/*
c1c6733f
AM
13726+ * This function decides what to do given every combination of current
13727+ * lockspace state and next lockspace state.
13728+ */
13729+
b7b72b66 13730+static void do_ls_recovery(struct dlm_ls *ls)
c1c6733f 13731+{
b7b72b66 13732+ struct dlm_recover *rv = NULL;
c1c6733f
AM
13733+ int error, cur_state, next_state = 0, do_now, finish_event = 0;
13734+
b7b72b66 13735+ do_now = next_move(ls, &rv, &finish_event);
c1c6733f
AM
13736+ if (!do_now)
13737+ goto out;
13738+
13739+ cur_state = ls->ls_state;
13740+ next_state = 0;
13741+
b7b72b66 13742+ DLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
c1c6733f
AM
13743+ log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
13744+
b7b72b66 13745+ /*
c1c6733f
AM
13746+ * LSST_CLEAR - we're not in any recovery state. We can get a stop or
13747+ * a stop and start which equates with a START.
13748+ */
13749+
13750+ if (cur_state == LSST_CLEAR) {
13751+ switch (do_now) {
13752+ case DO_STOP:
13753+ next_state = LSST_WAIT_START;
13754+ break;
13755+
13756+ case DO_START:
b7b72b66 13757+ error = ls_reconfig(ls, rv);
c1c6733f
AM
13758+ if (error)
13759+ next_state = LSST_WAIT_START;
13760+ else
13761+ next_state = LSST_RECONFIG_DONE;
13762+ break;
13763+
13764+ case DO_FINISH: /* invalid */
13765+ case DO_FINISH_STOP: /* invalid */
13766+ case DO_FINISH_START: /* invalid */
13767+ default:
b7b72b66 13768+ DLM_ASSERT(0,);
c1c6733f
AM
13769+ }
13770+ goto out;
13771+ }
13772+
b7b72b66 13773+ /*
c1c6733f
AM
13774+ * LSST_WAIT_START - we're not running because of getting a stop or
13775+ * failing a start. We wait in this state for another stop/start or
13776+ * just the next start to begin another reconfig attempt.
13777+ */
13778+
13779+ if (cur_state == LSST_WAIT_START) {
13780+ switch (do_now) {
13781+ case DO_STOP:
13782+ break;
13783+
13784+ case DO_START:
b7b72b66 13785+ error = ls_reconfig(ls, rv);
c1c6733f
AM
13786+ if (error)
13787+ next_state = LSST_WAIT_START;
13788+ else
13789+ next_state = LSST_RECONFIG_DONE;
13790+ break;
13791+
13792+ case DO_FINISH: /* invalid */
13793+ case DO_FINISH_STOP: /* invalid */
13794+ case DO_FINISH_START: /* invalid */
13795+ default:
b7b72b66 13796+ DLM_ASSERT(0,);
c1c6733f
AM
13797+ }
13798+ goto out;
13799+ }
13800+
b7b72b66 13801+ /*
c1c6733f
AM
13802+ * LSST_RECONFIG_DONE - we entered this state after successfully
13803+ * completing ls_reconfig and calling kcl_start_done. We expect to get
13804+ * a finish if everything goes ok. A finish could be followed by stop
13805+ * or stop/start before we get here to check it. Or a finish may never
13806+ * happen, only stop or stop/start.
13807+ */
13808+
13809+ if (cur_state == LSST_RECONFIG_DONE) {
13810+ switch (do_now) {
13811+ case DO_FINISH:
b7b72b66
AM
13812+ rebuild_freemem(ls);
13813+
c1c6733f
AM
13814+ clear_finished_nodes(ls, finish_event);
13815+ next_state = LSST_CLEAR;
13816+
13817+ error = enable_locking(ls, finish_event);
13818+ if (error)
13819+ break;
13820+
13821+ error = process_requestqueue(ls);
13822+ if (error)
13823+ break;
13824+
13825+ error = resend_cluster_requests(ls);
13826+ if (error)
13827+ break;
13828+
13829+ restbl_grant_after_purge(ls);
13830+
13831+ log_all(ls, "recover event %u finished", finish_event);
13832+ break;
13833+
13834+ case DO_STOP:
13835+ next_state = LSST_WAIT_START;
13836+ break;
13837+
13838+ case DO_FINISH_STOP:
13839+ clear_finished_nodes(ls, finish_event);
13840+ next_state = LSST_WAIT_START;
13841+ break;
13842+
13843+ case DO_FINISH_START:
13844+ clear_finished_nodes(ls, finish_event);
13845+ /* fall into DO_START */
13846+
13847+ case DO_START:
b7b72b66 13848+ error = ls_reconfig(ls, rv);
c1c6733f
AM
13849+ if (error)
13850+ next_state = LSST_WAIT_START;
13851+ else
13852+ next_state = LSST_RECONFIG_DONE;
13853+ break;
13854+
13855+ default:
b7b72b66 13856+ DLM_ASSERT(0,);
c1c6733f
AM
13857+ }
13858+ goto out;
13859+ }
13860+
b7b72b66 13861+ /*
c1c6733f
AM
13862+ * LSST_INIT - state after ls is created and before it has been
13863+ * started. A start operation will cause the ls to be started for the
13864+ * first time. A failed start will cause to just wait in INIT for
13865+ * another stop/start.
13866+ */
13867+
13868+ if (cur_state == LSST_INIT) {
13869+ switch (do_now) {
13870+ case DO_START:
b7b72b66 13871+ error = ls_first_start(ls, rv);
c1c6733f
AM
13872+ if (!error)
13873+ next_state = LSST_INIT_DONE;
13874+ break;
13875+
13876+ case DO_STOP:
13877+ break;
13878+
13879+ case DO_FINISH: /* invalid */
13880+ case DO_FINISH_STOP: /* invalid */
13881+ case DO_FINISH_START: /* invalid */
13882+ default:
b7b72b66 13883+ DLM_ASSERT(0,);
c1c6733f
AM
13884+ }
13885+ goto out;
13886+ }
13887+
b7b72b66 13888+ /*
c1c6733f
AM
13889+ * LSST_INIT_DONE - after the first start operation is completed
13890+ * successfully and kcl_start_done() called. If there are no errors, a
13891+ * finish will arrive next and we'll move to LSST_CLEAR.
13892+ */
13893+
13894+ if (cur_state == LSST_INIT_DONE) {
13895+ switch (do_now) {
13896+ case DO_STOP:
13897+ case DO_FINISH_STOP:
13898+ next_state = LSST_WAIT_START;
13899+ break;
13900+
13901+ case DO_START:
13902+ case DO_FINISH_START:
b7b72b66 13903+ error = ls_reconfig(ls, rv);
c1c6733f
AM
13904+ if (error)
13905+ next_state = LSST_WAIT_START;
13906+ else
13907+ next_state = LSST_RECONFIG_DONE;
13908+ break;
13909+
13910+ case DO_FINISH:
13911+ next_state = LSST_CLEAR;
c783755a 13912+
c1c6733f 13913+ enable_locking(ls, finish_event);
c783755a
AM
13914+
13915+ process_requestqueue(ls);
13916+
c1c6733f
AM
13917+ log_all(ls, "recover event %u finished", finish_event);
13918+ break;
13919+
13920+ default:
b7b72b66 13921+ DLM_ASSERT(0,);
c1c6733f
AM
13922+ }
13923+ goto out;
13924+ }
13925+
c783755a 13926+ out:
c1c6733f
AM
13927+ if (next_state)
13928+ ls->ls_state = next_state;
13929+
b7b72b66
AM
13930+ if (rv) {
13931+ kfree(rv->nodeids);
13932+ kfree(rv);
c1c6733f
AM
13933+ }
13934+}
13935+
b7b72b66 13936+int dlm_recoverd(void *arg)
c1c6733f 13937+{
b7b72b66 13938+ struct dlm_ls *ls = arg;
c1c6733f 13939+
b7b72b66 13940+ hold_lockspace(ls);
c1c6733f 13941+
c783755a 13942+ for (;;) {
b7b72b66
AM
13943+ set_current_state(TASK_INTERRUPTIBLE);
13944+ if (!test_bit(LSFL_WORK, &ls->ls_flags))
13945+ schedule();
13946+ set_current_state(TASK_RUNNING);
c1c6733f 13947+
c783755a
AM
13948+ if (test_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags)) {
13949+ down(&ls->ls_recoverd_lock);
13950+ ls->ls_recoverd_task = NULL;
13951+ up(&ls->ls_recoverd_lock);
13952+ goto out;
13953+ }
13954+
b7b72b66 13955+ if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) {
c1c6733f 13956+ do_ls_recovery(ls);
c783755a
AM
13957+
13958+ down(&ls->ls_recoverd_lock);
13959+ if (ls->ls_state == LSST_CLEAR &&
13960+ !test_bit(LSFL_WORK, &ls->ls_flags)) {
13961+ ls->ls_recoverd_task = NULL;
13962+ up(&ls->ls_recoverd_lock);
13963+ goto out;
13964+ }
13965+ up(&ls->ls_recoverd_lock);
b7b72b66 13966+ }
c1c6733f
AM
13967+ }
13968+
c783755a 13969+ out:
b7b72b66 13970+ put_lockspace(ls);
c1c6733f
AM
13971+ return 0;
13972+}
13973+
b7b72b66 13974+void dlm_recoverd_kick(struct dlm_ls *ls)
c1c6733f 13975+{
b7b72b66 13976+ struct task_struct *p;
c1c6733f 13977+
c783755a 13978+ down(&ls->ls_recoverd_lock);
b7b72b66 13979+ set_bit(LSFL_WORK, &ls->ls_flags);
c783755a
AM
13980+
13981+ if (!ls->ls_recoverd_task) {
d3b4771f 13982+ p = kthread_run(dlm_recoverd, (void *) ls, 0, "dlm_recoverd");
b7b72b66 13983+ if (IS_ERR(p)) {
c783755a
AM
13984+ log_error(ls, "can't start dlm_recoverd %ld",
13985+ PTR_ERR(p));
13986+ goto out;
b7b72b66 13987+ }
b7b72b66 13988+ ls->ls_recoverd_task = p;
c783755a
AM
13989+ } else
13990+ wake_up_process(ls->ls_recoverd_task);
13991+ out:
13992+ up(&ls->ls_recoverd_lock);
13993+}
c1c6733f 13994+
c783755a
AM
13995+void dlm_recoverd_stop(struct dlm_ls *ls)
13996+{
13997+ set_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags);
13998+
13999+ for (;;) {
14000+ down(&ls->ls_recoverd_lock);
14001+ if (!ls->ls_recoverd_task) {
14002+ up(&ls->ls_recoverd_lock);
14003+ break;
14004+ }
14005+ wake_up_process(ls->ls_recoverd_task);
14006+ up(&ls->ls_recoverd_lock);
14007+ msleep(100);
b7b72b66 14008+ }
c1c6733f 14009+}
c783755a 14010+
c1c6733f
AM
14011diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
14012--- linux-orig/cluster/dlm/recoverd.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 14013+++ linux-patched/cluster/dlm/recoverd.h 2004-11-03 11:31:56.000000000 +0800
c783755a 14014@@ -0,0 +1,21 @@
c1c6733f
AM
14015+/******************************************************************************
14016+*******************************************************************************
14017+**
14018+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14019+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14020+**
14021+** This copyrighted material is made available to anyone wishing to use,
14022+** modify, copy, or redistribute it subject to the terms and conditions
14023+** of the GNU General Public License v.2.
14024+**
14025+*******************************************************************************
14026+******************************************************************************/
14027+
14028+#ifndef __RECOVERD_DOT_H__
14029+#define __RECOVERD_DOT_H__
14030+
b7b72b66
AM
14031+int dlm_recoverd(void *arg);
14032+void dlm_recoverd_kick(struct dlm_ls *ls);
c783755a 14033+void dlm_recoverd_stop(struct dlm_ls *ls);
c1c6733f
AM
14034+
14035+#endif /* __RECOVERD_DOT_H__ */
14036diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
14037--- linux-orig/cluster/dlm/rsb.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 14038+++ linux-patched/cluster/dlm/rsb.c 2004-11-03 11:31:56.000000000 +0800
b7b72b66 14039@@ -0,0 +1,329 @@
c1c6733f
AM
14040+/******************************************************************************
14041+*******************************************************************************
14042+**
14043+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14044+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14045+**
14046+** This copyrighted material is made available to anyone wishing to use,
14047+** modify, copy, or redistribute it subject to the terms and conditions
14048+** of the GNU General Public License v.2.
14049+**
14050+*******************************************************************************
14051+******************************************************************************/
14052+
14053+#include "dlm_internal.h"
14054+#include "locking.h"
14055+#include "memory.h"
14056+#include "lockqueue.h"
14057+#include "nodes.h"
14058+#include "dir.h"
14059+#include "util.h"
b7b72b66 14060+#include "rsb.h"
c1c6733f 14061+
b7b72b66
AM
14062+static struct dlm_rsb *search_hashchain(struct list_head *head,
14063+ struct dlm_rsb *parent,
14064+ char *name, int namelen)
c1c6733f 14065+{
b7b72b66 14066+ struct dlm_rsb *r;
c1c6733f
AM
14067+
14068+ list_for_each_entry(r, head, res_hashchain) {
14069+ if ((parent == r->res_parent) && (namelen == r->res_length) &&
14070+ (memcmp(name, r->res_name, namelen) == 0)) {
c1c6733f
AM
14071+ return r;
14072+ }
14073+ }
14074+
14075+ return NULL;
14076+}
14077+
14078+/*
14079+ * A way to arbitrarily hold onto an rsb which we already have a reference to
14080+ * to make sure it doesn't go away. Opposite of release_rsb().
14081+ */
14082+
b7b72b66 14083+void hold_rsb(struct dlm_rsb *r)
c1c6733f
AM
14084+{
14085+ atomic_inc(&r->res_ref);
14086+}
14087+
14088+/*
14089+ * release_rsb() - Decrement reference count on rsb struct. Free the rsb
14090+ * struct when there are zero references. Every lkb for the rsb adds a
14091+ * reference. When ref is zero there can be no more lkb's for the rsb, on the
14092+ * queue's or anywhere else.
14093+ */
14094+
b7b72b66 14095+static void _release_rsb(struct dlm_rsb *r, int locked)
c1c6733f 14096+{
b7b72b66
AM
14097+ struct dlm_ls *ls = r->res_ls;
14098+ uint32_t nodeid;
c1c6733f
AM
14099+ int removed = FALSE;
14100+
b7b72b66
AM
14101+ write_lock(&ls->ls_rsbtbl[r->res_bucket].lock);
14102+ if (atomic_dec_and_test(&r->res_ref)) {
14103+ DLM_ASSERT(list_empty(&r->res_grantqueue), print_rsb(r););
14104+ DLM_ASSERT(list_empty(&r->res_waitqueue), print_rsb(r););
14105+ DLM_ASSERT(list_empty(&r->res_convertqueue), print_rsb(r););
c1c6733f
AM
14106+ removed = TRUE;
14107+ list_del(&r->res_hashchain);
14108+ }
b7b72b66 14109+ write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock);
c1c6733f 14110+
b7b72b66
AM
14111+ if (!removed)
14112+ return;
c1c6733f 14113+
b7b72b66
AM
14114+ if (!locked)
14115+ down_write(&ls->ls_root_lock);
14116+ if (r->res_parent)
14117+ list_del(&r->res_subreslist);
14118+ else
14119+ list_del(&r->res_rootlist);
14120+ if (!locked)
14121+ up_write(&ls->ls_root_lock);
14122+
14123+ if (r->res_parent || !test_bit(RESFL_MASTER, &r->res_flags))
14124+ goto out;
c1c6733f 14125+
b7b72b66 14126+ nodeid = get_directory_nodeid(r);
c1c6733f 14127+
b7b72b66
AM
14128+ if (nodeid != our_nodeid())
14129+ remote_remove_direntry(ls, nodeid, r->res_name, r->res_length);
14130+ else
14131+ dlm_dir_remove(ls, nodeid, r->res_name, r->res_length);
14132+ out:
14133+ if (r->res_lvbptr)
14134+ free_lvb(r->res_lvbptr);
14135+
14136+ free_rsb(r);
14137+}
14138+
14139+void release_rsb(struct dlm_rsb *r)
14140+{
14141+ _release_rsb(r, 0);
14142+}
14143+
14144+void release_rsb_locked(struct dlm_rsb *r)
14145+{
14146+ _release_rsb(r, 1);
14147+}
14148+
14149+struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb)
14150+{
14151+ struct dlm_rsb *r = lkb->lkb_resource;
14152+ return r;
c1c6733f
AM
14153+}
14154+
14155+/*
14156+ * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
14157+ * If the rsb exists, its ref count is incremented by this function. If it
14158+ * doesn't exist, it's created with a ref count of one.
14159+ */
14160+
b7b72b66
AM
14161+int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name, int len,
14162+ int flags, struct dlm_rsb **rp)
c1c6733f 14163+{
b7b72b66
AM
14164+ uint32_t bucket;
14165+ struct dlm_rsb *r, *tmp;
c1c6733f
AM
14166+ int error = -ENOMEM;
14167+
b7b72b66 14168+ DLM_ASSERT(len <= DLM_RESNAME_MAXLEN,);
c1c6733f 14169+
b7b72b66
AM
14170+ bucket = dlm_hash(name, len);
14171+ bucket &= (ls->ls_rsbtbl_size - 1);
c1c6733f 14172+
b7b72b66
AM
14173+ read_lock(&ls->ls_rsbtbl[bucket].lock);
14174+ r = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len);
14175+ if (r) {
14176+ if (r->res_nodeid != 0 && (flags & MASTER))
14177+ r = NULL;
14178+ else
14179+ atomic_inc(&r->res_ref);
14180+ }
14181+ read_unlock(&ls->ls_rsbtbl[bucket].lock);
c1c6733f
AM
14182+
14183+ if (r)
14184+ goto out_set;
b7b72b66
AM
14185+
14186+ /* Always create sublocks */
14187+ if (!(flags & CREATE) && !parent) {
c1c6733f
AM
14188+ *rp = NULL;
14189+ goto out;
14190+ }
14191+
b7b72b66 14192+ r = allocate_rsb(ls, len);
c1c6733f
AM
14193+ if (!r)
14194+ goto fail;
14195+
14196+ INIT_LIST_HEAD(&r->res_subreslist);
14197+ INIT_LIST_HEAD(&r->res_grantqueue);
14198+ INIT_LIST_HEAD(&r->res_convertqueue);
14199+ INIT_LIST_HEAD(&r->res_waitqueue);
14200+
b7b72b66
AM
14201+ memcpy(r->res_name, name, len);
14202+ r->res_length = len;
c1c6733f
AM
14203+ r->res_ls = ls;
14204+ init_rwsem(&r->res_lock);
14205+ atomic_set(&r->res_ref, 1);
b7b72b66 14206+ r->res_bucket = bucket;
c1c6733f
AM
14207+
14208+ if (parent) {
14209+ r->res_parent = parent;
14210+ r->res_depth = parent->res_depth + 1;
14211+ r->res_root = parent->res_root;
14212+ r->res_nodeid = parent->res_nodeid;
14213+ } else {
14214+ r->res_parent = NULL;
14215+ r->res_depth = 1;
14216+ r->res_root = r;
14217+ r->res_nodeid = -1;
14218+ }
14219+
b7b72b66
AM
14220+ write_lock(&ls->ls_rsbtbl[bucket].lock);
14221+ tmp = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len);
c1c6733f 14222+ if (tmp) {
b7b72b66
AM
14223+ atomic_inc(&tmp->res_ref);
14224+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
c1c6733f
AM
14225+ free_rsb(r);
14226+ r = tmp;
14227+ } else {
b7b72b66
AM
14228+ list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
14229+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
c1c6733f 14230+
b7b72b66 14231+ down_write(&ls->ls_root_lock);
c1c6733f
AM
14232+ if (parent)
14233+ list_add_tail(&r->res_subreslist,
14234+ &r->res_root->res_subreslist);
14235+ else
14236+ list_add(&r->res_rootlist, &ls->ls_rootres);
b7b72b66 14237+ up_write(&ls->ls_root_lock);
c1c6733f
AM
14238+ }
14239+
14240+ out_set:
14241+ *rp = r;
14242+
14243+ out:
14244+ error = 0;
14245+
14246+ fail:
14247+ return error;
14248+}
14249+
14250+/*
14251+ * Add a LKB to a resource's grant/convert/wait queue. in order
14252+ */
14253+
14254+void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
14255+{
b7b72b66 14256+ struct dlm_lkb *lkb = NULL;
c1c6733f
AM
14257+
14258+ list_for_each_entry(lkb, head, lkb_statequeue) {
14259+ if (lkb->lkb_rqmode < mode)
14260+ break;
14261+ }
14262+
14263+ if (!lkb) {
14264+ /* No entries in the queue, we are alone */
14265+ list_add_tail(new, head);
14266+ } else {
14267+ __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
14268+ }
14269+}
14270+
14271+/*
14272+ * The rsb res_lock must be held in write when this function is called.
14273+ */
14274+
b7b72b66 14275+void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
c1c6733f 14276+{
b7b72b66
AM
14277+ DLM_ASSERT(!lkb->lkb_status,
14278+ print_lkb(lkb);
14279+ print_rsb(r););
c1c6733f
AM
14280+
14281+ lkb->lkb_status = type;
14282+
14283+ switch (type) {
14284+ case GDLM_LKSTS_WAITING:
b7b72b66
AM
14285+ if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE)
14286+ list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
14287+ else
14288+ list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
c1c6733f
AM
14289+ break;
14290+
14291+ case GDLM_LKSTS_GRANTED:
14292+ lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
14293+ lkb->lkb_grmode);
14294+ break;
14295+
14296+ case GDLM_LKSTS_CONVERT:
b7b72b66
AM
14297+ if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE)
14298+ list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
c1c6733f 14299+ else
b7b72b66
AM
14300+ list_add_tail(&lkb->lkb_statequeue,
14301+ &r->res_convertqueue);
c1c6733f
AM
14302+ break;
14303+
14304+ default:
b7b72b66 14305+ DLM_ASSERT(0,);
c1c6733f
AM
14306+ }
14307+}
14308+
b7b72b66 14309+void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
c1c6733f
AM
14310+{
14311+ down_write(&r->res_lock);
14312+ lkb_enqueue(r, lkb, type);
14313+ up_write(&r->res_lock);
14314+}
14315+
14316+/*
14317+ * The rsb res_lock must be held in write when this function is called.
14318+ */
14319+
b7b72b66 14320+int lkb_dequeue(struct dlm_lkb *lkb)
c1c6733f
AM
14321+{
14322+ int status = lkb->lkb_status;
14323+
14324+ if (!status)
14325+ goto out;
14326+
14327+ lkb->lkb_status = 0;
14328+ list_del(&lkb->lkb_statequeue);
14329+
14330+ out:
14331+ return status;
14332+}
14333+
b7b72b66 14334+int res_lkb_dequeue(struct dlm_lkb *lkb)
c1c6733f
AM
14335+{
14336+ int status;
14337+
14338+ down_write(&lkb->lkb_resource->res_lock);
14339+ status = lkb_dequeue(lkb);
14340+ up_write(&lkb->lkb_resource->res_lock);
14341+
14342+ return status;
14343+}
14344+
14345+/*
14346+ * The rsb res_lock must be held in write when this function is called.
14347+ */
14348+
b7b72b66 14349+int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
c1c6733f
AM
14350+{
14351+ int status;
14352+
14353+ status = lkb_dequeue(lkb);
14354+ lkb_enqueue(r, lkb, type);
14355+
14356+ return status;
14357+}
14358+
b7b72b66 14359+int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
c1c6733f
AM
14360+{
14361+ int status;
14362+
14363+ down_write(&r->res_lock);
14364+ status = lkb_swqueue(r, lkb, type);
14365+ up_write(&r->res_lock);
14366+
14367+ return status;
14368+}
14369diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
14370--- linux-orig/cluster/dlm/rsb.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 14371+++ linux-patched/cluster/dlm/rsb.h 2004-11-03 11:31:56.000000000 +0800
b7b72b66 14372@@ -0,0 +1,34 @@
c1c6733f
AM
14373+/******************************************************************************
14374+*******************************************************************************
14375+**
14376+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14377+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14378+**
14379+** This copyrighted material is made available to anyone wishing to use,
14380+** modify, copy, or redistribute it subject to the terms and conditions
14381+** of the GNU General Public License v.2.
14382+**
14383+*******************************************************************************
14384+******************************************************************************/
14385+
14386+#ifndef __RSB_DOT_H__
14387+#define __RSB_DOT_H__
14388+
b7b72b66
AM
14389+#define CREATE 1
14390+#define MASTER 2
14391+
c1c6733f 14392+void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
b7b72b66
AM
14393+void release_rsb(struct dlm_rsb *r);
14394+void release_rsb_locked(struct dlm_rsb *r);
14395+void hold_rsb(struct dlm_rsb *r);
14396+int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name,
14397+ int namelen, int flags, struct dlm_rsb **rp);
14398+struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb);
14399+void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14400+void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14401+int lkb_dequeue(struct dlm_lkb *lkb);
14402+int res_lkb_dequeue(struct dlm_lkb *lkb);
14403+int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14404+int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
c1c6733f
AM
14405+
14406+#endif /* __RSB_DOT_H__ */
14407diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
14408--- linux-orig/cluster/dlm/util.c 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 14409+++ linux-patched/cluster/dlm/util.c 2004-11-03 11:31:56.000000000 +0800
b7b72b66 14410@@ -0,0 +1,183 @@
c1c6733f
AM
14411+/******************************************************************************
14412+*******************************************************************************
14413+**
14414+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14415+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14416+**
14417+** This copyrighted material is made available to anyone wishing to use,
14418+** modify, copy, or redistribute it subject to the terms and conditions
14419+** of the GNU General Public License v.2.
14420+**
14421+*******************************************************************************
14422+******************************************************************************/
14423+
14424+#include "dlm_internal.h"
14425+
14426+static const uint32_t crc_32_tab[] = {
14427+ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
14428+ 0xe963a535, 0x9e6495a3,
14429+ 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
14430+ 0xe7b82d07, 0x90bf1d91,
14431+ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
14432+ 0xf4d4b551, 0x83d385c7,
14433+ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
14434+ 0xfa0f3d63, 0x8d080df5,
14435+ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
14436+ 0xd20d85fd, 0xa50ab56b,
14437+ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
14438+ 0xdcd60dcf, 0xabd13d59,
14439+ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
14440+ 0xcfba9599, 0xb8bda50f,
14441+ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
14442+ 0xc1611dab, 0xb6662d3d,
14443+ 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
14444+ 0x9fbfe4a5, 0xe8b8d433,
14445+ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
14446+ 0x91646c97, 0xe6635c01,
14447+ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
14448+ 0x8208f4c1, 0xf50fc457,
14449+ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
14450+ 0x8cd37cf3, 0xfbd44c65,
14451+ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
14452+ 0xa4d1c46d, 0xd3d6f4fb,
14453+ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
14454+ 0xaa0a4c5f, 0xdd0d7cc9,
14455+ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
14456+ 0xb966d409, 0xce61e49f,
14457+ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
14458+ 0xb7bd5c3b, 0xc0ba6cad,
14459+ 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
14460+ 0x04db2615, 0x73dc1683,
14461+ 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
14462+ 0x0a00ae27, 0x7d079eb1,
14463+ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
14464+ 0x196c3671, 0x6e6b06e7,
14465+ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
14466+ 0x17b7be43, 0x60b08ed5,
14467+ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
14468+ 0x3fb506dd, 0x48b2364b,
14469+ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
14470+ 0x316e8eef, 0x4669be79,
14471+ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
14472+ 0x220216b9, 0x5505262f,
14473+ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
14474+ 0x2cd99e8b, 0x5bdeae1d,
14475+ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
14476+ 0x72076785, 0x05005713,
14477+ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
14478+ 0x7cdcefb7, 0x0bdbdf21,
14479+ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
14480+ 0x6fb077e1, 0x18b74777,
14481+ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
14482+ 0x616bffd3, 0x166ccf45,
14483+ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
14484+ 0x4969474d, 0x3e6e77db,
14485+ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
14486+ 0x47b2cf7f, 0x30b5ffe9,
14487+ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
14488+ 0x54de5729, 0x23d967bf,
14489+ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
14490+ 0x5a05df1b, 0x2d02ef8d
14491+};
14492+
14493+/**
b7b72b66 14494+ * dlm_hash - hash an array of data
c1c6733f
AM
14495+ * @data: the data to be hashed
14496+ * @len: the length of data to be hashed
14497+ *
14498+ * Copied from GFS.
14499+ *
14500+ * Take some data and convert it to a 32-bit hash.
14501+ *
14502+ * The hash function is a 32-bit CRC of the data. The algorithm uses
14503+ * the crc_32_tab table above.
14504+ *
14505+ * This may not be the fastest hash function, but it does a fair bit better
14506+ * at providing uniform results than the others I've looked at. That's
14507+ * really important for efficient directories.
14508+ *
14509+ * Returns: the hash
14510+ */
14511+
b7b72b66 14512+uint32_t dlm_hash(const char *data, int len)
c1c6733f
AM
14513+{
14514+ uint32_t hash = 0xFFFFFFFF;
14515+
14516+ for (; len--; data++)
14517+ hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
14518+
14519+ hash = ~hash;
14520+
14521+ return hash;
14522+}
14523+
b7b72b66
AM
14524+void print_lkb(struct dlm_lkb *lkb)
14525+{
14526+ printk("dlm: lkb\n"
14527+ "id %x\n"
14528+ "remid %x\n"
14529+ "flags %x\n"
14530+ "status %x\n"
14531+ "rqmode %d\n"
14532+ "grmode %d\n"
14533+ "nodeid %d\n"
14534+ "lqstate %x\n"
14535+ "lqflags %x\n",
14536+ lkb->lkb_id,
14537+ lkb->lkb_remid,
14538+ lkb->lkb_flags,
14539+ lkb->lkb_status,
14540+ lkb->lkb_rqmode,
14541+ lkb->lkb_grmode,
14542+ lkb->lkb_nodeid,
14543+ lkb->lkb_lockqueue_state,
14544+ lkb->lkb_lockqueue_flags);
14545+}
14546+
14547+void print_rsb(struct dlm_rsb *r)
14548+{
14549+ printk("dlm: rsb\n"
14550+ "name \"%s\"\n"
14551+ "nodeid %d\n"
14552+ "flags %lx\n"
14553+ "ref %u\n",
14554+ r->res_name,
14555+ r->res_nodeid,
14556+ r->res_flags,
14557+ atomic_read(&r->res_ref));
14558+}
14559+
14560+void print_request(struct dlm_request *req)
14561+{
14562+ printk("dlm: request\n"
14563+ "rh_cmd %u\n"
14564+ "rh_lkid %x\n"
14565+ "remlkid %x\n"
14566+ "flags %x\n"
14567+ "status %u\n"
14568+ "rqmode %u\n",
14569+ req->rr_header.rh_cmd,
14570+ req->rr_header.rh_lkid,
14571+ req->rr_remlkid,
14572+ req->rr_flags,
14573+ req->rr_status,
14574+ req->rr_rqmode);
14575+}
14576+
14577+void print_reply(struct dlm_reply *rp)
14578+{
14579+ printk("dlm: reply\n"
14580+ "rh_cmd %u\n"
14581+ "rh_lkid %x\n"
14582+ "lockstate %u\n"
14583+ "nodeid %u\n"
14584+ "status %u\n"
14585+ "lkid %x\n",
14586+ rp->rl_header.rh_cmd,
14587+ rp->rl_header.rh_lkid,
14588+ rp->rl_lockstate,
14589+ rp->rl_nodeid,
14590+ rp->rl_status,
14591+ rp->rl_lkid);
c1c6733f
AM
14592+}
14593+
c1c6733f
AM
14594diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
14595--- linux-orig/cluster/dlm/util.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 14596+++ linux-patched/cluster/dlm/util.h 2004-11-03 11:31:56.000000000 +0800
b7b72b66 14597@@ -0,0 +1,24 @@
c1c6733f
AM
14598+/******************************************************************************
14599+*******************************************************************************
14600+**
14601+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14602+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14603+**
14604+** This copyrighted material is made available to anyone wishing to use,
14605+** modify, copy, or redistribute it subject to the terms and conditions
14606+** of the GNU General Public License v.2.
14607+**
14608+*******************************************************************************
14609+******************************************************************************/
14610+
14611+#ifndef __UTIL_DOT_H__
14612+#define __UTIL_DOT_H__
14613+
b7b72b66 14614+uint32_t dlm_hash(const char *data, int len);
c1c6733f 14615+
b7b72b66
AM
14616+void print_lkb(struct dlm_lkb *lkb);
14617+void print_rsb(struct dlm_rsb *r);
14618+void print_request(struct dlm_request *req);
14619+void print_reply(struct dlm_reply *rp);
c1c6733f
AM
14620+
14621+#endif
14622diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
14623--- linux-orig/include/cluster/dlm.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 14624+++ linux-patched/include/cluster/dlm.h 2004-11-03 11:31:56.000000000 +0800
c783755a 14625@@ -0,0 +1,416 @@
c1c6733f
AM
14626+/******************************************************************************
14627+*******************************************************************************
14628+**
14629+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14630+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14631+**
14632+** This copyrighted material is made available to anyone wishing to use,
14633+** modify, copy, or redistribute it subject to the terms and conditions
14634+** of the GNU General Public License v.2.
14635+**
14636+*******************************************************************************
14637+******************************************************************************/
14638+
14639+#ifndef __DLM_DOT_H__
14640+#define __DLM_DOT_H__
14641+
14642+/*
14643+ * Interface to DLM - routines and structures to use DLM lockspaces.
14644+ */
14645+
14646+/*
14647+ * Lock Modes
14648+ */
14649+
14650+#define DLM_LOCK_IV (-1) /* invalid */
14651+#define DLM_LOCK_NL (0) /* null */
14652+#define DLM_LOCK_CR (1) /* concurrent read */
14653+#define DLM_LOCK_CW (2) /* concurrent write */
14654+#define DLM_LOCK_PR (3) /* protected read */
14655+#define DLM_LOCK_PW (4) /* protected write */
14656+#define DLM_LOCK_EX (5) /* exclusive */
14657+
14658+/*
14659+ * Maximum size in bytes of a dlm_lock name
14660+ */
14661+
14662+#define DLM_RESNAME_MAXLEN (64)
14663+
14664+/*
14665+ * Size in bytes of Lock Value Block
14666+ */
14667+
14668+#define DLM_LVB_LEN (32)
14669+
14670+/*
14671+ * Flags to dlm_new_lockspace
14672+ *
14673+ * DLM_LSF_NOTIMERS
14674+ *
14675+ * Do not subject locks in this lockspace to time-outs.
c1c6733f
AM
14676+ */
14677+
14678+#define DLM_LSF_NOTIMERS (1)
14679+
14680+/*
14681+ * Flags to dlm_lock
14682+ *
14683+ * DLM_LKF_NOQUEUE
14684+ *
14685+ * Do not queue the lock request on the wait queue if it cannot be granted
14686+ * immediately. If the lock cannot be granted because of this flag, DLM will
14687+ * either return -EAGAIN from the dlm_lock call or will return 0 from
14688+ * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
14689+ *
14690+ * DLM_LKF_CONVERT
14691+ *
14692+ * Indicates a lock conversion request. For conversions the name and namelen
14693+ * are ignored and the lock ID in the LKSB is used to identify the lock.
14694+ *
14695+ * DLM_LKF_VALBLK
14696+ *
14697+ * Requests DLM to return the current contents of the lock value block in the
14698+ * lock status block. When this flag is set in a lock conversion from PW or EX
14699+ * modes, DLM assigns the value specified in the lock status block to the lock
14700+ * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
14701+ * containing application-specific information.
14702+ *
14703+ * DLM_LKF_QUECVT
14704+ *
b7b72b66
AM
14705+ * Force a conversion request to be queued, even if it is compatible with
14706+ * the granted modes of other locks on the same resource.
c1c6733f
AM
14707+ *
14708+ * DLM_LKF_CANCEL
14709+ *
14710+ * Used to cancel a pending conversion (with dlm_unlock). Lock is returned to
14711+ * previously granted mode.
14712+ *
14713+ * DLM_LKF_IVVALBLK
14714+ *
14715+ * Invalidate/clear the lock value block.
14716+ *
14717+ * DLM_LKF_CONVDEADLK
14718+ *
14719+ * The granted mode of a lock being converted (from a non-NL mode) can be
14720+ * changed to NL in the process of acquiring the requested mode to avoid
14721+ * conversion deadlock.
14722+ *
14723+ * DLM_LKF_PERSISTENT
14724+ *
14725+ * Only relevant to locks originating in userspace. Signals to the ioctl.c code
14726+ * that this lock should not be unlocked when the process exits.
14727+ *
14728+ * DLM_LKF_NODLKWT
14729+ *
14730+ * This lock is not to be checked for conversion deadlocks.
14731+ *
14732+ * DLM_LKF_NODLCKBLK
14733+ *
14734+ * not yet implemented
14735+ *
14736+ * DLM_LKF_EXPEDITE
14737+ *
b7b72b66
AM
14738+ * Used only with new requests for NL mode locks. Tells the lock manager
14739+ * to grant the lock, ignoring other locks in convert and wait queues.
c1c6733f
AM
14740+ *
14741+ * DLM_LKF_NOQUEUEBAST
14742+ *
14743+ * Send blocking AST's before returning -EAGAIN to the caller. It is only
14744+ * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
14745+ * NOQUEUE requests otherwise.
14746+ *
b7b72b66
AM
14747+ * DLM_LKF_HEADQUE
14748+ *
14749+ * Add a lock to the head of the convert or wait queue rather than the tail.
14750+ *
14751+ * DLM_LKF_NOORDER
14752+ *
14753+ * Disregard the standard grant order rules and grant a lock as soon as it
14754+ * is compatible with other granted locks.
c1c6733f
AM
14755+ */
14756+
14757+#define DLM_LKF_NOQUEUE (0x00000001)
14758+#define DLM_LKF_CANCEL (0x00000002)
14759+#define DLM_LKF_CONVERT (0x00000004)
14760+#define DLM_LKF_VALBLK (0x00000008)
14761+#define DLM_LKF_QUECVT (0x00000010)
14762+#define DLM_LKF_IVVALBLK (0x00000020)
14763+#define DLM_LKF_CONVDEADLK (0x00000040)
14764+#define DLM_LKF_PERSISTENT (0x00000080)
14765+#define DLM_LKF_NODLCKWT (0x00000100)
14766+#define DLM_LKF_NODLCKBLK (0x00000200)
14767+#define DLM_LKF_EXPEDITE (0x00000400)
14768+#define DLM_LKF_NOQUEUEBAST (0x00000800)
b7b72b66
AM
14769+#define DLM_LKF_HEADQUE (0x00001000)
14770+#define DLM_LKF_NOORDER (0x00002000)
c783755a 14771+#define DLM_LKF_ORPHAN (0x00004000)
c1c6733f
AM
14772+
14773+/*
b7b72b66 14774+ * Some return codes that are not in errno.h
c1c6733f
AM
14775+ */
14776+
14777+#define DLM_ECANCEL (0x10001)
14778+#define DLM_EUNLOCK (0x10002)
14779+
14780+typedef void dlm_lockspace_t;
14781+
14782+/*
14783+ * Lock range structure
14784+ */
14785+
14786+struct dlm_range {
14787+ uint64_t ra_start;
14788+ uint64_t ra_end;
14789+};
14790+
14791+/*
14792+ * Lock status block
14793+ *
14794+ * Use this structure to specify the contents of the lock value block. For a
14795+ * conversion request, this structure is used to specify the lock ID of the
14796+ * lock. DLM writes the status of the lock request and the lock ID assigned
14797+ * to the request in the lock status block.
14798+ *
14799+ * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
14800+ * It is available when dlm_lock returns.
14801+ *
14802+ * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
14803+ * shown for the DLM_LKF_VALBLK flag.
14804+ *
14805+ * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
14806+ * it was first demoted to NL to avoid conversion deadlock.
14807+ *
14808+ * sb_status: the returned status of the lock request set prior to AST
14809+ * execution. Possible return values:
14810+ *
14811+ * 0 if lock request was successful
14812+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14813+ * -ENOMEM if there is no memory to process request
14814+ * -EINVAL if there are invalid parameters
14815+ * -DLM_EUNLOCK if unlock request was successful
14816+ * -DLM_ECANCEL ?
14817+ */
14818+
14819+#define DLM_SBF_DEMOTED (0x01)
14820+
14821+struct dlm_lksb {
14822+ int sb_status;
14823+ uint32_t sb_lkid;
14824+ char sb_flags;
14825+ char * sb_lvbptr;
14826+};
14827+
14828+/*
b7b72b66 14829+ * These defines are the bits that make up the query code.
c1c6733f
AM
14830+ */
14831+
14832+/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
14833+ * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
14834+#define DLM_LOCK_THIS 0x0007
14835+#define DLM_QUERY_MODE_MASK 0x0007
14836+
14837+/* Bits 3, 4, 5 bitmap of queue(s) to query */
14838+#define DLM_QUERY_QUEUE_WAIT 0x0008
14839+#define DLM_QUERY_QUEUE_CONVERT 0x0010
14840+#define DLM_QUERY_QUEUE_GRANT 0x0020
14841+#define DLM_QUERY_QUEUE_GRANTED 0x0030 /* Shorthand */
14842+#define DLM_QUERY_QUEUE_ALL 0x0038 /* Shorthand */
14843+
14844+/* Bit 6, Return only the information that can be established without a network
14845+ * round-trip. The caller must be aware of the implications of this. Useful for
14846+ * just getting the master node id or resource name. */
14847+#define DLM_QUERY_LOCAL 0x0040
14848+
14849+/* Bits 8 up, query type */
14850+#define DLM_QUERY_LOCKS_HIGHER 0x0100
14851+#define DLM_QUERY_LOCKS_LOWER 0x0200
14852+#define DLM_QUERY_LOCKS_EQUAL 0x0300
14853+#define DLM_QUERY_LOCKS_BLOCKING 0x0400
14854+#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
14855+#define DLM_QUERY_LOCKS_ALL 0x0600
c783755a 14856+#define DLM_QUERY_LOCKS_ORPHAN 0x0700
c1c6733f
AM
14857+#define DLM_QUERY_MASK 0x0F00
14858+
14859+/* GRMODE is the default for mode comparisons,
14860+ RQMODE might also be handy */
14861+#define DLM_QUERY_GRMODE 0x0000
14862+#define DLM_QUERY_RQMODE 0x1000
14863+
14864+/* Structures passed into and out of the query */
14865+
14866+struct dlm_lockinfo {
14867+ int lki_lkid; /* Lock ID on originating node */
14868+ int lki_mstlkid; /* Lock ID on master node */
14869+ int lki_parent;
14870+ int lki_node; /* Originating node (not master) */
b7b72b66 14871+ int lki_ownpid; /* Owner pid on originating node */
c1c6733f
AM
14872+ uint8_t lki_state; /* Queue the lock is on */
14873+ uint8_t lki_grmode; /* Granted mode */
14874+ uint8_t lki_rqmode; /* Requested mode */
14875+ struct dlm_range lki_grrange; /* Granted range, if applicable */
14876+ struct dlm_range lki_rqrange; /* Requested range, if applicable */
14877+};
14878+
14879+struct dlm_resinfo {
14880+ int rsi_length;
14881+ int rsi_grantcount; /* No. of nodes on grant queue */
14882+ int rsi_convcount; /* No. of nodes on convert queue */
14883+ int rsi_waitcount; /* No. of nodes on wait queue */
14884+ int rsi_masternode; /* Master for this resource */
14885+ char rsi_name[DLM_RESNAME_MAXLEN]; /* Resource name */
14886+ char rsi_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable
14887+ */
14888+};
14889+
14890+struct dlm_queryinfo {
14891+ struct dlm_resinfo *gqi_resinfo;
14892+ struct dlm_lockinfo *gqi_lockinfo; /* This points to an array
14893+ * of structs */
14894+ int gqi_locksize; /* input */
14895+ int gqi_lockcount; /* output */
14896+};
14897+
14898+#ifdef __KERNEL__
14899+/*
14900+ * dlm_init
14901+ *
14902+ * Starts and initializes DLM threads and structures. Creation of the first
14903+ * lockspace will call this if it has not been called already.
14904+ *
14905+ * Returns: 0 if successful, -EXXX on error
14906+ */
14907+
14908+int dlm_init(void);
14909+
14910+/*
14911+ * dlm_release
14912+ *
14913+ * Stops DLM threads.
14914+ *
14915+ * Returns: 0 if successful, -EXXX on error
14916+ */
14917+
14918+int dlm_release(void);
14919+
14920+/*
14921+ * dlm_new_lockspace
14922+ *
14923+ * Starts a lockspace with the given name. If the named lockspace exists in
14924+ * the cluster, the calling node joins it.
14925+ */
14926+
14927+int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
14928+ int flags);
14929+
14930+/*
14931+ * dlm_release_lockspace
14932+ *
14933+ * Stop a lockspace.
14934+ */
14935+
14936+int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
14937+
14938+/*
14939+ * dlm_lock
14940+ *
14941+ * Make an asyncronous request to acquire or convert a lock on a named
14942+ * resource.
14943+ *
14944+ * lockspace: context for the request
14945+ * mode: the requested mode of the lock (DLM_LOCK_)
14946+ * lksb: lock status block for input and async return values
14947+ * flags: input flags (DLM_LKF_)
14948+ * name: name of the resource to lock, can be binary
14949+ * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
14950+ * parent: the lock ID of a parent lock or 0 if none
14951+ * lockast: function DLM executes when it completes processing the request
14952+ * astarg: argument passed to lockast and bast functions
14953+ * bast: function DLM executes when this lock later blocks another request
14954+ *
14955+ * Returns:
14956+ * 0 if request is successfully queued for processing
14957+ * -EINVAL if any input parameters are invalid
14958+ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14959+ * -ENOMEM if there is no memory to process request
14960+ * -ENOTCONN if there is a communication error
14961+ *
14962+ * If the call to dlm_lock returns an error then the operation has failed and
14963+ * the AST routine will not be called. If dlm_lock returns 0 it is still
14964+ * possible that the lock operation will fail. The AST routine will be called
14965+ * when the locking is complete and the status is returned in the lksb.
14966+ *
14967+ * If the AST routines or parameter are passed to a conversion operation then
14968+ * they will overwrite those values that were passed to a previous dlm_lock
14969+ * call.
14970+ *
14971+ * AST routines should not block (at least not for long), but may make
14972+ * any locking calls they please.
14973+ */
14974+
14975+int dlm_lock(dlm_lockspace_t *lockspace,
14976+ uint32_t mode,
14977+ struct dlm_lksb *lksb,
14978+ uint32_t flags,
14979+ void *name,
14980+ unsigned int namelen,
14981+ uint32_t parent,
14982+ void (*lockast) (void *astarg),
14983+ void *astarg,
14984+ void (*bast) (void *astarg, int mode),
14985+ struct dlm_range *range);
14986+
14987+/*
14988+ * dlm_unlock
14989+ *
14990+ * Asynchronously release a lock on a resource. The AST routine is called
14991+ * when the resource is successfully unlocked.
14992+ *
14993+ * lockspace: context for the request
14994+ * lkid: the lock ID as returned in the lksb
14995+ * flags: input flags (DLM_LKF_)
14996+ * lksb: if NULL the lksb parameter passed to last lock request is used
c783755a 14997+ * astarg: the arg used with the completion ast for the unlock
c1c6733f
AM
14998+ *
14999+ * Returns:
15000+ * 0 if request is successfully queued for processing
15001+ * -EINVAL if any input parameters are invalid
15002+ * -ENOTEMPTY if the lock still has sublocks
15003+ * -EBUSY if the lock is waiting for a remote lock operation
15004+ * -ENOTCONN if there is a communication error
15005+ */
15006+
15007+extern int dlm_unlock(dlm_lockspace_t *lockspace,
15008+ uint32_t lkid,
15009+ uint32_t flags,
15010+ struct dlm_lksb *lksb,
15011+ void *astarg);
15012+
15013+/* Query interface
15014+ *
15015+ * Query the other holders of a resource, given a known lock ID
15016+ *
15017+ * lockspace: context for the request
15018+ * lksb: LKSB, sb_lkid contains the lock ID of a valid lock
15019+ * on the resource. sb_status will contain the status
15020+ * of the request on completion.
15021+ * query: query bitmap see DLM_QUERY_* above
15022+ * qinfo: pointer to dlm_queryinfo structure
15023+ * ast_routine: AST routine to call on completion
15024+ * artarg: argument to AST routine. It is "traditional"
15025+ * to put the qinfo pointer into lksb->sb_lvbptr
15026+ * and pass the lksb in here.
15027+ */
15028+extern int dlm_query(dlm_lockspace_t *lockspace,
15029+ struct dlm_lksb *lksb,
15030+ int query,
15031+ struct dlm_queryinfo *qinfo,
15032+ void (ast_routine(void *)),
15033+ void *astarg);
15034+
b7b72b66
AM
15035+
15036+void dlm_debug_dump(void);
15037+void dlm_locks_dump(void);
15038+
c1c6733f
AM
15039+#endif /* __KERNEL__ */
15040+
15041+#endif /* __DLM_DOT_H__ */
15042diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
15043--- linux-orig/include/cluster/dlm_device.h 1970-01-01 07:30:00.000000000 +0730
bb1d8b11 15044+++ linux-patched/include/cluster/dlm_device.h 2004-11-03 11:31:56.000000000 +0800
b7b72b66 15045@@ -0,0 +1,64 @@
c1c6733f
AM
15046+/******************************************************************************
15047+*******************************************************************************
15048+**
15049+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
15050+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
15051+**
15052+** This copyrighted material is made available to anyone wishing to use,
15053+** modify, copy, or redistribute it subject to the terms and conditions
15054+** of the GNU General Public License v.2.
15055+**
15056+*******************************************************************************
15057+******************************************************************************/
15058+
15059+/* This is the device interface for dlm, most users will use a library
15060+ * interface.
15061+ */
15062+
15063+/* Version of the device interface */
15064+#define DLM_DEVICE_VERSION_MAJOR 2
15065+#define DLM_DEVICE_VERSION_MINOR 0
15066+#define DLM_DEVICE_VERSION_PATCH 0
15067+
15068+/* struct passed to the lock write */
15069+struct dlm_lock_params {
15070+ uint32_t version[3];
15071+ uint8_t cmd;
15072+ uint8_t mode;
15073+ uint16_t flags;
15074+ uint32_t lkid;
15075+ uint32_t parent;
15076+ struct dlm_range range;
15077+ uint8_t namelen;
b7b72b66
AM
15078+ void *castparam;
15079+ void *castaddr;
15080+ void *bastparam;
c1c6733f
AM
15081+ void *bastaddr;
15082+ struct dlm_lksb *lksb;
15083+ char name[1];
15084+};
15085+
15086+
15087+/* struct read from the "device" fd,
15088+ consists mainly of userspace pointers for the library to use */
15089+struct dlm_lock_result {
15090+ uint8_t cmd;
15091+ void *astparam;
15092+ void (*astaddr)(void *astparam);
15093+ struct dlm_lksb *user_lksb;
15094+ struct dlm_lksb lksb; /* But this has real data in it */
15095+ uint8_t bast_mode; /* Not yet used */
15096+};
15097+
15098+/* commands passed to the device */
15099+#define DLM_USER_LOCK 1
15100+#define DLM_USER_UNLOCK 2
15101+#define DLM_USER_QUERY 3
15102+
15103+/* Arbitrary length restriction */
15104+#define MAX_LS_NAME_LEN 64
15105+
15106+/* ioctls on the device */
15107+#define DLM_CREATE_LOCKSPACE _IOW('D', 0x01, char *)
15108+#define DLM_RELEASE_LOCKSPACE _IOW('D', 0x02, char *)
15109+#define DLM_FORCE_RELEASE_LOCKSPACE _IOW('D', 0x03, char *)
This page took 2.036189 seconds and 4 git commands to generate.