]> git.pld-linux.org Git - packages/kernel.git/blame - linux-cluster-cman.patch
- ported from linux-2.4.25-atmdd.patch
[packages/kernel.git] / linux-cluster-cman.patch
CommitLineData
bb1d8b11
AM
1diff -urN linux-orig/arch/alpha/Kconfig linux-orig2/arch/alpha/Kconfig
2--- linux-orig/arch/alpha/Kconfig 2004-10-18 16:55:37.000000000 -0500
3+++ linux-orig2/arch/alpha/Kconfig 2004-10-22 11:29:33.507218717 -0500
4@@ -600,3 +600,4 @@
5
6 source "lib/Kconfig"
7
8+source "cluster/Kconfig"
9diff -urN linux-orig/arch/arm/Kconfig linux-orig2/arch/arm/Kconfig
10--- linux-orig/arch/arm/Kconfig 2004-10-18 16:54:31.000000000 -0500
11+++ linux-orig2/arch/arm/Kconfig 2004-10-22 11:30:56.358918506 -0500
12@@ -690,3 +690,5 @@
13 source "crypto/Kconfig"
14
15 source "lib/Kconfig"
c1c6733f 16+
bb1d8b11
AM
17+source "cluster/Kconfig"
18diff -urN linux-orig/arch/arm26/Kconfig linux-orig2/arch/arm26/Kconfig
19--- linux-orig/arch/arm26/Kconfig 2004-10-18 16:54:32.000000000 -0500
20+++ linux-orig2/arch/arm26/Kconfig 2004-10-22 11:29:33.531218341 -0500
21@@ -222,3 +222,4 @@
22
23 source "lib/Kconfig"
24
25+source "cluster/Kconfig"
26diff -urN linux-orig/arch/cris/Kconfig linux-orig2/arch/cris/Kconfig
27--- linux-orig/arch/cris/Kconfig 2004-10-18 16:55:07.000000000 -0500
28+++ linux-orig2/arch/cris/Kconfig 2004-10-22 11:31:11.965673644 -0500
29@@ -174,3 +174,5 @@
30 source "crypto/Kconfig"
31
32 source "lib/Kconfig"
c1c6733f 33+
bb1d8b11
AM
34+source "cluster/Kconfig"
35diff -urN linux-orig/arch/i386/Kconfig linux-orig2/arch/i386/Kconfig
36--- linux-orig/arch/i386/Kconfig 2004-10-18 16:53:22.000000000 -0500
37+++ linux-orig2/arch/i386/Kconfig 2004-10-22 11:29:33.533218309 -0500
38@@ -1194,6 +1194,8 @@
39
40 source "lib/Kconfig"
41
42+source "cluster/Kconfig"
c1c6733f 43+
bb1d8b11
AM
44 config X86_SMP
45 bool
46 depends on SMP && !X86_VOYAGER
47diff -urN linux-orig/arch/ia64/Kconfig linux-orig2/arch/ia64/Kconfig
48--- linux-orig/arch/ia64/Kconfig 2004-10-18 16:55:27.000000000 -0500
49+++ linux-orig2/arch/ia64/Kconfig 2004-10-22 11:29:33.534218294 -0500
50@@ -390,3 +390,5 @@
51 source "security/Kconfig"
52
53 source "crypto/Kconfig"
c1c6733f 54+
bb1d8b11
AM
55+source "cluster/Kconfig"
56diff -urN linux-orig/arch/m68k/Kconfig linux-orig2/arch/m68k/Kconfig
57--- linux-orig/arch/m68k/Kconfig 2004-10-18 16:54:32.000000000 -0500
58+++ linux-orig2/arch/m68k/Kconfig 2004-10-22 11:31:38.187262279 -0500
59@@ -655,3 +655,5 @@
60 source "crypto/Kconfig"
61
62 source "lib/Kconfig"
c1c6733f 63+
bb1d8b11
AM
64+source "cluster/Kconfig"
65diff -urN linux-orig/arch/mips/Kconfig linux-orig2/arch/mips/Kconfig
66--- linux-orig/arch/mips/Kconfig 2004-10-18 16:54:08.000000000 -0500
67+++ linux-orig2/arch/mips/Kconfig 2004-10-22 11:29:33.541218184 -0500
d3b4771f 68@@ -1587,6 +1587,8 @@
bb1d8b11
AM
69
70 source "lib/Kconfig"
d3b4771f 71
bb1d8b11 72+source "cluster/Kconfig"
d3b4771f
AM
73+
74 #
75 # Use the generic interrupt handling code in kernel/irq/:
76 #
bb1d8b11
AM
77diff -urN linux-orig/arch/parisc/Kconfig linux-orig2/arch/parisc/Kconfig
78--- linux-orig/arch/parisc/Kconfig 2004-10-18 16:54:37.000000000 -0500
79+++ linux-orig2/arch/parisc/Kconfig 2004-10-22 11:31:57.146964867 -0500
80@@ -195,3 +195,5 @@
81 source "crypto/Kconfig"
82
83 source "lib/Kconfig"
c1c6733f 84+
bb1d8b11
AM
85+source "cluster/Kconfig"
86diff -urN linux-orig/arch/ppc/Kconfig linux-orig2/arch/ppc/Kconfig
87--- linux-orig/arch/ppc/Kconfig 2004-10-18 16:55:29.000000000 -0500
88+++ linux-orig2/arch/ppc/Kconfig 2004-10-22 11:29:33.550218043 -0500
89@@ -1231,3 +1231,5 @@
90 source "security/Kconfig"
91
92 source "crypto/Kconfig"
c1c6733f 93+
bb1d8b11
AM
94+source "cluster/Kconfig"
95diff -urN linux-orig/arch/ppc64/Kconfig linux-orig2/arch/ppc64/Kconfig
96--- linux-orig/arch/ppc64/Kconfig 2004-10-18 16:54:31.000000000 -0500
97+++ linux-orig2/arch/ppc64/Kconfig 2004-10-22 11:32:11.150745212 -0500
98@@ -352,3 +352,5 @@
99 source "crypto/Kconfig"
100
101 source "lib/Kconfig"
c1c6733f 102+
bb1d8b11
AM
103+source "cluster/Kconfig"
104diff -urN linux-orig/arch/s390/Kconfig linux-orig2/arch/s390/Kconfig
105--- linux-orig/arch/s390/Kconfig 2004-10-18 16:53:51.000000000 -0500
106+++ linux-orig2/arch/s390/Kconfig 2004-10-22 11:32:31.175431141 -0500
107@@ -466,3 +466,5 @@
108 source "crypto/Kconfig"
109
110 source "lib/Kconfig"
c1c6733f 111+
bb1d8b11
AM
112+source "cluster/Kconfig"
113diff -urN linux-orig/arch/sh/Kconfig linux-orig2/arch/sh/Kconfig
114--- linux-orig/arch/sh/Kconfig 2004-10-18 16:55:29.000000000 -0500
115+++ linux-orig2/arch/sh/Kconfig 2004-10-22 11:32:47.169180310 -0500
116@@ -748,3 +748,5 @@
117 source "crypto/Kconfig"
118
119 source "lib/Kconfig"
c1c6733f 120+
bb1d8b11
AM
121+source "cluster/Kconfig"
122diff -urN linux-orig/arch/sparc/Kconfig linux-orig2/arch/sparc/Kconfig
123--- linux-orig/arch/sparc/Kconfig 2004-10-18 16:53:05.000000000 -0500
124+++ linux-orig2/arch/sparc/Kconfig 2004-10-22 11:33:06.891871022 -0500
125@@ -386,3 +386,5 @@
126 source "crypto/Kconfig"
127
128 source "lib/Kconfig"
c1c6733f 129+
bb1d8b11
AM
130+source "cluster/Kconfig"
131diff -urN linux-orig/arch/sparc64/Kconfig linux-orig2/arch/sparc64/Kconfig
132--- linux-orig/arch/sparc64/Kconfig 2004-10-18 16:55:06.000000000 -0500
133+++ linux-orig2/arch/sparc64/Kconfig 2004-10-22 11:33:19.290676599 -0500
134@@ -613,3 +613,5 @@
135 source "crypto/Kconfig"
136
137 source "lib/Kconfig"
c1c6733f 138+
bb1d8b11
AM
139+source "cluster/Kconfig"
140diff -urN linux-orig/arch/um/Kconfig linux-orig2/arch/um/Kconfig
141--- linux-orig/arch/um/Kconfig 2004-10-18 16:54:08.000000000 -0500
142+++ linux-orig2/arch/um/Kconfig 2004-10-22 11:29:33.564217823 -0500
143@@ -225,6 +225,8 @@
144
145 source "lib/Kconfig"
146
147+source "cluster/Kconfig"
c1c6733f 148+
bb1d8b11
AM
149 menu "SCSI support"
150 depends on BROKEN
151
152diff -urN linux-orig/arch/x86_64/Kconfig linux-orig2/arch/x86_64/Kconfig
153--- linux-orig/arch/x86_64/Kconfig 2004-10-18 16:54:55.000000000 -0500
154+++ linux-orig2/arch/x86_64/Kconfig 2004-10-22 11:33:37.130396876 -0500
155@@ -424,3 +424,5 @@
156 source "crypto/Kconfig"
157
158 source "lib/Kconfig"
c1c6733f 159+
bb1d8b11
AM
160+source "cluster/Kconfig"
161diff -urN linux-orig/cluster/cman/Makefile linux-orig2/cluster/cman/Makefile
162--- linux-orig/cluster/cman/Makefile 1969-12-31 18:00:00.000000000 -0600
163+++ linux-orig2/cluster/cman/Makefile 2004-10-22 11:29:33.566217791 -0500
164@@ -0,0 +1,6 @@
165+cman-objs := cnxman.o config.o membership.o proc.o\
166+ sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\
167+ sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \
168+ sm_user.o
c1c6733f 169+
bb1d8b11
AM
170+obj-$(CONFIG_CLUSTER) := cman.o
171diff -urN linux-orig/cluster/Kconfig linux-orig2/cluster/Kconfig
172--- linux-orig/cluster/Kconfig 1969-12-31 18:00:00.000000000 -0600
173+++ linux-orig2/cluster/Kconfig 2004-10-22 11:29:33.565217807 -0500
174@@ -0,0 +1,13 @@
175+menu "Cluster Support"
c1c6733f 176+
bb1d8b11
AM
177+config CLUSTER
178+ tristate "Cluster support"
179+ ---help---
180+ Enable clustering support. This is not the high-performance clustering
181+ made famous by beowulf. It is a high-availability cluster often using
182+ shared storage.
183+ The cluster manager is the heart(beat) of the cluster system. It is
184+ needed by all the other components. It provides membership services
185+ for those other subsystems.
c1c6733f 186+
bb1d8b11
AM
187+endmenu
188diff -urN linux-orig/cluster/Makefile linux-orig2/cluster/Makefile
189--- linux-orig/cluster/Makefile 1969-12-31 18:00:00.000000000 -0600
190+++ linux-orig2/cluster/Makefile 2004-10-22 11:29:33.566217791 -0500
191@@ -0,0 +1,3 @@
192+obj-y := nocluster.o
c1c6733f 193+
bb1d8b11
AM
194+obj-$(CONFIG_CLUSTER) += cman/
195diff -urN linux-orig/cluster/nocluster.c linux-orig2/cluster/nocluster.c
196--- linux-orig/cluster/nocluster.c 1969-12-31 18:00:00.000000000 -0600
197+++ linux-orig2/cluster/nocluster.c 2004-10-22 11:29:33.567217776 -0500
198@@ -0,0 +1,20 @@
199+/*
200+ * cluster/nocluster.c
201+ *
202+ * Copy from net/nonet.c
203+ * Dummy functions to allow us to configure cluster support entirely
204+ * out of the kernel.
205+ *
206+ * Distributed under the terms of the GNU GPL version 2.
207+ * Copyright (c) Matthew Wilcox 2003
208+ */
c1c6733f 209+
bb1d8b11
AM
210+#include <linux/module.h>
211+#include <linux/errno.h>
212+#include <linux/fs.h>
213+#include <linux/init.h>
214+#include <linux/kernel.h>
c1c6733f 215+
bb1d8b11
AM
216+void __init nocluster_init(void)
217+{
218+}
219diff -urN linux-orig/Makefile linux-orig2/Makefile
220--- linux-orig/Makefile 2004-10-18 16:54:38.000000000 -0500
221+++ linux-orig2/Makefile 2004-10-22 11:29:33.507218717 -0500
222@@ -445,7 +445,7 @@
223
224 # Objects we will link into vmlinux / subdirs we need to visit
225 init-y := init/
226-drivers-y := drivers/ sound/
227+drivers-y := drivers/ sound/ cluster/
228 net-y := net/
229 libs-y := lib/
230 core-y := usr/
231diff -urN linux-orig/cluster/cman/cnxman-private.h linux-patched/cluster/cman/cnxman-private.h
232--- linux-orig/cluster/cman/cnxman-private.h 1970-01-01 07:30:00.000000000 +0730
233+++ linux-patched/cluster/cman/cnxman-private.h 2004-11-03 11:37:37.000000000 +0800
234@@ -0,0 +1,432 @@
235+/******************************************************************************
236+*******************************************************************************
237+**
238+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
239+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
240+**
241+** This copyrighted material is made available to anyone wishing to use,
242+** modify, copy, or redistribute it subject to the terms and conditions
243+** of the GNU General Public License v.2.
244+**
245+*******************************************************************************
246+******************************************************************************/
c1c6733f 247+
bb1d8b11
AM
248+#ifndef __CNXMAN_PRIVATE_H
249+#define __CNXMAN_PRIVATE_H
c1c6733f 250+
bb1d8b11
AM
251+/* Version triplet */
252+#define CNXMAN_MAJOR_VERSION 3
253+#define CNXMAN_MINOR_VERSION 0
254+#define CNXMAN_PATCH_VERSION 1
c1c6733f 255+
bb1d8b11
AM
256+#define MAX_RETRIES 3 /* Maximum number of send retries */
257+#define CAP_CLUSTER CAP_SYS_ADMIN /* Capability needed to manage the
258+ * cluster */
259+#ifdef __KERNEL__
c1c6733f 260+
bb1d8b11
AM
261+/* How we announce ourself in console events */
262+#define CMAN_NAME "CMAN"
c1c6733f 263+
bb1d8b11
AM
264+/* One of these per AF_CLUSTER socket */
265+struct cluster_sock {
266+ /* WARNING: sk has to be the first member */
267+ struct sock sk;
c1c6733f 268+
bb1d8b11
AM
269+ unsigned char port; /* Bound port or zero */
270+ int (*kernel_callback) (char *, int, char *, int, unsigned int);
271+ void *service_data;
272+};
c1c6733f 273+
bb1d8b11 274+#define cluster_sk(__sk) ((struct cluster_sock *)__sk)
c1c6733f 275+
bb1d8b11
AM
276+/* We have one of these for each socket we use for communications */
277+struct cl_comms_socket {
278+ struct socket *sock;
279+ int broadcast; /* This is a broadcast socket */
280+ int recv_only; /* This is the unicast receive end of a
281+ * multicast socket */
282+ struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for
283+ * the remote end(s) */
284+ int addr_len; /* Length of above */
285+ int number; /* Internal socket number, used to cycle around
286+ * sockets in case of network errors */
287+ struct file *file; /* file pointer for user-passed in sockets */
c1c6733f 288+
bb1d8b11 289+ wait_queue_t wait;
c1c6733f 290+
bb1d8b11
AM
291+ /* The socket list */
292+ struct list_head list;
c783755a 293+
bb1d8b11
AM
294+ /* On here when it has something to say */
295+ struct list_head active_list;
296+ unsigned long active;
297+};
c783755a 298+
bb1d8b11
AM
299+/* A client socket. We keep a list of these so we can notify clients of cluster
300+ * events */
301+struct cl_client_socket {
302+ struct socket *sock;
303+ struct list_head list;
304+};
c1c6733f 305+
bb1d8b11
AM
306+/* This structure is tacked onto the start of a cluster message packet for our
307+ * own nefarious purposes. */
308+struct cl_protheader {
309+ unsigned char tgtport; /* Target port number */
310+ unsigned char srcport; /* Source (originationg) port number */
311+ unsigned short seq; /* Packet sequence number, little-endian */
312+ unsigned short ack; /* Inline ACK */
313+ unsigned short cluster; /* Our cluster number, little-endian */
314+ unsigned int flags;
315+ int srcid; /* Node ID of the sender */
316+ int tgtid; /* Node ID of the target or 0 for multicast
317+ * messages */
318+};
c1c6733f 319+
bb1d8b11
AM
320+/* A cluster internal protocol message - port number 0 */
321+struct cl_protmsg {
322+ struct cl_protheader header;
323+ unsigned char cmd;
324+};
c1c6733f 325+
bb1d8b11
AM
326+/* A Cluster ACK message */
327+struct cl_ackmsg {
328+ struct cl_protheader header;
329+ unsigned char cmd; /* Always CLUSTER_CMD_ACK */
330+ unsigned char remport; /* Remote port number the original message was
331+ * for */
332+ unsigned char aflags; /* ACK flags 0=OK, 1=No listener */
333+ unsigned char pad;
334+};
c1c6733f 335+
bb1d8b11
AM
336+/* A Cluster LISTENREQ/LISTENRESP message */
337+struct cl_listenmsg {
338+ unsigned char cmd; /* CLUSTER_CMD_LISTENRESP/REQ */
339+ unsigned char target_port; /* Port to probe */
340+ unsigned char listening; /* Always 0 for LISTENREQ */
341+ unsigned char pad;
342+ unsigned short tag; /* PID of remote waiting process */
343+};
c1c6733f 344+
bb1d8b11
AM
345+/* A Cluster PORTCLOSED message */
346+struct cl_closemsg {
347+ unsigned char cmd; /* CLUSTER_CMD_PORTCLOSED */
348+ unsigned char port;
349+};
c1c6733f 350+
bb1d8b11
AM
351+/* Structure of a newly dead node, passed from cnxman to kmembershipd */
352+struct cl_new_dead_node {
353+ struct list_head list;
c1c6733f 354+ struct cluster_node *node;
bb1d8b11 355+};
c1c6733f 356+
bb1d8b11
AM
357+/* Subcommands for BARRIER message */
358+#define BARRIER_REGISTER 1
359+#define BARRIER_CHANGE 2
360+#define BARRIER_WAIT 4
361+#define BARRIER_COMPLETE 5
c1c6733f 362+
bb1d8b11
AM
363+/* A Cluster BARRIER message */
364+struct cl_barriermsg {
365+ unsigned char cmd; /* CLUSTER_CMD_BARRIER */
366+ unsigned char subcmd; /* BARRIER sub command */
367+ unsigned short pad;
368+ unsigned int flags;
369+ unsigned int nodes;
370+ char name[MAX_BARRIER_NAME_LEN];
371+};
c1c6733f 372+
bb1d8b11
AM
373+/* Membership services messages, the cl_protheader is added transparently */
374+struct cl_mem_hello_msg {
375+ unsigned char cmd;
376+ unsigned char flags;
377+ unsigned short members; /* Number of nodes in the cluster,
378+ * little-endian */
379+ unsigned int generation; /* Current cluster generation number */
380+};
c1c6733f 381+
bb1d8b11
AM
382+struct cl_mem_endtrans_msg {
383+ unsigned char cmd;
384+ unsigned char pad1;
385+ unsigned short pad2;
386+ unsigned int quorum;
387+ unsigned int total_votes;
388+ unsigned int generation; /* Current cluster generation number */
389+ unsigned int new_node_id; /* If reason is a new node joining */
390+};
c1c6733f 391+
bb1d8b11
AM
392+/* ACK types for JOINACK message */
393+#define JOINACK_TYPE_OK 1 /* You can join */
394+#define JOINACK_TYPE_NAK 2 /* You can NOT join */
395+#define JOINACK_TYPE_WAIT 3 /* Wait a bit longer - cluster is in transition
396+ * already */
c783755a 397+
bb1d8b11
AM
398+struct cl_mem_joinack_msg {
399+ unsigned char cmd;
400+ unsigned char acktype;
401+};
c1c6733f 402+
bb1d8b11
AM
403+/* This is used by JOINREQ message */
404+struct cl_mem_join_msg {
405+ unsigned char cmd;
406+ unsigned char votes;
407+ unsigned short num_addr; /* Number of addresses for this node */
408+ unsigned int expected_votes;
409+ unsigned int nodeid; /* node ID we want */
410+ unsigned int major_version; /* Not backwards compatible */
411+ unsigned int minor_version; /* Backwards compatible */
412+ unsigned int patch_version; /* Backwards/forwards compatible */
413+ unsigned int config_version;
414+ unsigned int addr_len; /* length of node addresses */
415+ char clustername[16];
416+ /* Followed by <num_addr> addresses of `address_length` bytes and a
417+ * NUL-terminated node name */
418+};
c1c6733f 419+
bb1d8b11
AM
420+/* State transition start reasons: */
421+#define TRANS_NEWNODE 1 /* A new node is joining the cluster */
422+#define TRANS_REMNODE 2 /* a node has left the cluster */
423+#define TRANS_ANOTHERREMNODE 3 /* A node left the cluster while we were in
424+ * transition */
425+#define TRANS_NEWMASTER 4 /* We have had an election and I am the new
426+ * master */
427+#define TRANS_CHECK 5 /* A consistency check was called for */
428+#define TRANS_RESTART 6 /* Transition restarted because of a previous
429+ * timeout */
430+#define TRANS_DEADMASTER 7 /* The master died during transition and I have
431+ * taken over */
c1c6733f 432+
bb1d8b11
AM
433+/* This is used to start a state transition */
434+struct cl_mem_starttrans_msg {
435+ unsigned char cmd;
436+ unsigned char reason; /* Why a start transition was started - see
437+ * above */
438+ unsigned char flags;
439+ unsigned char votes;
440+ unsigned int expected_votes;
441+ unsigned int generation; /* Incremented for each STARTTRANS sent
442+ */
443+ int nodeid; /* Node to be removed */
444+ unsigned short num_addrs;
445+ /* If reason == TRANS_NEWNODE: Followed by <num_addr> addresses of
446+ * `address_length` bytes and a NUL-terminated node name */
447+};
c1c6733f 448+
bb1d8b11
AM
449+struct cl_mem_startack_msg {
450+ unsigned char cmd;
451+ unsigned char reason;
452+ unsigned short pad;
453+ unsigned int generation;
454+ unsigned int node_id; /* node_id we think new node should have */
455+ unsigned int highest_node_id; /* highest node_id on this system */
456+};
c1c6733f 457+
bb1d8b11
AM
458+/* Reconfigure a cluster parameter */
459+struct cl_mem_reconfig_msg {
460+ unsigned char cmd;
461+ unsigned char param;
462+ unsigned short pad;
463+ unsigned int value;
464+};
c1c6733f 465+
bb1d8b11
AM
466+/* Structure containing information about an outstanding listen request */
467+struct cl_waiting_listen_request {
468+ wait_queue_head_t waitq;
469+ int result;
470+ int waiting;
471+ unsigned short tag;
472+ int nodeid;
473+ struct list_head list;
474+};
c1c6733f 475+
bb1d8b11
AM
476+/* Messages from membership services */
477+#define CLUSTER_MEM_JOINCONF 1
478+#define CLUSTER_MEM_JOINREQ 2
479+#define CLUSTER_MEM_LEAVE 3
480+#define CLUSTER_MEM_HELLO 4
481+#define CLUSTER_MEM_KILL 5
482+#define CLUSTER_MEM_JOINACK 6
483+#define CLUSTER_MEM_ENDTRANS 7
484+#define CLUSTER_MEM_RECONFIG 8
485+#define CLUSTER_MEM_MASTERVIEW 9
486+#define CLUSTER_MEM_STARTTRANS 10
487+#define CLUSTER_MEM_JOINREJ 11
488+#define CLUSTER_MEM_VIEWACK 12
489+#define CLUSTER_MEM_STARTACK 13
490+#define CLUSTER_MEM_TRANSITION 14
491+#define CLUSTER_MEM_NEWCLUSTER 15
492+#define CLUSTER_MEM_CONFACK 16
493+#define CLUSTER_MEM_NOMINATE 17
c1c6733f 494+
bb1d8b11
AM
495+/* Flags in the HELLO message */
496+#define HELLO_FLAG_MASTER 1
497+#define HELLO_FLAG_QUORATE 2
c1c6733f 498+
bb1d8b11
AM
499+/* Parameters for RECONFIG command */
500+#define RECONFIG_PARAM_EXPECTED_VOTES 1
501+#define RECONFIG_PARAM_NODE_VOTES 2
502+#define RECONFIG_PARAM_CONFIG_VERSION 3
c1c6733f 503+
bb1d8b11
AM
504+/* Data associated with an outgoing socket */
505+struct cl_socket {
506+ struct file *file; /* The real file */
507+ struct socket *socket; /* The real sock */
508+ int num_nodes; /* On this link */
509+ int retransmit_count;
510+};
b7b72b66 511+
bb1d8b11
AM
512+/* There's one of these for each node in the cluster */
513+struct cluster_node {
514+ struct list_head list;
515+ char *name; /* Node/host name of node */
516+ struct list_head addr_list;
517+ int us; /* This node is us */
518+ unsigned int node_id; /* Unique node ID */
519+ nodestate_t state;
520+ unsigned short last_seq_recv;
521+ unsigned short last_seq_acked;
522+ unsigned short last_seq_sent;
523+ unsigned int votes;
524+ unsigned int expected_votes;
525+ unsigned int leave_reason;
526+ unsigned int incarnation; /* Incremented each time a node joins
527+ * the cluster */
528+ unsigned long last_hello; /* Jiffies */
529+ struct timeval join_time;
530+};
b7b72b66 531+
bb1d8b11
AM
532+/* This is how we keep a list of user processes that are listening for cluster
533+ * membership events */
534+struct notify_struct {
535+ struct list_head list;
536+ pid_t pid;
537+ int signal;
538+};
b7b72b66 539+
bb1d8b11
AM
540+/* This is how we keep a list of kernel callbacks that are registered for
541+ * cluster membership events */
542+struct kernel_notify_struct {
543+ struct list_head list;
544+ void (*callback) (kcl_callback_reason, long arg);
545+};
c1c6733f 546+
bb1d8b11
AM
547+/* A message waiting to be sent */
548+struct queued_message {
549+ struct list_head list;
c1c6733f 550+
bb1d8b11
AM
551+ struct socket *socket;
552+ struct sockaddr_cl addr;
553+ int addr_len;
554+ int msg_len;
555+ unsigned char port;
556+ unsigned int flags;
557+ char msg_buffer[MAX_CLUSTER_MESSAGE];
558+};
c1c6733f 559+
bb1d8b11
AM
560+/* A barrier */
561+struct cl_barrier {
562+ struct list_head list;
c1c6733f 563+
bb1d8b11
AM
564+ char name[MAX_BARRIER_NAME_LEN];
565+ unsigned int flags;
566+ enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE,
567+ BARRIER_STATE_COMPLETE } state;
568+ unsigned int expected_nodes;
569+ unsigned int registered_nodes;
570+ atomic_t got_nodes;
571+ atomic_t completed_nodes;
572+ unsigned int inuse;
573+ unsigned int waitsent;
574+ unsigned int phase; /* Completion phase */
575+ unsigned int endreason; /* Reason we were woken, usually 0 */
576+ unsigned long timeout; /* In seconds */
c1c6733f 577+
bb1d8b11
AM
578+ void (*callback) (char *name, int status);
579+ wait_queue_head_t waitq;
580+ struct semaphore lock; /* To synch with cnxman messages */
581+ spinlock_t phase2_spinlock; /* Need to synchronise with timer
582+ * interrupts */
583+ struct timer_list timer;
584+};
c1c6733f 585+
bb1d8b11
AM
586+/* Cluster protocol commands sent to port 0 */
587+#define CLUSTER_CMD_ACK 1
588+#define CLUSTER_CMD_LISTENREQ 2
589+#define CLUSTER_CMD_LISTENRESP 3
590+#define CLUSTER_CMD_PORTCLOSED 4
591+#define CLUSTER_CMD_BARRIER 5
c1c6733f 592+
bb1d8b11
AM
593+extern struct cluster_node *find_node_by_addr(unsigned char *addr,
594+ int addr_len);
595+extern struct cluster_node *find_node_by_nodeid(unsigned int id);
596+extern struct cluster_node *find_node_by_name(char *name);
597+extern void set_quorate(int);
598+extern void notify_kernel_listeners(kcl_callback_reason reason, long arg);
599+extern void notify_listeners(void);
600+extern void free_nodeid_array(void);
601+extern int send_reconfigure(int param, unsigned int value);
602+extern int calculate_quorum(int, int, int *);
603+extern void recalculate_quorum(int);
604+extern int send_leave(unsigned char);
605+extern int get_quorum(void);
606+extern void set_votes(int, int);
607+extern void kcl_wait_for_all_acks(void);
608+extern char *membership_state(char *, int);
609+extern char *leave_string(int reason);
610+extern void a_node_just_died(struct cluster_node *node);
611+extern void check_barrier_returns(void);
612+extern int in_transition(void);
613+extern void get_local_addresses(struct cluster_node *node);
614+extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len);
615+extern void create_proc_entries(void);
616+extern void cleanup_proc_entries(void);
617+extern unsigned int get_highest_nodeid(void);
618+extern int allocate_nodeid_array(void);
619+extern void queue_oob_skb(struct socket *sock, int cmd);
620+extern int new_temp_nodeid(char *addr, int addrlen);
621+extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen);
622+extern void purge_temp_nodeids(void);
623+extern inline char *print_addr(unsigned char *addr, int len, char *buf)
624+{
625+ int i;
626+ int ptr = 0;
c1c6733f 627+
bb1d8b11
AM
628+ for (i = 0; i < len; i++)
629+ ptr += sprintf(buf + ptr, "%02x ", addr[i]);
c1c6733f 630+
bb1d8b11 631+ return buf;
c1c6733f
AM
632+}
633+
bb1d8b11 634+#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1)
b7b72b66 635+
bb1d8b11
AM
636+/* Debug enabling macros. Sorry about the C++ comments but they're easier to
637+ * get rid of than C ones... */
b7b72b66 638+
bb1d8b11
AM
639+// #define DEBUG_MEMB
640+// #define DEBUG_COMMS
641+// #define DEBUG_BARRIER
b7b72b66 642+
bb1d8b11
AM
643+/* Debug macros */
644+#ifdef DEBUG_COMMS
645+#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args)
646+#else
647+#define P_COMMS(fmt, args...)
648+#endif
b7b72b66 649+
bb1d8b11
AM
650+#ifdef DEBUG_BARRIER
651+#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args)
652+#else
653+#define P_BARRIER(fmt, args...)
654+#endif
c783755a 655+
bb1d8b11
AM
656+#ifdef DEBUG_MEMB
657+#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args)
658+#define C_MEMB(fmt, args...) printk(fmt, ## args)
659+#else
660+#define P_MEMB(fmt, args...)
661+#define C_MEMB(fmt, args...)
662+#endif
c783755a 663+
bb1d8b11 664+#endif /* __KERNEL */
c783755a 665+
bb1d8b11
AM
666+#endif
667diff -urN linux-orig/cluster/cman/cnxman.c linux-patched/cluster/cman/cnxman.c
668--- linux-orig/cluster/cman/cnxman.c 1970-01-01 07:30:00.000000000 +0730
669+++ linux-patched/cluster/cman/cnxman.c 2004-11-03 11:37:37.000000000 +0800
670@@ -0,0 +1,4214 @@
671+/******************************************************************************
672+*******************************************************************************
673+**
674+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
675+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
676+**
677+** This copyrighted material is made available to anyone wishing to use,
678+** modify, copy, or redistribute it subject to the terms and conditions
679+** of the GNU General Public License v.2.
680+**
681+*******************************************************************************
682+******************************************************************************/
c783755a 683+
bb1d8b11
AM
684+#define EXPORT_SYMTAB
685+#include <linux/init.h>
686+#include <linux/socket.h>
687+#include <linux/kernel.h>
688+#include <linux/sched.h>
689+#include <linux/file.h>
690+#include <linux/utsname.h>
691+#include <net/sock.h>
692+#include <linux/proc_fs.h>
693+#include <linux/poll.h>
694+#include <linux/module.h>
695+#include <linux/list.h>
696+#include <linux/uio.h>
697+#include <cluster/cnxman.h>
698+#include <cluster/service.h>
c783755a 699+
bb1d8b11
AM
700+#include "cnxman-private.h"
701+#include "sm_control.h"
702+#include "sm_user.h"
703+#include "config.h"
c783755a 704+
bb1d8b11 705+#define CMAN_RELEASE_NAME "<CVS>"
c783755a 706+
bb1d8b11
AM
707+static void process_incoming_packet(struct cl_comms_socket *csock,
708+ struct msghdr *msg, struct kvec *vec, int veclen, int len);
709+static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq,
710+ int addr_len, char *addr, unsigned char remport,
711+ unsigned char flag);
712+static void send_listen_request(int nodeid, unsigned char port);
713+static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
714+ unsigned char port, unsigned short tag);
715+static void resend_last_message(void);
716+static void start_ack_timer(void);
717+static int send_queued_message(struct queued_message *qmsg);
718+static void send_port_close_oob(unsigned char port);
719+static void post_close_oob(unsigned char port, int nodeid);
720+static void process_barrier_msg(struct cl_barriermsg *msg,
721+ struct cluster_node *node);
722+static struct cl_barrier *find_barrier(char *name);
723+static void node_shutdown(void);
724+static void node_cleanup(void);
725+static int send_or_queue_message(struct socket *sock, void *buf, int len, struct sockaddr_cl *caddr,
726+ unsigned int flags);
727+static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur);
728+static void check_for_unacked_nodes(void);
729+static void free_cluster_sockets(void);
730+static uint16_t generate_cluster_id(char *name);
731+static int is_valid_temp_nodeid(int nodeid);
c783755a 732+
bb1d8b11
AM
733+extern int start_membership_services(pid_t);
734+extern int kcl_leave_cluster(int remove);
735+extern int send_kill(int nodeid);
c783755a 736+
bb1d8b11
AM
737+static struct proto_ops cl_proto_ops;
738+static struct sock *master_sock;
739+static kmem_cache_t *cluster_sk_cachep;
c783755a 740+
bb1d8b11
AM
741+/* Pointer to the pseudo node that maintains quorum in a 2node system */
742+struct cluster_node *quorum_device = NULL;
c783755a 743+
bb1d8b11
AM
744+/* Array of "ports" allocated. This is just a list of pointers to the sock that
745+ * has this port bound. Speed is a major issue here so 1-2K of allocated
746+ * storage is worth sacrificing. Port 0 is reserved for protocol messages */
747+static struct sock *port_array[256];
748+static struct semaphore port_array_lock;
c783755a 749+
bb1d8b11
AM
750+/* Our cluster name & number */
751+uint16_t cluster_id;
752+char cluster_name[MAX_CLUSTER_NAME_LEN+1];
c783755a 753+
bb1d8b11
AM
754+/* Two-node mode: causes cluster to remain quorate if one of two nodes fails.
755+ * No more than two nodes are permitted to join the cluster. */
756+unsigned short two_node;
c783755a 757+
bb1d8b11
AM
758+/* Cluster configuration version that must be the same among members. */
759+unsigned int config_version;
c783755a 760+
bb1d8b11
AM
761+/* Reference counting for cluster applications */
762+atomic_t use_count;
c783755a 763+
bb1d8b11
AM
764+/* Length of sockaddr address for our comms protocol */
765+unsigned int address_length;
c1c6733f 766+
bb1d8b11
AM
767+/* Message sending */
768+static unsigned short cur_seq; /* Last message sent */
769+static unsigned int ack_count; /* Number of acks received for message
770+ * 'cur_seq' */
771+static unsigned int acks_expected; /* Number of acks we expect to receive */
772+static struct semaphore send_lock;
773+static struct timer_list ack_timer;
c1c6733f 774+
bb1d8b11
AM
775+/* Saved packet information in case we need to resend it */
776+static char saved_msg_buffer[MAX_CLUSTER_MESSAGE];
777+static int saved_msg_len;
778+static int retry_count;
c1c6733f 779+
bb1d8b11
AM
780+/* Task variables */
781+static pid_t kcluster_pid;
782+static pid_t membership_pid;
783+extern struct task_struct *membership_task;
784+extern int quit_threads;
c1c6733f 785+
bb1d8b11 786+wait_queue_head_t cnxman_waitq;
c1c6733f 787+
bb1d8b11
AM
788+/* Variables owned by membership services */
789+extern int cluster_members;
790+extern struct list_head cluster_members_list;
791+extern struct semaphore cluster_members_lock;
792+extern int we_are_a_cluster_member;
793+extern int cluster_is_quorate;
794+extern struct cluster_node *us;
795+extern struct list_head new_dead_node_list;
796+extern struct semaphore new_dead_node_lock;
797+extern char nodename[];
798+extern int wanted_nodeid;
c1c6733f 799+
bb1d8b11
AM
800+/* A list of processes listening for membership events */
801+static struct list_head event_listener_list;
802+static struct semaphore event_listener_lock;
c1c6733f 803+
bb1d8b11
AM
804+/* A list of kernel callbacks listening for membership events */
805+static struct list_head kernel_listener_list;
806+static struct semaphore kernel_listener_lock;
c1c6733f 807+
bb1d8b11
AM
808+/* A list of sockets we are listening on (and can transmit on...later) */
809+static struct list_head socket_list;
c1c6733f 810+
bb1d8b11
AM
811+/* A list of all open cluster client sockets */
812+static struct list_head client_socket_list;
813+static struct semaphore client_socket_lock;
c1c6733f 814+
bb1d8b11
AM
815+/* A list of all current barriers */
816+static struct list_head barrier_list;
817+static struct semaphore barrier_list_lock;
c1c6733f 818+
bb1d8b11
AM
819+/* When a socket is read for reading it goes on this queue */
820+static spinlock_t active_socket_lock;
821+static struct list_head active_socket_list;
c1c6733f 822+
bb1d8b11
AM
823+/* If the cnxman process is running and available for work */
824+atomic_t cnxman_running;
c1c6733f 825+
bb1d8b11
AM
826+/* Fkags set by timers etc for the mainloop to detect and act upon */
827+static unsigned long mainloop_flags;
b7b72b66 828+
bb1d8b11
AM
829+#define ACK_TIMEOUT 1
830+#define RESEND_NEEDED 2
c1c6733f 831+
bb1d8b11
AM
832+/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of
833+ * process context then the messages get put in here */
834+static struct list_head messages_list;
835+static struct semaphore messages_list_lock;
b7b72b66 836+
bb1d8b11 837+static struct semaphore start_thread_sem;
c1c6733f 838+
bb1d8b11
AM
839+/* List of outstanding ISLISTENING requests */
840+static struct list_head listenreq_list;
841+static struct semaphore listenreq_lock;
c1c6733f 842+
bb1d8b11
AM
843+/* Any sending requests wait on this queue if necessary (eg inquorate, waiting
844+ * ACK) */
845+static DECLARE_WAIT_QUEUE_HEAD(socket_waitq);
c1c6733f 846+
bb1d8b11
AM
847+/* Wait for thread to exit properly */
848+struct completion cluster_thread_comp;
849+struct completion member_thread_comp;
c1c6733f 850+
bb1d8b11
AM
851+/* The resend delay to use, We increase this geometrically(word?) each time a
852+ * send is delayed. in deci-seconds */
853+static int resend_delay = 1;
c1c6733f 854+
bb1d8b11
AM
855+/* Highest numbered interface and the current default */
856+static int num_interfaces;
857+static struct cl_comms_socket *current_interface = NULL;
c1c6733f 858+
bb1d8b11
AM
859+struct temp_node
860+{
861+ int nodeid;
862+ char addr[sizeof(struct sockaddr_in6)];
863+ int addrlen;
864+ struct list_head list;
865+};
866+static struct list_head tempnode_list;
867+static struct semaphore tempnode_lock;
868+
869+
870+/* This is what's squirrelled away in skb->cb */
871+struct cb_info
872+{
873+ int orig_nodeid;
874+ char orig_port;
875+ char oob;
876+};
c1c6733f 877+
c1c6733f 878+
bb1d8b11
AM
879+/* Wake up any processes that are waiting to send. This is usually called when
880+ * all the ACKs have been gathered up or when a node has left the cluster
881+ * unexpectedly and we reckon there are no more acks to collect */
882+static void unjam(void)
883+{
884+ wake_up_interruptible(&socket_waitq);
885+ wake_up_interruptible(&cnxman_waitq);
c1c6733f
AM
886+}
887+
bb1d8b11
AM
888+/* Used by the data_ready routine to locate a connection given the socket */
889+static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk)
c1c6733f 890+{
bb1d8b11 891+ struct list_head *conlist;
c1c6733f 892+
bb1d8b11
AM
893+ list_for_each(conlist, &socket_list) {
894+ struct cl_comms_socket *clsock =
895+ list_entry(conlist, struct cl_comms_socket, list);
896+ if (clsock->sock->sk == sk) {
897+ return clsock;
898+ }
c1c6733f 899+ }
c1c6733f
AM
900+ return NULL;
901+}
902+
bb1d8b11
AM
903+/* Data available on socket */
904+static void cnxman_data_ready(struct sock *sk, int count_unused)
c1c6733f 905+{
bb1d8b11 906+ struct cl_comms_socket *clsock = find_comms_by_sock(sk);
c1c6733f 907+
bb1d8b11
AM
908+ if (clsock == NULL) /* ASSERT ?? */
909+ return;
c1c6733f 910+
bb1d8b11
AM
911+ /* If we're already on the list then don't do it again */
912+ if (test_and_set_bit(1, &clsock->active))
913+ return;
c1c6733f 914+
bb1d8b11
AM
915+ spin_lock_irq(&active_socket_lock);
916+ list_add(&clsock->active_list, &active_socket_list);
917+ spin_unlock_irq(&active_socket_lock);
c1c6733f 918+
bb1d8b11
AM
919+ wake_up_interruptible(&cnxman_waitq);
920+}
c1c6733f 921+
bb1d8b11
AM
922+static int receive_message(struct cl_comms_socket *csock, char *iobuf)
923+{
924+ struct msghdr msg;
925+ struct kvec vec;
926+ struct sockaddr_in6 sin;
927+ int len;
c1c6733f 928+
bb1d8b11 929+ memset(&sin, 0, sizeof (sin));
c1c6733f 930+
bb1d8b11
AM
931+ msg.msg_control = NULL;
932+ msg.msg_controllen = 0;
933+ msg.msg_name = &sin;
934+ msg.msg_namelen = sizeof (sin);
935+ msg.msg_flags = 0;
c1c6733f 936+
bb1d8b11
AM
937+ vec.iov_len = MAX_CLUSTER_MESSAGE;
938+ vec.iov_base = iobuf;
c1c6733f 939+
bb1d8b11
AM
940+ len = kernel_recvmsg(csock->sock, &msg,
941+ &vec, 1, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT);
942+
943+ vec.iov_base = iobuf;
944+
945+ if (len > 0) {
946+ if (len > MAX_CLUSTER_MESSAGE) {
947+ printk(KERN_CRIT CMAN_NAME
948+ ": %d byte message far too big\n", len);
949+ return 0;
c1c6733f 950+ }
bb1d8b11 951+ process_incoming_packet(csock, &msg, &vec, 1, len);
c1c6733f 952+ }
bb1d8b11
AM
953+ else {
954+ if (len != -EAGAIN)
955+ printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n",
956+ len);
957+ }
958+ return len;
c1c6733f
AM
959+}
960+
bb1d8b11 961+static int cluster_kthread(void *unused)
c1c6733f 962+{
bb1d8b11
AM
963+ int len;
964+ char *iobuf;
965+ struct list_head *socklist;
966+ struct cl_comms_socket *csock;
967+ wait_queue_t cnxman_waitq_head;
968+ sigset_t tmpsig;
c1c6733f 969+
bb1d8b11 970+ daemonize("cman_comms");
c1c6733f 971+
bb1d8b11
AM
972+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
973+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
974+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
c1c6733f 975+
bb1d8b11
AM
976+ /* This is the waitq we can wake the process up with */
977+ init_waitqueue_head(&cnxman_waitq);
978+ init_waitqueue_entry(&cnxman_waitq_head, current);
979+ add_wait_queue(&cnxman_waitq, &cnxman_waitq_head);
c1c6733f 980+
bb1d8b11 981+ set_user_nice(current, -6);
c1c6733f 982+
bb1d8b11
AM
983+ /* Allow the sockets to start receiving */
984+ list_for_each(socklist, &socket_list) {
985+ csock = list_entry(socklist, struct cl_comms_socket, list);
c1c6733f 986+
bb1d8b11 987+ clear_bit(1, &csock->active);
c1c6733f
AM
988+ }
989+
bb1d8b11
AM
990+ iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL);
991+ if (!iobuf) {
992+ printk(KERN_CRIT CMAN_NAME
993+ ": Cannot allocate receive buffer for cluster comms\n");
994+ return -1;
995+ }
c1c6733f 996+
bb1d8b11 997+ complete(&cluster_thread_comp);
c1c6733f 998+
bb1d8b11
AM
999+ for (;;) {
1000+ struct list_head *temp;
c1c6733f 1001+
bb1d8b11
AM
1002+ /* Wait for activity on any of the sockets */
1003+ set_task_state(current, TASK_INTERRUPTIBLE);
c1c6733f 1004+
bb1d8b11
AM
1005+ if (list_empty(&active_socket_list))
1006+ schedule();
1007+ set_task_state(current, TASK_RUNNING);
c1c6733f 1008+
bb1d8b11
AM
1009+ if (quit_threads)
1010+ break;
c1c6733f 1011+
bb1d8b11
AM
1012+ if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) {
1013+ check_for_unacked_nodes();
1014+ }
c1c6733f 1015+
bb1d8b11
AM
1016+ /* Now receive any messages waiting for us */
1017+ spin_lock_irq(&active_socket_lock);
1018+ list_for_each_safe(socklist, temp, &active_socket_list) {
1019+ csock =
1020+ list_entry(socklist, struct cl_comms_socket,
1021+ active_list);
c1c6733f 1022+
bb1d8b11
AM
1023+ list_del(&csock->active_list);
1024+ clear_bit(1, &csock->active);
c1c6733f 1025+
bb1d8b11 1026+ spin_unlock_irq(&active_socket_lock);
c1c6733f 1027+
bb1d8b11
AM
1028+ do {
1029+ len = receive_message(csock, iobuf);
1030+ }
1031+ while (len > 0);
c1c6733f 1032+
bb1d8b11 1033+ spin_lock_irq(&active_socket_lock);
c1c6733f 1034+
bb1d8b11
AM
1035+ if (len == 0)
1036+ break; /* EOF on socket */
1037+ }
1038+ spin_unlock_irq(&active_socket_lock);
c1c6733f 1039+
bb1d8b11
AM
1040+ /* Resend any unacked messages */
1041+ if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags)
1042+ && acks_expected) {
1043+ resend_last_message();
1044+ }
c1c6733f 1045+
bb1d8b11
AM
1046+ /* Send any queued messages */
1047+ if (acks_expected == 0) {
1048+ struct list_head *temp;
1049+ struct list_head *msglist;
1050+
1051+ down(&messages_list_lock);
1052+ list_for_each_safe(msglist, temp, &messages_list) {
1053+ struct queued_message *qmsg =
1054+ list_entry(msglist, struct queued_message,
1055+ list);
1056+ int status = send_queued_message(qmsg);
1057+
1058+ if (status >= 0) {
1059+ /* Suceeded, remove it from the queue */
1060+ list_del(&qmsg->list);
1061+ kfree(qmsg);
1062+ }
1063+ /* Did it fail horribly ?? */
1064+ if (status < 0 && status != -EAGAIN) {
1065+ printk(KERN_INFO CMAN_NAME
1066+ ": send_queued_message failed, error %d\n",
1067+ status);
1068+ list_del(&qmsg->list);
1069+ kfree(qmsg);
1070+ }
1071+ break; /* Only send one message at a time */
c1c6733f 1072+ }
bb1d8b11 1073+ up(&messages_list_lock);
c1c6733f
AM
1074+ }
1075+
bb1d8b11
AM
1076+ if (signal_pending(current))
1077+ break;
1078+ }
1079+ P_COMMS("closing down\n");
c1c6733f 1080+
bb1d8b11 1081+ quit_threads = 1; /* force other thread to die too */
c1c6733f 1082+
bb1d8b11
AM
1083+ /* Wait for membership thread to finish, that way any
1084+ LEAVE message will get sent. */
1085+ wake_up_process(membership_task);
1086+ wait_for_completion(&member_thread_comp);
c1c6733f 1087+
bb1d8b11 1088+ node_shutdown();
c1c6733f 1089+
bb1d8b11
AM
1090+ if (timer_pending(&ack_timer))
1091+ del_timer(&ack_timer);
c1c6733f 1092+
bb1d8b11
AM
1093+ node_cleanup();
1094+ kfree(iobuf);
c1c6733f 1095+
bb1d8b11 1096+ complete(&cluster_thread_comp);
c1c6733f
AM
1097+ return 0;
1098+}
1099+
bb1d8b11 1100+void notify_kernel_listeners(kcl_callback_reason reason, long arg)
c1c6733f 1101+{
bb1d8b11
AM
1102+ struct kernel_notify_struct *knotify;
1103+ struct list_head *proclist;
1104+
1105+ down(&kernel_listener_lock);
1106+ list_for_each(proclist, &kernel_listener_list) {
1107+ knotify =
1108+ list_entry(proclist, struct kernel_notify_struct, list);
1109+ knotify->callback(reason, arg);
1110+ }
1111+ up(&kernel_listener_lock);
c1c6733f
AM
1112+}
1113+
bb1d8b11 1114+static void check_for_unacked_nodes()
c1c6733f 1115+{
bb1d8b11
AM
1116+ struct list_head *nodelist;
1117+ struct list_head *temp;
1118+ struct cluster_node *node;
1119+
1120+ clear_bit(RESEND_NEEDED, &mainloop_flags);
1121+ retry_count = 0;
1122+
1123+ P_COMMS("Retry count exceeded -- looking for dead node\n");
1124+
1125+ /* Node did not ACK a message after <n> tries, remove it from the
1126+ * cluster */
1127+ down(&cluster_members_lock);
1128+ list_for_each_safe(nodelist, temp, &cluster_members_list) {
1129+ node = list_entry(nodelist, struct cluster_node, list);
1130+
1131+ P_COMMS("checking node %s: last_acked = %d, last_seq_sent = %d\n",
1132+ node->name, node->last_seq_acked, node->last_seq_sent);
1133+ if (node->state != NODESTATE_DEAD &&
1134+ node->last_seq_acked != node->last_seq_sent && !node->us) {
1135+ printk(KERN_WARNING CMAN_NAME
1136+ ": node %s is not responding - removing from the cluster\n",
1137+ node->name);
1138+
1139+ /* Drop this lock or we can deadlock with membership */
1140+ up(&cluster_members_lock);
1141+
1142+ /* Start a state transition */
1143+ a_node_just_died(node);
1144+ down(&cluster_members_lock);
1145+ }
1146+ }
1147+ up(&cluster_members_lock);
1148+ acks_expected = ack_count = 0;
1149+ unjam();
1150+ return;
c1c6733f
AM
1151+}
1152+
bb1d8b11 1153+static void ack_timer_fn(unsigned long arg)
b7b72b66 1154+{
bb1d8b11 1155+ P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count);
b7b72b66 1156+
bb1d8b11
AM
1157+ /* Too many retries ? */
1158+ if (++retry_count > MAX_RETRIES) {
1159+ set_bit(ACK_TIMEOUT, &mainloop_flags);
1160+ wake_up_interruptible(&cnxman_waitq);
b7b72b66
AM
1161+ }
1162+ else {
bb1d8b11
AM
1163+ /* Resend last message */
1164+ set_bit(RESEND_NEEDED, &mainloop_flags);
1165+ wake_up_interruptible(&cnxman_waitq);
b7b72b66 1166+ }
bb1d8b11 1167+}
b7b72b66 1168+
bb1d8b11
AM
1169+/* Called to resend a packet if sock_sendmsg was busy */
1170+static void short_timer_fn(unsigned long arg)
1171+{
1172+ P_COMMS("short_timer fired\n");
b7b72b66 1173+
bb1d8b11
AM
1174+ /* Resend last message */
1175+ resend_delay <<= 1;
1176+ set_bit(RESEND_NEEDED, &mainloop_flags);
1177+ wake_up_interruptible(&cnxman_waitq);
1178+}
b7b72b66 1179+
bb1d8b11
AM
1180+static void start_ack_timer()
1181+{
1182+ ack_timer.function = ack_timer_fn;
1183+ ack_timer.data = 0L;
1184+ mod_timer(&ack_timer, jiffies + HZ);
1185+}
b7b72b66 1186+
bb1d8b11
AM
1187+static void start_short_timer(void)
1188+{
1189+ ack_timer.function = short_timer_fn;
1190+ ack_timer.data = 0L;
1191+ mod_timer(&ack_timer, jiffies + (resend_delay * HZ));
1192+}
b7b72b66 1193+
b7b72b66 1194+
bb1d8b11
AM
1195+static struct cl_waiting_listen_request *find_listen_request(unsigned short tag)
1196+{
1197+ struct list_head *llist;
1198+ struct cl_waiting_listen_request *listener;
b7b72b66 1199+
bb1d8b11
AM
1200+ list_for_each(llist, &listenreq_list) {
1201+ listener = list_entry(llist, struct cl_waiting_listen_request,
1202+ list);
1203+ if (listener->tag == tag) {
1204+ return listener;
1205+ }
1206+ }
1207+ return NULL;
b7b72b66
AM
1208+}
1209+
bb1d8b11 1210+static void process_ack(struct cluster_node *rem_node, unsigned short seq)
c1c6733f 1211+{
bb1d8b11
AM
1212+ if (rem_node && rem_node->state != NODESTATE_DEAD) {
1213+ /* This copes with duplicate acks from a multipathed
1214+ * host */
1215+ if (rem_node->last_seq_acked !=
1216+ le16_to_cpu(seq)) {
1217+ rem_node->last_seq_acked =
1218+ le16_to_cpu(seq);
c1c6733f 1219+
bb1d8b11
AM
1220+ /* Got em all */
1221+ if (++ack_count >= acks_expected) {
c1c6733f 1222+
bb1d8b11
AM
1223+ /* Cancel the timer */
1224+ del_timer(&ack_timer);
1225+ acks_expected = 0;
1226+ unjam();
1227+ }
1228+ }
1229+ }
1230+}
c1c6733f 1231+
bb1d8b11
AM
1232+static void process_cnxman_message(struct cl_comms_socket *csock, char *data,
1233+ int len, char *addr, int addrlen,
1234+ struct cluster_node *rem_node)
1235+{
1236+ struct cl_protmsg *msg = (struct cl_protmsg *) data;
1237+ struct cl_protheader *header = (struct cl_protheader *) data;
1238+ struct cl_ackmsg *ackmsg;
1239+ struct cl_listenmsg *listenmsg;
1240+ struct cl_closemsg *closemsg;
1241+ struct cl_barriermsg *barriermsg;
1242+ struct cl_waiting_listen_request *listen_request;
c1c6733f 1243+
bb1d8b11
AM
1244+ P_COMMS("Message on port 0 is %d\n", msg->cmd);
1245+ switch (msg->cmd) {
1246+ case CLUSTER_CMD_ACK:
1247+ ackmsg = (struct cl_ackmsg *) data;
c1c6733f 1248+
bb1d8b11
AM
1249+ if (rem_node && (ackmsg->aflags & 1)) {
1250+ if (net_ratelimit())
1251+ printk(KERN_INFO CMAN_NAME
1252+ ": WARNING no listener for port %d on node %s\n",
1253+ ackmsg->remport, rem_node->name);
1254+ }
1255+ P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n",
1256+ rem_node ? rem_node->name : "Unknown",
1257+ le16_to_cpu(ackmsg->header.ack), cur_seq);
c1c6733f 1258+
bb1d8b11
AM
1259+ /* ACK processing has already happened */
1260+ break;
c1c6733f 1261+
bb1d8b11
AM
1262+ /* Return 1 if we have a listener on this port, 0 if not */
1263+ case CLUSTER_CMD_LISTENREQ:
1264+ listenmsg =
1265+ (struct cl_listenmsg *) (data +
1266+ sizeof (struct cl_protheader));
1267+ cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1268+ send_listen_response(csock, le32_to_cpu(header->srcid),
1269+ listenmsg->target_port, listenmsg->tag);
1270+ break;
c1c6733f 1271+
bb1d8b11
AM
1272+ case CLUSTER_CMD_LISTENRESP:
1273+ /* Wake up process waiting for listen response */
1274+ listenmsg =
1275+ (struct cl_listenmsg *) (data +
1276+ sizeof (struct cl_protheader));
1277+ cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1278+ down(&listenreq_lock);
1279+ listen_request = find_listen_request(listenmsg->tag);
1280+ if (listen_request) {
1281+ listen_request->result = listenmsg->listening;
1282+ listen_request->waiting = 0;
1283+ wake_up_interruptible(&listen_request->waitq);
1284+ }
1285+ up(&listenreq_lock);
1286+ break;
c1c6733f 1287+
bb1d8b11
AM
1288+ case CLUSTER_CMD_PORTCLOSED:
1289+ closemsg =
1290+ (struct cl_closemsg *) (data +
1291+ sizeof (struct cl_protheader));
1292+ cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1293+ post_close_oob(closemsg->port, le32_to_cpu(header->srcid));
1294+ break;
c1c6733f 1295+
bb1d8b11
AM
1296+ case CLUSTER_CMD_BARRIER:
1297+ barriermsg =
1298+ (struct cl_barriermsg *) (data +
1299+ sizeof (struct cl_protheader));
1300+ cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1301+ if (rem_node)
1302+ process_barrier_msg(barriermsg, rem_node);
1303+ break;
c1c6733f 1304+
bb1d8b11
AM
1305+ default:
1306+ printk(KERN_ERR CMAN_NAME
1307+ ": Unknown protocol message %d received\n", msg->cmd);
1308+ break;
c1c6733f 1309+
c1c6733f 1310+ }
bb1d8b11 1311+ return;
c1c6733f
AM
1312+}
1313+
bb1d8b11 1314+static int valid_addr_for_node(struct cluster_node *node, char *addr)
c1c6733f 1315+{
bb1d8b11
AM
1316+ struct list_head *addrlist;
1317+ struct cluster_node_addr *nodeaddr;
c1c6733f 1318+
bb1d8b11
AM
1319+ /* We don't compare the first two bytes of the address because it's
1320+ * the Address Family and always in native byte order...so it will
1321+ * not match if we have mixed big & little-endian machines in the cluster
1322+ */
c1c6733f 1323+
bb1d8b11
AM
1324+ list_for_each(addrlist, &node->addr_list) {
1325+ nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
c1c6733f 1326+
bb1d8b11
AM
1327+ if (memcmp(nodeaddr->addr+2, addr+2, address_length-2) == 0)
1328+ return 1; /* TRUE */
c1c6733f 1329+ }
bb1d8b11 1330+ return 0; /* FALSE */
c1c6733f
AM
1331+}
1332+
bb1d8b11 1333+static void memcpy_fromkvec(void *data, struct kvec *vec, int len)
c1c6733f 1334+{
bb1d8b11
AM
1335+ while (len > 0) {
1336+ if (vec->iov_len) {
1337+ int copy = min_t(unsigned int, len, vec->iov_len);
1338+ memcpy(data, vec->iov_base, copy);
1339+ len -= copy;
1340+ data += copy;
1341+ vec->iov_base += copy;
1342+ vec->iov_len -= copy;
1343+ }
1344+ vec++;
1345+ }
1346+}
c1c6733f 1347+
bb1d8b11
AM
1348+static int send_to_user_port(struct cl_comms_socket *csock,
1349+ struct cl_protheader *header,
1350+ struct msghdr *msg,
1351+ struct kvec *iov, int veclen,
1352+ int len)
1353+{
1354+ struct sk_buff *skb;
1355+ struct cb_info *cbinfo;
1356+ int err;
c1c6733f 1357+
bb1d8b11
AM
1358+ /* Get the port number and look for a listener */
1359+ down(&port_array_lock);
1360+ if (port_array[header->tgtport]) {
1361+ struct cluster_sock *c = cluster_sk(port_array[header->tgtport]);
c1c6733f 1362+
bb1d8b11
AM
1363+ /* ACK it */
1364+ if (!(header->flags & MSG_NOACK) &&
1365+ !(header->flags & MSG_REPLYEXP)) {
c1c6733f 1366+
bb1d8b11
AM
1367+ cl_sendack(csock, header->seq, msg->msg_namelen,
1368+ msg->msg_name, header->tgtport, 0);
1369+ }
c1c6733f 1370+
bb1d8b11
AM
1371+ /* Call a callback if there is one */
1372+ if (c->kernel_callback) {
1373+ up(&port_array_lock);
1374+ if (veclen == 1) {
1375+ c->kernel_callback(iov->iov_base,
1376+ iov->iov_len,
1377+ msg->msg_name, msg->msg_namelen,
1378+ le32_to_cpu(header->srcid));
c1c6733f 1379+
bb1d8b11
AM
1380+ }
1381+ else { /* Unroll iov, this Hardly ever Happens */
1382+ char *data;
1383+ data = kmalloc(len, GFP_KERNEL);
1384+ if (!data)
1385+ return -ENOMEM;
c1c6733f 1386+
bb1d8b11
AM
1387+ memcpy_fromkvec(data, iov, len);
1388+ c->kernel_callback(data, len,
1389+ msg->msg_name, msg->msg_namelen,
1390+ le32_to_cpu(header->srcid));
1391+ kfree(data);
1392+ }
1393+ return len;
1394+ }
c1c6733f 1395+
bb1d8b11
AM
1396+ /* Otherwise put it into an SKB and pass it onto the recvmsg
1397+ * mechanism */
1398+ skb = alloc_skb(len, GFP_KERNEL);
1399+ if (!skb) {
1400+ up(&port_array_lock);
1401+ printk(KERN_INFO CMAN_NAME
1402+ ": Failed to allocate skb\n");
1403+ return -ENOMEM;
1404+ }
c1c6733f 1405+
bb1d8b11
AM
1406+ skb_put(skb, len);
1407+ memcpy_fromkvec(skb->data, iov, len);
c1c6733f 1408+
bb1d8b11
AM
1409+ /* Put metadata into cb[] */
1410+ cbinfo = (struct cb_info *)skb->cb;
1411+ cbinfo->orig_nodeid = le32_to_cpu(header->srcid);
1412+ cbinfo->orig_port = header->srcport;
1413+ cbinfo->oob = 0;
c1c6733f 1414+
bb1d8b11
AM
1415+ if ((err =
1416+ sock_queue_rcv_skb(port_array[header->tgtport], skb)) < 0) {
c1c6733f 1417+
bb1d8b11
AM
1418+ printk(KERN_INFO CMAN_NAME
1419+ ": Error queueing request to port %d: %d\n",
1420+ header->tgtport, err);
1421+ kfree_skb(skb);
c1c6733f 1422+
bb1d8b11
AM
1423+ /* If the port was MEMBERSHIP then we have to die */
1424+ if (header->tgtport == CLUSTER_PORT_MEMBERSHIP) {
1425+ up(&port_array_lock);
1426+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
1427+ panic("membership stopped responding");
1428+ }
1429+ }
1430+ up(&port_array_lock);
c1c6733f 1431+
bb1d8b11
AM
1432+ }
1433+ else {
1434+ /* ACK it, but set the flag bit so remote end knows no-one
1435+ * caught it */
1436+ if (!(header->flags & MSG_NOACK))
1437+ cl_sendack(csock, header->seq,
1438+ msg->msg_namelen, msg->msg_name,
1439+ header->tgtport, 1);
c1c6733f 1440+
bb1d8b11
AM
1441+ /* Nobody listening, drop it */
1442+ up(&port_array_lock);
1443+ }
1444+ return len;
c1c6733f
AM
1445+}
1446+
bb1d8b11
AM
1447+/* NOTE: This routine knows (assumes!) that there is only one
1448+ iov element passed into it. */
1449+static void process_incoming_packet(struct cl_comms_socket *csock,
1450+ struct msghdr *msg,
1451+ struct kvec *vec, int veclen, int len)
c1c6733f 1452+{
bb1d8b11
AM
1453+ char *data = vec->iov_base;
1454+ char *addr = msg->msg_name;
1455+ int addrlen = msg->msg_namelen;
1456+ struct cl_protheader *header = (struct cl_protheader *) data;
1457+ struct cluster_node *rem_node =
1458+ find_node_by_nodeid(le32_to_cpu(header->srcid));
c1c6733f 1459+
bb1d8b11
AM
1460+ P_COMMS("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n",
1461+ le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1462+ le16_to_cpu(header->seq), rem_node,
1463+ rem_node ? rem_node->state : -1);
c1c6733f 1464+
bb1d8b11
AM
1465+ /* If the remote end is being coy about its node ID then look it up by
1466+ * address */
1467+ if (!rem_node && header->srcid == 0) {
1468+ rem_node = find_node_by_addr(addr, addrlen);
1469+ }
c1c6733f 1470+
bb1d8b11
AM
1471+ /* If this node is an ex-member then treat it as unknown */
1472+ if (rem_node && rem_node->state != NODESTATE_MEMBER
1473+ && rem_node->state != NODESTATE_JOINING)
1474+ rem_node = NULL;
c1c6733f 1475+
bb1d8b11
AM
1476+ /* Ignore messages not for our cluster */
1477+ if (le16_to_cpu(header->cluster) != cluster_id) {
1478+ P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n",
1479+ cluster_id, header->cluster);
1480+ goto incoming_finish;
1481+ }
c1c6733f 1482+
bb1d8b11
AM
1483+ /* If the message is from us then just dump it */
1484+ if (rem_node && rem_node->us)
1485+ goto incoming_finish;
c1c6733f 1486+
bb1d8b11
AM
1487+ /* If we can't find the nodeid then check for our own messages the hard
1488+ * way - this only happens during joining */
1489+ if (!rem_node) {
1490+ struct list_head *socklist;
1491+ struct cl_comms_socket *clsock;
c1c6733f 1492+
bb1d8b11
AM
1493+ list_for_each(socklist, &socket_list) {
1494+ clsock =
1495+ list_entry(socklist, struct cl_comms_socket, list);
c1c6733f 1496+
bb1d8b11 1497+ if (clsock->recv_only) {
c1c6733f 1498+
bb1d8b11
AM
1499+ if (memcmp(addr, &clsock->saddr, address_length) == 0) {
1500+ goto incoming_finish;
1501+ }
1502+ }
1503+ }
c1c6733f 1504+
bb1d8b11 1505+ }
c1c6733f 1506+
bb1d8b11
AM
1507+ /* Ignore messages not for us */
1508+ if (le32_to_cpu(header->tgtid) > 0 && us
1509+ && le32_to_cpu(header->tgtid) != us->node_id) {
1510+ goto incoming_finish;
1511+ }
c1c6733f 1512+
bb1d8b11
AM
1513+ P_COMMS("got message, from %d for %d, sequence num = %d\n",
1514+ le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1515+ le16_to_cpu(header->seq));
1516+
1517+ if (header->ack && rem_node) {
1518+ process_ack(rem_node, header->ack);
c1c6733f 1519+ }
c1c6733f 1520+
bb1d8b11
AM
1521+ /* Have we received this message before ? If so just ignore it, it's a
1522+ * resend for someone else's benefit */
1523+ if (!(header->flags & MSG_NOACK) &&
1524+ rem_node && le16_to_cpu(header->seq) == rem_node->last_seq_recv) {
1525+ P_COMMS
1526+ ("Discarding message - Already seen this sequence number %d\n",
1527+ rem_node->last_seq_recv);
1528+ /* Still need to ACK it though, in case it was the ACK that got
1529+ * lost */
1530+ cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1531+ goto incoming_finish;
1532+ }
c1c6733f 1533+
bb1d8b11
AM
1534+ /* Check that the message is from the node we think it is from */
1535+ if (rem_node && !valid_addr_for_node(rem_node, addr)) {
1536+ return;
1537+ }
c1c6733f 1538+
bb1d8b11
AM
1539+ /* If it's a new node then assign it a temporary node ID */
1540+ if (!rem_node)
1541+ header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen));
c1c6733f 1542+
bb1d8b11
AM
1543+ P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n",
1544+ header->flags, header->tgtport, we_are_a_cluster_member);
c1c6733f 1545+
c1c6733f 1546+
bb1d8b11
AM
1547+ /* If we are not part of the cluster then ignore multicast messages
1548+ * that need an ACK as we will confuse the sender who is only expecting
1549+ * ACKS from bona fide members */
1550+ if ((header->flags & MSG_MULTICAST) &&
1551+ !(header->flags & MSG_NOACK) && !we_are_a_cluster_member) {
1552+ P_COMMS
1553+ ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n",
1554+ header->tgtport, header->flags);
1555+ goto incoming_finish;
1556+ }
c1c6733f 1557+
bb1d8b11
AM
1558+ /* Save the sequence number of this message so we can ignore duplicates
1559+ * (above) */
1560+ if (!(header->flags & MSG_NOACK) && rem_node) {
1561+ P_COMMS("Saving seq %d for node %s\n", le16_to_cpu(header->seq),
1562+ rem_node->name);
1563+ rem_node->last_seq_recv = le16_to_cpu(header->seq);
1564+ }
c1c6733f 1565+
bb1d8b11
AM
1566+ /* Is it a protocol message? */
1567+ if (header->tgtport == 0) {
1568+ process_cnxman_message(csock, data, len, addr, addrlen,
1569+ rem_node);
1570+ goto incoming_finish;
1571+ }
c1c6733f 1572+
bb1d8b11
AM
1573+ /* Skip past the header to the data */
1574+ vec[0].iov_base = data + sizeof (struct cl_protheader);
1575+ vec[0].iov_len -= sizeof (struct cl_protheader);
1576+ len -= sizeof (struct cl_protheader);
c1c6733f 1577+
bb1d8b11 1578+ send_to_user_port(csock, header, msg, vec, veclen, len);
c1c6733f 1579+
bb1d8b11
AM
1580+ incoming_finish:
1581+ return;
1582+}
c1c6733f 1583+
bb1d8b11
AM
1584+static struct sock *cl_alloc_sock(struct socket *sock, int gfp)
1585+{
1586+ struct sock *sk;
1587+ struct cluster_sock *c;
c1c6733f 1588+
bb1d8b11
AM
1589+ if ((sk =
1590+ sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock),
1591+ cluster_sk_cachep)) == NULL)
1592+ goto no_sock;
1593+
1594+ if (sock) {
1595+ sock->ops = &cl_proto_ops;
c1c6733f 1596+ }
bb1d8b11 1597+ sock_init_data(sock, sk);
c1c6733f 1598+
bb1d8b11
AM
1599+ sk->sk_destruct = NULL;
1600+ sk->sk_no_check = 1;
1601+ sk->sk_family = PF_CLUSTER;
1602+ sk->sk_allocation = gfp;
c1c6733f 1603+
bb1d8b11
AM
1604+ c = cluster_sk(sk);
1605+ c->port = 0;
1606+ c->service_data = NULL;
c1c6733f 1607+
bb1d8b11
AM
1608+ return sk;
1609+ no_sock:
1610+ return NULL;
1611+}
1612+
1613+static int cl_release(struct socket *sock)
1614+{
1615+ struct sock *sk = sock->sk;
1616+ struct cl_client_socket *csock;
1617+ struct list_head *socklist;
1618+ struct list_head *tmp;
1619+
1620+ down(&client_socket_lock);
1621+ if (sk) {
1622+ /* Remove port allocations if it's a bound socket */
1623+ struct cluster_sock *c = cluster_sk(sk);
1624+
1625+ down(&port_array_lock);
1626+ if (c->port) {
1627+ port_array[c->port] = NULL;
1628+ }
1629+ up(&port_array_lock);
c1c6733f 1630+
bb1d8b11
AM
1631+ /* Tell other nodes in the cluster that this listener is going
1632+ * away */
1633+ if (atomic_read(&cnxman_running) && c->port)
1634+ send_port_close_oob(c->port);
c1c6733f 1635+
bb1d8b11
AM
1636+ if (c->service_data)
1637+ sm_sock_release(sock);
c1c6733f 1638+
bb1d8b11
AM
1639+ /* Master socket released ? */
1640+ if (sk->sk_protocol == CLPROTO_MASTER) {
1641+ master_sock = NULL;
c1c6733f 1642+
bb1d8b11
AM
1643+ /* If this socket is being freed and cnxman is not
1644+ * started then free all the comms sockets as either
1645+ * the userland "join" process has crashed or the
1646+ * join failed.
1647+ */
1648+ if (!atomic_read(&cnxman_running)) {
1649+ quit_threads = 1;
1650+ free_cluster_sockets();
1651+ }
1652+ }
c1c6733f 1653+
bb1d8b11
AM
1654+ sock_orphan(sk);
1655+ sock_hold(sk);
1656+ lock_sock(sk);
1657+ release_sock(sk);
1658+ sock_put(sk);
1659+ sock_put(sk);
1660+ sock->sk = NULL;
1661+ }
c1c6733f 1662+
bb1d8b11
AM
1663+ /* Remove it from the list of clients */
1664+ list_for_each_safe(socklist, tmp, &client_socket_list) {
1665+ csock = list_entry(socklist, struct cl_client_socket, list);
1666+
1667+ if (csock->sock == sock) {
1668+ list_del(&csock->list);
1669+ kfree(csock);
1670+ break;
c1c6733f
AM
1671+ }
1672+ }
bb1d8b11 1673+ up(&client_socket_lock);
b7b72b66 1674+
bb1d8b11 1675+ return 0;
c1c6733f
AM
1676+}
1677+
bb1d8b11 1678+static int cl_create(struct socket *sock, int protocol)
c1c6733f 1679+{
bb1d8b11 1680+ struct sock *sk;
c1c6733f 1681+
bb1d8b11
AM
1682+ /* All are datagrams */
1683+ if (sock->type != SOCK_DGRAM)
1684+ return -ESOCKTNOSUPPORT;
1685+
1686+ if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER))
c1c6733f
AM
1687+ return -EPERM;
1688+
bb1d8b11
AM
1689+ /* Can only have one master socket */
1690+ if (master_sock && protocol == CLPROTO_MASTER)
1691+ return -EBUSY;
c1c6733f 1692+
bb1d8b11
AM
1693+ /* cnxman not running and a client was requested */
1694+ if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER)
1695+ return -ENETDOWN;
c1c6733f 1696+
bb1d8b11
AM
1697+ if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL)
1698+ return -ENOBUFS;
c1c6733f 1699+
bb1d8b11 1700+ sk->sk_protocol = protocol;
c1c6733f 1701+
bb1d8b11
AM
1702+ if (protocol == CLPROTO_MASTER)
1703+ master_sock = sk;
1704+
1705+ /* Add client sockets to the list */
1706+ if (protocol == CLPROTO_CLIENT) {
1707+ struct cl_client_socket *clsock =
1708+ kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL);
1709+ if (!clsock) {
1710+ cl_release(sock);
1711+ return -ENOMEM;
1712+ }
1713+ clsock->sock = sock;
1714+ down(&client_socket_lock);
1715+ list_add(&clsock->list, &client_socket_list);
1716+ up(&client_socket_lock);
1717+ }
c1c6733f
AM
1718+
1719+ return 0;
1720+}
1721+
bb1d8b11 1722+static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
b7b72b66 1723+{
bb1d8b11
AM
1724+ struct sock *sk = sock->sk;
1725+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr;
1726+ struct cluster_sock *c = cluster_sk(sk);
b7b72b66 1727+
bb1d8b11 1728+ if (!capable(CAP_NET_BIND_SERVICE))
b7b72b66
AM
1729+ return -EPERM;
1730+
bb1d8b11 1731+ if (sk->sk_zapped == 0)
b7b72b66
AM
1732+ return -EINVAL;
1733+
bb1d8b11
AM
1734+ if (addr_len != sizeof (struct sockaddr_cl))
1735+ return -EINVAL;
b7b72b66 1736+
bb1d8b11
AM
1737+ if (saddr->scl_family != AF_CLUSTER)
1738+ return -EINVAL;
b7b72b66 1739+
bb1d8b11
AM
1740+ if (saddr->scl_port == 0)
1741+ return -EINVAL; /* Port 0 is reserved for protocol messages */
b7b72b66 1742+
bb1d8b11
AM
1743+ down(&port_array_lock);
1744+
1745+ if (port_array[saddr->scl_port]) {
1746+ up(&port_array_lock);
1747+ return -EADDRINUSE;
b7b72b66 1748+ }
b7b72b66 1749+
bb1d8b11 1750+ port_array[saddr->scl_port] = sk;
b7b72b66 1751+
bb1d8b11 1752+ up(&port_array_lock);
b7b72b66 1753+
bb1d8b11
AM
1754+ c->port = saddr->scl_port;
1755+ sk->sk_zapped = 0;
c783755a 1756+
bb1d8b11
AM
1757+ /* If we are not a cluster member yet then make the client wait until
1758+ * we are, this allows nodes to start cluster clients at the same time
1759+ * as cluster services but they will wait until membership is achieved.
1760+ * This looks odd in bind() (open would seem more obvious) but we need
1761+ * to know which port number is being used so that things like
1762+ * membership services don't get blocked
1763+ */
1764+
1765+ if (saddr->scl_port > HIGH_PROTECTED_PORT)
1766+ while (!we_are_a_cluster_member || !cluster_is_quorate
1767+ || in_transition()) {
1768+ DECLARE_WAITQUEUE(wq, current);
1769+ struct task_struct *tsk = current;
1770+
1771+ set_task_state(tsk, TASK_INTERRUPTIBLE);
1772+ add_wait_queue(&socket_waitq, &wq);
1773+
1774+ if (!we_are_a_cluster_member || !cluster_is_quorate
1775+ || in_transition())
1776+ schedule();
1777+
1778+ set_task_state(tsk, TASK_RUNNING);
1779+ remove_wait_queue(&socket_waitq, &wq);
1780+
1781+ /* We were woken up because the cluster is going down,
1782+ * ...and we never got a chance to do any work! (sob) */
1783+ if (atomic_read(&cnxman_running) == 0 || quit_threads) {
1784+ return -ENOTCONN;
1785+ }
1786+ }
c783755a 1787+
c783755a 1788+ return 0;
b7b72b66
AM
1789+}
1790+
bb1d8b11
AM
1791+static int cl_getname(struct socket *sock, struct sockaddr *uaddr,
1792+ int *uaddr_len, int peer)
b7b72b66 1793+{
bb1d8b11
AM
1794+ struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr;
1795+ struct sock *sk = sock->sk;
1796+ struct cluster_sock *c = cluster_sk(sk);
b7b72b66 1797+
bb1d8b11 1798+ *uaddr_len = sizeof (struct sockaddr_cl);
b7b72b66 1799+
bb1d8b11 1800+ lock_sock(sk);
b7b72b66 1801+
bb1d8b11
AM
1802+ sa->scl_port = c->port;
1803+ sa->scl_flags = 0;
1804+ sa->scl_family = AF_CLUSTER;
b7b72b66 1805+
bb1d8b11 1806+ release_sock(sk);
b7b72b66 1807+
bb1d8b11
AM
1808+ return 0;
1809+}
b7b72b66 1810+
bb1d8b11
AM
1811+static unsigned int cl_poll(struct file *file, struct socket *sock,
1812+ poll_table * wait)
1813+{
1814+ return datagram_poll(file, sock, wait);
1815+}
b7b72b66 1816+
bb1d8b11
AM
1817+/* Copy internal node format to userland format */
1818+void copy_to_usernode(struct cluster_node *node,
1819+ struct cl_cluster_node *unode)
1820+{
1821+ strcpy(unode->name, node->name);
1822+ unode->size = sizeof (struct cl_cluster_node);
1823+ unode->votes = node->votes;
1824+ unode->state = node->state;
1825+ unode->us = node->us;
1826+ unode->node_id = node->node_id;
1827+ unode->leave_reason = node->leave_reason;
1828+ unode->incarnation = node->incarnation;
1829+}
1830+
1831+static int add_clsock(int broadcast, int number, struct socket *sock,
1832+ struct file *file)
1833+{
1834+ struct cl_comms_socket *newsock =
1835+ kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL);
1836+ if (!newsock)
b7b72b66
AM
1837+ return -ENOMEM;
1838+
bb1d8b11
AM
1839+ memset(newsock, 0, sizeof (*newsock));
1840+ newsock->number = number;
1841+ newsock->sock = sock;
1842+ if (broadcast) {
1843+ newsock->broadcast = 1;
1844+ newsock->recv_only = 0;
1845+ }
1846+ else {
1847+ newsock->broadcast = 0;
1848+ newsock->recv_only = 1;
1849+ }
b7b72b66 1850+
bb1d8b11
AM
1851+ newsock->file = file;
1852+ newsock->addr_len = sizeof(struct sockaddr_in6);
b7b72b66 1853+
bb1d8b11
AM
1854+ /* Mark it active until cnxman thread is running and ready to process
1855+ * messages */
1856+ set_bit(1, &newsock->active);
b7b72b66 1857+
bb1d8b11
AM
1858+ /* Find out what it's bound to */
1859+ newsock->sock->ops->getname(newsock->sock,
1860+ (struct sockaddr *)&newsock->saddr,
1861+ &newsock->addr_len, 0);
b7b72b66 1862+
bb1d8b11
AM
1863+ num_interfaces = max(num_interfaces, newsock->number);
1864+ if (!current_interface && newsock->broadcast)
1865+ current_interface = newsock;
b7b72b66 1866+
bb1d8b11
AM
1867+ /* Hook data_ready */
1868+ newsock->sock->sk->sk_data_ready = cnxman_data_ready;
1869+
1870+ /* Make an attempt to keep them in order */
1871+ list_add_tail(&newsock->list, &socket_list);
1872+
1873+ address_length = newsock->addr_len;
b7b72b66
AM
1874+ return 0;
1875+}
1876+
bb1d8b11
AM
1877+/* ioctl processing functions */
1878+
1879+static int do_ioctl_set_version(unsigned long arg)
b7b72b66 1880+{
bb1d8b11
AM
1881+ struct cl_version version, *u_version;
1882+
b7b72b66
AM
1883+ if (!capable(CAP_CLUSTER))
1884+ return -EPERM;
bb1d8b11
AM
1885+ if (arg == 0)
1886+ return -EINVAL;
b7b72b66 1887+
bb1d8b11 1888+ u_version = (struct cl_version *) arg;
b7b72b66 1889+
bb1d8b11
AM
1890+ if (copy_from_user(&version, u_version, sizeof(struct cl_version)))
1891+ return -EFAULT;
b7b72b66 1892+
bb1d8b11
AM
1893+ if (version.major != CNXMAN_MAJOR_VERSION ||
1894+ version.minor != CNXMAN_MINOR_VERSION ||
1895+ version.patch != CNXMAN_PATCH_VERSION)
1896+ return -EINVAL;
b7b72b66 1897+
bb1d8b11
AM
1898+ if (config_version == version.config)
1899+ return 0;
b7b72b66 1900+
bb1d8b11
AM
1901+ config_version = version.config;
1902+ send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version);
b7b72b66
AM
1903+ return 0;
1904+}
1905+
bb1d8b11 1906+static int do_ioctl_get_members(unsigned long arg)
c1c6733f 1907+{
bb1d8b11
AM
1908+ struct cluster_node *node;
1909+ /* Kernel copies */
1910+ struct cl_cluster_node user_format_node;
1911+ struct cl_cluster_nodelist user_format_nodelist;
1912+ /* User space array ptr */
1913+ struct cl_cluster_node *user_node;
1914+ struct list_head *nodelist;
1915+ int num_nodes = 0;
c1c6733f 1916+
bb1d8b11
AM
1917+ if (arg == 0)
1918+ return cluster_members;
c1c6733f 1919+
bb1d8b11
AM
1920+ if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1921+ return -EFAULT;
c1c6733f 1922+
bb1d8b11 1923+ down(&cluster_members_lock);
c1c6733f 1924+
bb1d8b11
AM
1925+ if (user_format_nodelist.max_members < cluster_members) {
1926+ up(&cluster_members_lock);
1927+ return -E2BIG;
1928+ }
c1c6733f 1929+
bb1d8b11 1930+ user_node = user_format_nodelist.nodes;
c1c6733f 1931+
bb1d8b11
AM
1932+ list_for_each(nodelist, &cluster_members_list) {
1933+ node = list_entry(nodelist, struct cluster_node, list);
1934+ if (node->state == NODESTATE_MEMBER) {
1935+ copy_to_usernode(node, &user_format_node);
1936+ if (copy_to_user(user_node, &user_format_node,
1937+ sizeof (struct cl_cluster_node))) {
1938+ up(&cluster_members_lock);
1939+ return -EFAULT;
1940+ }
1941+ user_node++;
1942+ num_nodes++;
1943+ }
1944+ }
1945+ up(&cluster_members_lock);
c1c6733f 1946+
bb1d8b11
AM
1947+ return num_nodes;
1948+}
c1c6733f 1949+
bb1d8b11
AM
1950+static int do_ioctl_get_all_members(unsigned long arg)
1951+{
1952+ struct cluster_node *node;
1953+ /* Kernel copies */
1954+ struct cl_cluster_node user_format_node;
1955+ struct cl_cluster_nodelist user_format_nodelist;
1956+ /* User space array ptr*/
1957+ struct cl_cluster_node *user_node;
1958+ struct list_head *nodelist;
1959+ int num_nodes = 0;
c1c6733f 1960+
bb1d8b11
AM
1961+ if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1962+ return -EFAULT;
c1c6733f 1963+
bb1d8b11 1964+ down(&cluster_members_lock);
c1c6733f 1965+
bb1d8b11 1966+ user_node = user_format_nodelist.nodes;
c1c6733f 1967+
bb1d8b11
AM
1968+ list_for_each(nodelist, &cluster_members_list) {
1969+ node = list_entry(nodelist, struct cluster_node, list);
1970+ if (arg) {
1971+ copy_to_usernode(node,
1972+ &user_format_node);
1973+
1974+ if (copy_to_user(user_node, &user_format_node,
1975+ sizeof (struct cl_cluster_node))) {
1976+ up(&cluster_members_lock);
1977+ return -EFAULT;
1978+ }
1979+ user_node++;
1980+ if (--user_format_nodelist.max_members < 0) {
1981+ num_nodes = -EFAULT;
1982+ goto err_exit;
1983+ }
1984+
1985+ }
1986+ num_nodes++;
b7b72b66 1987+ }
bb1d8b11
AM
1988+ err_exit:
1989+ up(&cluster_members_lock);
1990+
1991+ return num_nodes;
c1c6733f
AM
1992+}
1993+
bb1d8b11
AM
1994+
1995+static int do_ioctl_get_cluster(unsigned long arg)
c1c6733f 1996+{
bb1d8b11 1997+ struct cl_cluster_info __user *info;
c1c6733f 1998+
bb1d8b11 1999+ info = (struct cl_cluster_info *)arg;
b7b72b66 2000+
bb1d8b11
AM
2001+ if (copy_to_user(&info->number, &cluster_id, sizeof(cluster_id)))
2002+ return -EFAULT;
b7b72b66 2003+
bb1d8b11
AM
2004+ if (copy_to_user(&info->name, cluster_name, strlen(cluster_name)+1))
2005+ return -EFAULT;
b7b72b66 2006+
bb1d8b11
AM
2007+ return 0;
2008+}
b7b72b66 2009+
bb1d8b11
AM
2010+static int do_ioctl_get_node(unsigned long arg)
2011+{
2012+ struct cluster_node *node;
2013+ struct cl_cluster_node k_node, *u_node;
b7b72b66 2014+
bb1d8b11 2015+ u_node = (struct cl_cluster_node *) arg;
b7b72b66 2016+
bb1d8b11
AM
2017+ if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node)))
2018+ return -EFAULT;
c1c6733f 2019+
bb1d8b11
AM
2020+ if (!k_node.name[0]) {
2021+ if (k_node.node_id == 0)
2022+ k_node.node_id = us->node_id;
2023+ node = find_node_by_nodeid(k_node.node_id);
2024+ }
2025+ else
2026+ node = find_node_by_name(k_node.name);
c1c6733f 2027+
bb1d8b11
AM
2028+ if (!node)
2029+ return -ENOENT;
c1c6733f 2030+
bb1d8b11 2031+ copy_to_usernode(node, &k_node);
b7b72b66 2032+
bb1d8b11
AM
2033+ if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node)))
2034+ return -EFAULT;
c1c6733f 2035+
bb1d8b11
AM
2036+ return 0;
2037+}
c1c6733f 2038+
bb1d8b11
AM
2039+static int do_ioctl_set_expected(unsigned long arg)
2040+{
2041+ struct list_head *nodelist;
2042+ struct cluster_node *node;
2043+ unsigned int total_votes;
2044+ unsigned int newquorum;
c1c6733f 2045+
bb1d8b11
AM
2046+ if (!capable(CAP_CLUSTER))
2047+ return -EPERM;
2048+ if (arg == 0)
2049+ return -EINVAL;
c1c6733f 2050+
bb1d8b11 2051+ newquorum = calculate_quorum(1, arg, &total_votes);
c1c6733f 2052+
bb1d8b11
AM
2053+ if (newquorum < total_votes / 2
2054+ || newquorum > total_votes) {
2055+ return -EINVAL;
2056+ }
c1c6733f 2057+
bb1d8b11
AM
2058+ /* Now do it */
2059+ down(&cluster_members_lock);
2060+ list_for_each(nodelist, &cluster_members_list) {
2061+ node = list_entry(nodelist, struct cluster_node, list);
2062+ if (node->state == NODESTATE_MEMBER
2063+ && node->expected_votes > arg) {
2064+ node->expected_votes = arg;
c1c6733f
AM
2065+ }
2066+ }
bb1d8b11 2067+ up(&cluster_members_lock);
c1c6733f 2068+
bb1d8b11 2069+ recalculate_quorum(1);
c1c6733f 2070+
bb1d8b11
AM
2071+ send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg);
2072+ sm_member_update(cluster_is_quorate);
2073+
2074+ return 0;
c1c6733f
AM
2075+}
2076+
bb1d8b11 2077+static int do_ioctl_kill_node(unsigned long arg)
c1c6733f 2078+{
bb1d8b11 2079+ struct cluster_node *node;
c1c6733f 2080+
bb1d8b11
AM
2081+ if (!capable(CAP_CLUSTER))
2082+ return -EPERM;
c1c6733f 2083+
c1c6733f 2084+
bb1d8b11
AM
2085+ if ((node = find_node_by_nodeid(arg)) == NULL)
2086+ return -EINVAL;
c1c6733f 2087+
bb1d8b11
AM
2088+ /* Can't kill us */
2089+ if (node->us)
2090+ return -EINVAL;
c1c6733f 2091+
bb1d8b11
AM
2092+ if (node->state != NODESTATE_MEMBER)
2093+ return -EINVAL;
c1c6733f 2094+
bb1d8b11
AM
2095+ /* Just in case it is alive, send a KILL message */
2096+ send_kill(arg);
c1c6733f 2097+
bb1d8b11
AM
2098+ node->leave_reason = CLUSTER_LEAVEFLAG_KILLED;
2099+ a_node_just_died(node);
c1c6733f 2100+
bb1d8b11
AM
2101+ return 0;
2102+}
c1c6733f 2103+
bb1d8b11
AM
2104+static int do_ioctl_barrier(unsigned long arg)
2105+{
2106+ struct cl_barrier_info info;
c1c6733f 2107+
bb1d8b11
AM
2108+ if (!capable(CAP_CLUSTER))
2109+ return -EPERM;
c1c6733f 2110+
bb1d8b11
AM
2111+ if (copy_from_user(&info, (void *)arg, sizeof(info)) != 0)
2112+ return -EFAULT;
2113+
2114+ switch (info.cmd) {
2115+ case BARRIER_IOCTL_REGISTER:
2116+ return kcl_barrier_register(info.name,
2117+ info.flags,
2118+ info.arg);
2119+ case BARRIER_IOCTL_CHANGE:
2120+ return kcl_barrier_setattr(info.name,
2121+ info.flags,
2122+ info.arg);
2123+ case BARRIER_IOCTL_WAIT:
2124+ return kcl_barrier_wait(info.name);
2125+ case BARRIER_IOCTL_DELETE:
2126+ return kcl_barrier_delete(info.name);
2127+ default:
2128+ return -EINVAL;
2129+ }
c1c6733f
AM
2130+}
2131+
bb1d8b11 2132+static int do_ioctl_islistening(unsigned long arg)
c1c6733f 2133+{
bb1d8b11
AM
2134+ DECLARE_WAITQUEUE(wq, current);
2135+ struct cl_listen_request rq;
2136+ struct cluster_node *rem_node;
2137+ int nodeid;
2138+ int result;
2139+ struct cl_waiting_listen_request *listen_request;
c1c6733f 2140+
bb1d8b11
AM
2141+ if (!arg)
2142+ return -EINVAL;
c1c6733f 2143+
bb1d8b11
AM
2144+ if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0)
2145+ return -EFAULT;
c1c6733f 2146+
bb1d8b11
AM
2147+ nodeid = rq.nodeid;
2148+ if (!nodeid)
2149+ nodeid = us->node_id;
c1c6733f 2150+
bb1d8b11
AM
2151+ rem_node = find_node_by_nodeid(nodeid);
2152+
2153+ /* Node not in the cluster */
2154+ if (!rem_node)
2155+ return -ENOENT;
2156+
2157+ if (rem_node->state != NODESTATE_MEMBER)
2158+ return -ENOTCONN;
2159+
2160+ /* If the request is for us then just look in the ports
2161+ * array */
2162+ if (rem_node->us)
2163+ return (port_array[rq.port] != 0) ? 1 : 0;
2164+
2165+ /* For a remote node we need to send a request out */
2166+
2167+ /* If we are in transition then wait until we are not */
2168+ while (in_transition()) {
2169+ set_task_state(current, TASK_INTERRUPTIBLE);
2170+ add_wait_queue(&socket_waitq, &wq);
2171+
2172+ if (in_transition())
2173+ schedule();
2174+
2175+ set_task_state(current, TASK_RUNNING);
2176+ remove_wait_queue(&socket_waitq, &wq);
c1c6733f 2177+
bb1d8b11
AM
2178+ if (signal_pending(current))
2179+ return -EINTR;
c1c6733f 2180+ }
c1c6733f 2181+
bb1d8b11
AM
2182+ /* Were we shut down before it completed ? */
2183+ if (!atomic_read(&cnxman_running))
2184+ return -ENOTCONN;
c1c6733f 2185+
bb1d8b11
AM
2186+ listen_request =
2187+ kmalloc(sizeof (struct cl_waiting_listen_request),
2188+ GFP_KERNEL);
2189+ if (!listen_request)
2190+ return -ENOMEM;
c1c6733f 2191+
bb1d8b11
AM
2192+ /* Build the request */
2193+ listen_request->waiting = 1;
2194+ listen_request->result = 0;
2195+ listen_request->tag = current->pid;
2196+ listen_request->nodeid = nodeid;
2197+ init_waitqueue_head(&listen_request->waitq);
c1c6733f 2198+
bb1d8b11
AM
2199+ down(&listenreq_lock);
2200+ list_add(&listen_request->list, &listenreq_list);
2201+ up(&listenreq_lock);
c1c6733f 2202+
bb1d8b11
AM
2203+ /* Now wait for the response to come back */
2204+ send_listen_request(rq.nodeid, rq.port);
c1c6733f 2205+
bb1d8b11
AM
2206+ while (listen_request->waiting) {
2207+ set_task_state(current, TASK_INTERRUPTIBLE);
2208+ add_wait_queue(&listen_request->waitq, &wq);
c1c6733f 2209+
bb1d8b11
AM
2210+ if (listen_request->waiting)
2211+ schedule();
c1c6733f 2212+
bb1d8b11
AM
2213+ set_task_state(current, TASK_RUNNING);
2214+ remove_wait_queue(&listen_request->waitq, &wq);
c1c6733f 2215+
bb1d8b11
AM
2216+ if (signal_pending(current)) {
2217+ result = -ERESTARTSYS;
2218+ goto end_listen;
2219+ }
2220+ }
2221+ result = listen_request->result;
c1c6733f 2222+
bb1d8b11
AM
2223+ end_listen:
2224+ down(&listenreq_lock);
2225+ list_del(&listen_request->list);
2226+ kfree(listen_request);
2227+ up(&listenreq_lock);
2228+ return result;
c1c6733f
AM
2229+}
2230+
bb1d8b11 2231+static int do_ioctl_set_votes(unsigned long arg)
c1c6733f 2232+{
bb1d8b11
AM
2233+ unsigned int total_votes;
2234+ unsigned int newquorum;
2235+ int saved_votes;
c1c6733f 2236+
bb1d8b11
AM
2237+ if (!capable(CAP_CLUSTER))
2238+ return -EPERM;
c1c6733f 2239+
bb1d8b11
AM
2240+ /* Check votes is valid */
2241+ saved_votes = us->votes;
2242+ us->votes = arg;
c1c6733f 2243+
bb1d8b11 2244+ newquorum = calculate_quorum(1, 0, &total_votes);
c1c6733f 2245+
bb1d8b11
AM
2246+ if (newquorum < total_votes / 2 || newquorum > total_votes) {
2247+ us->votes = saved_votes;
2248+ return -EINVAL;
c1c6733f 2249+ }
c1c6733f 2250+
bb1d8b11 2251+ recalculate_quorum(1);
c1c6733f 2252+
bb1d8b11
AM
2253+ send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg);
2254+
2255+ return 0;
2256+}
2257+
2258+static int do_ioctl_pass_socket(unsigned long arg)
c1c6733f 2259+{
bb1d8b11
AM
2260+ struct cl_passed_sock sock_info;
2261+ struct file *file;
2262+ int error;
c1c6733f 2263+
bb1d8b11
AM
2264+ if (!capable(CAP_CLUSTER))
2265+ return -EPERM;
c1c6733f 2266+
bb1d8b11
AM
2267+ if (atomic_read(&cnxman_running))
2268+ return -EINVAL;
c1c6733f 2269+
bb1d8b11 2270+ error = -EBADF;
c1c6733f 2271+
bb1d8b11
AM
2272+ if (copy_from_user(&sock_info, (void *)arg, sizeof(sock_info)))
2273+ return -EFAULT;
c783755a 2274+
bb1d8b11
AM
2275+ file = fget(sock_info.fd);
2276+ if (file) {
2277+ struct inode *inode = file->f_dentry->d_inode;
c1c6733f 2278+
bb1d8b11
AM
2279+ error = add_clsock(sock_info.multicast,
2280+ sock_info.number, SOCKET_I(inode),
2281+ file);
2282+ if (error)
2283+ fput(file);
2284+ }
2285+ return error;
c1c6733f 2286+
bb1d8b11 2287+}
c1c6733f 2288+
bb1d8b11
AM
2289+static int do_ioctl_set_nodename(unsigned long arg)
2290+{
2291+ if (!capable(CAP_CLUSTER))
2292+ return -EPERM;
2293+ if (atomic_read(&cnxman_running))
2294+ return -EINVAL;
2295+ if (strncpy_from_user(nodename, (void *)arg, MAX_CLUSTER_MEMBER_NAME_LEN) < 0)
2296+ return -EFAULT;
2297+ return 0;
2298+}
c1c6733f 2299+
bb1d8b11
AM
2300+static int do_ioctl_set_nodeid(unsigned long arg)
2301+{
2302+ int nodeid = (int)arg;
c1c6733f 2303+
bb1d8b11
AM
2304+ if (!capable(CAP_CLUSTER))
2305+ return -EPERM;
2306+ if (atomic_read(&cnxman_running))
2307+ return -EINVAL;
2308+ if (nodeid < 0 || nodeid > 4096)
2309+ return -EINVAL;
c1c6733f 2310+
bb1d8b11
AM
2311+ wanted_nodeid = (int)arg;
2312+ return 0;
2313+}
c1c6733f 2314+
bb1d8b11
AM
2315+static int do_ioctl_join_cluster(unsigned long arg)
2316+{
2317+ struct cl_join_cluster_info join_info;
c1c6733f 2318+
bb1d8b11
AM
2319+ if (!capable(CAP_CLUSTER))
2320+ return -EPERM;
c1c6733f 2321+
bb1d8b11
AM
2322+ if (atomic_read(&cnxman_running))
2323+ return -EALREADY;
c783755a 2324+
bb1d8b11
AM
2325+ if (copy_from_user(&join_info, (void *)arg, sizeof (struct cl_join_cluster_info) ))
2326+ return -EFAULT;
c1c6733f 2327+
bb1d8b11
AM
2328+ if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN)
2329+ return -EINVAL;
c1c6733f 2330+
bb1d8b11
AM
2331+ if (list_empty(&socket_list))
2332+ return -ENOTCONN;
c1c6733f 2333+
bb1d8b11
AM
2334+ set_votes(join_info.votes, join_info.expected_votes);
2335+ cluster_id = generate_cluster_id(join_info.cluster_name);
2336+ strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN);
2337+ two_node = join_info.two_node;
2338+ config_version = join_info.config_version;
c1c6733f 2339+
bb1d8b11
AM
2340+ quit_threads = 0;
2341+ acks_expected = 0;
2342+ init_completion(&cluster_thread_comp);
2343+ init_completion(&member_thread_comp);
2344+ if (allocate_nodeid_array())
2345+ return -ENOMEM;
c1c6733f 2346+
bb1d8b11
AM
2347+ kcluster_pid = kernel_thread(cluster_kthread, NULL, 0);
2348+ if (kcluster_pid < 0)
2349+ return kcluster_pid;
c1c6733f 2350+
bb1d8b11
AM
2351+ wait_for_completion(&cluster_thread_comp);
2352+ init_completion(&cluster_thread_comp);
b7b72b66 2353+
bb1d8b11 2354+ atomic_set(&cnxman_running, 1);
b7b72b66 2355+
bb1d8b11
AM
2356+ /* Make sure we have a node name */
2357+ if (nodename[0] == '\0')
2358+ strcpy(nodename, system_utsname.nodename);
c1c6733f 2359+
bb1d8b11
AM
2360+ membership_pid = start_membership_services(kcluster_pid);
2361+ if (membership_pid < 0) {
2362+ quit_threads = 1;
2363+ wait_for_completion(&cluster_thread_comp);
2364+ init_completion(&member_thread_comp);
2365+ return membership_pid;
c1c6733f
AM
2366+ }
2367+
bb1d8b11
AM
2368+ sm_start();
2369+ return 0;
2370+}
c1c6733f 2371+
bb1d8b11
AM
2372+static int do_ioctl_leave_cluster(unsigned long leave_flags)
2373+{
2374+ if (!capable(CAP_CLUSTER))
2375+ return -EPERM;
c1c6733f 2376+
bb1d8b11
AM
2377+ if (!atomic_read(&cnxman_running))
2378+ return -ENOTCONN;
c783755a 2379+
bb1d8b11
AM
2380+ if (in_transition())
2381+ return -EBUSY;
c783755a 2382+
bb1d8b11
AM
2383+ /* Ignore the use count if FORCE is set */
2384+ if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) {
2385+ if (atomic_read(&use_count))
2386+ return -ENOTCONN;
c783755a
AM
2387+ }
2388+
bb1d8b11
AM
2389+ us->leave_reason = leave_flags;
2390+ quit_threads = 1;
2391+ wake_up_interruptible(&cnxman_waitq);
c1c6733f 2392+
bb1d8b11
AM
2393+ wait_for_completion(&cluster_thread_comp);
2394+ atomic_set(&use_count, 0);
2395+ return 0;
2396+}
c1c6733f 2397+
bb1d8b11
AM
2398+static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2399+{
2400+ int err = -EOPNOTSUPP;
2401+ struct list_head *proclist;
2402+ struct list_head *tmp;
2403+ struct notify_struct *notify;
2404+ struct cl_version cnxman_version;
c1c6733f 2405+
bb1d8b11
AM
2406+ switch (cmd) {
2407+ /* Process requests notification of cluster events */
2408+ case SIOCCLUSTER_NOTIFY:
2409+ notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL);
2410+ if (!notify)
2411+ return -ENOMEM;
2412+ notify->pid = current->pid;
2413+ notify->signal = arg;
2414+ down(&event_listener_lock);
2415+ list_add(&notify->list, &event_listener_list);
2416+ up(&event_listener_lock);
2417+ err = 0;
2418+ break;
c1c6733f 2419+
bb1d8b11
AM
2420+ /* Process is no longer interested cluster events */
2421+ case SIOCCLUSTER_REMOVENOTIFY:
2422+ err = EINVAL;
c1c6733f 2423+
bb1d8b11
AM
2424+ down(&event_listener_lock);
2425+ list_for_each_safe(proclist, tmp, &event_listener_list) {
2426+ notify =
2427+ list_entry(proclist, struct notify_struct, list);
2428+ if (notify->pid == current->pid) {
2429+ list_del(&notify->list);
2430+ kfree(notify);
2431+ err = 0;
2432+ }
2433+ }
2434+ up(&event_listener_lock);
2435+ break;
c1c6733f 2436+
bb1d8b11
AM
2437+ /* Return the cnxman version number */
2438+ case SIOCCLUSTER_GET_VERSION:
2439+ if (!arg)
2440+ return -EINVAL;
2441+ err = 0;
2442+ cnxman_version.major = CNXMAN_MAJOR_VERSION;
2443+ cnxman_version.minor = CNXMAN_MINOR_VERSION;
2444+ cnxman_version.patch = CNXMAN_PATCH_VERSION;
2445+ cnxman_version.config = config_version;
2446+ if (copy_to_user((void *) arg, &cnxman_version,
2447+ sizeof (struct cl_version))) {
2448+ return -EFAULT;
2449+ }
2450+ break;
c1c6733f 2451+
bb1d8b11
AM
2452+ /* Set the cnxman config version number */
2453+ case SIOCCLUSTER_SET_VERSION:
2454+ err = do_ioctl_set_version(arg);
2455+ break;
c1c6733f 2456+
bb1d8b11
AM
2457+ /* Return the active membership list */
2458+ case SIOCCLUSTER_GETMEMBERS:
2459+ err = do_ioctl_get_members(arg);
2460+ break;
c1c6733f 2461+
bb1d8b11
AM
2462+ /* Return the full membership list include dead nodes */
2463+ case SIOCCLUSTER_GETALLMEMBERS:
2464+ err = do_ioctl_get_all_members(arg);
2465+ break;
c1c6733f 2466+
bb1d8b11
AM
2467+ case SIOCCLUSTER_GETNODE:
2468+ err = do_ioctl_get_node(arg);
2469+ break;
c783755a 2470+
bb1d8b11
AM
2471+ case SIOCCLUSTER_GETCLUSTER:
2472+ err = do_ioctl_get_cluster(arg);
2473+ break;
c783755a 2474+
bb1d8b11
AM
2475+ case SIOCCLUSTER_ISQUORATE:
2476+ return cluster_is_quorate;
c1c6733f 2477+
bb1d8b11
AM
2478+ case SIOCCLUSTER_ISACTIVE:
2479+ return atomic_read(&cnxman_running);
c1c6733f 2480+
bb1d8b11
AM
2481+ case SIOCCLUSTER_SETEXPECTED_VOTES:
2482+ err = do_ioctl_set_expected(arg);
2483+ break;
c1c6733f 2484+
bb1d8b11
AM
2485+ /* Change the number of votes for this node */
2486+ case SIOCCLUSTER_SET_VOTES:
2487+ err = do_ioctl_set_votes(arg);
2488+ break;
c1c6733f 2489+
bb1d8b11
AM
2490+ /* Return 1 if the specified node is listening on a given port */
2491+ case SIOCCLUSTER_ISLISTENING:
2492+ err = do_ioctl_islistening(arg);
2493+ break;
c1c6733f 2494+
bb1d8b11
AM
2495+ /* Forcibly kill a node */
2496+ case SIOCCLUSTER_KILLNODE:
2497+ err = do_ioctl_kill_node(arg);
2498+ break;
c1c6733f 2499+
bb1d8b11
AM
2500+ case SIOCCLUSTER_GET_JOINCOUNT:
2501+ if (!capable(CAP_CLUSTER))
2502+ return -EPERM;
2503+ else
2504+ return atomic_read(&use_count);
2505+
2506+ /* ioctl interface to the barrier system */
2507+ case SIOCCLUSTER_BARRIER:
2508+ err = do_ioctl_barrier(arg);
2509+ break;
2510+
2511+ case SIOCCLUSTER_PASS_SOCKET:
2512+ if (sock->sk->sk_protocol != CLPROTO_MASTER)
2513+ err = -EOPNOTSUPP;
2514+ else
2515+ err = do_ioctl_pass_socket(arg);
2516+ break;
c1c6733f 2517+
bb1d8b11
AM
2518+ case SIOCCLUSTER_SET_NODENAME:
2519+ if (sock->sk->sk_protocol != CLPROTO_MASTER)
2520+ err = -EOPNOTSUPP;
2521+ else
2522+ err = do_ioctl_set_nodename(arg);
2523+ break;
c1c6733f 2524+
bb1d8b11
AM
2525+ case SIOCCLUSTER_SET_NODEID:
2526+ if (sock->sk->sk_protocol != CLPROTO_MASTER)
2527+ err = -EOPNOTSUPP;
2528+ else
2529+ err = do_ioctl_set_nodeid(arg);
2530+ break;
c1c6733f 2531+
bb1d8b11
AM
2532+ case SIOCCLUSTER_JOIN_CLUSTER:
2533+ if (sock->sk->sk_protocol != CLPROTO_MASTER)
2534+ err = -EOPNOTSUPP;
2535+ else
2536+ err = do_ioctl_join_cluster(arg);
2537+ break;
c1c6733f 2538+
bb1d8b11
AM
2539+ case SIOCCLUSTER_LEAVE_CLUSTER:
2540+ err = do_ioctl_leave_cluster(arg);
2541+ break;
2542+
2543+ default:
2544+ err = sm_ioctl(sock, cmd, arg);
2545+ }
2546+ return err;
c1c6733f
AM
2547+}
2548+
bb1d8b11 2549+static int cl_shutdown(struct socket *sock, int how)
c1c6733f 2550+{
bb1d8b11
AM
2551+ struct sock *sk = sock->sk;
2552+ int err = -ENOTCONN;
c1c6733f 2553+
bb1d8b11 2554+ lock_sock(sk);
c1c6733f 2555+
bb1d8b11
AM
2556+ if (sock->state == SS_UNCONNECTED)
2557+ goto out;
c1c6733f 2558+
bb1d8b11
AM
2559+ err = 0;
2560+ if (sock->state == SS_DISCONNECTING)
2561+ goto out;
c1c6733f 2562+
bb1d8b11 2563+ err = -EINVAL;
c1c6733f 2564+
bb1d8b11
AM
2565+ if (how != SHUTDOWN_MASK)
2566+ goto out;
c1c6733f 2567+
bb1d8b11
AM
2568+ sk->sk_shutdown = how;
2569+ err = 0;
c1c6733f 2570+
bb1d8b11
AM
2571+ out:
2572+ release_sock(sk);
c1c6733f 2573+
bb1d8b11 2574+ return err;
c1c6733f
AM
2575+}
2576+
bb1d8b11
AM
2577+
2578+/* We'll be giving out reward points next... */
2579+/* Send the packet and save a copy in case someone loses theirs. Should be
2580+ * protected by the send mutexphore */
2581+static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg,
2582+ struct kvec *vec, int veclen,
2583+ int size, int needack)
c1c6733f 2584+{
bb1d8b11
AM
2585+ int result;
2586+ struct kvec save_vectors[veclen];
c1c6733f 2587+
bb1d8b11
AM
2588+ /* Save a copy of the IO vectors as sendmsg mucks around with them and
2589+ * we might want to send the same stuff out more than once (for different
2590+ * interfaces)
2591+ */
2592+ memcpy(save_vectors, vec,
2593+ sizeof (struct kvec) * veclen);
c1c6733f 2594+
bb1d8b11 2595+ result = kernel_sendmsg(csock->sock, msg, vec, veclen, size);
c1c6733f 2596+
bb1d8b11 2597+ if (result >= 0 && acks_expected && needack) {
c1c6733f 2598+
bb1d8b11
AM
2599+ /* Start retransmit timer if it didn't go */
2600+ if (result == 0) {
2601+ start_short_timer();
2602+ }
2603+ else {
2604+ resend_delay = 1;
2605+ }
c1c6733f
AM
2606+ }
2607+
bb1d8b11
AM
2608+ /* Restore IOVs */
2609+ memcpy(vec, save_vectors,
2610+ sizeof (struct kvec) * veclen);
c1c6733f 2611+
bb1d8b11 2612+ return result;
c1c6733f
AM
2613+}
2614+
bb1d8b11 2615+static void resend_last_message()
c1c6733f 2616+{
c1c6733f 2617+ struct msghdr msg;
bb1d8b11
AM
2618+ struct kvec vec[1];
2619+ int result;
c1c6733f 2620+
bb1d8b11
AM
2621+ P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n",
2622+ jiffies, saved_msg_len, saved_msg_buffer[0],
2623+ saved_msg_buffer[6]);
2624+
2625+ /* Assume there is something wrong with the last interface */
2626+ current_interface = get_next_interface(current_interface);
2627+ if (num_interfaces > 1)
2628+ printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n",
2629+ current_interface->number);
c1c6733f 2630+
bb1d8b11
AM
2631+ vec[0].iov_base = saved_msg_buffer;
2632+ vec[0].iov_len = saved_msg_len;
c1c6733f
AM
2633+
2634+ memset(&msg, 0, sizeof (msg));
bb1d8b11
AM
2635+ msg.msg_name = &current_interface->saddr;
2636+ msg.msg_namelen = current_interface->addr_len;
c1c6733f 2637+
bb1d8b11 2638+ result = kernel_sendmsg(current_interface->sock, &msg, vec, 1, saved_msg_len);
c1c6733f 2639+
bb1d8b11
AM
2640+ if (result < 0)
2641+ printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result);
c1c6733f 2642+
bb1d8b11
AM
2643+ /* Try indefinitely to send this, the backlog must die down eventually
2644+ * !? */
2645+ if (result == 0)
2646+ start_short_timer();
2647+
2648+ /* Send succeeded, continue waiting for ACKS */
2649+ if (result > 0)
2650+ start_ack_timer();
c1c6733f 2651+
c1c6733f
AM
2652+}
2653+
bb1d8b11
AM
2654+static int cl_recvmsg(struct kiocb *iocb, struct socket *sock,
2655+ struct msghdr *msg, size_t size, int flags)
c1c6733f 2656+{
bb1d8b11
AM
2657+ struct sock *sk = sock->sk;
2658+ struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name;
2659+ struct sk_buff *skb;
2660+ struct cb_info *cbinfo;
2661+ int copied, err = 0;
c1c6733f 2662+
bb1d8b11
AM
2663+ /* Socket was notified of shutdown, remove any pending skbs and return
2664+ * EOF */
2665+ if (!atomic_read(&cnxman_running)) {
2666+ while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err)))
2667+ skb_free_datagram(sk, skb);
2668+ return 0; /* cnxman has left the building */
2669+ }
c1c6733f 2670+
bb1d8b11
AM
2671+ /* Generic datagram code does most of the work. If the user is not
2672+ * interested in OOB messages then ignore them */
2673+ do {
2674+ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2675+ if (!skb)
2676+ goto out;
c1c6733f 2677+
bb1d8b11 2678+ cbinfo = (struct cb_info *)skb->cb;
c1c6733f 2679+
bb1d8b11
AM
2680+ /* If it is OOB and the user doesn't want it, then throw it away. */
2681+ if (cbinfo->oob && !(flags & MSG_OOB)) {
2682+ skb_free_datagram(sk, skb);
2683+
2684+ /* If we peeked (?) an OOB but the user doesn't want it
2685+ then we need to discard it or we'll loop forever */
2686+ if (flags & MSG_PEEK) {
2687+ skb = skb_recv_datagram(sk, flags & ~MSG_PEEK,
2688+ MSG_DONTWAIT, &err);
2689+ if (skb)
2690+ skb_free_datagram(sk, skb);
2691+ }
2692+ }
2693+ else
2694+ break;
c1c6733f 2695+ }
bb1d8b11 2696+ while (cbinfo->oob && !(flags & MSG_OOB));
c1c6733f 2697+
bb1d8b11
AM
2698+ copied = skb->len;
2699+ if (copied > size) {
2700+ copied = size;
2701+ msg->msg_flags |= MSG_TRUNC;
c1c6733f 2702+ }
bb1d8b11 2703+ err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
c1c6733f 2704+
bb1d8b11
AM
2705+ if (err)
2706+ goto out_free;
c1c6733f 2707+
bb1d8b11
AM
2708+ if (msg->msg_name && msg->msg_namelen) {
2709+ memset(msg->msg_name, 0, msg->msg_namelen);
c1c6733f 2710+
bb1d8b11
AM
2711+ if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) {
2712+
2713+ /* Nodeid is in native byte order - anything else is just
2714+ * perverse */
2715+ sin->scl_nodeid = cbinfo->orig_nodeid;
2716+ }
2717+ msg->msg_namelen = sizeof (struct sockaddr_cl);
2718+ sin->scl_port = cbinfo->orig_port;
2719+ }
2720+
2721+ if (cbinfo->oob) {
2722+ msg->msg_flags |= MSG_OOB;
2723+ }
2724+
2725+ sock_recv_timestamp(msg, sk, skb);
c1c6733f 2726+
bb1d8b11 2727+ err = copied;
c1c6733f 2728+
bb1d8b11
AM
2729+ out_free:
2730+ skb_free_datagram(sk, skb);
c1c6733f 2731+
bb1d8b11
AM
2732+ out:
2733+ return err;
c1c6733f
AM
2734+}
2735+
bb1d8b11
AM
2736+/* Send a message out on all interfaces */
2737+static int send_to_all_ints(int nodeid, struct msghdr *our_msg,
2738+ struct kvec *vec, int veclen, int size, int flags)
c1c6733f 2739+{
bb1d8b11
AM
2740+ struct sockaddr_in6 daddr;
2741+ struct cl_comms_socket *clsock;
2742+ int result = 0;
c1c6733f 2743+
bb1d8b11 2744+ our_msg->msg_name = &daddr;
c1c6733f 2745+
bb1d8b11 2746+ list_for_each_entry(clsock, &socket_list, list) {
c1c6733f 2747+
bb1d8b11
AM
2748+ /* Don't send out a recv-only socket */
2749+ if (!clsock->recv_only) {
c1c6733f 2750+
bb1d8b11
AM
2751+ /* For temporary node IDs send to the node's real IP address */
2752+ if (nodeid < 0) {
2753+ get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen);
2754+ }
2755+ else {
2756+ memcpy(&daddr, &clsock->saddr, clsock->addr_len);
2757+ our_msg->msg_namelen = clsock->addr_len;
2758+ }
c1c6733f 2759+
bb1d8b11
AM
2760+ result = __send_and_save(clsock, our_msg, vec, veclen,
2761+ size + sizeof (struct cl_protheader),
2762+ !(flags & MSG_NOACK));
2763+ }
2764+ }
2765+ return result;
c1c6733f
AM
2766+}
2767+
bb1d8b11
AM
2768+
2769+/* Internal common send message routine */
2770+static int __sendmsg(struct socket *sock, struct msghdr *msg,
2771+ struct kvec *vec, int veclen, int size,
2772+ unsigned char port)
c1c6733f 2773+{
bb1d8b11
AM
2774+ int result = 0, i;
2775+ int flags = msg->msg_flags;
2776+ struct msghdr our_msg;
2777+ struct sockaddr_cl *caddr = msg->msg_name;
2778+ struct cl_protheader header;
2779+ struct kvec vectors[veclen + 1];
2780+ unsigned char srcport;
2781+ int nodeid = 0;
c1c6733f 2782+
bb1d8b11
AM
2783+ if (size > MAX_CLUSTER_MESSAGE)
2784+ return -EINVAL;
2785+ if (!atomic_read(&cnxman_running))
2786+ return -ENOTCONN;
c1c6733f 2787+
bb1d8b11
AM
2788+ if (caddr)
2789+ nodeid = caddr->scl_nodeid;
c1c6733f 2790+
bb1d8b11
AM
2791+ /* Check that the node id (if present) is valid */
2792+ if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) &&
2793+ !is_valid_temp_nodeid(nodeid))) {
2794+ return -ENOTCONN;
2795+ }
2796+
2797+ /* If there's no sending client socket then the source
2798+ port is 0: "us" */
2799+ if (sock) {
2800+ struct cluster_sock *csock = cluster_sk(sock->sk);
2801+ srcport = csock->port;
c1c6733f
AM
2802+ }
2803+ else {
bb1d8b11 2804+ srcport = 0;
c1c6733f
AM
2805+ }
2806+
bb1d8b11
AM
2807+ /* We can only have one send outstanding at a time so we might as well
2808+ * lock the whole send mechanism */
2809+ down(&send_lock);
c1c6733f 2810+
bb1d8b11
AM
2811+ while ((port > HIGH_PROTECTED_PORT
2812+ && (!cluster_is_quorate || in_transition()))
2813+ || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) {
c1c6733f 2814+
bb1d8b11
AM
2815+ DECLARE_WAITQUEUE(wq, current);
2816+ struct task_struct *tsk = current;
c1c6733f 2817+
bb1d8b11
AM
2818+ if (flags & MSG_DONTWAIT) {
2819+ up(&send_lock);
2820+ return -EAGAIN;
2821+ }
c1c6733f 2822+
bb1d8b11
AM
2823+ if (current->pid == kcluster_pid) {
2824+ P_COMMS
2825+ ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n",
2826+ port, ack_count, acks_expected);
2827+ up(&send_lock);
2828+ return -EAGAIN;
2829+ }
c1c6733f 2830+
bb1d8b11
AM
2831+ P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm,
2832+ ack_count, acks_expected);
c1c6733f
AM
2833+
2834+ set_task_state(tsk, TASK_INTERRUPTIBLE);
2835+ add_wait_queue(&socket_waitq, &wq);
2836+
bb1d8b11
AM
2837+ if ((port > HIGH_PROTECTED_PORT
2838+ && (!cluster_is_quorate || in_transition()))
2839+ || (acks_expected > 0)) {
2840+
2841+ up(&send_lock);
c1c6733f 2842+ schedule();
bb1d8b11 2843+ down(&send_lock);
c1c6733f
AM
2844+ }
2845+
2846+ set_task_state(tsk, TASK_RUNNING);
2847+ remove_wait_queue(&socket_waitq, &wq);
bb1d8b11
AM
2848+
2849+ /* Going down */
2850+ if (quit_threads) {
2851+ up(&send_lock);
2852+ return -ENOTCONN;
2853+ }
2854+
2855+ if (signal_pending(current)) {
2856+ up(&send_lock);
2857+ return -ERESTARTSYS;
2858+ }
2859+
2860+ /* Were we shut down in the meantime ? */
2861+ if (!atomic_read(&cnxman_running)) {
2862+ up(&send_lock);
2863+ return -ENOTCONN;
2864+ }
2865+
c1c6733f 2866+ }
c1c6733f 2867+
bb1d8b11 2868+ memset(&our_msg, 0, sizeof (our_msg));
c1c6733f
AM
2869+
2870+ /* Build the header */
bb1d8b11
AM
2871+ header.tgtport = port;
2872+ header.srcport = srcport;
2873+ header.flags = msg->msg_flags;
2874+ header.cluster = cpu_to_le16(cluster_id);
2875+ header.srcid = us ? cpu_to_le32(us->node_id) : 0;
2876+ header.tgtid = caddr ? cpu_to_le32(nodeid) : 0;
c1c6733f 2877+
bb1d8b11
AM
2878+ ++cur_seq;
2879+ header.seq = cpu_to_le16(cur_seq);
2880+ header.ack = 0;
c1c6733f 2881+
bb1d8b11
AM
2882+ if (header.tgtid) {
2883+ struct cluster_node *remnode;
c1c6733f 2884+
bb1d8b11
AM
2885+ remnode = find_node_by_nodeid(nodeid);
2886+ if (remnode) {
2887+ header.ack = cpu_to_le16(remnode->last_seq_recv);
2888+ }
c1c6733f
AM
2889+ }
2890+
bb1d8b11
AM
2891+ /* Set the MULTICAST flag on messages with no particular destination */
2892+ if (!msg->msg_namelen) {
2893+ header.flags |= MSG_MULTICAST;
2894+ header.tgtid = 0;
2895+ }
c783755a 2896+
bb1d8b11
AM
2897+ /* Loopback shortcut */
2898+ if (nodeid == us->node_id && nodeid != 0) {
c1c6733f 2899+
bb1d8b11
AM
2900+ up(&send_lock);
2901+ header.flags |= MSG_NOACK; /* Don't ack it! */
c1c6733f 2902+
bb1d8b11
AM
2903+ return send_to_user_port(NULL, &header, msg, vec, veclen, size);
2904+ }
c1c6733f 2905+
bb1d8b11
AM
2906+ /* Copy the existing kvecs into our array and add the header on at the
2907+ * beginning */
2908+ vectors[0].iov_base = &header;
2909+ vectors[0].iov_len = sizeof (header);
2910+ for (i = 0; i < veclen; i++) {
2911+ vectors[i + 1] = vec[i];
2912+ }
c1c6733f 2913+
c1c6733f 2914+
bb1d8b11
AM
2915+ /* Work out how many ACKS are wanted - *don't* reset acks_expected to
2916+ * zero if no acks are required as an ACK-needed message may still be
2917+ * outstanding */
2918+ if (!(msg->msg_flags & MSG_NOACK)) {
2919+ if (msg->msg_namelen)
2920+ acks_expected = 1; /* Unicast */
2921+ else
2922+ acks_expected = max(cluster_members - 1, 0);
c1c6733f 2923+
bb1d8b11 2924+ }
c1c6733f 2925+
bb1d8b11
AM
2926+ P_COMMS
2927+ ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n",
2928+ nodeid, header.port,
2929+ (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected,
2930+ le16_to_cpu(header.seq), header.flags);
c1c6733f 2931+
bb1d8b11
AM
2932+ /* Don't include temp nodeids in the message itself */
2933+ if (header.tgtid < 0)
2934+ header.tgtid = 0;
c1c6733f 2935+
bb1d8b11
AM
2936+ /* For non-member sends we use all the interfaces */
2937+ if ((nodeid < 0) || (flags & MSG_ALLINT)) {
c1c6733f 2938+
bb1d8b11
AM
2939+ result = send_to_all_ints(nodeid, &our_msg, vectors, veclen+1,
2940+ size, msg->msg_flags);
2941+ }
2942+ else {
2943+ /* Send to only the current socket - resends will use the
2944+ * others if necessary */
2945+ our_msg.msg_name = &current_interface->saddr;
2946+ our_msg.msg_namelen = current_interface->addr_len;
c1c6733f 2947+
bb1d8b11
AM
2948+ result =
2949+ __send_and_save(current_interface, &our_msg,
2950+ vectors, veclen+1,
2951+ size + sizeof (header),
2952+ !(msg->msg_flags & MSG_NOACK));
2953+ }
c1c6733f 2954+
bb1d8b11
AM
2955+ /* Make a note in each nodes' structure that it has been sent a message
2956+ * so we can see which ones went astray */
2957+ if (!(flags & MSG_NOACK) && nodeid >= 0) {
2958+ if (msg->msg_namelen) {
2959+ struct cluster_node *node;
2960+
2961+ node = find_node_by_nodeid(le32_to_cpu(header.tgtid));
2962+ if (node)
2963+ node->last_seq_sent = cur_seq;
c1c6733f
AM
2964+ }
2965+ else {
bb1d8b11
AM
2966+ struct cluster_node *node;
2967+ struct list_head *nodelist;
2968+
2969+ list_for_each(nodelist, &cluster_members_list) {
2970+ node =
2971+ list_entry(nodelist, struct cluster_node,
2972+ list);
2973+ if (node->state == NODESTATE_MEMBER) {
2974+ node->last_seq_sent = cur_seq;
2975+ }
c1c6733f
AM
2976+ }
2977+ }
2978+ }
c1c6733f 2979+
bb1d8b11
AM
2980+ /* if the client wants a broadcast message sending back to itself
2981+ then loop it back */
2982+ if (nodeid == 0 && (flags & MSG_BCASTSELF)) {
2983+ header.flags |= MSG_NOACK; /* Don't ack it! */
c1c6733f 2984+
bb1d8b11 2985+ result = send_to_user_port(NULL, &header, msg, vec, veclen, size);
c1c6733f 2986+ }
bb1d8b11
AM
2987+
2988+ /* Save a copy of the message if we're expecting an ACK */
2989+ if (!(flags & MSG_NOACK) && acks_expected) {
2990+ struct cl_protheader *savhdr = (struct cl_protheader *) saved_msg_buffer;
2991+
2992+ memcpy_fromkvec(saved_msg_buffer, vectors,
2993+ size + sizeof (header));
2994+
2995+ saved_msg_len = size + sizeof (header);
2996+ retry_count = ack_count = 0;
2997+ clear_bit(RESEND_NEEDED, &mainloop_flags);
2998+
2999+ /* Clear the REPLYEXPected flag so we force a real ACK
3000+ if it's necessary to resend this packet */
3001+ savhdr->flags &= ~MSG_REPLYEXP;
3002+ start_ack_timer();
3003+ }
3004+
3005+ up(&send_lock);
3006+ return result;
c1c6733f
AM
3007+}
3008+
bb1d8b11
AM
3009+static int queue_message(struct socket *sock, void *buf, int len,
3010+ struct sockaddr_cl *caddr,
3011+ unsigned char port, int flags)
c1c6733f 3012+{
bb1d8b11 3013+ struct queued_message *qmsg;
c1c6733f 3014+
bb1d8b11
AM
3015+ qmsg = kmalloc(sizeof (struct queued_message),
3016+ (in_atomic()
3017+ || irqs_disabled())? GFP_ATOMIC : GFP_KERNEL);
3018+ if (qmsg == NULL)
3019+ return -1;
c1c6733f 3020+
bb1d8b11
AM
3021+ memcpy(qmsg->msg_buffer, buf, len);
3022+ qmsg->msg_len = len;
3023+ if (caddr) {
3024+ memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl));
3025+ qmsg->addr_len = sizeof (struct sockaddr_cl);
c1c6733f 3026+ }
bb1d8b11
AM
3027+ else {
3028+ qmsg->addr_len = 0;
3029+ }
3030+ qmsg->flags = flags;
3031+ qmsg->port = port;
3032+ qmsg->socket = sock;
3033+
3034+ down(&messages_list_lock);
3035+ list_add_tail(&qmsg->list, &messages_list);
3036+ up(&messages_list_lock);
3037+
3038+ wake_up_interruptible(&cnxman_waitq);
3039+
3040+ return 0;
c1c6733f
AM
3041+}
3042+
bb1d8b11
AM
3043+static int cl_sendmsg(struct kiocb *iocb, struct socket *sock,
3044+ struct msghdr *msg, size_t size)
c1c6733f 3045+{
bb1d8b11
AM
3046+ struct cluster_sock *c = cluster_sk(sock->sk);
3047+ char *buffer;
3048+ int status;
3049+ uint8_t port;
3050+ struct kvec vec;
3051+ struct sockaddr_cl *caddr = msg->msg_name;
c1c6733f 3052+
bb1d8b11
AM
3053+ if (sock->sk->sk_protocol == CLPROTO_MASTER)
3054+ return -EOPNOTSUPP;
c1c6733f 3055+
bb1d8b11 3056+ port = c->port;
c1c6733f 3057+
bb1d8b11
AM
3058+ /* Only capable users can override the port number */
3059+ if (caddr && capable(CAP_CLUSTER) && caddr->scl_port)
3060+ port = caddr->scl_port;
c1c6733f 3061+
bb1d8b11
AM
3062+ if (port == 0)
3063+ return -EDESTADDRREQ;
3064+
3065+ /* Allocate a kernel buffer for the data so we can put it into a kvec */
3066+ buffer = kmalloc(size, GFP_KERNEL);
3067+ if (!buffer)
3068+ return -ENOMEM;
3069+
3070+ if (memcpy_fromiovec(buffer, msg->msg_iov, size)) {
3071+ status = -EFAULT;
3072+ goto end_send;
c1c6733f
AM
3073+ }
3074+
bb1d8b11
AM
3075+ vec.iov_len = size;
3076+ vec.iov_base = buffer;
c1c6733f 3077+
bb1d8b11 3078+ status = __sendmsg(sock, msg, &vec, 1, size, port);
c1c6733f 3079+
bb1d8b11
AM
3080+ end_send:
3081+ kfree(buffer);
c1c6733f 3082+
bb1d8b11
AM
3083+ return status;
3084+}
c1c6733f 3085+
bb1d8b11
AM
3086+/* Kernel call to sendmsg */
3087+int kcl_sendmsg(struct socket *sock, void *buf, int size,
3088+ struct sockaddr_cl *caddr, int addr_len, unsigned int flags)
3089+{
3090+ struct kvec vecs[1];
3091+ struct msghdr msg;
3092+ struct cluster_sock *c = cluster_sk(sock->sk);
3093+ unsigned char port;
c1c6733f 3094+
bb1d8b11
AM
3095+ if (size > MAX_CLUSTER_MESSAGE)
3096+ return -EINVAL;
3097+ if (!atomic_read(&cnxman_running))
3098+ return -ENOTCONN;
3099+
3100+ port = c->port;
3101+ if (caddr && caddr->scl_port)
3102+ port = caddr->scl_port;
3103+
3104+ if (port == 0)
3105+ return -EDESTADDRREQ;
3106+
3107+ /* If we have no process context then queue it up for kclusterd to
3108+ * send. */
3109+ if (in_interrupt() || flags & MSG_QUEUE) {
3110+ return queue_message(sock, buf, size, caddr, port,
3111+ flags & ~MSG_QUEUE);
b7b72b66 3112+ }
b7b72b66 3113+
bb1d8b11
AM
3114+ vecs[0].iov_base = buf;
3115+ vecs[0].iov_len = size;
c1c6733f 3116+
bb1d8b11
AM
3117+ memset(&msg, 0, sizeof (msg));
3118+ msg.msg_name = caddr;
3119+ msg.msg_namelen = addr_len;
3120+ msg.msg_flags = flags;
c1c6733f 3121+
bb1d8b11 3122+ return __sendmsg(sock, &msg, vecs, 1, size, port);
c1c6733f
AM
3123+}
3124+
bb1d8b11 3125+static int send_queued_message(struct queued_message *qmsg)
c1c6733f 3126+{
bb1d8b11
AM
3127+ struct kvec vecs[1];
3128+ struct msghdr msg;
c1c6733f 3129+
bb1d8b11
AM
3130+ /* Don't send blocked messages */
3131+ if (qmsg->port > HIGH_PROTECTED_PORT
3132+ && (!cluster_is_quorate || in_transition()))
3133+ return -EAGAIN;
c1c6733f 3134+
bb1d8b11
AM
3135+ vecs[0].iov_base = qmsg->msg_buffer;
3136+ vecs[0].iov_len = qmsg->msg_len;
c1c6733f 3137+
bb1d8b11
AM
3138+ memset(&msg, 0, sizeof (msg));
3139+ msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL;
3140+ msg.msg_namelen = qmsg->addr_len;
3141+ msg.msg_flags = qmsg->flags;
c1c6733f 3142+
bb1d8b11
AM
3143+ return __sendmsg(qmsg->socket, &msg, vecs, 1,
3144+ qmsg->msg_len, qmsg->port);
c1c6733f
AM
3145+}
3146+
bb1d8b11
AM
3147+int kcl_register_read_callback(struct socket *sock,
3148+ int (*routine) (char *, int, char *, int,
3149+ unsigned int))
c1c6733f 3150+{
bb1d8b11 3151+ struct cluster_sock *c = cluster_sk(sock->sk);
c1c6733f 3152+
bb1d8b11 3153+ c->kernel_callback = routine;
c1c6733f 3154+
bb1d8b11 3155+ return 0;
c1c6733f
AM
3156+}
3157+
bb1d8b11
AM
3158+/* Used where we are in kclusterd context and we can't allow the task to wait
3159+ * as we are also responsible to processing the ACKs that do the wake up. Try
3160+ * to send the message immediately and queue it if that's not possible */
3161+static int send_or_queue_message(struct socket *sock, void *buf, int len,
3162+ struct sockaddr_cl *caddr,
3163+ unsigned int flags)
c1c6733f 3164+{
bb1d8b11
AM
3165+ struct kvec vecs[1];
3166+ struct msghdr msg;
3167+ int status;
c1c6733f 3168+
bb1d8b11
AM
3169+ vecs[0].iov_base = buf;
3170+ vecs[0].iov_len = len;
c1c6733f 3171+
bb1d8b11
AM
3172+ memset(&msg, 0, sizeof (msg));
3173+ msg.msg_name = caddr;
3174+ msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0;
3175+ msg.msg_flags = MSG_DONTWAIT | flags;
c1c6733f 3176+
bb1d8b11
AM
3177+ status = __sendmsg(NULL, &msg, vecs, 1, len, 0);
3178+
3179+ /* Did it work ? */
3180+ if (status > 0) {
3181+ return 0;
c1c6733f 3182+ }
c1c6733f 3183+
bb1d8b11
AM
3184+ /* Failure other than EAGAIN is fatal */
3185+ if (status != -EAGAIN) {
3186+ return status;
c1c6733f 3187+ }
bb1d8b11
AM
3188+
3189+ return queue_message(sock, buf, len, caddr, 0, flags);
c1c6733f
AM
3190+}
3191+
bb1d8b11
AM
3192+/* Send a listen request to a node */
3193+static void send_listen_request(int nodeid, unsigned char port)
c1c6733f 3194+{
bb1d8b11
AM
3195+ struct cl_listenmsg listenmsg;
3196+ struct sockaddr_cl caddr;
c1c6733f 3197+
bb1d8b11 3198+ memset(&caddr, 0, sizeof (caddr));
c1c6733f 3199+
bb1d8b11
AM
3200+ /* Build the header */
3201+ listenmsg.cmd = CLUSTER_CMD_LISTENREQ;
3202+ listenmsg.target_port = port;
3203+ listenmsg.listening = 0;
3204+ listenmsg.tag = current->pid;
c1c6733f 3205+
bb1d8b11
AM
3206+ caddr.scl_family = AF_CLUSTER;
3207+ caddr.scl_port = 0;
3208+ caddr.scl_nodeid = nodeid;
c1c6733f 3209+
bb1d8b11
AM
3210+ send_or_queue_message(NULL, &listenmsg, sizeof(listenmsg), &caddr, MSG_REPLYEXP);
3211+ return;
c1c6733f
AM
3212+}
3213+
bb1d8b11
AM
3214+/* Return 1 or 0 to indicate if we have a listener on the requested port */
3215+static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
3216+ unsigned char port, unsigned short tag)
c1c6733f 3217+{
bb1d8b11
AM
3218+ struct cl_listenmsg listenmsg;
3219+ struct sockaddr_cl caddr;
3220+ int status;
c1c6733f 3221+
bb1d8b11 3222+ memset(&caddr, 0, sizeof (caddr));
c1c6733f 3223+
bb1d8b11
AM
3224+ /* Build the message */
3225+ listenmsg.cmd = CLUSTER_CMD_LISTENRESP;
3226+ listenmsg.target_port = port;
3227+ listenmsg.tag = tag;
3228+ listenmsg.listening = (port_array[port] != 0) ? 1 : 0;
c1c6733f 3229+
bb1d8b11
AM
3230+ caddr.scl_family = AF_CLUSTER;
3231+ caddr.scl_port = 0;
3232+ caddr.scl_nodeid = nodeid;
c1c6733f 3233+
bb1d8b11
AM
3234+ status = send_or_queue_message(NULL, &listenmsg,
3235+ sizeof (listenmsg),
3236+ &caddr, 0);
c1c6733f 3237+
bb1d8b11 3238+ return;
c1c6733f
AM
3239+}
3240+
bb1d8b11
AM
3241+/* Send an ACK */
3242+static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq,
3243+ int addr_len, char *addr, unsigned char remport,
3244+ unsigned char flag)
c1c6733f 3245+{
bb1d8b11
AM
3246+ struct kvec vec;
3247+ struct cl_ackmsg ackmsg;
3248+ struct msghdr msg;
3249+ struct sockaddr_in6 daddr;
3250+ int result;
c1c6733f 3251+
bb1d8b11
AM
3252+#ifdef DEBUG_COMMS
3253+ char buf[MAX_ADDR_PRINTED_LEN];
c1c6733f 3254+
bb1d8b11
AM
3255+ P_COMMS("Sending ACK to %s, seq=%d\n",
3256+ print_addr(addr, address_length, buf), le16_to_cpu(seq));
3257+#endif
3258+
3259+ if (addr) {
3260+ memcpy(&daddr, addr, addr_len);
3261+ }
3262+ else {
3263+ memcpy(&daddr, &csock->saddr, csock->addr_len);
3264+ addr_len = csock->addr_len;
c1c6733f 3265+ }
c1c6733f 3266+
bb1d8b11
AM
3267+ /* Build the header */
3268+ ackmsg.header.tgtport = 0; /* Protocol port */
3269+ ackmsg.header.srcport = 0;
3270+ ackmsg.header.seq = 0;
3271+ ackmsg.header.flags = MSG_NOACK;
3272+ ackmsg.header.cluster = cpu_to_le16(cluster_id);
3273+ ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0;
3274+ ackmsg.header.ack = seq; /* already in LE order */
3275+ ackmsg.header.tgtid = 0; /* ACKS are unicast so we don't bother
3276+ * to look this up */
3277+ ackmsg.cmd = CLUSTER_CMD_ACK;
3278+ ackmsg.remport = remport;
3279+ ackmsg.aflags = flag;
3280+ vec.iov_base = &ackmsg;
3281+ vec.iov_len = sizeof (ackmsg);
c1c6733f 3282+
bb1d8b11
AM
3283+ memset(&msg, 0, sizeof (msg));
3284+ msg.msg_name = &daddr;
3285+ msg.msg_namelen = addr_len;
c1c6733f 3286+
bb1d8b11 3287+ result = kernel_sendmsg(csock->sock, &msg, &vec, 1, sizeof (ackmsg));
c1c6733f 3288+
bb1d8b11
AM
3289+ if (result < 0)
3290+ printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result);
3291+
3292+ return result;
c1c6733f 3293+
c1c6733f
AM
3294+}
3295+
bb1d8b11
AM
3296+/* Wait for all ACKS to be gathered */
3297+void kcl_wait_for_all_acks()
c1c6733f 3298+{
bb1d8b11 3299+ while (ack_count < acks_expected) {
c1c6733f 3300+
bb1d8b11
AM
3301+ DECLARE_WAITQUEUE(wq, current);
3302+ struct task_struct *tsk = current;
c1c6733f 3303+
bb1d8b11
AM
3304+ set_task_state(tsk, TASK_INTERRUPTIBLE);
3305+ add_wait_queue(&socket_waitq, &wq);
c1c6733f 3306+
bb1d8b11
AM
3307+ if (ack_count < acks_expected) {
3308+ schedule();
c1c6733f 3309+ }
bb1d8b11
AM
3310+
3311+ set_task_state(tsk, TASK_RUNNING);
3312+ remove_wait_queue(&socket_waitq, &wq);
c1c6733f 3313+ }
c1c6733f
AM
3314+}
3315+
bb1d8b11
AM
3316+/* Send a closedown OOB message to all cluster nodes - this tells them that a
3317+ * port listener has gone away */
3318+static void send_port_close_oob(unsigned char port)
c1c6733f 3319+{
bb1d8b11 3320+ struct cl_closemsg closemsg;
c1c6733f 3321+
bb1d8b11
AM
3322+ /* Build the header */
3323+ closemsg.cmd = CLUSTER_CMD_PORTCLOSED;
3324+ closemsg.port = port;
c1c6733f 3325+
bb1d8b11
AM
3326+ send_or_queue_message(NULL, &closemsg, sizeof (closemsg), NULL, 0);
3327+ return;
c1c6733f
AM
3328+}
3329+
bb1d8b11
AM
3330+/* A remote port has been closed - post an OOB message to the local listen on
3331+ * that port (if there is one) */
3332+static void post_close_oob(unsigned char port, int nodeid)
c1c6733f 3333+{
bb1d8b11
AM
3334+ struct cl_portclosed_oob *oobmsg;
3335+ struct sk_buff *skb;
3336+ struct sock *sock = port_array[port];
3337+ struct cb_info *cbinfo;
c1c6733f 3338+
bb1d8b11
AM
3339+ if (!sock) {
3340+ return; /* No-one listening */
3341+ }
c1c6733f 3342+
bb1d8b11
AM
3343+ skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3344+ if (!skb)
c1c6733f
AM
3345+ return;
3346+
bb1d8b11
AM
3347+ skb_put(skb, sizeof (*oobmsg));
3348+ oobmsg = (struct cl_portclosed_oob *) skb->data;
3349+ oobmsg->port = port;
3350+ oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED;
c1c6733f 3351+
bb1d8b11
AM
3352+ cbinfo = (struct cb_info *)skb->cb;
3353+ cbinfo->oob = 1;
3354+ cbinfo->orig_nodeid = nodeid;
3355+ cbinfo->orig_port = port;
c1c6733f 3356+
bb1d8b11 3357+ sock_queue_rcv_skb(sock, skb);
c1c6733f 3358+
bb1d8b11 3359+}
c1c6733f 3360+
bb1d8b11
AM
3361+/* Leave the cluster */
3362+static void node_shutdown()
3363+{
3364+ struct cl_barrier *barrier;
3365+ struct list_head *blist;
3366+ struct list_head *temp;
3367+ struct list_head *socklist;
3368+ struct cl_client_socket *csock;
3369+ struct sk_buff *null_skb;
c1c6733f 3370+
bb1d8b11
AM
3371+ if (we_are_a_cluster_member)
3372+ printk(KERN_INFO CMAN_NAME ": we are leaving the cluster. %s\n",
3373+ us->leave_reason?leave_string(us->leave_reason):"");
c1c6733f 3374+
bb1d8b11
AM
3375+ atomic_set(&cnxman_running, 0);
3376+ unjam();
c1c6733f 3377+
bb1d8b11
AM
3378+ /* Notify kernel listeners first */
3379+ notify_kernel_listeners(LEAVING, 0);
c1c6733f 3380+
bb1d8b11
AM
3381+ /* Notify client sockets */
3382+ down(&client_socket_lock);
3383+ list_for_each_safe(socklist, temp, &client_socket_list) {
3384+ csock = list_entry(socklist, struct cl_client_socket, list);
c1c6733f 3385+
bb1d8b11
AM
3386+ null_skb = alloc_skb(0, GFP_KERNEL);
3387+ if (null_skb)
3388+ sock_queue_rcv_skb(csock->sock->sk, null_skb);
3389+ list_del(&csock->list);
3390+ kfree(csock);
3391+ }
3392+ up(&client_socket_lock);
3393+ we_are_a_cluster_member = 0;
3394+ cluster_is_quorate = 0;
c1c6733f 3395+
bb1d8b11 3396+ sm_stop(1);
c1c6733f 3397+
bb1d8b11
AM
3398+ /* Wake up any processes waiting for barriers */
3399+ down(&barrier_list_lock);
3400+ list_for_each(blist, &barrier_list) {
3401+ barrier = list_entry(blist, struct cl_barrier, list);
c1c6733f 3402+
bb1d8b11
AM
3403+ /* Cancel any timers */
3404+ if (timer_pending(&barrier->timer))
3405+ del_timer(&barrier->timer);
3406+
3407+ /* Force it to be auto-delete so it discards itself */
3408+ if (barrier->state == BARRIER_STATE_WAITING) {
3409+ barrier->flags |= BARRIER_ATTR_AUTODELETE;
3410+ wake_up_interruptible(&barrier->waitq);
3411+ }
3412+ else {
3413+ if (barrier->callback) {
3414+ barrier->callback(barrier->name, -ENOTCONN);
3415+ barrier->callback = NULL;
3416+ }
c1c6733f
AM
3417+ }
3418+ }
bb1d8b11 3419+ up(&barrier_list_lock);
c1c6733f 3420+
bb1d8b11
AM
3421+ /* Wake up any processes waiting for ISLISTENING requests */
3422+ down(&listenreq_lock);
3423+ list_for_each(blist, &listenreq_list) {
3424+ struct cl_waiting_listen_request *lrequest =
3425+ list_entry(blist, struct cl_waiting_listen_request, list);
c1c6733f 3426+
bb1d8b11
AM
3427+ if (lrequest->waiting)
3428+ wake_up_interruptible(&lrequest->waitq);
3429+ }
3430+ up(&listenreq_lock);
c1c6733f
AM
3431+}
3432+
bb1d8b11 3433+static void free_cluster_sockets()
c1c6733f 3434+{
bb1d8b11
AM
3435+ struct list_head *socklist;
3436+ struct cl_comms_socket *sock;
3437+ struct list_head *temp;
c1c6733f 3438+
bb1d8b11
AM
3439+ list_for_each_safe(socklist, temp, &socket_list) {
3440+ sock = list_entry(socklist, struct cl_comms_socket, list);
c1c6733f 3441+
bb1d8b11
AM
3442+ list_del(&sock->list);
3443+ fput(sock->file);
3444+ kfree(sock);
c1c6733f 3445+ }
bb1d8b11
AM
3446+ num_interfaces = 0;
3447+ current_interface = NULL;
c1c6733f
AM
3448+}
3449+
bb1d8b11
AM
3450+/* Tidy up after all the rest of the cluster bits have shut down */
3451+static void node_cleanup()
c1c6733f 3452+{
bb1d8b11
AM
3453+ struct list_head *nodelist;
3454+ struct list_head *proclist;
3455+ struct list_head *temp;
3456+ struct list_head *socklist;
3457+ struct list_head *blist;
3458+ struct temp_node *tn;
3459+ struct temp_node *tmp;
3460+ struct cl_comms_socket *sock;
3461+ struct kernel_notify_struct *knotify;
c1c6733f 3462+
bb1d8b11
AM
3463+ /* Free list of kernel listeners */
3464+ list_for_each_safe(proclist, temp, &kernel_listener_list) {
3465+ knotify =
3466+ list_entry(proclist, struct kernel_notify_struct, list);
3467+ list_del(&knotify->list);
3468+ kfree(knotify);
c1c6733f
AM
3469+ }
3470+
bb1d8b11
AM
3471+ /* Mark the sockets as busy so they don't get added to the active
3472+ * sockets list in the next few lines of code before we free them */
3473+ list_for_each_safe(socklist, temp, &socket_list) {
3474+ sock = list_entry(socklist, struct cl_comms_socket, list);
c1c6733f 3475+
bb1d8b11 3476+ set_bit(1, &sock->active);
c1c6733f 3477+ }
bb1d8b11
AM
3478+
3479+ /* Tidy the active sockets list */
3480+ list_for_each_safe(socklist, temp, &active_socket_list) {
3481+ sock =
3482+ list_entry(socklist, struct cl_comms_socket, active_list);
3483+ list_del(&sock->active_list);
c1c6733f
AM
3484+ }
3485+
bb1d8b11
AM
3486+ /* Free the memory allocated to cluster nodes */
3487+ free_nodeid_array();
3488+ down(&cluster_members_lock);
3489+ us = NULL;
3490+ list_for_each_safe(nodelist, temp, &cluster_members_list) {
c1c6733f 3491+
bb1d8b11
AM
3492+ struct list_head *addrlist;
3493+ struct list_head *addrtemp;
3494+ struct cluster_node *node;
3495+ struct cluster_node_addr *nodeaddr;
c1c6733f 3496+
bb1d8b11
AM
3497+ node = list_entry(nodelist, struct cluster_node, list);
3498+
3499+ list_for_each_safe(addrlist, addrtemp, &node->addr_list) {
3500+ nodeaddr =
3501+ list_entry(addrlist, struct cluster_node_addr,
3502+ list);
3503+
3504+ list_del(&nodeaddr->list);
3505+ kfree(nodeaddr);
c1c6733f 3506+ }
bb1d8b11
AM
3507+ list_del(&node->list);
3508+ kfree(node->name);
3509+ kfree(node);
c1c6733f 3510+ }
bb1d8b11 3511+ cluster_members = 0;
c1c6733f
AM
3512+ up(&cluster_members_lock);
3513+
bb1d8b11
AM
3514+ /* Clean the temp node IDs list. */
3515+ down(&tempnode_lock);
3516+ list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
3517+ list_del(&tn->list);
3518+ kfree(tn);
3519+ }
3520+ up(&tempnode_lock);
3521+
3522+ /* Free the memory allocated to the outgoing sockets */
3523+ free_cluster_sockets();
3524+
3525+ /* Make sure that all the barriers are deleted */
3526+ down(&barrier_list_lock);
3527+ list_for_each_safe(blist, temp, &barrier_list) {
3528+ struct cl_barrier *barrier =
3529+ list_entry(blist, struct cl_barrier, list);
3530+
3531+ list_del(&barrier->list);
3532+ kfree(barrier);
3533+ }
3534+ up(&barrier_list_lock);
3535+
3536+ kcluster_pid = 0;
3537+ clear_bit(RESEND_NEEDED, &mainloop_flags);
3538+ acks_expected = 0;
3539+ wanted_nodeid = 0;
c1c6733f
AM
3540+}
3541+
bb1d8b11
AM
3542+/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
3543+ * blocked. */
3544+void set_quorate(int total_votes)
c1c6733f 3545+{
bb1d8b11 3546+ int quorate;
c1c6733f 3547+
bb1d8b11
AM
3548+ if (get_quorum() > total_votes) {
3549+ quorate = 0;
3550+ }
3551+ else {
3552+ quorate = 1;
3553+ }
c1c6733f 3554+
bb1d8b11
AM
3555+ /* Hide messages during startup state transition */
3556+ if (we_are_a_cluster_member) {
3557+ if (cluster_is_quorate && !quorate)
3558+ printk(KERN_CRIT CMAN_NAME
3559+ ": quorum lost, blocking activity\n");
3560+ if (!cluster_is_quorate && quorate)
3561+ printk(KERN_CRIT CMAN_NAME
3562+ ": quorum regained, resuming activity\n");
3563+ }
3564+ cluster_is_quorate = quorate;
3565+
3566+ /* Wake up any sleeping processes */
3567+ if (cluster_is_quorate) {
3568+ unjam();
c1c6733f 3569+ }
c1c6733f 3570+
c1c6733f
AM
3571+}
3572+
bb1d8b11 3573+void queue_oob_skb(struct socket *sock, int cmd)
c1c6733f 3574+{
bb1d8b11
AM
3575+ struct sk_buff *skb;
3576+ struct cb_info *cbinfo;
3577+ struct cl_portclosed_oob *oobmsg;
c1c6733f 3578+
bb1d8b11
AM
3579+ skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3580+ if (!skb)
3581+ return;
c1c6733f 3582+
bb1d8b11
AM
3583+ skb_put(skb, sizeof (*oobmsg));
3584+ oobmsg = (struct cl_portclosed_oob *) skb->data;
3585+ oobmsg->port = 0;
3586+ oobmsg->cmd = cmd;
3587+
3588+ /* There is no remote node associated with this so
3589+ clear out the field to avoid any accidents */
3590+ cbinfo = (struct cb_info *)skb->cb;
3591+ cbinfo->oob = 1;
3592+ cbinfo->orig_nodeid = 0;
3593+ cbinfo->orig_port = 0;
c1c6733f 3594+
bb1d8b11 3595+ sock_queue_rcv_skb(sock->sk, skb);
c1c6733f
AM
3596+}
3597+
bb1d8b11
AM
3598+/* Notify interested parties that the cluster configuration has changed */
3599+void notify_listeners()
c1c6733f 3600+{
bb1d8b11
AM
3601+ struct notify_struct *notify;
3602+ struct list_head *proclist;
3603+ struct list_head *socklist;
3604+ struct list_head *temp;
c1c6733f 3605+
bb1d8b11
AM
3606+ /* Do kernel listeners first */
3607+ notify_kernel_listeners(CLUSTER_RECONFIG, 0);
c1c6733f 3608+
bb1d8b11
AM
3609+ /* Now we deign to tell userspace */
3610+ down(&event_listener_lock);
3611+ list_for_each_safe(proclist, temp, &event_listener_list) {
3612+ notify = list_entry(proclist, struct notify_struct, list);
c1c6733f 3613+
bb1d8b11
AM
3614+ /* If the kill fails then remove the process from the list */
3615+ if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) {
3616+ list_del(&notify->list);
3617+ kfree(notify);
c1c6733f 3618+ }
c1c6733f 3619+ }
bb1d8b11 3620+ up(&event_listener_lock);
c1c6733f 3621+
bb1d8b11
AM
3622+ /* Tell userspace processes which want OOB messages */
3623+ down(&client_socket_lock);
3624+ list_for_each(socklist, &client_socket_list) {
3625+ struct cl_client_socket *csock;
3626+ csock = list_entry(socklist, struct cl_client_socket, list);
3627+ queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE);
c1c6733f 3628+ }
bb1d8b11
AM
3629+ up(&client_socket_lock);
3630+}
c1c6733f 3631+
bb1d8b11
AM
3632+/* This fills in the list of all addresses for the local node */
3633+void get_local_addresses(struct cluster_node *node)
3634+{
3635+ struct list_head *socklist;
3636+ struct cl_comms_socket *sock;
c1c6733f 3637+
bb1d8b11
AM
3638+ list_for_each(socklist, &socket_list) {
3639+ sock = list_entry(socklist, struct cl_comms_socket, list);
c1c6733f 3640+
bb1d8b11
AM
3641+ if (sock->recv_only) {
3642+ add_node_address(node, (char *) &sock->saddr, address_length);
3643+ }
3644+ }
c1c6733f
AM
3645+}
3646+
bb1d8b11
AM
3647+
3648+static uint16_t generate_cluster_id(char *name)
c1c6733f 3649+{
bb1d8b11
AM
3650+ int i;
3651+ int value = 0;
c1c6733f 3652+
bb1d8b11
AM
3653+ for (i=0; i<strlen(name); i++) {
3654+ value <<= 1;
3655+ value += name[i];
c1c6733f 3656+ }
bb1d8b11
AM
3657+ return value & 0xFFFF;
3658+}
c1c6733f 3659+
bb1d8b11
AM
3660+/* Return the next comms socket we can use. */
3661+static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur)
3662+{
3663+ int next;
3664+ struct list_head *socklist;
c1c6733f 3665+
bb1d8b11
AM
3666+ /* Fast path for single interface systems */
3667+ if (num_interfaces <= 1)
3668+ return cur;
c1c6733f 3669+
bb1d8b11
AM
3670+ /* Next number */
3671+ next = cur->number + 1;
3672+ if (next > num_interfaces)
3673+ next = 1;
c1c6733f 3674+
bb1d8b11
AM
3675+ /* Find the socket with this number, I could optimise this by starting
3676+ * at the current i/f but most systems are going to have a small number
3677+ * of them anyway */
3678+ list_for_each(socklist, &socket_list) {
3679+ struct cl_comms_socket *sock;
3680+ sock = list_entry(socklist, struct cl_comms_socket, list);
c1c6733f 3681+
bb1d8b11
AM
3682+ if (!sock->recv_only && sock->number == next)
3683+ return sock;
c1c6733f 3684+ }
bb1d8b11
AM
3685+
3686+ BUG();
3687+ return NULL;
c1c6733f
AM
3688+}
3689+
bb1d8b11
AM
3690+/* MUST be called with the barrier list lock held */
3691+static struct cl_barrier *find_barrier(char *name)
c1c6733f 3692+{
bb1d8b11
AM
3693+ struct list_head *blist;
3694+ struct cl_barrier *bar;
c1c6733f 3695+
bb1d8b11
AM
3696+ list_for_each(blist, &barrier_list) {
3697+ bar = list_entry(blist, struct cl_barrier, list);
c1c6733f 3698+
bb1d8b11
AM
3699+ if (strcmp(name, bar->name) == 0)
3700+ return bar;
c1c6733f 3701+ }
bb1d8b11
AM
3702+ return NULL;
3703+}
c1c6733f 3704+
bb1d8b11
AM
3705+/* Do the stuff we need to do when the barrier has completed phase 1 */
3706+static void check_barrier_complete_phase1(struct cl_barrier *barrier)
3707+{
3708+ if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0)
3709+ ? barrier->expected_nodes :
3710+ cluster_members)) {
c1c6733f 3711+
bb1d8b11 3712+ struct cl_barriermsg bmsg;
c1c6733f 3713+
bb1d8b11
AM
3714+ atomic_inc(&barrier->completed_nodes); /* We have completed */
3715+ barrier->phase = 2; /* Wait for complete phase II */
c1c6733f 3716+
bb1d8b11
AM
3717+ /* Send completion message, remember: we are in cnxman context
3718+ * and must not block */
3719+ bmsg.cmd = CLUSTER_CMD_BARRIER;
3720+ bmsg.subcmd = BARRIER_COMPLETE;
3721+ bmsg.flags = 0;
3722+ strcpy(bmsg.name, barrier->name);
c1c6733f 3723+
bb1d8b11
AM
3724+ P_BARRIER("Sending COMPLETE for %s\n", barrier->name);
3725+ queue_message(NULL, (char *) &bmsg, sizeof (bmsg), NULL, 0, 0);
c1c6733f 3726+ }
c1c6733f
AM
3727+}
3728+
bb1d8b11
AM
3729+/* Do the stuff we need to do when the barrier has been reached */
3730+/* Return 1 if we deleted the barrier */
3731+static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status)
c1c6733f 3732+{
bb1d8b11 3733+ spin_lock_irq(&barrier->phase2_spinlock);
c1c6733f 3734+
bb1d8b11
AM
3735+ if (barrier->state != BARRIER_STATE_COMPLETE &&
3736+ (status == -ETIMEDOUT ||
3737+ atomic_read(&barrier->completed_nodes) ==
3738+ ((barrier->expected_nodes != 0)
3739+ ? barrier->expected_nodes : cluster_members))) {
3740+
3741+ if (status == 0 && barrier->timeout)
3742+ del_timer(&barrier->timer);
3743+ barrier->endreason = status;
3744+
3745+ /* Wake up listener */
3746+ if (barrier->state == BARRIER_STATE_WAITING) {
3747+ wake_up_interruptible(&barrier->waitq);
3748+ }
3749+ else {
3750+ /* Additional tasks we have to do if the user was not
3751+ * waiting... */
3752+ /* Call the callback */
3753+ if (barrier->callback) {
3754+ barrier->callback(barrier->name, 0);
3755+ barrier->callback = NULL;
3756+ }
3757+ /* Remove it if it's AUTO-DELETE */
3758+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
3759+ list_del(&barrier->list);
3760+ spin_unlock_irq(&barrier->phase2_spinlock);
3761+ kfree(barrier);
3762+ return 1;
3763+ }
3764+ }
3765+ barrier->state = BARRIER_STATE_COMPLETE;
c1c6733f 3766+ }
bb1d8b11
AM
3767+ spin_unlock_irq(&barrier->phase2_spinlock);
3768+ return 0;
3769+}
c1c6733f 3770+
bb1d8b11
AM
3771+/* Called if a barrier timeout happens */
3772+static void barrier_timer_fn(unsigned long arg)
3773+{
3774+ struct cl_barrier *barrier = (struct cl_barrier *) arg;
c1c6733f 3775+
bb1d8b11
AM
3776+ /* Ignore any futher messages, they are too late. */
3777+ barrier->phase = 0;
c1c6733f 3778+
bb1d8b11
AM
3779+ /* and cause it to timeout */
3780+ check_barrier_complete_phase2(barrier, -ETIMEDOUT);
c1c6733f
AM
3781+}
3782+
bb1d8b11
AM
3783+/* Process BARRIER messages from other nodes */
3784+static void process_barrier_msg(struct cl_barriermsg *msg,
3785+ struct cluster_node *node)
c1c6733f
AM
3786+{
3787+ struct cl_barrier *barrier;
3788+
c1c6733f 3789+ down(&barrier_list_lock);
bb1d8b11
AM
3790+ barrier = find_barrier(msg->name);
3791+ up(&barrier_list_lock);
c1c6733f 3792+
bb1d8b11
AM
3793+ /* Ignore other peoples messages, in_transition() is needed here so
3794+ * that joining nodes will see their barrier messages before the
3795+ * we_are_a_cluster_member is set */
3796+ if (!we_are_a_cluster_member && !in_transition())
3797+ return;
3798+ if (!barrier)
3799+ return;
c1c6733f 3800+
bb1d8b11
AM
3801+ P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name,
3802+ node ? node->name : "unknown");
c1c6733f 3803+
bb1d8b11
AM
3804+ switch (msg->subcmd) {
3805+ case BARRIER_WAIT:
3806+ down(&barrier->lock);
3807+ if (barrier->phase == 0)
3808+ barrier->phase = 1;
c1c6733f 3809+
bb1d8b11
AM
3810+ if (barrier->phase == 1) {
3811+ atomic_inc(&barrier->got_nodes);
3812+ check_barrier_complete_phase1(barrier);
3813+ }
3814+ else {
3815+ printk(KERN_WARNING CMAN_NAME
3816+ ": got WAIT barrier not in phase 1 %s (%d)\n",
3817+ msg->name, barrier->phase);
3818+
3819+ }
c1c6733f 3820+ up(&barrier->lock);
bb1d8b11 3821+ break;
c1c6733f 3822+
bb1d8b11
AM
3823+ case BARRIER_COMPLETE:
3824+ down(&barrier->lock);
3825+ atomic_inc(&barrier->completed_nodes);
c1c6733f 3826+
bb1d8b11
AM
3827+ /* First node to get all the WAIT messages sends COMPLETE, so
3828+ * we all complete */
3829+ if (barrier->phase == 1) {
3830+ atomic_set(&barrier->got_nodes,
3831+ barrier->expected_nodes);
3832+ check_barrier_complete_phase1(barrier);
3833+ }
3834+
3835+ if (barrier->phase == 2) {
3836+ /* If it was deleted (ret==1) then no need to unlock
3837+ * the mutex */
3838+ if (check_barrier_complete_phase2(barrier, 0) == 1)
3839+ return;
3840+ }
3841+ up(&barrier->lock);
3842+ break;
3843+ }
c1c6733f
AM
3844+}
3845+
bb1d8b11
AM
3846+/* In-kernel membership API */
3847+int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg))
c1c6733f 3848+{
bb1d8b11 3849+ struct kernel_notify_struct *notify;
c1c6733f 3850+
bb1d8b11
AM
3851+ notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL);
3852+ if (!notify)
3853+ return -ENOMEM;
3854+ notify->callback = callback;
c1c6733f 3855+
bb1d8b11
AM
3856+ down(&kernel_listener_lock);
3857+ list_add(&notify->list, &kernel_listener_list);
3858+ up(&kernel_listener_lock);
c1c6733f 3859+
bb1d8b11
AM
3860+ return 0;
3861+}
c1c6733f 3862+
bb1d8b11
AM
3863+int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg))
3864+{
3865+ struct list_head *calllist;
3866+ struct list_head *temp;
3867+ struct kernel_notify_struct *notify;
c1c6733f 3868+
bb1d8b11
AM
3869+ down(&kernel_listener_lock);
3870+ list_for_each_safe(calllist, temp, &kernel_listener_list) {
3871+ notify = list_entry(calllist, struct kernel_notify_struct, list);
3872+ if (notify->callback == callback){
3873+ list_del(&notify->list);
3874+ kfree(notify);
3875+ up(&kernel_listener_lock);
3876+ return 0;
3877+ }
c1c6733f 3878+ }
bb1d8b11
AM
3879+ up(&kernel_listener_lock);
3880+ return -EINVAL;
3881+}
c1c6733f 3882+
bb1d8b11
AM
3883+/* Return quorate status */
3884+int kcl_is_quorate()
3885+{
3886+ return cluster_is_quorate;
3887+}
c1c6733f 3888+
bb1d8b11
AM
3889+/* Return the address list for a node */
3890+struct list_head *kcl_get_node_addresses(int nodeid)
3891+{
3892+ struct cluster_node *node = find_node_by_nodeid(nodeid);
c1c6733f 3893+
bb1d8b11
AM
3894+ if (node)
3895+ return &node->addr_list;
3896+ else
3897+ return NULL;
3898+}
c1c6733f 3899+
bb1d8b11
AM
3900+static void copy_to_kclnode(struct cluster_node *node,
3901+ struct kcl_cluster_node *knode)
3902+{
3903+ strcpy(knode->name, node->name);
3904+ knode->size = sizeof (struct kcl_cluster_node);
3905+ knode->votes = node->votes;
3906+ knode->state = node->state;
3907+ knode->node_id = node->node_id;
3908+ knode->us = node->us;
3909+ knode->leave_reason = node->leave_reason;
3910+ knode->incarnation = node->incarnation;
3911+}
c1c6733f 3912+
bb1d8b11
AM
3913+/* Return the info for a node given it's address. if addr is NULL then return
3914+ * OUR info */
3915+int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
3916+ struct kcl_cluster_node *n)
3917+{
3918+ struct cluster_node *node;
c1c6733f 3919+
bb1d8b11
AM
3920+ /* They want us */
3921+ if (addr == NULL) {
3922+ node = us;
c1c6733f 3923+ }
bb1d8b11
AM
3924+ else {
3925+ node = find_node_by_addr(addr, addr_len);
3926+ if (!node)
3927+ return -1;
3928+ }
3929+
3930+ /* Copy to user's buffer */
3931+ copy_to_kclnode(node, n);
3932+ return 0;
3933+}
c1c6733f 3934+
bb1d8b11
AM
3935+int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n)
3936+{
3937+ struct cluster_node *node;
c1c6733f 3938+
bb1d8b11
AM
3939+ /* They want us */
3940+ if (name == NULL) {
3941+ node = us;
3942+ if (node == NULL)
3943+ return -1;
3944+ }
3945+ else {
3946+ node = find_node_by_name(name);
3947+ if (!node)
3948+ return -1;
c1c6733f
AM
3949+ }
3950+
bb1d8b11
AM
3951+ /* Copy to user's buffer */
3952+ copy_to_kclnode(node, n);
3953+ return 0;
3954+}
c1c6733f 3955+
bb1d8b11
AM
3956+/* As above but by node id. MUCH faster */
3957+int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n)
3958+{
3959+ struct cluster_node *node;
c1c6733f 3960+
bb1d8b11
AM
3961+ /* They want us */
3962+ if (nodeid == 0) {
3963+ node = us;
3964+ if (node == NULL)
3965+ return -1;
c1c6733f
AM
3966+ }
3967+ else {
bb1d8b11
AM
3968+ node = find_node_by_nodeid(nodeid);
3969+ if (!node)
3970+ return -1;
c1c6733f
AM
3971+ }
3972+
bb1d8b11
AM
3973+ /* Copy to user's buffer */
3974+ copy_to_kclnode(node, n);
3975+ return 0;
c1c6733f
AM
3976+}
3977+
bb1d8b11
AM
3978+/* Return a list of all cluster members ever */
3979+int kcl_get_all_members(struct list_head *list)
c1c6733f 3980+{
bb1d8b11
AM
3981+ struct list_head *nodelist;
3982+ struct cluster_node *node;
3983+ struct kcl_cluster_node *newnode;
3984+ int num_nodes = 0;
c1c6733f 3985+
bb1d8b11
AM
3986+ down(&cluster_members_lock);
3987+ list_for_each(nodelist, &cluster_members_list) {
3988+ if (list) {
3989+ node = list_entry(nodelist, struct cluster_node, list);
3990+ newnode =
3991+ kmalloc(sizeof (struct kcl_cluster_node),
3992+ GFP_KERNEL);
3993+ if (newnode) {
3994+ copy_to_kclnode(node, newnode);
3995+ list_add(&newnode->list, list);
3996+ num_nodes++;
3997+ }
3998+ }
3999+ else {
4000+ num_nodes++;
4001+ }
4002+ }
4003+ up(&cluster_members_lock);
c1c6733f 4004+
bb1d8b11
AM
4005+ return num_nodes;
4006+}
c1c6733f 4007+
bb1d8b11
AM
4008+/* Return a list of cluster members */
4009+int kcl_get_members(struct list_head *list)
4010+{
4011+ struct list_head *nodelist;
4012+ struct cluster_node *node;
4013+ struct kcl_cluster_node *newnode;
4014+ int num_nodes = 0;
4015+
4016+ down(&cluster_members_lock);
4017+ list_for_each(nodelist, &cluster_members_list) {
4018+ node = list_entry(nodelist, struct cluster_node, list);
4019+
4020+ if (node->state == NODESTATE_MEMBER) {
4021+ if (list) {
4022+ newnode =
4023+ kmalloc(sizeof (struct kcl_cluster_node),
4024+ GFP_KERNEL);
4025+ if (newnode) {
4026+ copy_to_kclnode(node, newnode);
4027+ list_add(&newnode->list, list);
4028+ num_nodes++;
c1c6733f
AM
4029+ }
4030+ }
4031+ else {
bb1d8b11 4032+ num_nodes++;
c1c6733f
AM
4033+ }
4034+ }
4035+ }
bb1d8b11 4036+ up(&cluster_members_lock);
c1c6733f 4037+
bb1d8b11
AM
4038+ return num_nodes;
4039+}
c1c6733f 4040+
bb1d8b11
AM
4041+/* Copy current member's nodeids into buffer */
4042+int kcl_get_member_ids(uint32_t *idbuf, int size)
4043+{
4044+ struct list_head *nodelist;
4045+ struct cluster_node *node;
4046+ int num_nodes = 0;
4047+
4048+ down(&cluster_members_lock);
4049+ list_for_each(nodelist, &cluster_members_list) {
4050+ node = list_entry(nodelist, struct cluster_node, list);
4051+
4052+ if (node->state == NODESTATE_MEMBER) {
4053+ if (idbuf && size) {
4054+ idbuf[num_nodes] = node->node_id;
4055+ num_nodes++;
4056+ size--;
4057+ }
4058+ else {
4059+ num_nodes++;
4060+ }
c1c6733f
AM
4061+ }
4062+ }
bb1d8b11
AM
4063+ up(&cluster_members_lock);
4064+
4065+ return num_nodes;
c1c6733f
AM
4066+}
4067+
bb1d8b11
AM
4068+/* Barrier API */
4069+int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes)
c1c6733f 4070+{
bb1d8b11 4071+ struct cl_barrier *barrier;
c1c6733f 4072+
bb1d8b11
AM
4073+ /* We are not joined to a cluster */
4074+ if (!we_are_a_cluster_member)
4075+ return -ENOTCONN;
c1c6733f 4076+
bb1d8b11
AM
4077+ /* Must have a valid name */
4078+ if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1)
4079+ return -EINVAL;
c1c6733f 4080+
bb1d8b11
AM
4081+ /* We don't do this yet */
4082+ if (flags & BARRIER_ATTR_MULTISTEP)
4083+ return -ENOTSUPP;
4084+
4085+ down(&barrier_list_lock);
4086+
4087+ /* See if it already exists */
4088+ if ((barrier = find_barrier(name))) {
4089+ up(&barrier_list_lock);
4090+ if (nodes != barrier->expected_nodes) {
4091+ printk(KERN_WARNING CMAN_NAME
4092+ ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n",
4093+ name, barrier->expected_nodes, nodes);
4094+ up(&barrier_list_lock);
4095+ return -EINVAL;
c1c6733f 4096+ }
bb1d8b11
AM
4097+ else
4098+ return 0;
4099+ }
4100+
4101+ /* Build a new struct and add it to the list */
4102+ barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL);
4103+ if (barrier == NULL) {
4104+ up(&barrier_list_lock);
4105+ return -ENOMEM;
4106+ }
4107+ memset(barrier, 0, sizeof (*barrier));
4108+
4109+ strcpy(barrier->name, name);
4110+ barrier->flags = flags;
4111+ barrier->expected_nodes = nodes;
4112+ atomic_set(&barrier->got_nodes, 0);
4113+ atomic_set(&barrier->completed_nodes, 0);
4114+ barrier->endreason = 0;
4115+ barrier->registered_nodes = 1;
4116+ spin_lock_init(&barrier->phase2_spinlock);
4117+ barrier->state = BARRIER_STATE_INACTIVE;
4118+ init_MUTEX(&barrier->lock);
4119+
4120+ list_add(&barrier->list, &barrier_list);
4121+ up(&barrier_list_lock);
4122+
4123+ return 0;
4124+}
4125+
4126+static int barrier_setattr_enabled(struct cl_barrier *barrier,
4127+ unsigned int attr, unsigned long arg)
4128+{
4129+ int status;
4130+
4131+ /* Can't disable a barrier */
4132+ if (!arg) {
4133+ up(&barrier->lock);
4134+ return -EINVAL;
c1c6733f 4135+ }
c1c6733f 4136+
bb1d8b11
AM
4137+ /* We need to send WAIT now because the user may not
4138+ * actually call kcl_barrier_wait() */
4139+ if (!barrier->waitsent) {
4140+ struct cl_barriermsg bmsg;
c1c6733f 4141+
bb1d8b11
AM
4142+ /* Send it to the rest of the cluster */
4143+ bmsg.cmd = CLUSTER_CMD_BARRIER;
4144+ bmsg.subcmd = BARRIER_WAIT;
4145+ strcpy(bmsg.name, barrier->name);
c1c6733f 4146+
bb1d8b11
AM
4147+ barrier->waitsent = 1;
4148+ barrier->phase = 1;
c1c6733f 4149+
bb1d8b11 4150+ atomic_inc(&barrier->got_nodes);
c1c6733f 4151+
bb1d8b11
AM
4152+ /* Start the timer if one was wanted */
4153+ if (barrier->timeout) {
4154+ init_timer(&barrier->timer);
4155+ barrier->timer.function = barrier_timer_fn;
4156+ barrier->timer.data = (long) barrier;
4157+ mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ));
4158+ }
c1c6733f 4159+
bb1d8b11
AM
4160+ /* Barrier WAIT and COMPLETE messages are
4161+ * always queued - that way they always get
4162+ * sent out in the right order. If we don't do
4163+ * this then one can get sent out in the
4164+ * context of the user process and the other in
4165+ * cnxman and COMPLETE may /just/ slide in
4166+ * before WAIT if its in the queue
4167+ */
4168+ P_BARRIER("Sending WAIT for %s\n", barrier->name);
4169+ status = queue_message(NULL, &bmsg, sizeof (bmsg), NULL, 0, 0);
4170+ if (status < 0) {
4171+ up(&barrier->lock);
4172+ return status;
c1c6733f 4173+ }
bb1d8b11
AM
4174+
4175+ /* It might have been reached now */
4176+ if (barrier
4177+ && barrier->state != BARRIER_STATE_COMPLETE
4178+ && barrier->phase == 1)
4179+ check_barrier_complete_phase1(barrier);
4180+ }
4181+ if (barrier && barrier->state == BARRIER_STATE_COMPLETE) {
4182+ up(&barrier->lock);
4183+ return barrier->endreason;
c1c6733f 4184+ }
bb1d8b11
AM
4185+ up(&barrier->lock);
4186+ return 0; /* Nothing to propogate */
4187+}
c1c6733f 4188+
bb1d8b11
AM
4189+int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg)
4190+{
4191+ struct cl_barrier *barrier;
c1c6733f 4192+
bb1d8b11
AM
4193+ /* See if it already exists */
4194+ down(&barrier_list_lock);
4195+ if (!(barrier = find_barrier(name))) {
4196+ up(&barrier_list_lock);
4197+ return -ENOENT;
c1c6733f 4198+ }
bb1d8b11 4199+ up(&barrier_list_lock);
c1c6733f 4200+
bb1d8b11
AM
4201+ down(&barrier->lock);
4202+ if (barrier->state == BARRIER_STATE_COMPLETE) {
4203+ up(&barrier->lock);
4204+ return 0;
4205+ }
c1c6733f 4206+
bb1d8b11
AM
4207+ switch (attr) {
4208+ case BARRIER_SETATTR_AUTODELETE:
4209+ if (arg)
4210+ barrier->flags |= BARRIER_ATTR_AUTODELETE;
4211+ else
4212+ barrier->flags &= ~BARRIER_ATTR_AUTODELETE;
4213+ up(&barrier->lock);
4214+ return 0;
4215+ break;
c1c6733f 4216+
bb1d8b11
AM
4217+ case BARRIER_SETATTR_TIMEOUT:
4218+ /* Can only change the timout of an inactive barrier */
4219+ if (barrier->state == BARRIER_STATE_WAITING
4220+ || barrier->waitsent) {
4221+ up(&barrier->lock);
4222+ return -EINVAL;
4223+ }
4224+ barrier->timeout = arg;
4225+ up(&barrier->lock);
4226+ return 0;
c1c6733f 4227+
bb1d8b11
AM
4228+ case BARRIER_SETATTR_MULTISTEP:
4229+ up(&barrier->lock);
4230+ return -ENOTSUPP;
c1c6733f 4231+
bb1d8b11
AM
4232+ case BARRIER_SETATTR_ENABLED:
4233+ return barrier_setattr_enabled(barrier, attr, arg);
4234+
4235+ case BARRIER_SETATTR_NODES:
4236+ /* Can only change the expected node count of an inactive
4237+ * barrier */
4238+ if (barrier->state == BARRIER_STATE_WAITING
4239+ || barrier->waitsent)
4240+ return -EINVAL;
4241+ barrier->expected_nodes = arg;
4242+ break;
4243+
4244+ case BARRIER_SETATTR_CALLBACK:
4245+ if (barrier->state == BARRIER_STATE_WAITING
4246+ || barrier->waitsent)
4247+ return -EINVAL;
4248+ barrier->callback = (void (*)(char *, int)) arg;
4249+ up(&barrier->lock);
4250+ return 0; /* Don't propgate this to other nodes */
c1c6733f 4251+ }
c1c6733f 4252+
bb1d8b11
AM
4253+ up(&barrier->lock);
4254+ return 0;
c1c6733f
AM
4255+}
4256+
bb1d8b11 4257+int kcl_barrier_delete(char *name)
c1c6733f 4258+{
bb1d8b11 4259+ struct cl_barrier *barrier;
c1c6733f 4260+
bb1d8b11
AM
4261+ down(&barrier_list_lock);
4262+ /* See if it exists */
4263+ if (!(barrier = find_barrier(name))) {
4264+ up(&barrier_list_lock);
4265+ return -ENOENT;
4266+ }
b7b72b66 4267+
bb1d8b11
AM
4268+ /* Delete it */
4269+ list_del(&barrier->list);
4270+ kfree(barrier);
b7b72b66 4271+
bb1d8b11 4272+ up(&barrier_list_lock);
b7b72b66 4273+
bb1d8b11 4274+ return 0;
c1c6733f
AM
4275+}
4276+
bb1d8b11 4277+int kcl_barrier_cancel(char *name)
c1c6733f 4278+{
bb1d8b11 4279+ struct cl_barrier *barrier;
c1c6733f 4280+
bb1d8b11
AM
4281+ /* See if it exists */
4282+ down(&barrier_list_lock);
4283+ if (!(barrier = find_barrier(name))) {
4284+ up(&barrier_list_lock);
4285+ return -ENOENT;
4286+ }
4287+ down(&barrier->lock);
c1c6733f 4288+
bb1d8b11 4289+ barrier->endreason = -ENOTCONN;
c1c6733f 4290+
bb1d8b11
AM
4291+ if (barrier->callback) {
4292+ barrier->callback(barrier->name, -ECONNRESET);
4293+ barrier->callback = NULL;
c1c6733f
AM
4294+ }
4295+
bb1d8b11
AM
4296+ if (barrier->timeout)
4297+ del_timer(&barrier->timer);
c1c6733f 4298+
bb1d8b11
AM
4299+ /* Remove it if it's AUTO-DELETE */
4300+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4301+ list_del(&barrier->list);
4302+ up(&barrier->lock);
4303+ kfree(barrier);
4304+ up(&barrier_list_lock);
4305+ return 0;
4306+ }
4307+
4308+ if (barrier->state == BARRIER_STATE_WAITING)
4309+ wake_up_interruptible(&barrier->waitq);
c1c6733f 4310+
bb1d8b11
AM
4311+ up(&barrier->lock);
4312+ up(&barrier_list_lock);
c1c6733f
AM
4313+ return 0;
4314+}
4315+
bb1d8b11 4316+int kcl_barrier_wait(char *name)
c1c6733f 4317+{
bb1d8b11
AM
4318+ struct cl_barrier *barrier;
4319+ int ret;
c1c6733f 4320+
bb1d8b11
AM
4321+ if (!atomic_read(&cnxman_running))
4322+ return -ENOTCONN;
c1c6733f 4323+
bb1d8b11
AM
4324+ /* Enable it */
4325+ kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L);
c1c6733f 4326+
bb1d8b11 4327+ down(&barrier_list_lock);
c1c6733f 4328+
bb1d8b11
AM
4329+ /* See if it still exists - enable may have deleted it! */
4330+ if (!(barrier = find_barrier(name))) {
4331+ up(&barrier_list_lock);
4332+ return -ENOENT;
c1c6733f 4333+ }
bb1d8b11
AM
4334+
4335+ down(&barrier->lock);
4336+
4337+ up(&barrier_list_lock);
4338+
4339+ /* If it has already completed then return the status */
4340+ if (barrier->state == BARRIER_STATE_COMPLETE) {
4341+ up(&barrier->lock);
4342+ return barrier->endreason;
c1c6733f
AM
4343+ }
4344+
bb1d8b11 4345+ barrier->state = BARRIER_STATE_WAITING;
c1c6733f 4346+
bb1d8b11
AM
4347+ /* Have we all reached the barrier? */
4348+ while (atomic_read(&barrier->completed_nodes) !=
4349+ ((barrier->expected_nodes == 0)
4350+ ? cluster_members : barrier->expected_nodes)
4351+ && barrier->endreason == 0) {
c1c6733f 4352+
bb1d8b11 4353+ wait_queue_t wq;
c1c6733f 4354+
bb1d8b11
AM
4355+ init_waitqueue_entry(&wq, current);
4356+ init_waitqueue_head(&barrier->waitq);
c1c6733f 4357+
bb1d8b11
AM
4358+ /* Wait for em all */
4359+ set_task_state(current, TASK_INTERRUPTIBLE);
4360+ add_wait_queue(&barrier->waitq, &wq);
c1c6733f 4361+
bb1d8b11
AM
4362+ if (atomic_read(&barrier->completed_nodes) !=
4363+ ((barrier->expected_nodes ==
4364+ 0) ? cluster_members : barrier->expected_nodes)
4365+ && barrier->endreason == 0) {
4366+ up(&barrier->lock);
4367+ schedule();
4368+ down(&barrier->lock);
4369+ }
c1c6733f 4370+
bb1d8b11
AM
4371+ remove_wait_queue(&barrier->waitq, &wq);
4372+ set_task_state(current, TASK_RUNNING);
c1c6733f 4373+
bb1d8b11
AM
4374+ if (signal_pending(current)) {
4375+ barrier->endreason = -EINTR;
4376+ break;
4377+ }
4378+ }
4379+ barrier->state = BARRIER_STATE_INACTIVE;
c1c6733f 4380+
bb1d8b11
AM
4381+ if (barrier->timeout)
4382+ del_timer(&barrier->timer);
c1c6733f 4383+
bb1d8b11
AM
4384+ /* Barrier has been reached on all nodes, call the callback */
4385+ if (barrier->callback) {
4386+ barrier->callback(barrier->name, barrier->endreason);
4387+ barrier->callback = NULL;
4388+ }
c1c6733f 4389+
bb1d8b11 4390+ atomic_set(&barrier->got_nodes, 0);
c1c6733f 4391+
bb1d8b11
AM
4392+ /* Return the reason we were woken */
4393+ ret = barrier->endreason;
c1c6733f 4394+
bb1d8b11
AM
4395+ /* Remove it if it's AUTO-DELETE */
4396+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4397+ down(&barrier_list_lock);
4398+ list_del(&barrier->list);
4399+ up(&barrier_list_lock);
4400+ up(&barrier->lock);
4401+ kfree(barrier);
4402+ }
4403+ else {
4404+ up(&barrier->lock);
4405+ }
c1c6733f 4406+
bb1d8b11
AM
4407+ /* We were woken up because the node left the cluster ? */
4408+ if (!atomic_read(&cnxman_running))
4409+ ret = -ENOTCONN;
c1c6733f 4410+
bb1d8b11
AM
4411+ return ret;
4412+}
4413+
4414+/* This is called from membership services when a node has left the cluster -
4415+ * we signal all waiting barriers with -ESRCH so they know to do something
4416+ * else, if the number of nodes is left at 0 then we compare the new number of
4417+ * nodes in the cluster with that at the barrier and return 0 (success) in that
4418+ * case */
4419+void check_barrier_returns()
c1c6733f 4420+{
bb1d8b11
AM
4421+ struct list_head *blist;
4422+ struct list_head *llist;
4423+ struct cl_barrier *barrier;
4424+ int status = 0;
c1c6733f 4425+
bb1d8b11
AM
4426+ down(&barrier_list_lock);
4427+ list_for_each(blist, &barrier_list) {
4428+ barrier = list_entry(blist, struct cl_barrier, list);
b7b72b66 4429+
bb1d8b11
AM
4430+ if (barrier->waitsent) {
4431+ int wakeit = 0;
4432+
4433+ /* Check for a dynamic member barrier */
4434+ if (barrier->expected_nodes == 0) {
4435+ if (barrier->registered_nodes ==
4436+ cluster_members) {
4437+ status = 0;
4438+ wakeit = 1;
4439+ }
4440+ }
4441+ else {
4442+ status = -ESRCH;
4443+ wakeit = 1;
4444+ }
4445+
4446+ /* Do we need to tell the barrier? */
4447+ if (wakeit) {
4448+ if (barrier->state == BARRIER_STATE_WAITING) {
4449+ barrier->endreason = status;
4450+ wake_up_interruptible(&barrier->waitq);
4451+ }
4452+ else {
4453+ if (barrier->callback) {
4454+ barrier->callback(barrier->name,
4455+ status);
4456+ }
4457+ }
4458+ }
4459+ }
c1c6733f 4460+ }
bb1d8b11 4461+ up(&barrier_list_lock);
c1c6733f 4462+
bb1d8b11
AM
4463+ /* Part 2 check for outstanding listen requests for dead nodes and
4464+ * cancel them */
4465+ down(&listenreq_lock);
4466+ list_for_each(llist, &listenreq_list) {
4467+ struct cl_waiting_listen_request *lrequest =
4468+ list_entry(llist, struct cl_waiting_listen_request, list);
4469+ struct cluster_node *node =
4470+ find_node_by_nodeid(lrequest->nodeid);
c783755a 4471+
bb1d8b11
AM
4472+ if (node && node->state != NODESTATE_MEMBER) {
4473+ lrequest->result = -ENOTCONN;
4474+ lrequest->waiting = 0;
4475+ wake_up_interruptible(&lrequest->waitq);
4476+ }
4477+ }
4478+ up(&listenreq_lock);
4479+}
c783755a 4480+
bb1d8b11
AM
4481+int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen)
4482+{
4483+ struct temp_node *tn;
4484+ int err = 1; /* true */
4485+#ifdef DEBUG_COMMS
4486+ char buf[MAX_ADDR_PRINTED_LEN];
4487+#endif
c783755a 4488+
bb1d8b11 4489+ down(&tempnode_lock);
c783755a 4490+
bb1d8b11
AM
4491+ list_for_each_entry(tn, &tempnode_list, list) {
4492+ if (tn->nodeid == nodeid) {
4493+ memcpy(addr, tn->addr, tn->addrlen);
4494+ *addrlen = tn->addrlen;
4495+ P_COMMS("get_temp_nodeid. id %d:\n: %s\n",
4496+ tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
c783755a 4497+
bb1d8b11
AM
4498+ goto out;
4499+ }
4500+ }
4501+ err = 0;
4502+
4503+ out:
4504+ up(&tempnode_lock);
4505+ return err;
c783755a
AM
4506+}
4507+
bb1d8b11
AM
4508+/* Create a new temporary node ID. This list will only ever be very small
4509+ (usaully only 1 item) but I can't take the risk that someone won't try to
4510+ boot 128 nodes all at exactly the same time. */
4511+int new_temp_nodeid(char *addr, int addrlen)
c783755a 4512+{
bb1d8b11
AM
4513+ struct temp_node *tn;
4514+ int err = -1;
4515+ int try_nodeid = 0;
4516+#ifdef DEBUG_COMMS
4517+ char buf[MAX_ADDR_PRINTED_LEN];
c783755a
AM
4518+#endif
4519+
bb1d8b11
AM
4520+ P_COMMS("new_temp_nodeid needed for\n: %s\n",
4521+ print_addr(addr, addrlen, buf));
c783755a 4522+
bb1d8b11 4523+ down(&tempnode_lock);
c783755a 4524+
bb1d8b11
AM
4525+ /* First see if we already know about this node */
4526+ list_for_each_entry(tn, &tempnode_list, list) {
c783755a 4527+
bb1d8b11
AM
4528+ P_COMMS("new_temp_nodeid list. id %d:\n: %s\n",
4529+ tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
c783755a 4530+
bb1d8b11
AM
4531+ /* We're already in here... */
4532+ if (tn->addrlen == addrlen &&
4533+ memcmp(tn->addr, addr, addrlen) == 0) {
4534+ P_COMMS("reused temp node ID %d\n", tn->nodeid);
4535+ err = tn->nodeid;
4536+ goto out;
4537+ }
4538+ }
c783755a 4539+
bb1d8b11
AM
4540+ /* Nope, OK, invent a suitable number */
4541+ retry:
4542+ try_nodeid -= 1;
4543+ list_for_each_entry(tn, &tempnode_list, list) {
c783755a 4544+
bb1d8b11
AM
4545+ if (tn->nodeid == try_nodeid)
4546+ goto retry;
4547+ }
c783755a 4548+
bb1d8b11
AM
4549+ tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL);
4550+ if (!tn)
4551+ goto out;
c783755a 4552+
bb1d8b11
AM
4553+ memcpy(tn->addr, addr, addrlen);
4554+ tn->addrlen = addrlen;
4555+ tn->nodeid = try_nodeid;
4556+ list_add_tail(&tn->list, &tempnode_list);
4557+ err = try_nodeid;
4558+ P_COMMS("new temp nodeid = %d\n", try_nodeid);
4559+ out:
4560+ up(&tempnode_lock);
4561+ return err;
4562+}
c783755a 4563+
bb1d8b11
AM
4564+static int is_valid_temp_nodeid(int nodeid)
4565+{
4566+ struct temp_node *tn;
4567+ int err = 1; /* true */
c783755a 4568+
bb1d8b11 4569+ down(&tempnode_lock);
c783755a 4570+
bb1d8b11
AM
4571+ list_for_each_entry(tn, &tempnode_list, list) {
4572+ if (tn->nodeid == nodeid)
4573+ goto out;
4574+ }
4575+ err = 0;
c783755a 4576+
bb1d8b11
AM
4577+ out:
4578+ P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err);
4579+ up(&tempnode_lock);
4580+ return err;
4581+}
c783755a 4582+
bb1d8b11
AM
4583+/*
4584+ * Remove any temp nodeIDs that refer to now-valid cluster members.
4585+ */
4586+void purge_temp_nodeids()
4587+{
4588+ struct temp_node *tn;
4589+ struct temp_node *tmp;
4590+ struct cluster_node *node;
4591+ struct cluster_node_addr *nodeaddr;
c783755a 4592+
c783755a 4593+
bb1d8b11
AM
4594+ down(&tempnode_lock);
4595+ down(&cluster_members_lock);
c783755a 4596+
bb1d8b11
AM
4597+ /*
4598+ * The ordering of these nested lists is deliberately
4599+ * arranged for the fewest list traversals overall
4600+ */
c783755a 4601+
bb1d8b11
AM
4602+ /* For each node... */
4603+ list_for_each_entry(node, &cluster_members_list, list) {
4604+ if (node->state == NODESTATE_MEMBER) {
4605+ /* ...We check the temp node ID list... */
4606+ list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
c783755a 4607+
bb1d8b11
AM
4608+ /* ...against that node's address */
4609+ list_for_each_entry(nodeaddr, &node->addr_list, list) {
c783755a 4610+
bb1d8b11
AM
4611+ if (memcmp(nodeaddr->addr, tn->addr, tn->addrlen) == 0) {
4612+ list_del(&tn->list);
4613+ kfree(tn);
4614+ }
4615+ }
4616+ }
4617+ }
4618+ }
4619+ up(&cluster_members_lock);
4620+ up(&tempnode_lock);
4621+}
c783755a 4622+
c783755a 4623+
bb1d8b11
AM
4624+/* Quorum device functions */
4625+int kcl_register_quorum_device(char *name, int votes)
4626+{
4627+ if (quorum_device)
4628+ return -EBUSY;
c783755a 4629+
bb1d8b11
AM
4630+ if (find_node_by_name(name))
4631+ return -EINVAL;
c783755a 4632+
bb1d8b11
AM
4633+ quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
4634+ if (!quorum_device)
4635+ return -ENOMEM;
4636+ memset(quorum_device, 0, sizeof (struct cluster_node));
c783755a 4637+
bb1d8b11
AM
4638+ quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
4639+ if (!quorum_device->name) {
4640+ kfree(quorum_device);
4641+ quorum_device = NULL;
4642+ return -ENOMEM;
4643+ }
c783755a 4644+
bb1d8b11
AM
4645+ strcpy(quorum_device->name, name);
4646+ quorum_device->votes = votes;
4647+ quorum_device->state = NODESTATE_DEAD;
c783755a 4648+
bb1d8b11
AM
4649+ /* Keep this list valid so it doesn't confuse other code */
4650+ INIT_LIST_HEAD(&quorum_device->addr_list);
c783755a 4651+
bb1d8b11
AM
4652+ return 0;
4653+}
c783755a 4654+
bb1d8b11
AM
4655+int kcl_unregister_quorum_device(void)
4656+{
4657+ if (!quorum_device)
4658+ return -EINVAL;
4659+ if (quorum_device->state == NODESTATE_MEMBER)
4660+ return -EINVAL;
c783755a 4661+
bb1d8b11 4662+ quorum_device = NULL;
c783755a 4663+
bb1d8b11
AM
4664+ return 0;
4665+}
c783755a 4666+
bb1d8b11
AM
4667+int kcl_quorum_device_available(int yesno)
4668+{
4669+ if (!quorum_device)
4670+ return -EINVAL;
c783755a 4671+
bb1d8b11
AM
4672+ if (yesno) {
4673+ quorum_device->last_hello = jiffies;
4674+ if (quorum_device->state == NODESTATE_DEAD) {
4675+ quorum_device->state = NODESTATE_MEMBER;
4676+ recalculate_quorum(0);
4677+ }
4678+ }
4679+ else {
4680+ if (quorum_device->state == NODESTATE_MEMBER) {
4681+ quorum_device->state = NODESTATE_DEAD;
4682+ recalculate_quorum(0);
4683+ }
4684+ }
c783755a 4685+
bb1d8b11
AM
4686+ return 0;
4687+}
c783755a 4688+
bb1d8b11
AM
4689+/* APIs for cluster ref counting. */
4690+int kcl_addref_cluster()
4691+{
4692+ int ret = -ENOTCONN;
c783755a 4693+
bb1d8b11
AM
4694+ if (!atomic_read(&cnxman_running))
4695+ goto addref_ret;
c783755a 4696+
bb1d8b11
AM
4697+ if (try_module_get(THIS_MODULE)) {
4698+ atomic_inc(&use_count);
4699+ ret = 0;
4700+ }
c783755a 4701+
bb1d8b11
AM
4702+ addref_ret:
4703+ return ret;
4704+}
c783755a 4705+
bb1d8b11
AM
4706+int kcl_releaseref_cluster()
4707+{
4708+ if (!atomic_read(&cnxman_running))
4709+ return -ENOTCONN;
4710+ atomic_dec(&use_count);
4711+ module_put(THIS_MODULE);
4712+ return 0;
4713+}
c783755a 4714+
bb1d8b11
AM
4715+int kcl_cluster_name(char **cname)
4716+{
4717+ char *name;
c783755a 4718+
bb1d8b11
AM
4719+ name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL);
4720+ if (!name)
4721+ return -ENOMEM;
4722+
4723+ strncpy(name, cluster_name, strlen(cluster_name)+1);
4724+ *cname = name;
4725+ return 0;
4726+}
4727+
4728+int kcl_get_current_interface(void)
4729+{
4730+ return current_interface->number;
4731+}
c783755a 4732+
bb1d8b11
AM
4733+/* Socket registration stuff */
4734+static struct net_proto_family cl_family_ops = {
4735+ .family = AF_CLUSTER,
4736+ .create = cl_create,
4737+ .owner = THIS_MODULE,
c783755a
AM
4738+};
4739+
bb1d8b11
AM
4740+static struct proto_ops cl_proto_ops = {
4741+ .family = AF_CLUSTER,
c783755a 4742+
bb1d8b11
AM
4743+ .release = cl_release,
4744+ .bind = cl_bind,
4745+ .connect = sock_no_connect,
4746+ .socketpair = sock_no_socketpair,
4747+ .accept = sock_no_accept,
4748+ .getname = cl_getname,
4749+ .poll = cl_poll,
4750+ .ioctl = cl_ioctl,
4751+ .listen = sock_no_listen,
4752+ .shutdown = cl_shutdown,
4753+ .setsockopt = sock_no_setsockopt,
4754+ .getsockopt = sock_no_getsockopt,
4755+ .sendmsg = cl_sendmsg,
4756+ .recvmsg = cl_recvmsg,
4757+ .mmap = sock_no_mmap,
4758+ .sendpage = sock_no_sendpage,
4759+ .owner = THIS_MODULE,
c783755a
AM
4760+};
4761+
bb1d8b11
AM
4762+#ifdef MODULE
4763+MODULE_DESCRIPTION("Cluster Connection and Service Manager");
4764+MODULE_AUTHOR("Red Hat, Inc");
4765+MODULE_LICENSE("GPL");
4766+#endif
c783755a 4767+
bb1d8b11
AM
4768+static int __init cluster_init(void)
4769+{
4770+ printk("CMAN %s (built %s %s) installed\n",
4771+ CMAN_RELEASE_NAME, __DATE__, __TIME__);
c783755a 4772+
bb1d8b11
AM
4773+ if (sock_register(&cl_family_ops)) {
4774+ printk(KERN_INFO "Unable to register cluster socket type\n");
4775+ return -1;
4776+ }
c783755a 4777+
bb1d8b11
AM
4778+ /* allocate our sock slab cache */
4779+ cluster_sk_cachep = kmem_cache_create("cluster_sock",
4780+ sizeof (struct cluster_sock), 0,
4781+ SLAB_HWCACHE_ALIGN, 0, 0);
4782+ if (!cluster_sk_cachep) {
4783+ printk(KERN_CRIT
4784+ "cluster_init: Cannot create cluster_sock SLAB cache\n");
4785+ sock_unregister(AF_CLUSTER);
4786+ return -1;
4787+ }
c1c6733f 4788+
bb1d8b11
AM
4789+#ifdef CONFIG_PROC_FS
4790+ create_proc_entries();
4791+#endif
c1c6733f 4792+
bb1d8b11
AM
4793+ init_MUTEX(&start_thread_sem);
4794+ init_MUTEX(&send_lock);
4795+ init_MUTEX(&barrier_list_lock);
4796+ init_MUTEX(&cluster_members_lock);
4797+ init_MUTEX(&port_array_lock);
4798+ init_MUTEX(&messages_list_lock);
4799+ init_MUTEX(&listenreq_lock);
4800+ init_MUTEX(&client_socket_lock);
4801+ init_MUTEX(&new_dead_node_lock);
4802+ init_MUTEX(&event_listener_lock);
4803+ init_MUTEX(&kernel_listener_lock);
4804+ init_MUTEX(&tempnode_lock);
4805+ spin_lock_init(&active_socket_lock);
4806+ init_timer(&ack_timer);
c1c6733f 4807+
bb1d8b11
AM
4808+ INIT_LIST_HEAD(&event_listener_list);
4809+ INIT_LIST_HEAD(&kernel_listener_list);
4810+ INIT_LIST_HEAD(&socket_list);
4811+ INIT_LIST_HEAD(&client_socket_list);
4812+ INIT_LIST_HEAD(&active_socket_list);
4813+ INIT_LIST_HEAD(&barrier_list);
4814+ INIT_LIST_HEAD(&messages_list);
4815+ INIT_LIST_HEAD(&listenreq_list);
4816+ INIT_LIST_HEAD(&cluster_members_list);
4817+ INIT_LIST_HEAD(&new_dead_node_list);
4818+ INIT_LIST_HEAD(&tempnode_list);
c1c6733f 4819+
bb1d8b11 4820+ atomic_set(&cnxman_running, 0);
c1c6733f 4821+
bb1d8b11
AM
4822+ sm_init();
4823+
4824+ return 0;
c1c6733f
AM
4825+}
4826+
bb1d8b11
AM
4827+static void __exit cluster_exit(void)
4828+{
4829+#ifdef CONFIG_PROC_FS
4830+ cleanup_proc_entries();
4831+#endif
c1c6733f 4832+
bb1d8b11
AM
4833+ sock_unregister(AF_CLUSTER);
4834+ kmem_cache_destroy(cluster_sk_cachep);
4835+}
c1c6733f 4836+
bb1d8b11
AM
4837+module_init(cluster_init);
4838+module_exit(cluster_exit);
c1c6733f 4839+
bb1d8b11
AM
4840+EXPORT_SYMBOL(kcl_sendmsg);
4841+EXPORT_SYMBOL(kcl_register_read_callback);
4842+EXPORT_SYMBOL(kcl_add_callback);
4843+EXPORT_SYMBOL(kcl_remove_callback);
4844+EXPORT_SYMBOL(kcl_get_members);
4845+EXPORT_SYMBOL(kcl_get_member_ids);
4846+EXPORT_SYMBOL(kcl_get_all_members);
4847+EXPORT_SYMBOL(kcl_is_quorate);
4848+EXPORT_SYMBOL(kcl_get_node_by_addr);
4849+EXPORT_SYMBOL(kcl_get_node_by_name);
4850+EXPORT_SYMBOL(kcl_get_node_by_nodeid);
4851+EXPORT_SYMBOL(kcl_get_node_addresses);
4852+EXPORT_SYMBOL(kcl_addref_cluster);
4853+EXPORT_SYMBOL(kcl_releaseref_cluster);
4854+EXPORT_SYMBOL(kcl_cluster_name);
c1c6733f 4855+
bb1d8b11
AM
4856+EXPORT_SYMBOL(kcl_barrier_register);
4857+EXPORT_SYMBOL(kcl_barrier_setattr);
4858+EXPORT_SYMBOL(kcl_barrier_delete);
4859+EXPORT_SYMBOL(kcl_barrier_wait);
4860+EXPORT_SYMBOL(kcl_barrier_cancel);
c1c6733f 4861+
bb1d8b11
AM
4862+EXPORT_SYMBOL(kcl_register_quorum_device);
4863+EXPORT_SYMBOL(kcl_unregister_quorum_device);
4864+EXPORT_SYMBOL(kcl_quorum_device_available);
c1c6733f 4865+
bb1d8b11
AM
4866+EXPORT_SYMBOL(kcl_register_service);
4867+EXPORT_SYMBOL(kcl_unregister_service);
4868+EXPORT_SYMBOL(kcl_join_service);
4869+EXPORT_SYMBOL(kcl_leave_service);
4870+EXPORT_SYMBOL(kcl_global_service_id);
4871+EXPORT_SYMBOL(kcl_start_done);
4872+EXPORT_SYMBOL(kcl_get_services);
4873+EXPORT_SYMBOL(kcl_get_current_interface);
c1c6733f 4874+
bb1d8b11
AM
4875+/*
4876+ * Overrides for Emacs so that we follow Linus's tabbing style.
4877+ * Emacs will notice this stuff at the end of the file and automatically
4878+ * adjust the settings for this buffer only. This must remain at the end
4879+ * of the file.
4880+ * ---------------------------------------------------------------------------
4881+ * Local variables:
4882+ * c-file-style: "linux"
4883+ * End:
4884+ */
c1c6733f 4885diff -urN linux-orig/cluster/cman/config.c linux-patched/cluster/cman/config.c
bb1d8b11
AM
4886--- linux-orig/cluster/cman/config.c 1970-01-01 07:30:00.000000000 +0730
4887+++ linux-patched/cluster/cman/config.c 2004-11-03 11:37:37.000000000 +0800
c783755a 4888@@ -0,0 +1,49 @@
c1c6733f
AM
4889+/******************************************************************************
4890+*******************************************************************************
4891+**
4892+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4893+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4894+**
4895+** This copyrighted material is made available to anyone wishing to use,
4896+** modify, copy, or redistribute it subject to the terms and conditions
4897+** of the GNU General Public License v.2.
4898+**
4899+*******************************************************************************
4900+******************************************************************************/
4901+
4902+#include "config.h"
4903+
4904+/* Config file defaults */
4905+
b7b72b66 4906+#define DEFAULT_JOIN_WAIT_TIME 16 /* Time to wait while sending JOINREQ
c1c6733f 4907+ * messages. Should be at least twice
b7b72b66 4908+ * the HELLO timer, probably 3x */
c1c6733f
AM
4909+#define DEFAULT_JOIN_TIMEOUT 30 /* How long we wait after getting a
4910+ * JOINACK to regarding that node as
4911+ * dead */
4912+#define DEFAULT_HELLO_TIMER 5 /* Period between HELLO messages */
4913+#define DEFAULT_DEADNODE_TIMER 21 /* If we don't get a message from a
4914+ * node in this period kill it */
4915+#define DEFAULT_TRANSITION_TIMER 15 /* Maximum time a state transition
4916+ * should take */
4917+#define DEFAULT_JOINCONF_TIMER 5 /* Time allowed to a node to respond to
4918+ * a JOINCONF message */
4919+#define DEFAULT_MAX_NODES 128 /* Max allowed nodes */
4920+#define DEFAULT_TRANSITION_RESTARTS 10 /* Maximum number of transition
4921+ * restarts before we die */
4922+#define DEFAULT_SM_DEBUG_SIZE 256 /* Size in bytes of SM debug buffer */
4923+
c783755a
AM
4924+#define DEFAULT_NEWCLUSTER_TIMEOUT 16 /* Time to send NEWCLUSTER messages */
4925+
c1c6733f
AM
4926+struct config_info cman_config = {
4927+ .joinwait_timeout = DEFAULT_JOIN_WAIT_TIME,
4928+ .joinconf_timeout = DEFAULT_JOINCONF_TIMER,
4929+ .join_timeout = DEFAULT_JOIN_TIMEOUT,
4930+ .hello_timer = DEFAULT_HELLO_TIMER,
4931+ .deadnode_timeout = DEFAULT_DEADNODE_TIMER,
4932+ .transition_timeout = DEFAULT_TRANSITION_TIMER,
4933+ .transition_restarts = DEFAULT_TRANSITION_RESTARTS,
4934+ .max_nodes = DEFAULT_MAX_NODES,
4935+ .sm_debug_size = DEFAULT_SM_DEBUG_SIZE,
c783755a 4936+ .newcluster_timeout = DEFAULT_NEWCLUSTER_TIMEOUT,
c1c6733f
AM
4937+};
4938diff -urN linux-orig/cluster/cman/config.h linux-patched/cluster/cman/config.h
bb1d8b11
AM
4939--- linux-orig/cluster/cman/config.h 1970-01-01 07:30:00.000000000 +0730
4940+++ linux-patched/cluster/cman/config.h 2004-11-03 11:37:37.000000000 +0800
c783755a 4941@@ -0,0 +1,32 @@
c1c6733f
AM
4942+/******************************************************************************
4943+*******************************************************************************
4944+**
4945+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4946+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4947+**
4948+** This copyrighted material is made available to anyone wishing to use,
4949+** modify, copy, or redistribute it subject to the terms and conditions
4950+** of the GNU General Public License v.2.
4951+**
4952+*******************************************************************************
4953+******************************************************************************/
4954+
4955+#ifndef __CONFIG_DOT_H__
4956+#define __CONFIG_DOT_H__
4957+
4958+struct config_info {
4959+ int joinwait_timeout;
4960+ int joinconf_timeout;
4961+ int join_timeout;
4962+ int hello_timer;
4963+ int deadnode_timeout;
4964+ int transition_timeout;
4965+ int transition_restarts;
4966+ int max_nodes;
4967+ int sm_debug_size;
c783755a 4968+ int newcluster_timeout;
c1c6733f
AM
4969+};
4970+
4971+extern struct config_info cman_config;
4972+
4973+#endif /* __CONFIG_DOT_H__ */
4974diff -urN linux-orig/cluster/cman/kjoin.c linux-patched/cluster/cman/kjoin.c
bb1d8b11
AM
4975--- linux-orig/cluster/cman/kjoin.c 1970-01-01 07:30:00.000000000 +0730
4976+++ linux-patched/cluster/cman/kjoin.c 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
4977@@ -0,0 +1,238 @@
4978+/******************************************************************************
4979+*******************************************************************************
4980+**
4981+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4982+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4983+**
4984+** This copyrighted material is made available to anyone wishing to use,
4985+** modify, copy, or redistribute it subject to the terms and conditions
4986+** of the GNU General Public License v.2.
4987+**
4988+*******************************************************************************
4989+******************************************************************************/
4990+
4991+#include <linux/socket.h>
4992+#include <net/sock.h>
4993+#include <linux/list.h>
4994+#include <cluster/cnxman.h>
4995+#include <linux/in.h>
4996+
4997+#include "cnxman-private.h"
4998+
4999+static struct socket *mcast_sock;
5000+static struct socket *recv_sock;
5001+static struct socket *cluster_sock;
5002+
5003+extern short cluster_id;
5004+extern int join_count;
5005+extern struct semaphore join_count_lock;
5006+extern atomic_t cnxman_running;
5007+
5008+int kcl_join_cluster(struct cl_join_cluster_info *join_info)
5009+{
5010+ int result;
5011+ int one = 1, error;
5012+ unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr;
5013+ unsigned short port = join_info->port;
5014+ mm_segment_t fs;
5015+ struct sockaddr_in saddr;
5016+ struct kcl_multicast_sock mcast_info;
5017+
5018+ down(&join_count_lock);
5019+ if (atomic_read(&cnxman_running))
5020+ {
5021+ error = 0;
5022+ if (join_info->cluster_id == cluster_id)
5023+ join_count++;
5024+ else
5025+ error = -EINVAL;
5026+ up(&join_count_lock);
5027+ return error;
5028+ }
5029+ up(&join_count_lock);
5030+
5031+ result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock);
5032+ if (result < 0)
5033+ {
5034+ printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n");
5035+ return result;
5036+ }
5037+
5038+ result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock);
5039+ if (result < 0)
5040+ {
5041+ printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n");
5042+ return result;
5043+ }
5044+
5045+ fs = get_fs();
5046+ set_fs(get_ds());
5047+
5048+ if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST,
5049+ (void *) &one, sizeof (int))))
5050+ {
5051+ set_fs(fs);
5052+ printk("Error %d Setting master socket to SO_BROADCAST\n",
5053+ error);
5054+ sock_release(mcast_sock);
5055+ return -1;
5056+ }
5057+ set_fs(fs);
5058+
5059+ /* Bind the multicast socket */
5060+ saddr.sin_family = AF_INET;
5061+ saddr.sin_port = htons(port);
5062+ saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
5063+ result =
5064+ mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr,
5065+ sizeof (saddr));
5066+ if (result < 0)
5067+ {
5068+ printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n");
5069+ sock_release(mcast_sock);
5070+ sock_release(recv_sock);
5071+ return result;
5072+ }
5073+
5074+ /* Bind the receive socket to our IP address */
5075+ saddr.sin_family = AF_INET;
5076+ saddr.sin_port = htons(port);
5077+ saddr.sin_addr.s_addr = cpu_to_be32(ipaddr);
5078+ result =
5079+ recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr,
5080+ sizeof (saddr));
5081+ if (result < 0)
5082+ {
5083+ printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n");
5084+ sock_release(mcast_sock);
5085+ sock_release(recv_sock);
5086+ return result;
5087+ }
5088+
5089+ /* Create the cluster master socket */
5090+ result =
5091+ sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock);
5092+ if (result < 0)
5093+ {
5094+ printk(KERN_ERR CMAN_NAME
5095+ ": Can't create cluster master socket\n");
5096+ sock_release(mcast_sock);
5097+ sock_release(recv_sock);
5098+ return result;
5099+ }
5100+
5101+ /* This is the broadcast transmit address */
5102+ saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
5103+
5104+ /* Pass the multicast socket to kernel space */
5105+ mcast_info.sock = mcast_sock;
5106+ mcast_info.number = 1;
5107+
5108+ fs = get_fs();
5109+ set_fs(get_ds());
5110+
5111+ if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
5112+ KCL_SET_MULTICAST,
5113+ (void *) &mcast_info,
5114+ sizeof (mcast_info))))
5115+ {
5116+ set_fs(fs);
5117+ printk(CMAN_NAME
5118+ ": Unable to pass multicast socket to cnxman, %d\n",
5119+ error);
5120+ sock_release(mcast_sock);
5121+ sock_release(recv_sock);
5122+ sock_release(cluster_sock);
5123+ return -1;
5124+ }
5125+
5126+ mcast_info.sock = recv_sock;
5127+ if ((error =
5128+ cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
5129+ KCL_SET_RCVONLY,
5130+ (void *) &mcast_info,
5131+ sizeof (mcast_info))))
5132+ {
5133+ set_fs(fs);
5134+ printk(CMAN_NAME
5135+ ": Unable to pass receive socket to cnxman, %d\n",
5136+ error);
5137+ sock_release(mcast_sock);
5138+ sock_release(recv_sock);
5139+ sock_release(cluster_sock);
5140+ return -1;
5141+ }
5142+
5143+ /* This setsockopt expects usermode variables */
5144+
5145+ if (cluster_sock->ops->
5146+ setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER,
5147+ (void *) join_info,
5148+ sizeof (struct cl_join_cluster_info)))
5149+
5150+ {
5151+ set_fs(fs);
5152+ printk(CMAN_NAME ": Unable to join cluster\n");
5153+ sock_release(mcast_sock);
5154+ sock_release(recv_sock);
5155+ sock_release(cluster_sock);
5156+ return -1;
5157+ }
5158+ set_fs(fs);
5159+
5160+ return 0;
5161+}
5162+
5163+int kcl_leave_cluster(int remove)
5164+{
5165+ mm_segment_t fs;
5166+ int rem = remove;
5167+ int ret = 0;
5168+ struct socket *shutdown_sock = cluster_sock;
5169+
5170+ cluster_sock = NULL;
5171+
5172+ if (!shutdown_sock)
5173+ {
5174+ /* Create the cluster master socket */
5175+ int result =
5176+ sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER,
5177+ &shutdown_sock);
5178+ if (result < 0)
5179+ {
5180+ printk(KERN_ERR CMAN_NAME
5181+ ": Can't create cluster master socket\n");
5182+ sock_release(mcast_sock);
5183+ sock_release(recv_sock);
5184+ return result;
5185+ }
5186+ }
5187+
5188+ fs = get_fs();
5189+ set_fs(get_ds());
5190+
5191+ if ((ret =
5192+ shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER,
5193+ CLU_LEAVE_CLUSTER, (void *) &rem,
5194+ sizeof (int))))
5195+ {
5196+ printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n",
5197+ ret);
5198+ }
5199+ set_fs(fs);
5200+
5201+ sock_release(shutdown_sock);
5202+
5203+ return ret;
5204+}
5205+
5206+/*
5207+ * Overrides for Emacs so that we follow Linus's tabbing style.
5208+ * Emacs will notice this stuff at the end of the file and automatically
5209+ * adjust the settings for this buffer only. This must remain at the end
5210+ * of the file.
5211+ * ---------------------------------------------------------------------------
5212+ * Local variables:
5213+ * c-file-style: "linux"
5214+ * End:
5215+ */
5216diff -urN linux-orig/cluster/cman/membership.c linux-patched/cluster/cman/membership.c
bb1d8b11
AM
5217--- linux-orig/cluster/cman/membership.c 1970-01-01 07:30:00.000000000 +0730
5218+++ linux-patched/cluster/cman/membership.c 2004-11-03 11:37:37.000000000 +0800
5219@@ -0,0 +1,3160 @@
c1c6733f
AM
5220+/******************************************************************************
5221+*******************************************************************************
5222+**
5223+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5224+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5225+**
5226+** This copyrighted material is made available to anyone wishing to use,
5227+** modify, copy, or redistribute it subject to the terms and conditions
5228+** of the GNU General Public License v.2.
5229+**
5230+*******************************************************************************
5231+******************************************************************************/
5232+
5233+#include <linux/socket.h>
5234+#include <net/sock.h>
5235+#include <linux/slab.h>
5236+#include <linux/spinlock.h>
5237+#include <linux/vmalloc.h>
5238+#include <asm/uaccess.h>
5239+#include <linux/list.h>
5240+#include <cluster/cnxman.h>
5241+
5242+#include "cnxman-private.h"
5243+#include "config.h"
5244+#include "sm_control.h"
5245+
5246+#ifndef TRUE
5247+#define TRUE 1
5248+#endif
5249+
5250+/* Barrier name for membership transitions. %d is the cluster generation number
5251+ */
5252+#define MEMBERSHIP_BARRIER_NAME "TRANSITION.%d"
5253+
5254+/* Variables also used by connection manager */
5255+struct list_head cluster_members_list;
5256+struct semaphore cluster_members_lock;
5257+int cluster_members; /* Number of ACTIVE members, not a count of
5258+ * nodes in the list */
b7b72b66 5259+int we_are_a_cluster_member;
c1c6733f 5260+int cluster_is_quorate;
b7b72b66 5261+int quit_threads;
c1c6733f
AM
5262+struct task_struct *membership_task;
5263+struct cluster_node *us;
5264+
5265+static struct task_struct *hello_task;
5266+static struct semaphore hello_task_lock;
5267+
5268+/* Variables that belong to the connection manager */
5269+extern wait_queue_head_t cnxman_waitq;
5270+extern struct completion member_thread_comp;
5271+extern struct cluster_node *quorum_device;
5272+extern unsigned short two_node;
5273+extern char cluster_name[];
5274+extern unsigned int config_version;
5275+extern unsigned int address_length;
5276+
5277+static struct socket *mem_socket;
5278+static pid_t kcluster_pid;
5279+
5280+static char iobuf[MAX_CLUSTER_MESSAGE];
5281+static char scratchbuf[MAX_CLUSTER_MESSAGE + 100];
5282+
5283+/* Our node name, usually system_utsname.nodename, but can be overridden */
5284+char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1];
5285+
c783755a
AM
5286+/* Node ID that we want. defaults of zero means
5287+ * it will be allocated by the cluster join mechanism
5288+ */
5289+int wanted_nodeid;
5290+
c1c6733f 5291+static spinlock_t members_by_nodeid_lock;
b7b72b66 5292+static int sizeof_members_array; /* Can dynamically increase (vmalloc
c1c6733f
AM
5293+ * permitting) */
5294+static struct cluster_node **members_by_nodeid;
5295+
5296+#define MEMBER_INCREMENT_SIZE 10
5297+
5298+static int votes = 1; /* Votes this node has */
5299+static int expected_votes = 1; /* Total expected votes in the cluster */
5300+static unsigned int quorum; /* Quorum, fewer votes than this and we stop
5301+ * work */
5302+static int leavereason; /* Saved for the duration of a state transition */
5303+static int transitionreason; /* Reason this transition was initiated */
5304+static unsigned int highest_nodeid; /* Highest node ID known to the cluster */
5305+static struct timer_list transition_timer; /* Kicks in if the transition
5306+ * doesn't complete in a
5307+ * reasonable time */
5308+static struct timer_list hello_timer; /* Timer to send HELLOs on */
5309+static unsigned long join_time; /* The time that we got our JOIN-ACK */
5310+static unsigned long start_time; /* The time that we were started */
5311+static int joinconf_count; /* Number of JOINCONF messages we have sent to
5312+ * a new node */
5313+static unsigned long wake_flags;/* Reason we were woken */
5314+
5315+/* Flags in above */
5316+#define WAKE_FLAG_DEADNODE 1
5317+#define WAKE_FLAG_TRANSTIMER 2
5318+
5319+/* The time the transition finished */
5320+static unsigned long transition_end_time;
5321+
5322+/* A list of nodes that cnxman tells us are dead. I hope this never has more
5323+ * than one element in it but I can't take that chance. only non-static so it
5324+ * can be initialised in module_load. */
5325+struct list_head new_dead_node_list;
5326+struct semaphore new_dead_node_lock;
5327+
bb1d8b11
AM
5328+static int do_membership_packet(struct msghdr *msg, char *buf, int len);
5329+static int do_process_joinreq(struct msghdr *msg, char *buf, int len);
5330+static int do_process_joinack(struct msghdr *msg, char *buf, int len);
5331+static int do_process_joinconf(struct msghdr *msg, char *buf, int len);
5332+static int do_process_leave(struct msghdr *msg, char *buf, int len);
5333+static int do_process_hello(struct msghdr *msg, char *buf, int len);
5334+static int do_process_kill(struct msghdr *msg, char *buf, int len);
5335+static int do_process_reconfig(struct msghdr *msg, char *buf, int len);
5336+static int do_process_starttrans(struct msghdr *msg, char *buf, int len);
5337+static int do_process_masterview(struct msghdr *msg, char *buf, int len);
5338+static int do_process_endtrans(struct msghdr *msg, char *buf, int len);
5339+static int do_process_viewack(struct msghdr *msg, char *buf, int len);
5340+static int do_process_startack(struct msghdr *msg, char *buf, int len);
5341+static int do_process_newcluster(struct msghdr *msg, char *buf, int len);
5342+static int do_process_nominate(struct msghdr *msg, char *buf, int len);
c1c6733f 5343+static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
b7b72b66 5344+ unsigned int flags, unsigned int flags2);
c1c6733f
AM
5345+static int send_joinreq(struct sockaddr_cl *addr, int addr_len);
5346+static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id);
5347+static int send_hello(void);
5348+static int send_master_hello(void);
5349+static int send_newcluster(void);
5350+static int end_transition(void);
5351+static int dispatch_messages(struct socket *mem_socket);
5352+static void check_for_dead_nodes(void);
5353+static void confirm_joiner(void);
5354+static void reset_hello_time(void);
5355+static int add_us(void);
5356+static int send_joinconf(void);
5357+static int init_membership_services(void);
5358+static int elect_master(struct cluster_node **);
5359+static void trans_timer_expired(unsigned long arg);
5360+static void hello_timer_expired(unsigned long arg);
5361+static void join_or_form_cluster(void);
5362+static int do_timer_wakeup(void);
5363+static int start_transition(unsigned char reason, struct cluster_node *node);
c783755a 5364+static uint32_t low32_of_ip(void);
c1c6733f
AM
5365+int send_leave(unsigned char);
5366+int send_reconfigure(int, unsigned int);
5367+
5368+#ifdef DEBUG_MEMB
5369+static char *msgname(int msg);
5370+static int debug_sendmsg(struct socket *sock, void *buf, int size,
5371+ struct sockaddr_cl *caddr, int addr_len,
5372+ unsigned int flags)
5373+{
5374+ P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]),
5375+ size);
5376+ return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags);
5377+}
5378+
5379+#define kcl_sendmsg debug_sendmsg
5380+#endif
5381+
5382+/* State of the node */
c783755a 5383+static enum { STARTING, NEWCLUSTER, JOINING, JOINWAIT, JOINACK, TRANSITION,
c1c6733f 5384+ TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER
c783755a 5385+} node_state = LEFT_CLUSTER;
c1c6733f
AM
5386+
5387+/* Sub-state when we are MASTER */
5388+static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM,
5389+ MASTER_COMPLETE } master_state;
5390+
5391+/* Number of responses collected while a master controlling a state transition */
5392+static int responses_collected;
5393+static int responses_expected;
5394+
5395+/* Current cluster generation number */
c783755a 5396+int cluster_generation = 1;
c1c6733f
AM
5397+
5398+/* When another node initiates a transtion then store it's pointer in here so
5399+ * we can check for other nodes trying to spoof us */
5400+static struct cluster_node *master_node = NULL;
5401+
5402+/* Struct the node wanting to join us */
5403+static struct cluster_node *joining_node = NULL;
b7b72b66 5404+static int joining_temp_nodeid;
c1c6733f
AM
5405+
5406+/* Last time a HELLO message was sent */
b7b72b66 5407+unsigned long last_hello;
c1c6733f
AM
5408+
5409+/* When we got our JOINWAIT or NEWCLUSTER */
b7b72b66 5410+unsigned long joinwait_time;
c1c6733f
AM
5411+
5412+/* Number of times a transition has restarted when we were master */
b7b72b66 5413+int transition_restarts;
c1c6733f
AM
5414+
5415+/* Variables used by the master to collect cluster status during a transition */
b7b72b66
AM
5416+static int agreeing_nodes;
5417+static int dissenting_nodes;
c1c6733f
AM
5418+static uint8_t *node_opinion = NULL;
5419+#define OPINION_AGREE 1
5420+#define OPINION_DISAGREE 2
5421+
5422+/* Set node id of a node, also add it to the members array and expand the array
5423+ * if necessary */
5424+static inline void set_nodeid(struct cluster_node *node, int nodeid)
5425+{
5426+ if (!nodeid)
5427+ return;
5428+
5429+ node->node_id = nodeid;
c783755a 5430+ if (nodeid >= sizeof_members_array) {
c1c6733f 5431+ int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE;
c783755a
AM
5432+ struct cluster_node **new_array;
5433+
5434+ if (new_size < nodeid)
5435+ new_size = nodeid + MEMBER_INCREMENT_SIZE;
5436+
5437+ new_array = vmalloc((new_size) * sizeof (struct cluster_node *));
c1c6733f
AM
5438+ if (new_array) {
5439+ spin_lock(&members_by_nodeid_lock);
5440+ memcpy(new_array, members_by_nodeid,
5441+ sizeof_members_array *
5442+ sizeof (struct cluster_node *));
5443+ memset(&new_array[sizeof_members_array], 0,
c783755a 5444+ (new_size - sizeof_members_array) *
c1c6733f
AM
5445+ sizeof (struct cluster_node *));
5446+ vfree(members_by_nodeid);
c783755a 5447+
c1c6733f
AM
5448+ members_by_nodeid = new_array;
5449+ sizeof_members_array = new_size;
5450+ spin_unlock(&members_by_nodeid_lock);
5451+ }
5452+ else {
5453+ panic("No memory for more nodes");
5454+ }
5455+ }
5456+ notify_kernel_listeners(NEWNODE, (long) nodeid);
5457+
5458+ spin_lock(&members_by_nodeid_lock);
5459+ members_by_nodeid[nodeid] = node;
5460+ spin_unlock(&members_by_nodeid_lock);
5461+}
5462+
5463+static int hello_kthread(void *unused)
5464+{
5465+ struct task_struct *tsk = current;
5466+ sigset_t tmpsig;
5467+
5468+ daemonize("cman_hbeat");
5469+
5470+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5471+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5472+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5473+
5474+ down(&hello_task_lock);
5475+ hello_task = tsk;
5476+ up(&hello_task_lock);
5477+
5478+ set_user_nice(current, -6);
5479+
5480+ while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
c1c6733f
AM
5481+
5482+ /* Scan the nodes list for dead nodes */
5483+ if (node_state == MEMBER)
5484+ check_for_dead_nodes();
5485+
5486+ set_task_state(current, TASK_INTERRUPTIBLE);
5487+ schedule();
5488+ set_task_state(current, TASK_RUNNING);
b7b72b66
AM
5489+
5490+ if (node_state != REJECTED && node_state != LEFT_CLUSTER)
5491+ send_hello();
c1c6733f
AM
5492+ }
5493+ down(&hello_task_lock);
5494+ hello_task = NULL;
5495+ up(&hello_task_lock);
5496+ P_MEMB("heartbeat closing down\n");
5497+ return 0;
5498+}
5499+
5500+/* This is the membership "daemon". A client of cnxman (but symbiotic with it)
5501+ * that keeps track of and controls cluster membership. */
5502+static int membership_kthread(void *unused)
5503+{
5504+ struct task_struct *tsk = current;
c1c6733f
AM
5505+ sigset_t tmpsig;
5506+
5507+ daemonize("cman_memb");
5508+
5509+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5510+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5511+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5512+
5513+ membership_task = tsk;
5514+ set_user_nice(current, -5);
5515+
5516+ /* Open the socket */
5517+ if (init_membership_services())
5518+ return -1;
5519+
5520+ add_us();
5521+ joining_node = us;
5522+
5523+ init_timer(&hello_timer);
5524+ hello_timer.function = hello_timer_expired;
5525+ hello_timer.data = 0L;
5526+
5527+ /* Do joining stuff */
5528+ join_or_form_cluster();
5529+
5530+ transition_end_time = jiffies;
5531+
5532+ /* Main loop */
5533+ while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5534+
5535+ struct task_struct *tsk = current;
5536+
5537+ DECLARE_WAITQUEUE(wait, tsk);
5538+
5539+ tsk->state = TASK_INTERRUPTIBLE;
5540+ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5541+
5542+ if (!skb_peek(&mem_socket->sk->sk_receive_queue) &&
5543+ wake_flags == 0) {
5544+ if (node_state == JOINACK ||
5545+ node_state == JOINWAIT)
5546+ schedule_timeout(HZ);
5547+ else
5548+ schedule();
5549+ }
5550+
5551+ tsk->state = TASK_RUNNING;
5552+ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5553+
5554+ /* Are we being shut down? */
5555+ if (node_state == LEFT_CLUSTER || quit_threads ||
5556+ signal_pending(current))
5557+ break;
5558+
5559+ /* Were we woken by a dead node passed down from cnxman ? */
5560+ if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
5561+ struct list_head *nodelist, *tmp;
5562+ struct cl_new_dead_node *deadnode;
5563+
5564+ down(&new_dead_node_lock);
5565+ list_for_each_safe(nodelist, tmp, &new_dead_node_list) {
5566+ deadnode =
5567+ list_entry(nodelist,
5568+ struct cl_new_dead_node, list);
5569+
5570+ if (deadnode->node->state == NODESTATE_MEMBER)
5571+ a_node_just_died(deadnode->node);
5572+ list_del(&deadnode->list);
5573+ kfree(deadnode);
5574+ }
5575+ up(&new_dead_node_lock);
5576+ }
5577+
5578+ /* Process received messages. If dispatch_message() returns an
5579+ * error then we shut down */
5580+ if (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5581+ if (dispatch_messages(mem_socket) < 0)
5582+ goto leave_cluster;
5583+
5584+ }
5585+
5586+ /* Were we woken by the transition timer firing ? */
5587+ if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) {
5588+ switch (do_timer_wakeup()) {
5589+ case -1:
5590+ continue;
5591+ case 0:
5592+ break;
5593+ case +1:
5594+ goto leave_cluster;
5595+ }
5596+ }
5597+
5598+ /* Got a JOINACK but no JOIN-CONF, start waiting for HELLO
5599+ * messages again */
c783755a
AM
5600+ if (node_state == JOINACK &&
5601+ time_after(jiffies,
5602+ join_time + cman_config.join_timeout * HZ)) {
c1c6733f
AM
5603+ P_MEMB
5604+ ("Waited a long time for a join-conf, going back to JOINWAIT state\n");
5605+ node_state = JOINWAIT;
5606+ joinwait_time = jiffies;
5607+ }
5608+
c783755a
AM
5609+ /* Have we had an ACK for our JOINREQ message ? */
5610+ if (node_state == JOINING &&
5611+ time_after(jiffies,
5612+ join_time + cman_config.join_timeout * HZ)) {
5613+ P_MEMB("didn't get JOINACK, going back to JOINWAIT\n");
5614+ node_state = JOINWAIT;
5615+ joinwait_time = jiffies;
5616+ }
5617+
c1c6733f 5618+ /* Have we been in joinwait for too long... */
c783755a
AM
5619+ if (node_state == JOINWAIT &&
5620+ time_after(jiffies,
5621+ joinwait_time + cman_config.joinwait_timeout * HZ)) {
c1c6733f
AM
5622+ printk(CMAN_NAME
5623+ ": Been in JOINWAIT for too long - giving up\n");
5624+ goto leave_cluster;
5625+ }
5626+ }
5627+
5628+ leave_cluster:
5629+
5630+ /* Wake up the heartbeat thread so it can exit */
5631+ down(&hello_task_lock);
5632+ if (hello_task)
5633+ wake_up_process(hello_task);
5634+ up(&hello_task_lock);
5635+
5636+ if (timer_pending(&hello_timer))
5637+ del_timer(&hello_timer);
5638+
5639+ if (timer_pending(&transition_timer))
5640+ del_timer(&transition_timer);
5641+
5642+ node_state = LEFT_CLUSTER;
5643+ P_MEMB("closing down\n");
5644+ quit_threads = 1; /* force other thread to exit too */
5645+
c783755a
AM
5646+ send_leave(us->leave_reason);
5647+ sock_release(mem_socket);
c1c6733f
AM
5648+ highest_nodeid = 0;
5649+ complete(&member_thread_comp);
5650+ return 0;
5651+}
5652+
5653+/* Things to do in the main thread when the transition timer has woken us.
5654+ * Usually this happens when a transition is taking too long and we need to
5655+ * take remedial action.
5656+ *
5657+ * returns: -1 continue; 0 carry on processing +1 leave cluster; */
5658+static int do_timer_wakeup()
5659+{
5660+ P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies);
5661+
5662+ /* Resend JOINCONF if it got lost on the wire */
5663+ if (node_state == MASTER && master_state == MASTER_CONFIRM) {
5664+ mod_timer(&transition_timer,
5665+ jiffies + cman_config.joinconf_timeout * HZ);
5666+ if (++joinconf_count < MAX_RETRIES) {
5667+ P_MEMB("Resending JOINCONF\n");
5668+ send_joinconf();
5669+ }
5670+ else {
5671+ P_MEMB("JOINCONF not acked, cancelling transition\n");
5672+ end_transition();
5673+ }
5674+ return -1;
5675+ }
5676+
5677+ /* A joining node probably died */
5678+ if (cluster_members == 1) {
5679+ end_transition();
5680+ return -1;
5681+ }
5682+
5683+ /* See if the master is still there */
5684+ if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) {
5685+
5686+ /* If we are in transition and master_node is NULL then we are
5687+ * waiting for ENDTRANS after JOIN-CONF */
5688+ if (!master_node) {
5689+ /* Hmmm. master died after sending JOINCONF, we'll have
5690+ * to die as we are in mid-transition */
5691+ printk(KERN_INFO CMAN_NAME
5692+ ": Master died after JOINCONF, we must leave the cluster\n");
5693+ quit_threads = 1;
5694+ return +1;
5695+ }
5696+
5697+ /* No messages from the master - see if it's stil there */
5698+ if (master_node->state == NODESTATE_MEMBER) {
5699+ send_master_hello();
5700+ mod_timer(&transition_timer,
5701+ jiffies +
5702+ cman_config.transition_timeout * HZ);
5703+ }
5704+
5705+ /* If the master is dead then elect a new one */
5706+ if (master_node->state == NODESTATE_DEAD) {
5707+
5708+ struct cluster_node *node;
5709+
5710+ P_MEMB("Master node is dead...Election!\n");
5711+ if (elect_master(&node)) {
5712+
5713+ /* We are master now, all kneel */
5714+ start_transition(TRANS_DEADMASTER, master_node);
5715+ }
5716+ else {
5717+ /* Leave the job to someone on more pay */
5718+ master_node = node;
5719+ mod_timer(&transition_timer,
5720+ jiffies +
5721+ cman_config.transition_timeout * HZ);
5722+ }
5723+ }
5724+ }
5725+
5726+ /* If we are the master node then restart the transition */
5727+ if (node_state == MASTER) {
5728+ start_transition(TRANS_RESTART, us);
5729+ }
5730+
5731+ return 0;
5732+}
5733+
5734+static void form_cluster(void)
5735+{
5736+ printk(KERN_INFO CMAN_NAME ": forming a new cluster\n");
5737+ node_state = MEMBER;
5738+ we_are_a_cluster_member = TRUE;
c1c6733f 5739+ us->state = NODESTATE_MEMBER;
c783755a
AM
5740+ if (wanted_nodeid)
5741+ set_nodeid(us, wanted_nodeid);
5742+ else
5743+ set_nodeid(us, 1);
c1c6733f
AM
5744+ recalculate_quorum(0);
5745+ sm_member_update(cluster_is_quorate);
5746+ send_hello();
5747+ kernel_thread(hello_kthread, NULL, 0);
5748+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
5749+}
5750+
5751+/* This does the initial JOIN part of the membership process. Actually most of
5752+ * is done in the message processing routines but this is the main loop that
5753+ * controls it. The side-effect of this routine is "node_state" which tells the
5754+ * real main loop (in the kernel thread routine) what to do next */
5755+static void join_or_form_cluster()
5756+{
5757+ start_time = jiffies;
5758+
5759+ printk(KERN_INFO CMAN_NAME
5760+ ": Waiting to join or form a Linux-cluster\n");
c783755a
AM
5761+
5762+ restart_joinwait:
c1c6733f
AM
5763+ join_time = 0;
5764+ start_time = jiffies;
5765+ joinwait_time = jiffies;
5766+ last_hello = 0;
c1c6733f 5767+
c783755a 5768+ /* Listen for HELLO or NEWCLUSTER messages */
c1c6733f
AM
5769+ do {
5770+ DECLARE_WAITQUEUE(wait, current);
5771+ set_task_state(current, TASK_INTERRUPTIBLE);
5772+ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5773+
5774+ if (!skb_peek(&mem_socket->sk->sk_receive_queue))
5775+ schedule_timeout((cman_config.joinwait_timeout * HZ) /
5776+ 5);
5777+
5778+ set_task_state(current, TASK_RUNNING);
5779+ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5780+
5781+ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5782+ dispatch_messages(mem_socket);
5783+ }
5784+ if (quit_threads)
5785+ node_state = LEFT_CLUSTER;
5786+
5787+ }
5788+ while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) &&
5789+ node_state == STARTING);
5790+
c1c6733f 5791+ if (node_state == STARTING) {
c783755a
AM
5792+ start_time = jiffies;
5793+ joinwait_time = jiffies;
5794+ node_state = NEWCLUSTER;
5795+ }
5796+
5797+ /* If we didn't hear any HELLO messages then start sending NEWCLUSTER messages */
5798+ while (time_before(jiffies, start_time + cman_config.newcluster_timeout * HZ) &&
5799+ node_state == NEWCLUSTER) {
5800+
5801+ DECLARE_WAITQUEUE(wait, current);
5802+
5803+ send_newcluster();
5804+
5805+ set_task_state(current, TASK_INTERRUPTIBLE);
5806+ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5807+
5808+ if (!skb_peek(&mem_socket->sk->sk_receive_queue))
5809+ schedule_timeout((cman_config.joinwait_timeout * HZ) /
5810+ 5);
5811+
5812+ set_task_state(current, TASK_RUNNING);
5813+ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5814+
5815+ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5816+ dispatch_messages(mem_socket);
5817+ }
5818+ /* Did we get a lower "NEWCLUSTER" message ? */
5819+ if (node_state == STARTING) {
5820+ P_MEMB("NEWCLUSTER: restarting joinwait\n");
5821+ goto restart_joinwait;
5822+ }
5823+
5824+ if (quit_threads)
5825+ node_state = LEFT_CLUSTER;
5826+
5827+ }
5828+
5829+
5830+ /* If we didn't hear any HELLO messages then form a new cluster */
5831+ if (node_state == NEWCLUSTER) {
c1c6733f
AM
5832+ form_cluster();
5833+ }
5834+ else
5835+ last_hello = jiffies;
5836+
5837+}
5838+
5839+int start_membership_services(pid_t cluster_pid)
5840+{
5841+ kcluster_pid = cluster_pid;
5842+
5843+ init_timer(&transition_timer);
5844+ transition_timer.function = trans_timer_expired;
5845+ transition_timer.data = 0L;
5846+
5847+ /* Start the thread */
5848+ return kernel_thread(membership_kthread, NULL, 0);
5849+}
5850+
5851+static int init_membership_services()
5852+{
5853+ int result;
5854+ struct sockaddr_cl saddr;
5855+ struct socket *sock;
5856+
5857+ init_MUTEX(&hello_task_lock);
5858+ /* Create a socket to communicate with */
5859+ result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
5860+ if (result < 0) {
5861+ printk(KERN_ERR CMAN_NAME
5862+ ": Can't create cluster socket for membership services\n");
5863+ return result;
5864+ }
5865+ mem_socket = sock;
5866+
5867+ /* Bind to our port */
5868+ saddr.scl_family = AF_CLUSTER;
5869+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5870+ result =
5871+ sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr));
5872+ if (result < 0) {
5873+ printk(KERN_ERR CMAN_NAME
5874+ ": Can't bind to cluster membership services port\n");
5875+ sock_release(sock);
5876+ return result;
5877+ }
5878+
5879+ node_state = STARTING;
5880+ return 0;
5881+}
5882+
5883+static int send_joinconf()
5884+{
5885+ struct sockaddr_cl saddr;
5886+ int status;
5887+
5888+ if (joining_temp_nodeid == 0) {
5889+ BUG();
5890+ }
5891+
5892+ master_state = MASTER_CONFIRM;
5893+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5894+ saddr.scl_family = AF_CLUSTER;
5895+ saddr.scl_nodeid = joining_temp_nodeid;
5896+ status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr,
b7b72b66 5897+ MSG_NOACK, 0);
c1c6733f
AM
5898+
5899+ if (status < 0) {
5900+ printk("Error %d sending JOINCONF, aborting transition\n", status);
5901+ end_transition();
5902+ }
5903+ return status;
5904+}
5905+
5906+static int send_joinreq(struct sockaddr_cl *addr, int addr_len)
5907+{
5908+ char *msgbuf = scratchbuf;
5909+ struct list_head *addrlist;
5910+ int ptr = sizeof (struct cl_mem_join_msg);
5911+ unsigned short num_addr = 0;
5912+ struct cluster_node_addr *nodeaddr;
5913+ struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf;
5914+
5915+ msg->cmd = CLUSTER_MEM_JOINREQ;
5916+ msg->votes = votes;
5917+ msg->expected_votes = cpu_to_le32(expected_votes);
c783755a
AM
5918+ msg->nodeid = cpu_to_le32(wanted_nodeid);
5919+ msg->major_version = cpu_to_le32(CNXMAN_MAJOR_VERSION);
5920+ msg->minor_version = cpu_to_le32(CNXMAN_MINOR_VERSION);
5921+ msg->patch_version = cpu_to_le32(CNXMAN_PATCH_VERSION);
c1c6733f
AM
5922+ msg->config_version = cpu_to_le32(config_version);
5923+ msg->addr_len = cpu_to_le32(address_length);
5924+ strcpy(msg->clustername, cluster_name);
5925+
5926+ /* Add our addresses */
5927+ list_for_each(addrlist, &us->addr_list) {
5928+ nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
5929+
5930+ memcpy(msgbuf + ptr, nodeaddr->addr, address_length);
5931+ ptr += address_length;
5932+ num_addr++;
5933+ }
5934+ msg->num_addr = cpu_to_le16(num_addr);
5935+
5936+ /* And our name */
5937+ strcpy(msgbuf + ptr, nodename);
5938+ ptr += strlen(nodename) + 1;
5939+
5940+ return kcl_sendmsg(mem_socket, msgbuf, ptr,
5941+ addr, addr_len, MSG_NOACK);
5942+}
5943+
5944+static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id)
5945+{
5946+ struct cl_mem_startack_msg msg;
5947+
5948+ msg.cmd = CLUSTER_MEM_STARTACK;
5949+ msg.generation = cpu_to_le32(cluster_generation);
5950+ msg.node_id = cpu_to_le32(node_id);
5951+ msg.highest_node_id = cpu_to_le32(get_highest_nodeid());
5952+
b7b72b66 5953+ return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, MSG_REPLYEXP);
c1c6733f
AM
5954+}
5955+
5956+static int send_newcluster()
5957+{
c783755a
AM
5958+ char buf[5];
5959+ uint32_t lowip;
c1c6733f
AM
5960+
5961+ buf[0] = CLUSTER_MEM_NEWCLUSTER;
c783755a
AM
5962+ lowip = cpu_to_le32(low32_of_ip());
5963+ memcpy(&buf[1], &lowip, sizeof(lowip));
c1c6733f 5964+
c783755a
AM
5965+ return kcl_sendmsg(mem_socket, buf, sizeof(uint32_t)+1,
5966+ NULL, 0,
c1c6733f
AM
5967+ MSG_NOACK);
5968+}
5969+
5970+static int send_hello()
5971+{
5972+ struct cl_mem_hello_msg hello_msg;
5973+ int status;
5974+
5975+ hello_msg.cmd = CLUSTER_MEM_HELLO;
5976+ hello_msg.members = cpu_to_le16(cluster_members);
b7b72b66 5977+ hello_msg.flags = cluster_is_quorate ? HELLO_FLAG_QUORATE : 0;
c1c6733f
AM
5978+ hello_msg.generation = cpu_to_le32(cluster_generation);
5979+
b7b72b66
AM
5980+ status = kcl_sendmsg(mem_socket, &hello_msg,
5981+ sizeof(struct cl_mem_hello_msg),
5982+ NULL, 0, MSG_NOACK | MSG_ALLINT);
c1c6733f
AM
5983+
5984+ last_hello = jiffies;
5985+
5986+ return status;
5987+}
5988+
5989+/* This is a special HELLO message that requires an ACK. clients in transition
b7b72b66 5990+ * send these to the master to check it is still alive. If it does not ACK then
c1c6733f
AM
5991+ * cnxman will signal it dead and we can restart the transition */
5992+static int send_master_hello()
5993+{
5994+ struct cl_mem_hello_msg hello_msg;
5995+ int status;
5996+ struct sockaddr_cl saddr;
5997+
5998+ hello_msg.cmd = CLUSTER_MEM_HELLO;
5999+ hello_msg.members = cpu_to_le16(cluster_members);
b7b72b66
AM
6000+ hello_msg.flags = HELLO_FLAG_MASTER |
6001+ (cluster_is_quorate ? HELLO_FLAG_QUORATE : 0);
c1c6733f
AM
6002+ hello_msg.generation = cpu_to_le32(cluster_generation);
6003+
6004+ saddr.scl_family = AF_CLUSTER;
6005+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6006+ saddr.scl_nodeid = master_node->node_id;
b7b72b66
AM
6007+
6008+ status = kcl_sendmsg(mem_socket, &hello_msg,
6009+ sizeof(struct cl_mem_hello_msg),
6010+ &saddr, sizeof (saddr), 0);
c1c6733f
AM
6011+
6012+ last_hello = jiffies;
6013+
6014+ return status;
6015+}
6016+
6017+/* Called when the transition timer has expired, meaning we sent a transition
6018+ * message that was not ACKed */
6019+static void trans_timer_expired(unsigned long arg)
6020+{
6021+ P_MEMB("Transition timer fired %ld\n", jiffies);
6022+
6023+ set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags);
6024+ wake_up_process(membership_task);
6025+}
6026+
6027+static void hello_timer_expired(unsigned long arg)
6028+{
6029+ P_MEMB("Hello timer fired %ld\n", jiffies);
6030+
6031+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
6032+
6033+ if (node_state >= TRANSITION) {
6034+ wake_up_process(hello_task);
6035+ }
6036+}
6037+
6038+static int wait_for_completion_barrier(void)
6039+{
6040+ int status;
6041+ char barriername[MAX_BARRIER_NAME_LEN];
6042+
6043+ sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation);
6044+
6045+ /* Make sure we all complete together */
6046+ P_MEMB("Waiting for completion barrier: %d members\n", cluster_members);
6047+ if ((status =
6048+ kcl_barrier_register(barriername, 0, cluster_members)) < 0) {
6049+ printk(CMAN_NAME ": Error registering barrier: %d\n", status);
6050+ return -1;
6051+ }
6052+ kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT,
6053+ cman_config.transition_timeout);
6054+ status = kcl_barrier_wait(barriername);
6055+ kcl_barrier_delete(barriername);
6056+
6057+ P_MEMB("Completion barrier reached : status = %d\n", status);
6058+ return status;
6059+}
6060+
6061+/* Called at the end of a state transition when we are the master */
6062+static int end_transition()
6063+{
6064+ struct cl_mem_endtrans_msg msg;
6065+ int total_votes;
6066+ int status;
6067+
6068+ /* Cancel the timer */
6069+ del_timer(&transition_timer);
6070+
6071+ confirm_joiner();
6072+
6073+ quorum = calculate_quorum(leavereason, 0, &total_votes);
6074+
6075+ msg.cmd = CLUSTER_MEM_ENDTRANS;
6076+ msg.quorum = cpu_to_le32(quorum);
6077+ msg.generation = cpu_to_le32(++cluster_generation);
6078+ msg.total_votes = cpu_to_le32(total_votes);
6079+ if (joining_node && transitionreason == TRANS_NEWNODE) {
6080+ msg.new_node_id = cpu_to_le32(joining_node->node_id);
6081+ }
6082+ else {
6083+ msg.new_node_id = 0;
6084+ }
6085+ status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0);
6086+
6087+ /* When that's all settled down, do the transition completion barrier */
6088+ kcl_wait_for_all_acks();
6089+
6090+ if (wait_for_completion_barrier() != 0) {
6091+ P_MEMB("Barrier timed out - restart\n");
6092+ start_transition(TRANS_RESTART, us);
6093+ return 0;
6094+ }
6095+
b7b72b66
AM
6096+ joining_temp_nodeid = 0;
6097+ purge_temp_nodeids();
6098+
c1c6733f
AM
6099+ set_quorate(total_votes);
6100+
6101+ notify_listeners();
6102+ reset_hello_time();
6103+
6104+ /* Tell any waiting barriers that we had a transition */
6105+ check_barrier_returns();
6106+
6107+ leavereason = 0;
6108+ node_state = MEMBER;
6109+ transition_end_time = jiffies;
6110+
6111+ sm_member_update(cluster_is_quorate);
6112+
6113+ return 0;
6114+}
6115+
6116+int send_reconfigure(int param, unsigned int value)
6117+{
6118+ char msgbuf[66];
6119+ struct cl_mem_reconfig_msg *msg =
6120+ (struct cl_mem_reconfig_msg *) &msgbuf;
6121+
6122+ if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value)
6123+ expected_votes = value;
6124+
6125+ msg->cmd = CLUSTER_MEM_RECONFIG;
6126+ msg->param = param;
6127+ msg->value = cpu_to_le32(value);
6128+
6129+ return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0);
6130+}
6131+
6132+static int send_joinack(char *addr, int addr_len, unsigned char acktype)
6133+{
6134+ struct cl_mem_joinack_msg msg;
6135+
6136+ msg.cmd = CLUSTER_MEM_JOINACK;
6137+ msg.acktype = acktype;
6138+
6139+ return kcl_sendmsg(mem_socket, &msg, sizeof (msg),
6140+ (struct sockaddr_cl *)addr, addr_len, MSG_NOACK);
6141+}
6142+
6143+/* Only send a leave message to one node in the cluster so that it can master
6144+ * the state transition, otherwise we get a "thundering herd" of potential
6145+ * masters fighting it out */
6146+int send_leave(unsigned char flags)
6147+{
6148+ unsigned char msg[2];
6149+ struct sockaddr_cl saddr;
6150+ struct cluster_node *node = NULL;
6151+ int status;
6152+
6153+ if (!mem_socket)
c783755a 6154+ return 0;
c1c6733f
AM
6155+
6156+ saddr.scl_family = AF_CLUSTER;
6157+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6158+
6159+ /* If we are in transition then use the current master */
6160+ if (node_state == TRANSITION) {
6161+ node = master_node;
6162+ }
6163+ if (!node) {
6164+ /* If we are the master or not in transition then pick a node
6165+ * almost at random */
6166+ struct list_head *nodelist;
6167+
6168+ down(&cluster_members_lock);
6169+ list_for_each(nodelist, &cluster_members_list) {
6170+ node = list_entry(nodelist, struct cluster_node, list);
6171+
6172+ if (node->state == NODESTATE_MEMBER && !node->us)
6173+ break;
6174+ }
6175+ up(&cluster_members_lock);
6176+ }
6177+
6178+ /* we are the only member of the cluster - there is no-one to tell */
6179+ if (node && !node->us) {
6180+ saddr.scl_nodeid = node->node_id;
6181+
6182+ P_MEMB("Sending LEAVE to %s\n", node->name);
6183+ msg[0] = CLUSTER_MEM_LEAVE;
6184+ msg[1] = flags;
b7b72b66
AM
6185+ status = kcl_sendmsg(mem_socket, msg, 2,
6186+ &saddr, sizeof (saddr),
6187+ MSG_NOACK);
c1c6733f
AM
6188+ if (status < 0)
6189+ return status;
6190+ }
6191+
6192+ /* And exit */
6193+ node_state = LEFT_CLUSTER;
6194+ wake_up_process(membership_task);
6195+ return 0;
6196+}
6197+
6198+int send_kill(int nodeid)
6199+{
6200+ char killmsg;
6201+ struct sockaddr_cl saddr;
6202+
6203+ killmsg = CLUSTER_MEM_KILL;
6204+
6205+ saddr.scl_family = AF_CLUSTER;
6206+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6207+ saddr.scl_nodeid = nodeid;
6208+ return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr,
6209+ sizeof (struct sockaddr_cl), MSG_NOACK);
6210+}
6211+
6212+/* Process a message */
bb1d8b11 6213+static int do_membership_packet(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
6214+{
6215+ int result = -1;
c1c6733f
AM
6216+ struct sockaddr_cl *saddr = msg->msg_name;
6217+ struct cluster_node *node;
6218+
6219+ node = find_node_by_nodeid(saddr->scl_nodeid);
6220+
6221+ P_MEMB("got membership message : %s, from (%d) %s, len = %d\n",
6222+ msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len);
6223+
6224+ switch (*buf) {
6225+ case CLUSTER_MEM_JOINREQ:
bb1d8b11 6226+ result = do_process_joinreq(msg, buf, len);
c1c6733f
AM
6227+ break;
6228+
6229+ case CLUSTER_MEM_LEAVE:
6230+ if (we_are_a_cluster_member)
bb1d8b11 6231+ result = do_process_leave(msg, buf, len);
c1c6733f
AM
6232+ break;
6233+
6234+ case CLUSTER_MEM_HELLO:
bb1d8b11 6235+ result = do_process_hello(msg, buf, len);
c1c6733f
AM
6236+ break;
6237+
6238+ case CLUSTER_MEM_KILL:
6239+ if (we_are_a_cluster_member)
bb1d8b11 6240+ result = do_process_kill(msg, buf, len);
c1c6733f
AM
6241+ break;
6242+
6243+ case CLUSTER_MEM_JOINCONF:
6244+ if (node_state == JOINACK) {
bb1d8b11 6245+ do_process_joinconf(msg, buf, len);
c1c6733f
AM
6246+ }
6247+ break;
6248+
6249+ case CLUSTER_MEM_CONFACK:
6250+ if (node_state == MASTER && master_state == MASTER_CONFIRM) {
6251+ end_transition();
6252+ }
6253+ break;
6254+
6255+ case CLUSTER_MEM_MASTERVIEW:
6256+ if (node_state == TRANSITION)
bb1d8b11 6257+ do_process_masterview(msg, buf, len);
c1c6733f
AM
6258+ break;
6259+
6260+ case CLUSTER_MEM_JOINACK:
b7b72b66
AM
6261+ if (node_state == JOINING || node_state == JOINWAIT ||
6262+ node_state == JOINACK) {
bb1d8b11 6263+ do_process_joinack(msg, buf, len);
c1c6733f
AM
6264+ }
6265+ break;
6266+ case CLUSTER_MEM_RECONFIG:
6267+ if (we_are_a_cluster_member) {
bb1d8b11 6268+ do_process_reconfig(msg, buf, len);
c1c6733f
AM
6269+ }
6270+ break;
6271+
6272+ case CLUSTER_MEM_STARTTRANS:
bb1d8b11 6273+ result = do_process_starttrans(msg, buf, len);
c1c6733f
AM
6274+ break;
6275+
6276+ case CLUSTER_MEM_ENDTRANS:
bb1d8b11 6277+ result = do_process_endtrans(msg, buf, len);
c1c6733f
AM
6278+ break;
6279+
6280+ case CLUSTER_MEM_VIEWACK:
b7b72b66 6281+ if (node_state == MASTER && master_state == MASTER_COLLECT)
bb1d8b11 6282+ result = do_process_viewack(msg, buf, len);
c1c6733f
AM
6283+ break;
6284+
6285+ case CLUSTER_MEM_STARTACK:
6286+ if (node_state == MASTER)
bb1d8b11 6287+ result = do_process_startack(msg, buf, len);
c1c6733f
AM
6288+ break;
6289+
6290+ case CLUSTER_MEM_NEWCLUSTER:
bb1d8b11 6291+ result = do_process_newcluster(msg, buf, len);
c1c6733f
AM
6292+ break;
6293+
6294+ case CLUSTER_MEM_NOMINATE:
6295+ if (node_state != MASTER)
bb1d8b11 6296+ result = do_process_nominate(msg, buf, len);
c1c6733f
AM
6297+ break;
6298+
6299+ default:
6300+ printk(KERN_ERR CMAN_NAME
bb1d8b11
AM
6301+ ": Unknown membership services message %d received from node %d port %d\n",
6302+ *buf, saddr->scl_nodeid, saddr->scl_port);
c1c6733f
AM
6303+ break;
6304+
6305+ }
6306+ return result;
6307+}
6308+
6309+/* Returns -ve to reject membership of the cluster 0 to accept membership +ve
6310+ * to ignore request (node already joining) */
6311+static int check_duplicate_node(char *name, struct msghdr *msg, int len)
6312+{
6313+ struct cluster_node *node;
6314+ struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name;
6315+ char addr[address_length];
6316+ int addrlen;
6317+
6318+ if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN)
6319+ return -3;
6320+
6321+ /* See if we already have a cluster member with that name... */
6322+ node = find_node_by_name(name);
6323+ if (node && node->state != NODESTATE_DEAD) {
6324+
c783755a 6325+ if (node->state == NODESTATE_JOINING)
c1c6733f
AM
6326+ return +1;
6327+
6328+ printk(KERN_WARNING CMAN_NAME
6329+ ": Rejecting cluster membership application from %s - already have a node with that name\n",
6330+ name);
6331+ return -1;
6332+
6333+ }
6334+
6335+ /* Need to check the node's address too */
6336+ if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) &&
6337+ (node = find_node_by_addr(addr, addrlen)) &&
6338+ node->state != NODESTATE_DEAD) {
6339+
c783755a 6340+ if (node->state == NODESTATE_JOINING)
c1c6733f
AM
6341+ return +1;
6342+
6343+ printk(KERN_WARNING CMAN_NAME
6344+ ": Rejecting cluster membership application from %s - already have a node with that address\n",
6345+ name);
6346+ return -1;
6347+ }
6348+ return 0;
6349+}
6350+
6351+/* Start the state transition */
6352+static int start_transition(unsigned char reason, struct cluster_node *node)
6353+{
6354+ char *startbuf = scratchbuf;
6355+ struct cl_mem_starttrans_msg *msg =
6356+ (struct cl_mem_starttrans_msg *) startbuf;
6357+
6358+ P_MEMB("Start transition - reason = %d\n", reason);
6359+
6360+ /* If this is a restart then zero the counters */
6361+ if (reason == TRANS_RESTART) {
6362+ agreeing_nodes = 0;
6363+ dissenting_nodes = 0;
6364+ if (node_opinion) {
6365+ kfree(node_opinion);
6366+ node_opinion = NULL;
6367+ }
6368+ responses_collected = 0;
6369+ }
6370+
6371+ /* If we have timed out too many times then just die */
6372+ if (reason == TRANS_RESTART
6373+ && ++transition_restarts > cman_config.transition_restarts) {
6374+ printk(KERN_WARNING CMAN_NAME
6375+ ": too many transition restarts - will die\n");
c783755a 6376+ us->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
c1c6733f
AM
6377+ node_state = LEFT_CLUSTER;
6378+ quit_threads = 1;
6379+ wake_up_process(membership_task);
6380+ wake_up_interruptible(&cnxman_waitq);
6381+ return 0;
6382+ }
6383+ if (reason != TRANS_RESTART)
6384+ transition_restarts = 0;
6385+
6386+ /* Only keep the original state transition reason in the global
6387+ * variable. */
6388+ if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER &&
6389+ reason != TRANS_RESTART && reason != TRANS_DEADMASTER)
6390+ transitionreason = reason;
6391+
6392+ /* Save the info of the requesting node */
6393+ if (reason == TRANS_NEWNODE)
6394+ joining_node = node;
6395+
6396+ node_state = MASTER;
6397+ master_state = MASTER_START;
6398+ responses_collected = 0;
6399+ responses_expected = cluster_members - 1;
6400+
6401+ /* If we are on our own then just do it */
6402+ if (responses_expected == 0) {
6403+ P_MEMB("We are on our own...lonely here\n");
6404+ responses_collected--;
bb1d8b11 6405+ do_process_startack(NULL, NULL, 0);
c1c6733f
AM
6406+ }
6407+ else {
6408+ int ptr = sizeof (struct cl_mem_starttrans_msg);
6409+ struct list_head *addrlist;
6410+ unsigned short num_addrs = 0;
b7b72b66 6411+ int flags = MSG_REPLYEXP;
c1c6733f
AM
6412+
6413+ /* Send the STARTTRANS message */
6414+ msg->cmd = CLUSTER_MEM_STARTTRANS;
6415+ msg->reason = reason;
6416+ msg->votes = node->votes;
6417+ msg->expected_votes = cpu_to_le32(node->expected_votes);
6418+ msg->generation = cpu_to_le32(++cluster_generation);
6419+ msg->nodeid = cpu_to_le32(node->node_id);
6420+
6421+ if (reason == TRANS_NEWNODE) {
6422+ /* Add the addresses */
6423+ list_for_each(addrlist, &node->addr_list) {
6424+ struct cluster_node_addr *nodeaddr =
6425+ list_entry(addrlist,
6426+ struct cluster_node_addr, list);
6427+
6428+ memcpy(startbuf + ptr, nodeaddr->addr,
6429+ address_length);
6430+ ptr += address_length;
6431+ num_addrs++;
6432+ }
6433+
6434+ /* And the name */
6435+ strcpy(startbuf + ptr, node->name);
6436+ ptr += strlen(node->name) + 1;
6437+ }
6438+
6439+ /* If another node died then we must queue the STARTTRANS
6440+ * messages so that membershipd can carry on processing the
6441+ * other replies */
6442+ if (reason == TRANS_ANOTHERREMNODE)
6443+ flags |= MSG_QUEUE;
6444+
6445+ msg->num_addrs = cpu_to_le16(num_addrs);
6446+ kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags);
6447+ }
6448+ /* Set a timer in case we don't get 'em all back */
6449+ mod_timer(&transition_timer,
6450+ jiffies + cman_config.transition_timeout * HZ);
6451+ return 0;
6452+}
6453+
6454+/* A node has died - decide what to do */
6455+void a_node_just_died(struct cluster_node *node)
6456+{
6457+ /* If we are not in the context of kmembershipd then stick it on the
6458+ * list and wake it */
6459+ if (current != membership_task) {
6460+ struct cl_new_dead_node *newnode =
6461+ kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL);
6462+ if (!newnode)
6463+ return;
6464+ newnode->node = node;
6465+ down(&new_dead_node_lock);
6466+ list_add_tail(&newnode->list, &new_dead_node_list);
6467+ set_bit(WAKE_FLAG_DEADNODE, &wake_flags);
6468+ up(&new_dead_node_lock);
6469+ wake_up_process(membership_task);
6470+ P_MEMB("Passing dead node %s onto kmembershipd\n", node->name);
6471+ return;
6472+ }
6473+
6474+ /* Remove it */
6475+ down(&cluster_members_lock);
6476+ if (node->state == NODESTATE_MEMBER)
6477+ cluster_members--;
6478+ node->state = NODESTATE_DEAD;
6479+ up(&cluster_members_lock);
6480+
6481+ /* Notify listeners */
6482+ notify_kernel_listeners(DIED, (long) node->node_id);
6483+
6484+ /* If we are in normal operation then become master and initiate a
6485+ * state-transition */
6486+ if (node_state == MEMBER) {
6487+ start_transition(TRANS_REMNODE, node);
6488+ return;
6489+ }
6490+
6491+ /* If we are a slave in transition then see if it's the master that has
6492+ * failed. If not then ignore it. If it /is/ the master then elect a
6493+ * new one */
6494+ if (node_state == TRANSITION) {
6495+ if (master_node == node) {
6496+ if (elect_master(&node)) {
6497+ del_timer(&transition_timer);
6498+ node_state = MASTER;
6499+
6500+ start_transition(TRANS_DEADMASTER, master_node);
6501+ }
6502+ else {
6503+ /* Someone else can be in charge - phew! */
6504+ }
6505+ }
6506+ return;
6507+ }
6508+
6509+ /* If we are the master then we need to start the transition all over
6510+ * again */
6511+ if (node_state == MASTER) {
6512+ /* Cancel timer */
6513+ del_timer(&transition_timer);
6514+
6515+ /* Restart the transition */
6516+ start_transition(TRANS_ANOTHERREMNODE, node);
6517+ transition_restarts = 0;
6518+ return;
6519+ }
6520+}
6521+
6522+/*
6523+ * Build up and send a set of messages consisting of the whole cluster view.
6524+ * The first byte is the command (cmd as passed in), the second is a flag byte:
6525+ * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if
6526+ * this is the only message sent The rest is a set of packed node entries, which
6527+ * are NOT split over packets. */
6528+static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
b7b72b66 6529+ unsigned int flags, unsigned int flags2)
c1c6733f
AM
6530+{
6531+ int ptr = 2;
6532+ int len;
6533+ int status = 0;
6534+ int last_node_start = 2;
6535+ unsigned char first_packet_flag = 1;
6536+ struct list_head *nodelist;
6537+ struct list_head *temp;
6538+ struct cluster_node *node;
6539+ char *message = scratchbuf;
6540+
6541+ message[0] = cmd;
6542+
6543+ down(&cluster_members_lock);
6544+ list_for_each_safe(nodelist, temp, &cluster_members_list) {
6545+ node = list_entry(nodelist, struct cluster_node, list);
6546+
c783755a 6547+ if (node->state == NODESTATE_MEMBER || node->state == NODESTATE_DEAD) {
c1c6733f
AM
6548+ unsigned int evotes;
6549+ unsigned int node_id;
6550+ unsigned short num_addrs = 0;
6551+ unsigned short num_addrs_le;
6552+ struct list_head *addrlist;
6553+
6554+ last_node_start = ptr;
6555+
6556+ message[ptr++] = len = strlen(node->name);
6557+ strcpy(&message[ptr], node->name);
6558+ ptr += len;
6559+
c783755a
AM
6560+ message[ptr++] = node->state;
6561+
c1c6733f
AM
6562+ /* Count the number of addresses this node has */
6563+ list_for_each(addrlist, &node->addr_list) {
6564+ num_addrs++;
6565+ }
6566+
6567+ num_addrs_le = cpu_to_le16(num_addrs);
6568+ memcpy(&message[ptr], &num_addrs_le, sizeof (short));
6569+ ptr += sizeof (short);
6570+
6571+ /* Pack em in */
6572+ list_for_each(addrlist, &node->addr_list) {
6573+
6574+ struct cluster_node_addr *nodeaddr =
c783755a
AM
6575+ list_entry(addrlist,
6576+ struct cluster_node_addr, list);
c1c6733f
AM
6577+
6578+ memcpy(&message[ptr], nodeaddr->addr,
6579+ address_length);
6580+ ptr += address_length;
6581+ }
6582+
6583+ message[ptr++] = node->votes;
6584+
6585+ evotes = cpu_to_le32(node->expected_votes);
6586+ memcpy(&message[ptr], &evotes, sizeof (int));
6587+ ptr += sizeof (int);
6588+
6589+ node_id = cpu_to_le32(node->node_id);
6590+ memcpy(&message[ptr], &node_id, sizeof (int));
6591+ ptr += sizeof (int);
6592+
6593+ /* If the block is full then send it */
6594+ if (ptr > MAX_CLUSTER_MESSAGE) {
6595+ message[1] = first_packet_flag;
6596+
6597+ up(&cluster_members_lock);
c783755a
AM
6598+ status = kcl_sendmsg(mem_socket, message,
6599+ last_node_start, saddr,
6600+ saddr ? sizeof (struct sockaddr_cl) : 0,
6601+ flags);
c1c6733f
AM
6602+
6603+ if (status < 0)
6604+ goto send_fail;
6605+
6606+ down(&cluster_members_lock);
6607+
6608+ first_packet_flag = 0;
6609+ /* Copy the overflow back to the start of the
6610+ * buffer for the next send */
6611+ memcpy(&message[2], &message[last_node_start],
6612+ ptr - last_node_start);
6613+ ptr = ptr - last_node_start + 2;
6614+ }
6615+ }
6616+ }
6617+
6618+ up(&cluster_members_lock);
6619+
6620+ message[1] = first_packet_flag | 2; /* The last may also be first */
6621+ status = kcl_sendmsg(mem_socket, message, ptr,
6622+ saddr, saddr ? sizeof (struct sockaddr_cl) : 0,
b7b72b66 6623+ flags | flags2);
c1c6733f
AM
6624+ send_fail:
6625+
6626+ return status;
6627+}
6628+
6629+/* Make the JOINING node into a MEMBER */
6630+static void confirm_joiner()
6631+{
6632+ if (joining_node && joining_node->state == NODESTATE_JOINING) {
6633+ down(&cluster_members_lock);
6634+ joining_node->state = NODESTATE_MEMBER;
6635+ cluster_members++;
6636+ up(&cluster_members_lock);
6637+ }
c1c6733f
AM
6638+}
6639+
6640+/* Reset HELLO timers for all nodes We do this after a state-transition as we
6641+ * have had HELLOS disabled during the transition and if we don't do this the
6642+ * nodes will go on an uncontrolled culling-spree afterwards */
6643+static void reset_hello_time()
6644+{
6645+ struct list_head *nodelist;
6646+ struct cluster_node *node;
6647+
6648+ down(&cluster_members_lock);
6649+ list_for_each(nodelist, &cluster_members_list) {
6650+ node = list_entry(nodelist, struct cluster_node, list);
6651+
6652+ if (node->state == NODESTATE_MEMBER) {
6653+ node->last_hello = jiffies;
6654+ }
6655+
6656+ }
6657+ up(&cluster_members_lock);
6658+}
6659+
6660+/* Calculate the new quorum and return the value. do *not* set it in here as
6661+ * cnxman calls this to check if a new expected_votes value is valid. It
6662+ * (optionally) returns the total number of votes in the cluster */
6663+int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes)
6664+{
6665+ struct list_head *nodelist;
6666+ struct cluster_node *node;
6667+ unsigned int total_votes = 0;
6668+ unsigned int highest_expected = 0;
6669+ unsigned int newquorum, q1, q2;
6670+
6671+ down(&cluster_members_lock);
6672+ list_for_each(nodelist, &cluster_members_list) {
6673+ node = list_entry(nodelist, struct cluster_node, list);
6674+
6675+ if (node->state == NODESTATE_MEMBER) {
6676+ highest_expected =
6677+ max(highest_expected, node->expected_votes);
6678+ total_votes += node->votes;
6679+ }
6680+ }
6681+ up(&cluster_members_lock);
6682+ if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
6683+ total_votes += quorum_device->votes;
6684+
6685+ if (max_expected > 0)
6686+ highest_expected = max_expected;
6687+
6688+ /* This quorum calculation is taken from the OpenVMS Cluster Systems
6689+ * manual, but, then, you guessed that didn't you */
6690+ q1 = (highest_expected + 2) / 2;
6691+ q2 = (total_votes + 2) / 2;
6692+ newquorum = max(q1, q2);
6693+
6694+ /* Normally quorum never decreases but the system administrator can
6695+ * force it down by setting expected votes to a maximum value */
6696+ if (!allow_decrease)
6697+ newquorum = max(quorum, newquorum);
6698+
6699+ /* The special two_node mode allows each of the two nodes to retain
6700+ * quorum if the other fails. Only one of the two should live past
6701+ * fencing (as both nodes try to fence each other in split-brain.) */
6702+ if (two_node)
6703+ newquorum = 1;
6704+
6705+ if (ret_total_votes)
6706+ *ret_total_votes = total_votes;
6707+ return newquorum;
6708+}
6709+
6710+/* Recalculate cluster quorum, set quorate and notify changes */
6711+void recalculate_quorum(int allow_decrease)
6712+{
6713+ int total_votes;
6714+
6715+ quorum = calculate_quorum(allow_decrease, 0, &total_votes);
6716+ set_quorate(total_votes);
6717+ notify_listeners();
6718+}
6719+
6720+/* Add new node address to an existing node */
6721+int add_node_address(struct cluster_node *node, unsigned char *addr, int len)
6722+{
6723+ struct cluster_node_addr *newaddr;
6724+
6725+ newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL);
6726+ if (!newaddr)
6727+ return -1;
6728+
6729+ memcpy(newaddr->addr, addr, len);
6730+ newaddr->addr_len = len;
6731+ list_add_tail(&newaddr->list, &node->addr_list);
6732+
6733+ return 0;
6734+}
6735+
6736+static struct cluster_node *add_new_node(char *name, unsigned char votes,
6737+ unsigned int expected_votes,
6738+ int node_id, int state)
6739+{
6740+ struct cluster_node *newnode;
6741+
6742+ /* Look for a dead node with this name */
6743+ newnode = find_node_by_name(name);
6744+
6745+ /* Is it already joining */
6746+ if (newnode && newnode->state == NODESTATE_JOINING)
6747+ return NULL;
6748+
6749+ /* Update existing information */
6750+ if (newnode && newnode->state == NODESTATE_DEAD) {
6751+ newnode->last_hello = jiffies;
6752+ newnode->votes = votes;
6753+ newnode->expected_votes = expected_votes;
6754+ newnode->state = state;
6755+ newnode->us = 0;
6756+ newnode->leave_reason = 0;
6757+ newnode->last_seq_recv = 0;
6758+ newnode->last_seq_acked = 0;
6759+ newnode->last_seq_sent = 0;
6760+ newnode->incarnation++;
c783755a 6761+ do_gettimeofday(&newnode->join_time);
c1c6733f
AM
6762+ /* Don't overwrite the node ID */
6763+
6764+ if (state == NODESTATE_MEMBER) {
6765+ down(&cluster_members_lock);
6766+ cluster_members++;
6767+ up(&cluster_members_lock);
6768+ }
6769+
6770+ printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name);
6771+ return newnode;
6772+ }
6773+
6774+ newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
6775+ if (!newnode)
6776+ goto alloc_err;
6777+
6778+ memset(newnode, 0, sizeof (struct cluster_node));
6779+ newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
6780+ if (!newnode->name)
6781+ goto alloc_err1;
6782+
6783+ strcpy(newnode->name, name);
6784+ newnode->last_hello = jiffies;
6785+ newnode->votes = votes;
6786+ newnode->expected_votes = expected_votes;
6787+ newnode->state = state;
6788+ newnode->node_id = node_id;
6789+ newnode->us = 0;
6790+ newnode->leave_reason = 0;
6791+ newnode->last_seq_recv = 0;
6792+ newnode->last_seq_acked = 0;
6793+ newnode->last_seq_sent = 0;
6794+ newnode->incarnation = 0;
c783755a 6795+ do_gettimeofday(&newnode->join_time);
c1c6733f
AM
6796+ INIT_LIST_HEAD(&newnode->addr_list);
6797+ set_nodeid(newnode, node_id);
6798+
6799+ /* Add the new node to the list */
6800+ down(&cluster_members_lock);
6801+ list_add(&newnode->list, &cluster_members_list);
6802+ if (state == NODESTATE_MEMBER)
6803+ cluster_members++;
6804+ up(&cluster_members_lock);
6805+
6806+ printk(KERN_INFO CMAN_NAME ": got node %s\n", name);
6807+ return newnode;
6808+
6809+ alloc_err1:
6810+ kfree(newnode);
6811+ alloc_err:
6812+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
6813+
6814+ printk(KERN_CRIT CMAN_NAME
6815+ ": Cannot allocate memory for new cluster node %s\n", name);
6816+
6817+ panic("cluster memory allocation failed");
6818+
6819+ return NULL;
6820+}
6821+
6822+/* Remove node from a STARTTRANS message */
6823+static struct cluster_node *remove_node(int nodeid)
6824+{
6825+ struct cluster_node *node = find_node_by_nodeid(nodeid);
6826+
6827+ if (node && node->state == NODESTATE_MEMBER) {
6828+ P_MEMB("starttrans removes node %s\n", node->name);
6829+ down(&cluster_members_lock);
6830+ node->state = NODESTATE_DEAD;
6831+ cluster_members--;
6832+ up(&cluster_members_lock);
6833+
6834+ notify_kernel_listeners(DIED, (long) nodeid);
6835+
6836+ /* If this node is us then go quietly */
6837+ if (node->us) {
6838+ printk(KERN_INFO CMAN_NAME
6839+ ": killed by STARTTRANS or NOMINATE\n");
c783755a 6840+ node_state = LEFT_CLUSTER;
c1c6733f
AM
6841+ quit_threads = 1;
6842+ wake_up_process(membership_task);
6843+ wake_up_interruptible(&cnxman_waitq);
6844+ }
6845+ }
6846+ return node;
6847+}
6848+
6849+/* Add a node from a STARTTRANS or NOMINATE message */
bb1d8b11 6850+static void add_node_from_starttrans(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
6851+{
6852+ /* Add the new node but don't fill in the ID until the master has
6853+ * confirmed it */
6854+ struct cl_mem_starttrans_msg *startmsg =
bb1d8b11 6855+ (struct cl_mem_starttrans_msg *)buf;
c1c6733f 6856+ int ptr = sizeof (struct cl_mem_starttrans_msg);
c1c6733f 6857+ int i;
bb1d8b11
AM
6858+ char *name = buf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length;
6859+ char *nodeaddr = buf + sizeof(struct cl_mem_starttrans_msg);
c1c6733f
AM
6860+
6861+ joining_node = add_new_node(name, startmsg->votes,
6862+ le32_to_cpu(startmsg->expected_votes),
6863+ 0, NODESTATE_JOINING);
6864+
6865+ /* add_new_node returns NULL if the node already exists */
6866+ if (!joining_node)
6867+ joining_node = find_node_by_name(name);
6868+
6869+ /* Add the node's addresses */
6870+ if (list_empty(&joining_node->addr_list)) {
6871+ for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) {
bb1d8b11 6872+ add_node_address(joining_node, buf + ptr, address_length);
c1c6733f
AM
6873+ ptr += address_length;
6874+ }
6875+ }
b7b72b66
AM
6876+
6877+ /* Make sure we have a temp nodeid for the new node in case we
6878+ become master */
6879+ joining_temp_nodeid = new_temp_nodeid(nodeaddr,
6880+ address_length);
c1c6733f
AM
6881+}
6882+
6883+/* We have been nominated as master for a transition */
bb1d8b11 6884+static int do_process_nominate(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
6885+{
6886+ struct cl_mem_starttrans_msg *startmsg =
bb1d8b11 6887+ (struct cl_mem_starttrans_msg *)buf;
c1c6733f 6888+ struct cluster_node *node = NULL;
c1c6733f
AM
6889+
6890+ P_MEMB("nominate reason is %d\n", startmsg->reason);
6891+
6892+ if (startmsg->reason == TRANS_REMNODE) {
6893+ node = remove_node(le32_to_cpu(startmsg->nodeid));
6894+ }
6895+
6896+ if (startmsg->reason == TRANS_NEWNODE) {
bb1d8b11 6897+ add_node_from_starttrans(msg, buf, len);
c1c6733f 6898+ node = joining_node;
c1c6733f
AM
6899+ }
6900+
6901+ /* This should be a TRANS_CHECK but start_transition needs some node
6902+ * info */
6903+ if (node == NULL)
6904+ node = us;
6905+ start_transition(startmsg->reason, node);
6906+ return 0;
6907+}
6908+
6909+/* Got a STARTACK response from a node */
bb1d8b11 6910+static int do_process_startack(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
6911+{
6912+ if (node_state != MASTER && master_state != MASTER_START) {
6913+ P_MEMB("Got StartACK when not in MASTER_STARTING substate\n");
6914+ return 0;
6915+ }
6916+
bb1d8b11
AM
6917+ /* buf is NULL if we are called directly from start_transition */
6918+ if (buf) {
6919+ struct cl_mem_startack_msg *ackmsg =
6920+ (struct cl_mem_startack_msg *)buf;
c1c6733f
AM
6921+
6922+ /* Ignore any messages wil old generation numbers in them */
6923+ if (le32_to_cpu(ackmsg->generation) != cluster_generation) {
6924+ P_MEMB("Got old generation START-ACK msg - ignoring\n");
6925+ return 0;
6926+ }
6927+ }
6928+
6929+ /* If the node_id is non-zero then use it. */
6930+ if (transitionreason == TRANS_NEWNODE && joining_node && msg) {
bb1d8b11
AM
6931+ struct cl_mem_startack_msg *ackmsg =
6932+ (struct cl_mem_startack_msg *)buf;
c1c6733f
AM
6933+
6934+ if (ackmsg->node_id) {
6935+ set_nodeid(joining_node, le32_to_cpu(ackmsg->node_id));
6936+ }
6937+ highest_nodeid =
6938+ max(highest_nodeid, le32_to_cpu(ackmsg->highest_node_id));
6939+ P_MEMB("Node id = %d, highest node id = %d\n",
6940+ le32_to_cpu(ackmsg->node_id),
6941+ le32_to_cpu(ackmsg->highest_node_id));
6942+ }
6943+
6944+ /* If we have all the responses in then move to the next stage */
6945+ if (++responses_collected == responses_expected) {
6946+
6947+ /* If the new node has no node_id (ie nobody in the cluster has
6948+ * heard of it before) then assign it a new one */
6949+ if (transitionreason == TRANS_NEWNODE && joining_node) {
6950+ highest_nodeid =
6951+ max(highest_nodeid, get_highest_nodeid());
6952+ if (joining_node->node_id == 0) {
6953+ set_nodeid(joining_node, ++highest_nodeid);
6954+ }
6955+ P_MEMB("nodeIDs: new node: %d, highest: %d\n",
6956+ joining_node->node_id, highest_nodeid);
6957+ }
6958+
6959+ /* Behave a little differently if we are on our own */
6960+ if (cluster_members == 1) {
6961+ if (transitionreason == TRANS_NEWNODE) {
6962+ /* If the cluster is just us then confirm at
6963+ * once */
6964+ joinconf_count = 0;
6965+ mod_timer(&transition_timer,
6966+ jiffies +
6967+ cman_config.joinconf_timeout * HZ);
6968+ send_joinconf();
6969+ return 0;
6970+ }
6971+ else { /* Node leaving the cluster */
6972+ recalculate_quorum(leavereason);
6973+ leavereason = 0;
6974+ node_state = MEMBER;
6975+ }
6976+ }
6977+ else {
6978+ master_state = MASTER_COLLECT;
6979+ responses_collected = 0;
6980+ responses_expected = cluster_members - 1;
6981+ P_MEMB("Sending MASTERVIEW: expecting %d responses\n",
6982+ responses_expected);
6983+
b7b72b66 6984+ send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0, MSG_REPLYEXP);
c1c6733f
AM
6985+
6986+ /* Set a timer in case we don't get 'em all back */
6987+ mod_timer(&transition_timer,
6988+ jiffies +
6989+ cman_config.transition_timeout * HZ);
6990+ }
6991+ }
6992+ return 0;
6993+}
6994+
6995+/* Got a VIEWACK response from a node */
bb1d8b11 6996+static int do_process_viewack(struct msghdr *msg, char *reply, int len)
c1c6733f 6997+{
c1c6733f
AM
6998+ struct sockaddr_cl *saddr = msg->msg_name;
6999+
c1c6733f
AM
7000+ if (node_opinion == NULL) {
7001+ node_opinion =
7002+ kmalloc((1 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL);
7003+ if (!node_opinion) {
7004+ panic(": malloc agree/dissent failed\n");
7005+ }
7006+ memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t));
7007+ }
7008+
7009+ /* Keep a list of agreeing and dissenting nodes */
7010+ if (reply[1] == 1) {
7011+ /* ACK - remote node agrees with me */
7012+ P_MEMB("Node agrees\n");
7013+ node_opinion[saddr->scl_nodeid] = OPINION_AGREE;
7014+ agreeing_nodes++;
7015+ }
7016+ else {
7017+ /* Remote node disagrees */
7018+ P_MEMB("Node disagrees\n");
7019+ node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE;
7020+ dissenting_nodes++;
7021+ }
7022+
7023+ P_MEMB("got %d responses, expected %d\n", responses_collected + 1,
7024+ responses_expected);
7025+
7026+ /* Are all the results in yet ? */
7027+ if (++responses_collected == responses_expected) {
7028+ del_timer(&transition_timer);
7029+
7030+ P_MEMB("The results are in: %d agree, %d dissent\n",
7031+ agreeing_nodes, dissenting_nodes);
7032+
7033+ if (agreeing_nodes > dissenting_nodes) {
7034+ /* Kill dissenting nodes */
7035+ int i;
7036+
7037+ for (i = 1; i <= responses_collected; i++) {
7038+ if (node_opinion[i] == OPINION_DISAGREE)
7039+ send_kill(i);
7040+ }
7041+ }
7042+ else {
7043+ /* We must leave the cluster as we are in a minority,
7044+ * the rest of them can fight it out amongst
7045+ * themselves. */
c783755a 7046+ us->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
c1c6733f
AM
7047+ agreeing_nodes = 0;
7048+ dissenting_nodes = 0;
7049+ kfree(node_opinion);
7050+ node_opinion = NULL;
7051+ node_state = LEFT_CLUSTER;
7052+ quit_threads = 1;
7053+ wake_up_process(membership_task);
7054+ wake_up_interruptible(&cnxman_waitq);
7055+ return -1;
7056+ }
7057+
7058+ /* Reset counters */
7059+ agreeing_nodes = 0;
7060+ dissenting_nodes = 0;
7061+ kfree(node_opinion);
7062+ node_opinion = NULL;
7063+
7064+ /* Confirm new node */
7065+ if (transitionreason == TRANS_NEWNODE) {
7066+ mod_timer(&transition_timer,
7067+ jiffies + cman_config.joinconf_timeout * HZ);
7068+ joinconf_count = 0;
7069+ send_joinconf();
7070+ return 0;
7071+ }
7072+
7073+ master_state = MASTER_COMPLETE;
7074+
7075+ end_transition();
7076+ }
7077+
7078+ return 0;
7079+}
7080+
7081+/* Got an ENDTRANS message */
bb1d8b11 7082+static int do_process_endtrans(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
7083+{
7084+ struct cl_mem_endtrans_msg *endmsg =
bb1d8b11 7085+ (struct cl_mem_endtrans_msg *)buf;
c1c6733f
AM
7086+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
7087+
7088+ /* Someone else's state transition */
7089+ if (node_state != TRANSITION && node_state != JOINACK)
7090+ return 0;
7091+
7092+ /* Check we got it from the MASTER node */
7093+ if (master_node && master_node->node_id != saddr->scl_nodeid) {
7094+ printk(KERN_INFO
7095+ "Got ENDTRANS from a node not the master: master: %d, sender: %d\n",
7096+ master_node->node_id, saddr->scl_nodeid);
7097+ return 0;
7098+ }
7099+
7100+ del_timer(&transition_timer);
7101+
7102+ /* Set node ID on new node */
7103+ if (endmsg->new_node_id) {
7104+ set_nodeid(joining_node, le32_to_cpu(endmsg->new_node_id));
7105+ P_MEMB("new node %s has ID %d\n", joining_node->name,
7106+ joining_node->node_id);
7107+ }
7108+
7109+ node_state = TRANSITION_COMPLETE;
7110+
7111+ /* Need to set this here or the barrier code will reject us if we've
7112+ * just joined */
7113+ we_are_a_cluster_member = TRUE;
7114+
7115+ confirm_joiner();
7116+ cluster_generation = le32_to_cpu(endmsg->generation);
7117+
7118+ if (wait_for_completion_barrier() != 0) {
7119+ P_MEMB("Barrier timed out - restart\n");
7120+ node_state = TRANSITION;
7121+ mod_timer(&transition_timer,
7122+ jiffies + cman_config.transition_timeout * HZ);
7123+ return 0;
7124+ }
7125+
7126+ quorum = le32_to_cpu(endmsg->quorum);
7127+ set_quorate(le32_to_cpu(endmsg->total_votes));
c783755a 7128+ highest_nodeid = get_highest_nodeid();
c1c6733f
AM
7129+
7130+ /* Tell any waiting barriers that we had a transition */
7131+ check_barrier_returns();
7132+
b7b72b66
AM
7133+ purge_temp_nodeids();
7134+
c1c6733f
AM
7135+ /* Clear the master node */
7136+ master_node = NULL;
7137+
7138+ node_state = MEMBER;
7139+
7140+ /* Notify other listeners that transition has completed */
7141+ notify_listeners();
7142+ reset_hello_time();
7143+ transition_end_time = jiffies;
7144+
7145+ sm_member_update(cluster_is_quorate);
7146+ return 0;
7147+}
7148+
7149+/* Turn a STARTTRANS message into NOMINATE and send it to the new master */
7150+static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen,
7151+ int nodeid)
7152+{
7153+ struct sockaddr_cl maddr;
7154+
7155+ maddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
7156+ maddr.scl_family = AF_CLUSTER;
7157+ maddr.scl_nodeid = nodeid;
7158+
7159+ startmsg->cmd = CLUSTER_MEM_NOMINATE;
7160+ return kcl_sendmsg(mem_socket, startmsg, msglen,
7161+ &maddr, sizeof (maddr), 0);
7162+}
7163+
7164+/* Got a STARTTRANS message */
bb1d8b11 7165+static int do_process_starttrans(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
7166+{
7167+ struct cl_mem_starttrans_msg *startmsg =
bb1d8b11 7168+ (struct cl_mem_starttrans_msg *)buf;
c1c6733f
AM
7169+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
7170+ struct cluster_node *node;
7171+ unsigned int newgen = le32_to_cpu(startmsg->generation);
7172+
7173+ /* Got a WHAT from WHOM? */
7174+ node = find_node_by_nodeid(saddr->scl_nodeid);
7175+ if (!node || node->state != NODESTATE_MEMBER)
7176+ return 0;
7177+
7178+ /* Someone else's state transition */
7179+ if (node_state != MEMBER &&
7180+ node_state != TRANSITION && node_state != MASTER)
7181+ return 0;
7182+
7183+ /* Ignore old generation STARTTRANS messages */
7184+ if ((newgen < cluster_generation) ||
7185+ (newgen == 0xFFFFFFFF && cluster_generation == 0)) {
7186+ P_MEMB("Ignoring STARTTRANS with old generation number\n");
7187+ return 0;
7188+ }
7189+
7190+ P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n",
7191+ newgen, cluster_generation, startmsg->reason);
7192+
7193+ /* Up the generation number */
7194+ cluster_generation = newgen;
7195+
7196+ /* If we are also a master then decide between us */
7197+ if (node_state == MASTER) {
7198+
7199+ /* See if we really want the responsibility of being master */
7200+ if (elect_master(&node)) {
7201+
7202+ /* I reluctantly accept this position of responsibility
7203+ */
7204+ P_MEMB("I elected myself master\n");
7205+
7206+ /* start_transition will re-establish this */
7207+ del_timer(&transition_timer);
7208+
7209+ start_transition(TRANS_NEWMASTER, node);
7210+ return 0;
7211+ }
7212+ else {
7213+ /* Back down */
7214+ P_MEMB("Backing down from MASTER status\n");
7215+ master_node = node;
7216+ node_state = MEMBER;
7217+
7218+ /* If we were bringing a new node into the cluster then
7219+ * we will have to abandon that now and tell the new
7220+ * node to try again later */
7221+ if (transitionreason == TRANS_NEWNODE && joining_node) {
7222+ struct cluster_node_addr *first_addr =
7223+ (struct cluster_node_addr *) joining_node->
7224+ addr_list.next;
7225+
7226+ P_MEMB("Postponing membership of node %s\n",
7227+ joining_node->name);
7228+ send_joinack(first_addr->addr, address_length,
7229+ JOINACK_TYPE_WAIT);
7230+
7231+ /* Not dead, just sleeping */
7232+ joining_node->state = NODESTATE_DEAD;
7233+ joining_node = NULL;
7234+ }
7235+
7236+ /* If the new master is not us OR the node we just got
7237+ * the STARTTRANS from then make sure it knows it has
7238+ * to be master */
7239+ if (saddr->scl_nodeid != node->node_id) {
7240+ send_nominate(startmsg, len, node->node_id);
7241+ return 0;
7242+ }
7243+
7244+ /* Fall through into MEMBER code below if we are
7245+ * obeying the STARTTRANS we just received */
7246+ }
7247+ }
7248+
7249+ /* Do non-MASTER STARTTRANS bits */
7250+ if (node_state == MEMBER) {
7251+ int ptr = sizeof (struct cl_mem_starttrans_msg);
7252+ int node_id = 0;
7253+
7254+ P_MEMB("Normal transition start\n");
7255+
7256+ /* If the master is adding a new node and we know it's node ID
7257+ * then ACK with it. */
7258+ if (startmsg->reason == TRANS_NEWNODE) {
7259+ struct cluster_node *node =
7260+ find_node_by_addr((char *) startmsg + ptr,
7261+ address_length);
7262+ if (node)
7263+ node_id = node->node_id;
7264+ }
7265+
7266+ /* Save the master info */
7267+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
7268+ node_state = TRANSITION;
7269+
7270+ if (startmsg->reason == TRANS_NEWNODE) {
bb1d8b11 7271+ add_node_from_starttrans(msg, buf, len);
c1c6733f
AM
7272+ }
7273+
7274+ if (startmsg->reason == TRANS_REMNODE ||
7275+ startmsg->reason == TRANS_ANOTHERREMNODE) {
7276+ remove_node(le32_to_cpu(startmsg->nodeid));
7277+ }
7278+
7279+ send_startack(saddr, msg->msg_namelen,
7280+ node_id);
7281+
7282+ /* Establish timer in case the master dies */
7283+ mod_timer(&transition_timer,
7284+ jiffies + cman_config.transition_timeout * HZ);
7285+
7286+ return 0;
7287+ }
7288+
7289+ /* We are in transition but this may be a restart */
7290+ if (node_state == TRANSITION) {
7291+
7292+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
7293+ send_startack(saddr, msg->msg_namelen, 0);
7294+
7295+ /* Is it a new joining node ? This happens if a master is
7296+ * usurped */
7297+ if (startmsg->reason == TRANS_NEWNODE) {
7298+ struct cluster_node *oldjoin = joining_node;
7299+
bb1d8b11 7300+ add_node_from_starttrans(msg, buf, len);
c1c6733f
AM
7301+
7302+ /* If this is a different node joining than the one we
7303+ * were previously joining (probably cos the master is
7304+ * a nominated one) then mark our "old" joiner as DEAD.
7305+ * The original master will already have told the node
7306+ * to go back into JOINWAIT state */
7307+ if (oldjoin && oldjoin != joining_node
7308+ && oldjoin->state == NODESTATE_JOINING)
7309+ oldjoin->state = NODESTATE_DEAD;
7310+ }
7311+
7312+ /* Is it a new master node? */
7313+ if (startmsg->reason == TRANS_NEWMASTER ||
7314+ startmsg->reason == TRANS_DEADMASTER) {
7315+ P_MEMB("starttrans %s, node=%d\n",
7316+ startmsg->reason ==
7317+ TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER",
7318+ le32_to_cpu(startmsg->nodeid));
7319+
7320+ /* If the old master has died then remove it */
c783755a
AM
7321+ if (startmsg->reason == TRANS_DEADMASTER) {
7322+ remove_node(le32_to_cpu(startmsg->nodeid));
c1c6733f
AM
7323+ }
7324+
7325+ /* Store new master */
7326+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
7327+ }
7328+
7329+ /* Another node has died (or been killed) */
7330+ if (startmsg->reason == TRANS_ANOTHERREMNODE) {
7331+ /* Remove new dead node */
c783755a 7332+ remove_node(le32_to_cpu(startmsg->nodeid));
c1c6733f
AM
7333+ }
7334+ /* Restart the timer */
7335+ del_timer(&transition_timer);
7336+ mod_timer(&transition_timer,
7337+ jiffies + cman_config.transition_timeout * HZ);
7338+ }
7339+
7340+ return 0;
7341+}
7342+
7343+/* Change a cluster parameter */
bb1d8b11 7344+static int do_process_reconfig(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
7345+{
7346+ struct cl_mem_reconfig_msg *confmsg;
7347+ struct sockaddr_cl *saddr = msg->msg_name;
7348+ struct cluster_node *node;
7349+ unsigned int val;
7350+
7351+ if (len < sizeof(struct cl_mem_reconfig_msg))
7352+ return -1;
7353+
bb1d8b11 7354+ confmsg = (struct cl_mem_reconfig_msg *)buf;
c1c6733f
AM
7355+ val = le32_to_cpu(confmsg->value);
7356+
7357+ switch (confmsg->param) {
7358+
7359+ case RECONFIG_PARAM_EXPECTED_VOTES:
7360+ /* Set any nodes with expected_votes higher than the new value
7361+ * down */
7362+ if (val > 0) {
7363+ struct cluster_node *node;
7364+
7365+ down(&cluster_members_lock);
7366+ list_for_each_entry(node, &cluster_members_list, list) {
7367+ if (node->state == NODESTATE_MEMBER &&
7368+ node->expected_votes > val) {
7369+ node->expected_votes = val;
7370+ }
7371+ }
7372+ up(&cluster_members_lock);
7373+ if (expected_votes > val)
7374+ expected_votes = val;
7375+ }
7376+ recalculate_quorum(1); /* Allow decrease */
7377+ sm_member_update(cluster_is_quorate);
7378+ break;
7379+
7380+ case RECONFIG_PARAM_NODE_VOTES:
7381+ node = find_node_by_nodeid(saddr->scl_nodeid);
7382+ node->votes = val;
7383+ recalculate_quorum(1); /* Allow decrease */
7384+ sm_member_update(cluster_is_quorate);
7385+ break;
7386+
7387+ case RECONFIG_PARAM_CONFIG_VERSION:
7388+ config_version = val;
7389+ break;
7390+
7391+ default:
7392+ printk(KERN_INFO CMAN_NAME
7393+ ": got unknown parameter in reconfigure message. %d\n",
7394+ confmsg->param);
7395+ break;
7396+ }
7397+ return 0;
7398+}
7399+
7400+/* Response from master node */
bb1d8b11 7401+static int do_process_joinack(struct msghdr *msg, char *buf, int len)
c1c6733f 7402+{
bb1d8b11
AM
7403+ struct cl_mem_joinack_msg *ackmsg =
7404+ (struct cl_mem_joinack_msg *)buf;
c1c6733f
AM
7405+
7406+ join_time = jiffies;
7407+ if (ackmsg->acktype == JOINACK_TYPE_OK) {
7408+ node_state = JOINACK;
7409+ }
7410+
7411+ if (ackmsg->acktype == JOINACK_TYPE_NAK) {
7412+ printk(KERN_WARNING CMAN_NAME
7413+ ": Cluster membership rejected\n");
7414+ P_MEMB("Got JOINACK NACK\n");
7415+ node_state = REJECTED;
7416+ }
7417+
7418+ if (ackmsg->acktype == JOINACK_TYPE_WAIT) {
7419+ P_MEMB("Got JOINACK WAIT\n");
7420+ node_state = JOINWAIT;
7421+ joinwait_time = jiffies;
7422+ }
7423+
c783755a
AM
7424+ return 0;
7425+}
7426+
7427+/* Check a JOINREQ message for validity,
7428+ return -1 if we can't let the node join our cluster */
7429+static int validate_joinmsg(struct cl_mem_join_msg *joinmsg, int len)
7430+{
7431+ struct cluster_node *node;
7432+
7433+ /* Check version number */
c1c6733f
AM
7434+ if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) {
7435+ char *ptr = (char *) joinmsg;
7436+ char *name;
7437+
b7b72b66
AM
7438+ ptr += sizeof (*joinmsg);
7439+ name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
7440+
c1c6733f
AM
7441+ /* Sanity-check the num_addrs field otherwise we could oops */
7442+ if (le16_to_cpu(joinmsg->num_addr) * address_length > len) {
7443+ printk(KERN_WARNING CMAN_NAME
7444+ ": num_addr in JOIN-REQ message is rubbish: %d\n",
7445+ le16_to_cpu(joinmsg->num_addr));
c783755a 7446+ return -1;
c1c6733f
AM
7447+ }
7448+
7449+ /* Check the cluster name matches */
7450+ if (strcmp(cluster_name, joinmsg->clustername)) {
7451+ printk(KERN_WARNING CMAN_NAME
7452+ ": attempt to join with cluster name '%s' refused\n",
7453+ joinmsg->clustername);
c783755a 7454+ return -1;
c1c6733f
AM
7455+ }
7456+
c1c6733f 7457+ /* Check we are not exceeding the maximum number of nodes */
b7b72b66 7458+ if (cluster_members >= cman_config.max_nodes) {
c1c6733f
AM
7459+ printk(KERN_WARNING CMAN_NAME
7460+ ": Join request from %s rejected, exceeds maximum number of nodes\n",
7461+ name);
c783755a 7462+ return -1;
c1c6733f
AM
7463+ }
7464+
b7b72b66 7465+ /* Check that we don't exceed the two_node limit, if applicable */
c1c6733f
AM
7466+ if (two_node && cluster_members == 2) {
7467+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7468+ "rejected, exceeds two node limit\n", name);
c783755a 7469+ return -1;
c1c6733f
AM
7470+ }
7471+
b7b72b66 7472+ if (le32_to_cpu(joinmsg->config_version) != config_version) {
c1c6733f
AM
7473+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7474+ "rejected, config version local %u remote %u\n",
7475+ name, config_version,
b7b72b66 7476+ le32_to_cpu(joinmsg->config_version));
c783755a
AM
7477+ return -1;
7478+ }
7479+
7480+ /* Validate requested static node ID */
7481+ if (joinmsg->nodeid &&
7482+ (node = find_node_by_nodeid(le32_to_cpu(joinmsg->nodeid))) &&
7483+ (node->state != NODESTATE_DEAD ||
7484+ (strcmp(node->name, name)))) {
7485+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7486+ "rejected, node ID %d already in use by %s\n",
7487+ name, node->node_id, node->name);
7488+ return -1;
7489+ }
7490+ if (joinmsg->nodeid &&
7491+ (node = find_node_by_name(name)) &&
7492+ (node->state != NODESTATE_DEAD ||
7493+ node->node_id != le32_to_cpu(joinmsg->nodeid))) {
7494+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7495+ "rejected, wanted node %d but previously had %d\n",
7496+ name, le32_to_cpu(joinmsg->nodeid), node->node_id);
7497+ return -1;
c1c6733f
AM
7498+ }
7499+
c783755a 7500+ /* If these don't match then I don't know how the message
c1c6733f
AM
7501+ arrived! However, I can't take the chance */
7502+ if (le32_to_cpu(joinmsg->addr_len) != address_length) {
7503+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7504+ "rejected, address length local: %u remote %u\n",
7505+ name, address_length,
7506+ le32_to_cpu(joinmsg->addr_len));
c783755a 7507+ return -1;
c1c6733f
AM
7508+ }
7509+ }
7510+ else {
7511+ /* Version number mismatch, don't use any part of the message
7512+ * other than the version numbers as things may have moved */
c1c6733f 7513+ printk(KERN_INFO CMAN_NAME
c783755a 7514+ ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d)\n",
c1c6733f
AM
7515+ CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
7516+ CNXMAN_PATCH_VERSION,
7517+ le32_to_cpu(joinmsg->major_version),
7518+ le32_to_cpu(joinmsg->minor_version),
c783755a
AM
7519+ le32_to_cpu(joinmsg->patch_version));
7520+ return -1;
7521+ }
7522+ return 0;
7523+}
7524+
7525+
7526+/* Request to join the cluster. This makes us the master for this state
7527+ * transition */
bb1d8b11 7528+static int do_process_joinreq(struct msghdr *msg, char *buf, int len)
c783755a
AM
7529+{
7530+ static unsigned long last_joinreq = 0;
7531+ static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN];
bb1d8b11 7532+ struct cl_mem_join_msg *joinmsg = (struct cl_mem_join_msg *)buf;
c783755a
AM
7533+ struct cluster_node *node;
7534+ char *ptr = (char *) joinmsg;
7535+ char *name;
7536+ int i;
7537+ struct sockaddr_cl *addr = msg->msg_name;
7538+
7539+ ptr += sizeof (*joinmsg);
7540+ name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
c1c6733f 7541+
c783755a
AM
7542+ /* If we are in a state transition then tell the new node to wait a bit
7543+ * longer */
7544+ if (node_state != MEMBER) {
7545+ if (node_state == MASTER || node_state == TRANSITION) {
7546+ send_joinack(msg->msg_name, msg->msg_namelen,
7547+ JOINACK_TYPE_WAIT);
7548+ }
7549+ return 0;
7550+ }
7551+
7552+ /* Reject application if message is invalid for any reason */
7553+ if (validate_joinmsg(joinmsg, len)) {
7554+ send_joinack(msg->msg_name, msg->msg_namelen,
7555+ JOINACK_TYPE_NAK);
7556+ return 0;
7557+ }
7558+
7559+ /* Do we already know about this node? */
7560+ if (check_duplicate_node(name, msg, len) < 0) {
c1c6733f 7561+ send_joinack(msg->msg_name, msg->msg_namelen,
c783755a
AM
7562+ JOINACK_TYPE_NAK);
7563+ return 0;
7564+ }
7565+
7566+ /* Duplicate checking: Because joining messages do not have
7567+ * sequence numbers we may get as many JOINREQ messages as we
7568+ * have interfaces. This bit of code here just checks for
7569+ * JOINREQ messages that come in from the same node in a small
7570+ * period of time and removes the duplicates */
7571+ if (time_before(jiffies, last_joinreq + 10 * HZ)
7572+ && strcmp(name, last_name) == 0) {
c1c6733f
AM
7573+ return 0;
7574+ }
7575+
c783755a
AM
7576+ /* OK, you can be in my gang */
7577+ last_joinreq = jiffies;
7578+ strcpy(last_name, name);
7579+
7580+ node = add_new_node(name, joinmsg->votes,
7581+ le32_to_cpu(joinmsg->expected_votes),
7582+ le32_to_cpu(joinmsg->nodeid),
7583+ NODESTATE_JOINING);
7584+
7585+ /* Add the node's addresses */
7586+ if (list_empty(&node->addr_list)) {
7587+ for (i = 0; i < le16_to_cpu(joinmsg->num_addr);
7588+ i++) {
7589+ add_node_address(node, ptr, address_length);
7590+ ptr += address_length;
7591+ }
7592+ }
7593+ send_joinack(msg->msg_name, msg->msg_namelen,
7594+ JOINACK_TYPE_OK);
7595+ joining_node = node;
7596+ joining_temp_nodeid = addr->scl_nodeid;
7597+
7598+ /* Start the state transition */
7599+ start_transition(TRANS_NEWNODE, node);
7600+
c1c6733f
AM
7601+ return 0;
7602+}
7603+
7604+/* A simple function to invent a small number based
7605+ on the node name */
7606+static int node_hash(void)
7607+{
7608+ int i;
7609+ int value = 0;
7610+
7611+ for (i=0; i<strlen(nodename); i++) {
7612+ value += nodename[i];
7613+ }
c783755a
AM
7614+ return (value & 0xF) + 1;
7615+}
7616+
7617+
7618+/* Return the low 32 bits of our IP address */
7619+static uint32_t low32_of_ip()
7620+{
7621+ struct cluster_node_addr *addr;
7622+ uint32_t lowip;
7623+
7624+ addr = list_entry(us->addr_list.next, struct cluster_node_addr, list);
7625+ memcpy(&lowip, addr->addr+address_length-sizeof(uint32_t), sizeof(uint32_t));
7626+ if (!lowip)
7627+ memcpy(&lowip, addr->addr - sizeof(uint32_t)*2, sizeof(uint32_t));
7628+
7629+ return lowip;
c1c6733f
AM
7630+}
7631+
7632+/* A new node has stated its intent to form a new cluster. we may have
7633+ * something to say about that... */
bb1d8b11 7634+static int do_process_newcluster(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
7635+{
7636+ /* If we are also in STARTING state then back down for a random period
7637+ * of time */
7638+ if (node_state == STARTING) {
7639+ P_MEMB("got NEWCLUSTER, backing down for %d seconds\n", node_hash());
7640+ start_time = jiffies + node_hash() * HZ;
7641+ }
c783755a
AM
7642+
7643+ if (node_state == NEWCLUSTER) {
7644+ uint32_t otherip;
c783755a 7645+
bb1d8b11 7646+ memcpy(&otherip, buf+1, sizeof(otherip));
c783755a
AM
7647+ otherip = le32_to_cpu(otherip);
7648+ P_MEMB("got NEWCLUSTER, remote ip = %x, us = %x\n", otherip, low32_of_ip());
7649+ if (otherip < low32_of_ip())
7650+ node_state = STARTING;
7651+ }
7652+
b7b72b66
AM
7653+ if (node_state == MEMBER)
7654+ send_hello();
c1c6733f
AM
7655+
7656+ return 0;
7657+}
7658+
7659+/* Called for each node by the node-message unpacker. Returns -1 if there is a
7660+ * mismatch and the caller will stop processing */
7661+static int check_node(struct cluster_node *newnode, char *addrs,
7662+ unsigned short num_addr)
7663+{
7664+ struct cluster_node *node = find_node_by_name(newnode->name);
7665+
7666+ P_MEMB("check_node: %s", newnode->name);
7667+
7668+ if (!node) {
7669+ C_MEMB(" - not found\n");
7670+ return -1;
7671+ }
7672+
7673+ if (node->votes != newnode->votes ||
7674+ node->node_id != newnode->node_id ||
c783755a
AM
7675+ node->state != newnode->state) {
7676+ C_MEMB(" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d\n",
7677+ node->votes, newnode->votes, node->node_id,
7678+ newnode->node_id, node->state);
c1c6733f
AM
7679+ return -1;
7680+ }
7681+ C_MEMB(" - OK\n");
7682+ return 0;
7683+}
7684+
7685+/* Called for each new node found in a JOINCONF message. Create a new node
7686+ * entry */
7687+static int add_node(struct cluster_node *node, char *addrs,
7688+ unsigned short num_addr)
7689+{
7690+ P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes,
7691+ node->expected_votes, node->node_id);
7692+
7693+ if (!find_node_by_name(node->name)) {
7694+ struct cluster_node *newnode;
7695+ int i;
7696+
7697+ if ((newnode =
7698+ add_new_node(node->name, node->votes, node->expected_votes,
c783755a 7699+ node->node_id, node->state)) == NULL) {
c1c6733f
AM
7700+ P_MEMB("Error adding node\n");
7701+ return -1;
7702+ }
7703+ if (list_empty(&newnode->addr_list)) {
7704+ for (i = 0; i < num_addr; i++) {
7705+ add_node_address(newnode,
7706+ addrs + i * address_length, address_length);
7707+ }
7708+ }
7709+ return 0;
7710+ }
7711+ else {
7712+ P_MEMB("Already got node with name %s\n", node->name);
7713+ return -1;
7714+ }
7715+}
7716+
7717+/* Call a specified routine for each node unpacked from the message. Return
7718+ * either the number of nodes found or -1 for an error */
7719+static int unpack_nodes(unsigned char *buf, int len,
7720+ int (*routine) (struct cluster_node *, char *,
7721+ unsigned short))
7722+{
7723+ int ptr = 0;
7724+ int num_nodes = 0;
7725+ char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
7726+ struct cluster_node node;
7727+
7728+ node.name = nodename;
7729+
7730+ while (ptr < len) {
7731+ int namelen = buf[ptr++];
7732+ unsigned int evotes;
7733+ unsigned int node_id;
7734+ unsigned short num_addr;
7735+ unsigned char *addrs;
7736+
7737+ memcpy(nodename, &buf[ptr], namelen);
7738+ nodename[namelen] = '\0';
7739+ ptr += namelen;
7740+
c783755a
AM
7741+ node.state = buf[ptr++];
7742+
c1c6733f
AM
7743+ memcpy(&num_addr, &buf[ptr], sizeof (short));
7744+ num_addr = le16_to_cpu(num_addr);
7745+ ptr += sizeof (short);
7746+
7747+ /* Just make a note of the addrs "array" */
7748+ addrs = &buf[ptr];
7749+ ptr += num_addr * address_length;
7750+
7751+ node.votes = buf[ptr++];
7752+
7753+ memcpy(&evotes, &buf[ptr], sizeof (int));
7754+ node.expected_votes = le32_to_cpu(evotes);
7755+ ptr += sizeof (int);
7756+
7757+ memcpy(&node_id, &buf[ptr], sizeof (int));
7758+ node.node_id = le32_to_cpu(node_id);
7759+ ptr += sizeof (int);
7760+
7761+ /* Call the callback routine */
7762+ if (routine(&node, addrs, num_addr) < 0)
7763+ return -1;
c783755a
AM
7764+
7765+ /* Return the number of MEMBER nodes */
7766+ if (node.state == NODESTATE_MEMBER)
7767+ num_nodes++;
c1c6733f
AM
7768+ }
7769+ return num_nodes;
7770+}
7771+
7772+/* Got join confirmation from a master node. This message contains a list of
7773+ * cluster nodes which we unpack and build into our cluster nodes list. When we
7774+ * have the last message we can go into TRANSITION state */
bb1d8b11 7775+static int do_process_joinconf(struct msghdr *msg, char *buf, int len)
c1c6733f 7776+{
bb1d8b11 7777+ if (unpack_nodes(buf + 2, len - 2, add_node) < 0) {
c1c6733f
AM
7778+ printk(CMAN_NAME
7779+ ": Error procssing joinconf message - giving up on cluster join\n");
c783755a
AM
7780+ us->leave_reason = CLUSTER_LEAVEFLAG_PANIC;
7781+ node_state = LEFT_CLUSTER;
c1c6733f
AM
7782+ return -1;
7783+ }
7784+
7785+ /* Last message in the list? */
bb1d8b11 7786+ if (buf[1] & 2) {
c1c6733f
AM
7787+ char ackmsg;
7788+ struct sockaddr_cl *addr = msg->msg_name;
7789+
7790+ us->state = NODESTATE_MEMBER;
7791+ node_state = TRANSITION;
7792+ we_are_a_cluster_member = TRUE;
7793+
7794+ ackmsg = CLUSTER_MEM_CONFACK;
7795+ kcl_sendmsg(mem_socket, &ackmsg, 1, addr,
7796+ sizeof (struct sockaddr_cl),
7797+ MSG_NOACK);
7798+ kernel_thread(hello_kthread, NULL, 0);
7799+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
7800+ }
7801+ return 0;
7802+}
7803+
7804+/* Got the master's view of the cluster - compare it with ours and tell it the
7805+ * result */
bb1d8b11 7806+static int do_process_masterview(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
7807+{
7808+ char reply[2] = { CLUSTER_MEM_VIEWACK, 0 };
c1c6733f
AM
7809+ static int num_nodes;
7810+
7811+ /* Someone else's state transition */
7812+ if (node_state != MEMBER &&
7813+ node_state != TRANSITION && node_state != MASTER)
7814+ return 0;
7815+
7816+ /* First message, zero the counter */
bb1d8b11 7817+ if (buf[1] & 1)
c1c6733f
AM
7818+ num_nodes = 0;
7819+
bb1d8b11 7820+ num_nodes += unpack_nodes(buf + 2, len - 2, check_node);
c1c6733f
AM
7821+
7822+ /* Last message, check the count and reply */
bb1d8b11 7823+ if (buf[1] & 2) {
c1c6733f
AM
7824+ if (num_nodes == cluster_members) {
7825+ /* Send ACK */
7826+ reply[1] = 1;
7827+ }
7828+ else {
7829+ P_MEMB
7830+ ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n",
7831+ num_nodes, cluster_members);
7832+ /* Send NAK */
7833+ reply[1] = 0;
7834+ }
7835+ kcl_sendmsg(mem_socket, reply, 2, msg->msg_name,
7836+ msg->msg_namelen, 0);
7837+ }
7838+ return 0;
7839+}
7840+
bb1d8b11 7841+static int do_process_leave(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
7842+{
7843+ struct cluster_node *node;
7844+ struct sockaddr_cl *saddr = msg->msg_name;
bb1d8b11 7845+ unsigned char *leavemsg = (unsigned char *)buf;
c1c6733f
AM
7846+
7847+ if ((node = find_node_by_nodeid(saddr->scl_nodeid))) {
7848+ unsigned char reason = leavemsg[1];
7849+
7850+ if (node->state != NODESTATE_DEAD) {
7851+ printk(KERN_INFO CMAN_NAME
c783755a
AM
7852+ ": Node %s is leaving the cluster, %s\n",
7853+ node->name, leave_string(reason));
c1c6733f
AM
7854+
7855+ node->leave_reason = reason;
7856+ }
7857+ leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0);
7858+
7859+ a_node_just_died(node);
c1c6733f
AM
7860+ }
7861+ return 0;
7862+}
7863+
bb1d8b11 7864+static int do_process_hello(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
7865+{
7866+ struct cluster_node *node;
7867+ struct cl_mem_hello_msg *hellomsg =
bb1d8b11 7868+ (struct cl_mem_hello_msg *)buf;
c1c6733f
AM
7869+ struct sockaddr_cl *saddr = msg->msg_name;
7870+
7871+ /* We are starting up. Send a join message to the node whose HELLO we
7872+ * just received */
c783755a
AM
7873+ if (node_state == STARTING || node_state == JOINWAIT ||
7874+ node_state == JOINING || node_state == NEWCLUSTER) {
c1c6733f
AM
7875+ struct sockaddr_cl *addr = msg->msg_name;
7876+
7877+ printk(KERN_INFO CMAN_NAME ": sending membership request\n");
7878+
7879+ send_joinreq(addr, msg->msg_namelen);
7880+ join_time = jiffies;
7881+ node_state = JOINING;
7882+ return 0;
7883+ }
7884+
7885+ /* Only process HELLOs if we are not in transition */
7886+ if (node_state == MEMBER) {
c1c6733f
AM
7887+
7888+ node = find_node_by_nodeid(saddr->scl_nodeid);
7889+ if (node && node->state != NODESTATE_DEAD) {
7890+
7891+ /* Check the cluster generation in the HELLO message.
7892+ * NOTE: this may be different if the message crossed
7893+ * on the wire with an END-TRANS so we allow a period
7894+ * of grace in which this is allowable */
7895+ if (cluster_generation !=
7896+ le32_to_cpu(hellomsg->generation)
7897+ && node_state == MEMBER
7898+ && time_after(jiffies,
7899+ cman_config.hello_timer * HZ +
7900+ transition_end_time)) {
c1c6733f
AM
7901+
7902+ printk(KERN_INFO CMAN_NAME
7903+ ": bad generation number %d in HELLO message, expected %d\n",
7904+ le32_to_cpu(hellomsg->generation),
7905+ cluster_generation);
7906+
7907+ notify_kernel_listeners(DIED,
7908+ (long) node->node_id);
7909+
b7b72b66 7910+ send_kill(node->node_id);
c1c6733f
AM
7911+ return 0;
7912+ }
7913+
7914+ if (cluster_members != le16_to_cpu(hellomsg->members)
7915+ && node_state == MEMBER) {
7916+ printk(KERN_INFO CMAN_NAME
b7b72b66
AM
7917+ ": nmembers in HELLO message does not match our view (got %d, exp %d)\n",
7918+ le16_to_cpu(hellomsg->members), cluster_members);
c1c6733f
AM
7919+ start_transition(TRANS_CHECK, node);
7920+ return 0;
7921+ }
7922+ /* The message is OK - save the time */
7923+ node->last_hello = jiffies;
c1c6733f
AM
7924+ }
7925+ else {
c1c6733f
AM
7926+ /* This node is a danger to our valid cluster */
7927+ if (cluster_is_quorate) {
b7b72b66 7928+ send_kill(saddr->scl_nodeid);
c1c6733f 7929+ }
c1c6733f
AM
7930+ }
7931+ }
7932+
7933+ return 0;
7934+
7935+}
7936+
bb1d8b11 7937+static int do_process_kill(struct msghdr *msg, char *buf, int len)
c1c6733f
AM
7938+{
7939+ struct sockaddr_cl *saddr = msg->msg_name;
7940+ struct cluster_node *node;
7941+
7942+ node = find_node_by_nodeid(saddr->scl_nodeid);
7943+ if (node && node->state == NODESTATE_MEMBER) {
7944+
7945+ printk(KERN_INFO CMAN_NAME
7946+ ": Being told to leave the cluster by node %d\n",
7947+ saddr->scl_nodeid);
7948+
7949+ node_state = LEFT_CLUSTER;
7950+ quit_threads = 1;
7951+ wake_up_process(membership_task);
7952+ wake_up_interruptible(&cnxman_waitq);
7953+ }
7954+ else {
7955+ P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n");
7956+ }
7957+ return 0;
7958+}
7959+
7960+/* Some cluster membership utility functions */
7961+struct cluster_node *find_node_by_name(char *name)
7962+{
7963+ struct list_head *nodelist;
7964+ struct cluster_node *node;
7965+
7966+ down(&cluster_members_lock);
7967+ list_for_each(nodelist, &cluster_members_list) {
7968+ node = list_entry(nodelist, struct cluster_node, list);
7969+
7970+ if (strcmp(node->name, name) == 0) {
7971+ up(&cluster_members_lock);
7972+ return node;
7973+ }
7974+ }
7975+ up(&cluster_members_lock);
7976+ return NULL;
7977+}
7978+
7979+/* Try to avoid using this as it's slow and holds the members lock */
7980+struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len)
7981+{
7982+ struct list_head *nodelist;
7983+ struct list_head *addrlist;
7984+ struct cluster_node *node;
7985+ struct cluster_node_addr *nodeaddr;
7986+
7987+ down(&cluster_members_lock);
7988+
7989+ list_for_each(nodelist, &cluster_members_list) {
7990+ node = list_entry(nodelist, struct cluster_node, list);
7991+
7992+ list_for_each(addrlist, &node->addr_list) {
7993+ nodeaddr =
7994+ list_entry(addrlist, struct cluster_node_addr,
7995+ list);
7996+
b7b72b66 7997+ if (memcmp(nodeaddr->addr+2, addr+2, address_length-2) == 0) {
c1c6733f
AM
7998+ up(&cluster_members_lock);
7999+ return node;
8000+ }
8001+ }
8002+ }
8003+
8004+ up(&cluster_members_lock);
8005+ return NULL;
8006+}
8007+
8008+/* This is the quick way to find a node */
8009+struct cluster_node *find_node_by_nodeid(unsigned int id)
8010+{
8011+ struct cluster_node *node;
8012+
8013+ if (id > sizeof_members_array)
8014+ return NULL;
8015+
8016+ spin_lock(&members_by_nodeid_lock);
8017+ node = members_by_nodeid[id];
8018+ spin_unlock(&members_by_nodeid_lock);
8019+ return node;
8020+}
8021+
8022+static int dispatch_messages(struct socket *mem_socket)
8023+{
8024+ int err = 0;
8025+
8026+ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
8027+ struct msghdr msg;
bb1d8b11 8028+ struct kvec vec;
c1c6733f
AM
8029+ struct sockaddr_cl sin;
8030+ int len;
c1c6733f
AM
8031+
8032+ memset(&sin, 0, sizeof (sin));
8033+
8034+ msg.msg_control = NULL;
8035+ msg.msg_controllen = 0;
c1c6733f
AM
8036+ msg.msg_name = &sin;
8037+ msg.msg_namelen = sizeof (sin);
8038+ msg.msg_flags = 0;
8039+
bb1d8b11
AM
8040+ vec.iov_len = MAX_CLUSTER_MESSAGE;
8041+ vec.iov_base = iobuf;
c1c6733f 8042+
bb1d8b11
AM
8043+ len = kernel_recvmsg(mem_socket, &msg, &vec, 1,
8044+ MAX_CLUSTER_MESSAGE,
8045+ MSG_DONTWAIT);
c1c6733f 8046+ if (len > 0) {
c1c6733f 8047+ msg.msg_name = &sin;
bb1d8b11 8048+ do_membership_packet(&msg, iobuf, len);
c1c6733f
AM
8049+ }
8050+ else {
8051+ if (len == -EAGAIN)
8052+ err = 0;
8053+ else
8054+ err = -1;
8055+ break;
8056+ }
8057+ }
8058+ return err;
8059+}
8060+
8061+/* Scan the nodes list for dead nodes */
8062+static void check_for_dead_nodes()
8063+{
8064+ struct list_head *nodelist;
8065+ struct cluster_node *node;
8066+
8067+ down(&cluster_members_lock);
8068+ list_for_each(nodelist, &cluster_members_list) {
8069+ node = list_entry(nodelist, struct cluster_node, list);
8070+
8071+ if (node->state != NODESTATE_DEAD &&
8072+ time_after(jiffies,
8073+ node->last_hello +
8074+ cman_config.deadnode_timeout * HZ) && !node->us) {
8075+
8076+ up(&cluster_members_lock);
8077+
8078+ printk(KERN_WARNING CMAN_NAME
8079+ ": no HELLO from %s, removing from the cluster\n",
8080+ node->name);
8081+
8082+ P_MEMB("last hello was %ld, current time is %ld\n",
8083+ node->last_hello, jiffies);
8084+
8085+ node->leave_reason = CLUSTER_LEAVEFLAG_DEAD;
8086+ leavereason = 0;
8087+
8088+ /* This is unlikely to work but it's worth a try! */
8089+ send_kill(node->node_id);
8090+
8091+ /* Start state transition */
8092+ a_node_just_died(node);
8093+ return;
8094+ }
8095+ }
8096+ up(&cluster_members_lock);
8097+
8098+ /* Also check for a dead quorum device */
8099+ if (quorum_device) {
8100+ if (quorum_device->state == NODESTATE_MEMBER &&
8101+ time_after(jiffies,
8102+ quorum_device->last_hello +
8103+ cman_config.deadnode_timeout * HZ)) {
8104+ quorum_device->state = NODESTATE_DEAD;
8105+ printk(KERN_WARNING CMAN_NAME
8106+ ": Quorum device %s timed out\n",
8107+ quorum_device->name);
8108+ recalculate_quorum(0);
8109+ }
8110+ }
8111+
8112+ return;
8113+}
8114+
8115+/* add "us" as a node in the cluster */
8116+static int add_us()
8117+{
8118+ struct cluster_node *newnode =
8119+ kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
8120+
8121+ if (!newnode) {
8122+ /* Oh shit, we have to commit hara kiri here for the greater
8123+ * good of the cluster */
8124+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
8125+
8126+ printk(KERN_CRIT CMAN_NAME
8127+ ": Cannot allocate memory for our node structure\n");
8128+ panic("Must die");
8129+
8130+ return -1;
8131+ }
8132+
8133+ memset(newnode, 0, sizeof (struct cluster_node));
8134+ newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL);
8135+ if (!newnode->name) {
8136+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
8137+
8138+ printk(KERN_CRIT CMAN_NAME
8139+ ": Cannot allocate memory for node name\n");
8140+ kfree(newnode);
8141+
8142+ panic("Must die");
8143+
8144+ return -1;
8145+ }
8146+
8147+ strcpy(newnode->name, nodename);
8148+ newnode->last_hello = jiffies;
8149+ newnode->votes = votes;
8150+ newnode->expected_votes = expected_votes;
8151+ newnode->state = NODESTATE_JOINING;
8152+ newnode->node_id = 0; /* Will get filled in by ENDTRANS message */
8153+ newnode->us = 1;
8154+ newnode->leave_reason = 0;
8155+ INIT_LIST_HEAD(&newnode->addr_list);
8156+ get_local_addresses(newnode); /* Get from cnxman socket info */
8157+
8158+ /* Add the new node to the list */
8159+ down(&cluster_members_lock);
8160+ list_add(&newnode->list, &cluster_members_list);
8161+ cluster_members++;
8162+ up(&cluster_members_lock);
8163+ us = newnode;
8164+
8165+ return 0;
8166+}
8167+
8168+/* Return the highest known node_id */
8169+unsigned int get_highest_nodeid()
8170+{
8171+ struct list_head *nodelist;
8172+ struct cluster_node *node = NULL;
8173+ unsigned int highest = 0;
8174+
8175+ down(&cluster_members_lock);
8176+ list_for_each(nodelist, &cluster_members_list) {
8177+ node = list_entry(nodelist, struct cluster_node, list);
8178+
8179+ if (node->node_id > highest)
8180+ highest = node->node_id;
8181+ }
8182+ up(&cluster_members_lock);
8183+
8184+ return highest;
8185+}
8186+
8187+/* Elect a new master if there is a clash. Returns 1 if we are the new master,
8188+ * the master's struct will also be returned. This, rather primitively, uses
8189+ * the lowest node ID */
8190+static int elect_master(struct cluster_node **master_node)
8191+{
8192+ int i;
8193+
8194+ for (i = 1; i < sizeof_members_array; i++) {
8195+ if (members_by_nodeid[i]
8196+ && members_by_nodeid[i]->state == NODESTATE_MEMBER) {
8197+ *master_node = members_by_nodeid[i];
8198+ P_MEMB("Elected master is %s\n", (*master_node)->name);
8199+ return (*master_node)->us;
8200+ }
8201+ }
8202+ BUG();
8203+ return 0;
8204+}
8205+
8206+/* Called by node_cleanup in cnxman when we have left the cluster */
8207+void free_nodeid_array()
8208+{
8209+ vfree(members_by_nodeid);
8210+ members_by_nodeid = NULL;
8211+ sizeof_members_array = 0;
8212+}
8213+
8214+int allocate_nodeid_array()
8215+{
8216+ /* Allocate space for the nodeid lookup array */
8217+ if (!members_by_nodeid) {
8218+ spin_lock_init(&members_by_nodeid_lock);
8219+ members_by_nodeid =
8220+ vmalloc(cman_config.max_nodes *
8221+ sizeof (struct cluster_member *));
8222+ }
8223+
8224+ if (!members_by_nodeid) {
8225+ printk(KERN_WARNING
8226+ "Unable to allocate members array for %d members\n",
8227+ cman_config.max_nodes);
8228+ return -ENOMEM;
8229+ }
8230+ memset(members_by_nodeid, 0,
8231+ cman_config.max_nodes * sizeof (struct cluster_member *));
8232+ sizeof_members_array = cman_config.max_nodes;
8233+
8234+ return 0;
8235+}
8236+
8237+/* Set the votes & expected_votes variables */
8238+void set_votes(int v, int e)
8239+{
8240+ votes = v;
8241+ expected_votes = e;
8242+}
8243+
8244+int get_quorum()
8245+{
8246+ return quorum;
8247+}
8248+
8249+/* Called by cnxman to see if activity should be blocked because we are in a
8250+ * state transition */
8251+int in_transition()
8252+{
8253+ return node_state == TRANSITION ||
8254+ node_state == TRANSITION_COMPLETE || node_state == MASTER;
8255+}
8256+
8257+/* Return the current membership state as a string for the main line to put
8258+ * into /proc . I really should be using snprintf rather than sprintf but it's
8259+ * not exported... */
8260+char *membership_state(char *buf, int buflen)
8261+{
8262+ switch (node_state) {
8263+ case STARTING:
8264+ strncpy(buf, "Starting", buflen);
8265+ break;
c783755a
AM
8266+ case NEWCLUSTER:
8267+ strncpy(buf, "New-Cluster?", buflen);
8268+ break;
c1c6733f
AM
8269+ case JOINING:
8270+ strncpy(buf, "Joining", buflen);
8271+ break;
8272+ case JOINWAIT:
8273+ strncpy(buf, "Join-Wait", buflen);
8274+ break;
8275+ case JOINACK:
8276+ strncpy(buf, "Join-Ack", buflen);
8277+ break;
8278+ case TRANSITION:
8279+ sprintf(buf, "State-Transition: Master is %s",
8280+ master_node ? master_node->name : "Unknown");
8281+ break;
8282+ case MEMBER:
8283+ strncpy(buf, "Cluster-Member", buflen);
8284+ break;
8285+ case REJECTED:
8286+ strncpy(buf, "Rejected", buflen);
8287+ break;
8288+ case LEFT_CLUSTER:
c783755a 8289+ strncpy(buf, "Not-in-Cluster", buflen);
c1c6733f
AM
8290+ break;
8291+ case TRANSITION_COMPLETE:
8292+ strncpy(buf, "Transition-Complete", buflen);
8293+ break;
8294+ case MASTER:
8295+ strncpy(buf, "Transition-Master", buflen);
8296+ break;
8297+ default:
8298+ sprintf(buf, "Unknown: code=%d", node_state);
8299+ break;
8300+ }
8301+
8302+ return buf;
8303+}
8304+
c783755a
AM
8305+char *leave_string(int reason)
8306+{
bb1d8b11 8307+ static char msg[32];
c783755a
AM
8308+ switch (reason)
8309+ {
8310+ case CLUSTER_LEAVEFLAG_DOWN:
8311+ return "Shutdown";
8312+ case CLUSTER_LEAVEFLAG_KILLED:
8313+ return "Killed by another node";
8314+ case CLUSTER_LEAVEFLAG_PANIC:
8315+ return "Panic";
8316+ case CLUSTER_LEAVEFLAG_REMOVED:
8317+ return "Removed";
8318+ case CLUSTER_LEAVEFLAG_REJECTED:
8319+ return "Membership rejected";
8320+ default:
bb1d8b11
AM
8321+ sprintf(msg, "Reason is %d\n", reason);
8322+ return msg;
c783755a
AM
8323+ }
8324+}
8325+
c1c6733f
AM
8326+#ifdef DEBUG_MEMB
8327+static char *msgname(int msg)
8328+{
8329+ switch (msg) {
8330+ case CLUSTER_MEM_JOINCONF:
8331+ return "JOINCONF";
8332+ case CLUSTER_MEM_JOINREQ:
8333+ return "JOINREQ";
8334+ case CLUSTER_MEM_LEAVE:
8335+ return "LEAVE";
8336+ case CLUSTER_MEM_HELLO:
8337+ return "HELLO";
8338+ case CLUSTER_MEM_KILL:
8339+ return "KILL";
8340+ case CLUSTER_MEM_JOINACK:
8341+ return "JOINACK";
8342+ case CLUSTER_MEM_ENDTRANS:
8343+ return "ENDTRANS";
8344+ case CLUSTER_MEM_RECONFIG:
8345+ return "RECONFIG";
8346+ case CLUSTER_MEM_MASTERVIEW:
8347+ return "MASTERVIEW";
8348+ case CLUSTER_MEM_STARTTRANS:
8349+ return "STARTTRANS";
8350+ case CLUSTER_MEM_JOINREJ:
8351+ return "JOINREJ";
8352+ case CLUSTER_MEM_VIEWACK:
8353+ return "VIEWACK";
8354+ case CLUSTER_MEM_STARTACK:
8355+ return "STARTACK";
8356+ case CLUSTER_MEM_NEWCLUSTER:
8357+ return "NEWCLUSTER";
8358+ case CLUSTER_MEM_CONFACK:
8359+ return "CONFACK";
8360+ case CLUSTER_MEM_NOMINATE:
8361+ return "NOMINATE";
8362+
8363+ default:
8364+ return "??UNKNOWN??";
8365+ }
8366+}
8367+
8368+#endif
8369+
8370+/*
8371+ * Overrides for Emacs so that we follow Linus's tabbing style.
8372+ * Emacs will notice this stuff at the end of the file and automatically
8373+ * adjust the settings for this buffer only. This must remain at the end
8374+ * of the file.
8375+ * ---------------------------------------------------------------------------
8376+ * Local variables:
8377+ * c-file-style: "linux"
8378+ * End:
8379+ */
8380diff -urN linux-orig/cluster/cman/proc.c linux-patched/cluster/cman/proc.c
bb1d8b11
AM
8381--- linux-orig/cluster/cman/proc.c 1970-01-01 07:30:00.000000000 +0730
8382+++ linux-patched/cluster/cman/proc.c 2004-11-03 11:37:37.000000000 +0800
c783755a 8383@@ -0,0 +1,372 @@
c1c6733f
AM
8384+/******************************************************************************
8385+*******************************************************************************
8386+**
8387+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8388+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8389+**
8390+** This copyrighted material is made available to anyone wishing to use,
8391+** modify, copy, or redistribute it subject to the terms and conditions
8392+** of the GNU General Public License v.2.
8393+**
8394+*******************************************************************************
8395+******************************************************************************/
8396+
8397+#include <linux/init.h>
8398+#include <linux/socket.h>
8399+#include <linux/kernel.h>
8400+#include <linux/sched.h>
8401+#include <linux/file.h>
8402+#include <linux/proc_fs.h>
8403+#include <linux/seq_file.h>
8404+#include <linux/list.h>
8405+#include <linux/in.h>
8406+#include <net/sock.h>
8407+#include <cluster/cnxman.h>
8408+#include <cluster/service.h>
8409+
8410+#include "cnxman-private.h"
8411+#include "config.h"
8412+
8413+extern int cluster_members;
8414+extern struct list_head cluster_members_list;
8415+extern struct semaphore cluster_members_lock;
8416+extern struct cluster_node *quorum_device;
8417+extern int we_are_a_cluster_member;
8418+extern int cluster_is_quorate;
bb1d8b11 8419+extern uint16_t cluster_id;
c1c6733f
AM
8420+extern atomic_t use_count;
8421+extern unsigned int address_length;
8422+extern unsigned int config_version;
8423+extern char cluster_name[];
8424+extern struct cluster_node *us;
8425+static struct seq_operations cluster_info_op;
8426+
b7b72b66 8427+int sm_proc_open(struct inode *inode, struct file *file);
c1c6733f
AM
8428+int sm_debug_info(char *b, char **start, off_t offset, int length);
8429+
8430+/* /proc interface to the configuration struct */
8431+static struct config_proc_info {
8432+ char *name;
8433+ int *value;
8434+} config_proc[] = {
8435+ {
8436+ .name = "joinwait_timeout",
8437+ .value = &cman_config.joinwait_timeout,
8438+ },
8439+ {
8440+ .name = "joinconf_timeout",
8441+ .value = &cman_config.joinconf_timeout,
8442+ },
8443+ {
8444+ .name = "join_timeout",
8445+ .value = &cman_config.join_timeout,
8446+ },
8447+ {
8448+ .name = "hello_timer",
8449+ .value = &cman_config.hello_timer,
8450+ },
8451+ {
8452+ .name = "deadnode_timeout",
8453+ .value = &cman_config.deadnode_timeout,
8454+ },
8455+ {
8456+ .name = "transition_timeout",
8457+ .value = &cman_config.transition_timeout,
8458+ },
8459+ {
8460+ .name = "transition_restarts",
8461+ .value = &cman_config.transition_restarts,
8462+ },
8463+ {
8464+ .name = "max_nodes",
8465+ .value = &cman_config.max_nodes,
8466+ },
8467+ {
8468+ .name = "sm_debug_size",
8469+ .value = &cman_config.sm_debug_size,
8470+ },
c783755a
AM
8471+ {
8472+ .name = "newcluster_timeout",
8473+ .value = &cman_config.newcluster_timeout,
8474+ },
c1c6733f
AM
8475+};
8476+
8477+
8478+static int proc_cluster_status(char *b, char **start, off_t offset, int length)
8479+{
8480+ struct list_head *nodelist;
8481+ struct cluster_node *node;
8482+ struct cluster_node_addr *node_addr;
8483+ unsigned int total_votes = 0;
8484+ unsigned int max_expected = 0;
8485+ int c = 0;
8486+ char node_buf[MAX_CLUSTER_MEMBER_NAME_LEN];
8487+
c783755a
AM
8488+ c += sprintf(b+c,
8489+ "Version: %d.%d.%d\n",
8490+ CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
8491+ CNXMAN_PATCH_VERSION);
8492+
8493+ c += sprintf(b+c,
8494+ "Config version: %d\nCluster name: %s\nCluster ID: %d\nMembership state: %s\n",
8495+ config_version,
8496+ cluster_name, cluster_id,
8497+ membership_state(node_buf, sizeof (node_buf)));
8498+
8499+ if (!we_are_a_cluster_member)
c1c6733f 8500+ return c;
c1c6733f
AM
8501+
8502+ /* Total the votes */
8503+ down(&cluster_members_lock);
8504+ list_for_each(nodelist, &cluster_members_list) {
8505+ node = list_entry(nodelist, struct cluster_node, list);
8506+ if (node->state == NODESTATE_MEMBER) {
8507+ total_votes += node->votes;
8508+ max_expected =
8509+ max(max_expected, node->expected_votes);
8510+ }
8511+ }
8512+ up(&cluster_members_lock);
8513+
8514+ if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
8515+ total_votes += quorum_device->votes;
8516+
8517+ c += sprintf(b+c,
c1c6733f
AM
8518+ "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d %s\n",
8519+ cluster_members, max_expected, total_votes,
8520+ get_quorum(),
8521+ cluster_is_quorate ? " " : "Activity blocked");
8522+ c += sprintf(b+c, "Active subsystems: %d\n",
8523+ atomic_read(&use_count));
8524+
8525+
8526+ c += sprintf(b+c, "Node addresses: ");
8527+ list_for_each_entry(node_addr, &us->addr_list, list) {
8528+ struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr;
8529+ if (saddr->sin6_family == AF_INET6) {
8530+ c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x ",
8531+ be16_to_cpu(saddr->sin6_addr.s6_addr16[0]),
8532+ be16_to_cpu(saddr->sin6_addr.s6_addr16[1]),
8533+ be16_to_cpu(saddr->sin6_addr.s6_addr16[2]),
8534+ be16_to_cpu(saddr->sin6_addr.s6_addr16[3]),
8535+ be16_to_cpu(saddr->sin6_addr.s6_addr16[4]),
8536+ be16_to_cpu(saddr->sin6_addr.s6_addr16[5]),
8537+ be16_to_cpu(saddr->sin6_addr.s6_addr16[6]),
8538+ be16_to_cpu(saddr->sin6_addr.s6_addr16[7]));
8539+ }
8540+ else {
8541+ struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr;
8542+ uint8_t *addr = (uint8_t *)&saddr4->sin_addr;
8543+ c+= sprintf(b+c, "%u.%u.%u.%u ",
8544+ addr[0], addr[1], addr[2], addr[3]);
8545+ }
8546+ }
8547+ c += sprintf(b+c, "\n\n");
8548+ return c;
8549+}
8550+
8551+
8552+/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where
8553+ * we are */
8554+struct cluster_seq_info {
8555+ int nodeid;
8556+ int highest_nodeid;
8557+};
8558+
8559+static int cluster_open(struct inode *inode, struct file *file)
8560+{
8561+ return seq_open(file, &cluster_info_op);
8562+}
8563+
8564+static void *cluster_seq_start(struct seq_file *m, loff_t * pos)
8565+{
8566+ struct cluster_seq_info *csi =
8567+ kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL);
8568+
8569+ if (!csi)
8570+ return NULL;
8571+
8572+ /* Keep highest_nodeid here so we don't need to keep traversing the
8573+ * list to find it */
8574+ csi->nodeid = *pos;
8575+ csi->highest_nodeid = get_highest_nodeid();
8576+
8577+ /* Print the header */
8578+ if (*pos == 0) {
c783755a 8579+ seq_printf(m, "Node Votes Exp Sts Name\n");
c1c6733f
AM
8580+ }
8581+ return csi;
8582+}
8583+
8584+static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos)
8585+{
8586+ struct cluster_seq_info *csi = p;
8587+
8588+ *pos = ++csi->nodeid;
8589+ if (csi->nodeid > csi->highest_nodeid)
8590+ return NULL;
8591+
8592+ return csi;
8593+}
8594+
8595+static int cluster_seq_show(struct seq_file *m, void *p)
8596+{
8597+ char state = '?';
8598+ struct cluster_node *node;
8599+ struct cluster_seq_info *csi = p;
8600+
8601+ /*
8602+ * If we have "0" here then display the quorum device if
8603+ * there is one.
8604+ */
8605+ if (csi->nodeid == 0)
8606+ node = quorum_device;
8607+ else
8608+ node = find_node_by_nodeid(csi->nodeid);
8609+
8610+ if (!node)
8611+ return 0;
8612+
8613+ /* Make state printable */
8614+ switch (node->state) {
8615+ case NODESTATE_MEMBER:
8616+ state = 'M';
8617+ break;
8618+ case NODESTATE_JOINING:
8619+ state = 'J';
8620+ break;
c1c6733f
AM
8621+ case NODESTATE_DEAD:
8622+ state = 'X';
8623+ break;
8624+ }
c783755a 8625+ seq_printf(m, "%4d %3d %3d %c %s\n",
c1c6733f
AM
8626+ node->node_id,
8627+ node->votes,
8628+ node->expected_votes,
8629+ state,
8630+ node->name);
8631+
8632+ return 0;
8633+}
8634+
8635+static void cluster_seq_stop(struct seq_file *m, void *p)
8636+{
8637+ kfree(p);
8638+}
8639+
8640+static struct seq_operations cluster_info_op = {
8641+ .start = cluster_seq_start,
8642+ .next = cluster_seq_next,
8643+ .stop = cluster_seq_stop,
8644+ .show = cluster_seq_show
8645+};
8646+
8647+static struct file_operations cluster_fops = {
8648+ .open = cluster_open,
8649+ .read = seq_read,
8650+ .llseek = seq_lseek,
8651+ .release = seq_release,
c783755a 8652+ .owner = THIS_MODULE,
c1c6733f
AM
8653+};
8654+
b7b72b66
AM
8655+static struct file_operations service_fops = {
8656+ .open = sm_proc_open,
8657+ .read = seq_read,
8658+ .llseek = seq_lseek,
8659+ .release = seq_release,
c783755a 8660+ .owner = THIS_MODULE,
b7b72b66
AM
8661+};
8662+
c1c6733f
AM
8663+static int cman_config_read_proc(char *page, char **start, off_t off, int count,
8664+ int *eof, void *data)
8665+{
8666+ struct config_proc_info *cinfo = data;
8667+
8668+ return snprintf(page, count, "%d\n", *cinfo->value);
8669+}
8670+
8671+static int cman_config_write_proc(struct file *file, const char *buffer,
8672+ unsigned long count, void *data)
8673+{
8674+ struct config_proc_info *cinfo = data;
8675+ int value;
8676+ char *end;
8677+
8678+ value = simple_strtoul(buffer, &end, 10);
8679+ if (*end) {
8680+ *cinfo->value = value;
8681+ }
8682+ return count;
8683+}
8684+
8685+/* Base of the config directory for cman */
8686+static struct proc_dir_entry *proc_cman_config;
8687+void create_proc_entries(void)
8688+{
8689+ struct proc_dir_entry *procentry;
8690+ struct proc_dir_entry *proc_cluster;
8691+ int i;
8692+
8693+ proc_cluster = proc_mkdir("cluster", 0);
8694+ if (!proc_cluster)
8695+ return;
8696+ proc_cluster->owner = THIS_MODULE;
8697+
8698+ /* Config dir filled in by us and others */
8699+ if (!proc_mkdir("cluster/config", 0))
8700+ return;
8701+
8702+ /* Don't much care if this fails, it's hardly vital */
8703+ procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL);
8704+ if (procentry)
8705+ procentry->proc_fops = &cluster_fops;
8706+
8707+ procentry = create_proc_entry("cluster/status", S_IRUGO, NULL);
8708+ if (procentry)
c783755a 8709+ procentry->get_info = proc_cluster_status;
c1c6733f 8710+
c783755a
AM
8711+ procentry = create_proc_entry("cluster/services", S_IRUGO, NULL);
8712+ if (procentry)
8713+ procentry->proc_fops = &service_fops;
c1c6733f 8714+
c783755a
AM
8715+ /* Config entries */
8716+ proc_cman_config = proc_mkdir("cluster/config/cman", 0);
8717+ if (!proc_cman_config)
8718+ return;
c1c6733f 8719+
c783755a
AM
8720+ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
8721+ procentry = create_proc_entry(config_proc[i].name, 0660,
8722+ proc_cman_config);
8723+ if (procentry) {
8724+ procentry->data = &config_proc[i];
8725+ procentry->write_proc = cman_config_write_proc;
8726+ procentry->read_proc = cman_config_read_proc;
8727+ }
8728+ }
c1c6733f 8729+
c783755a
AM
8730+ procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL);
8731+ if (procentry)
8732+ procentry->get_info = sm_debug_info;
8733+}
c1c6733f 8734+
c783755a
AM
8735+void cleanup_proc_entries(void)
8736+{
8737+ int i, config_count;
c1c6733f 8738+
c783755a 8739+ remove_proc_entry("cluster/sm_debug", NULL);
c1c6733f 8740+
c783755a 8741+ config_count = sizeof(config_proc) / sizeof(struct config_proc_info);
c1c6733f 8742+
c783755a
AM
8743+ if (proc_cman_config) {
8744+ for (i=0; i<config_count; i++)
8745+ remove_proc_entry(config_proc[i].name, proc_cman_config);
8746+ }
8747+ remove_proc_entry("cluster/config/cman", NULL);
8748+ remove_proc_entry("cluster/config", NULL);
c1c6733f 8749+
c783755a
AM
8750+ remove_proc_entry("cluster/nodes", NULL);
8751+ remove_proc_entry("cluster/status", NULL);
8752+ remove_proc_entry("cluster/services", NULL);
8753+ remove_proc_entry("cluster/config", NULL);
8754+ remove_proc_entry("cluster", NULL);
8755+}
bb1d8b11
AM
8756diff -urN linux-orig/cluster/cman/sm.h linux-patched/cluster/cman/sm.h
8757--- linux-orig/cluster/cman/sm.h 1970-01-01 07:30:00.000000000 +0730
8758+++ linux-patched/cluster/cman/sm.h 2004-11-03 11:37:37.000000000 +0800
8759@@ -0,0 +1,109 @@
8760+/******************************************************************************
8761+*******************************************************************************
8762+**
8763+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8764+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8765+**
8766+** This copyrighted material is made available to anyone wishing to use,
8767+** modify, copy, or redistribute it subject to the terms and conditions
8768+** of the GNU General Public License v.2.
8769+**
8770+*******************************************************************************
8771+******************************************************************************/
8772+
8773+#ifndef __SM_DOT_H__
8774+#define __SM_DOT_H__
8775+
8776+/*
8777+ * This is the main header file to be included in each Service Manager source
8778+ * file.
8779+ */
8780+
8781+#include <linux/list.h>
8782+#include <linux/socket.h>
8783+#include <linux/kernel.h>
8784+#include <linux/sched.h>
8785+#include <linux/file.h>
8786+#include <linux/kthread.h>
8787+#include <net/sock.h>
8788+
8789+#include <cluster/cnxman.h>
8790+#include <cluster/service.h>
8791+
8792+#define SG_LEVELS (4)
8793+
8794+#include "sm_internal.h"
8795+#include "sm_barrier.h"
8796+#include "sm_control.h"
8797+#include "sm_daemon.h"
8798+#include "sm_joinleave.h"
8799+#include "sm_membership.h"
8800+#include "sm_message.h"
8801+#include "sm_misc.h"
8802+#include "sm_recover.h"
8803+#include "sm_services.h"
8804+
8805+extern struct list_head sm_sg[SG_LEVELS];
8806+extern struct semaphore sm_sglock;
8807+
8808+#ifndef TRUE
8809+#define TRUE (1)
8810+#endif
8811+
8812+#ifndef FALSE
8813+#define FALSE (0)
8814+#endif
8815+
8816+#define SM_ASSERT(x, do) \
8817+{ \
8818+ if (!(x)) \
8819+ { \
8820+ printk("\nSM: Assertion failed on line %d of file %s\n" \
8821+ "SM: assertion: \"%s\"\n" \
8822+ "SM: time = %lu\n", \
8823+ __LINE__, __FILE__, #x, jiffies); \
8824+ {do} \
8825+ printk("\n"); \
8826+ panic("SM: Record message above and reboot.\n"); \
8827+ } \
8828+}
8829+
8830+#define SM_RETRY(do_this, until_this) \
8831+for (;;) \
8832+{ \
8833+ do { do_this; } while (0); \
8834+ if (until_this) \
8835+ break; \
8836+ printk("SM: out of memory: %s, %u\n", __FILE__, __LINE__); \
8837+ schedule();\
8838+}
8839+
8840+
8841+#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args)
8842+
8843+#define log_error(sg, fmt, args...) \
8844+ printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8845+
8846+
8847+#define SM_DEBUG_LOG
8848+
8849+#ifdef SM_DEBUG_CONSOLE
8850+#define log_debug(sg, fmt, args...) \
8851+ printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8852+#endif
8853+
8854+#ifdef SM_DEBUG_LOG
8855+#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args);
8856+#endif
8857+
8858+#ifdef SM_DEBUG_ALL
8859+#define log_debug(sg, fmt, args...) \
8860+do \
8861+{ \
8862+ printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \
8863+ sm_debug_log(sg, fmt, ##args); \
8864+} \
8865+while (0)
8866+#endif
8867+
8868+#endif /* __SM_DOT_H__ */
c1c6733f 8869diff -urN linux-orig/cluster/cman/sm_barrier.c linux-patched/cluster/cman/sm_barrier.c
bb1d8b11
AM
8870--- linux-orig/cluster/cman/sm_barrier.c 1970-01-01 07:30:00.000000000 +0730
8871+++ linux-patched/cluster/cman/sm_barrier.c 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
8872@@ -0,0 +1,232 @@
8873+/******************************************************************************
8874+*******************************************************************************
8875+**
8876+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8877+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8878+**
8879+** This copyrighted material is made available to anyone wishing to use,
8880+** modify, copy, or redistribute it subject to the terms and conditions
8881+** of the GNU General Public License v.2.
8882+**
8883+*******************************************************************************
8884+******************************************************************************/
8885+
8886+#include "sm.h"
8887+
8888+static struct list_head barriers;
8889+static spinlock_t barriers_lock;
8890+
8891+struct bc_entry {
8892+ struct list_head list;
8893+ uint32_t gid;
8894+ int status;
8895+ char type;
8896+};
8897+typedef struct bc_entry bc_entry_t;
8898+
8899+void init_barriers(void)
8900+{
8901+ INIT_LIST_HEAD(&barriers);
8902+ spin_lock_init(&barriers_lock);
8903+}
8904+
8905+static int atoi(char *c)
8906+{
8907+ int x = 0;
8908+
8909+ while ('0' <= *c && *c <= '9') {
8910+ x = x * 10 + (*c - '0');
8911+ c++;
8912+ }
8913+ return x;
8914+}
8915+
8916+static void add_barrier_callback(char *name, int status, int type)
8917+{
8918+ char *p;
8919+ uint32_t gid;
8920+ bc_entry_t *be;
8921+
8922+ /* an ESRCH callback just means there was a cnxman transition */
8923+ if (status == -ESRCH)
8924+ return;
8925+
8926+ /* extract global id of SG from barrier name */
8927+ p = strstr(name, "sm.");
8928+
8929+ SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status););
8930+
8931+ p += strlen("sm.");
8932+ gid = atoi(p);
8933+
8934+ SM_RETRY(be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC), be);
8935+
8936+ be->gid = gid;
8937+ be->status = status;
8938+ be->type = type;
8939+
8940+ spin_lock(&barriers_lock);
8941+ list_add_tail(&be->list, &barriers);
8942+ spin_unlock(&barriers_lock);
8943+
8944+ wake_serviced(DO_BARRIERS);
8945+}
8946+
8947+static void callback_recovery_barrier(char *name, int status)
8948+{
8949+ add_barrier_callback(name, status, SM_BARRIER_RECOVERY);
8950+}
8951+
8952+static void callback_startdone_barrier_new(char *name, int status)
8953+{
8954+ add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW);
8955+}
8956+
8957+static void callback_startdone_barrier(char *name, int status)
8958+{
8959+ add_barrier_callback(name, status, SM_BARRIER_STARTDONE);
8960+}
8961+
8962+int sm_barrier(char *name, int count, int type)
8963+{
8964+ int error;
8965+ unsigned long fn = 0;
8966+
8967+ switch (type) {
8968+ case SM_BARRIER_STARTDONE:
8969+ fn = (unsigned long) callback_startdone_barrier;
8970+ break;
8971+ case SM_BARRIER_STARTDONE_NEW:
8972+ fn = (unsigned long) callback_startdone_barrier_new;
8973+ break;
8974+ case SM_BARRIER_RECOVERY:
8975+ fn = (unsigned long) callback_recovery_barrier;
8976+ break;
8977+ }
8978+
8979+ error = kcl_barrier_register(name, 0, count);
8980+ if (error) {
8981+ log_print("barrier register error %d", error);
8982+ goto fail;
8983+ }
8984+
8985+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE);
8986+ if (error) {
8987+ log_print("barrier setattr autodel error %d", error);
8988+ goto fail_bar;
8989+ }
8990+
8991+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn);
8992+ if (error) {
8993+ log_print("barrier setattr cb error %d", error);
8994+ goto fail_bar;
8995+ }
8996+
8997+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE);
8998+ if (error) {
8999+ log_print("barrier setattr enabled error %d", error);
9000+ goto fail_bar;
9001+ }
9002+
9003+ return 0;
9004+
9005+ fail_bar:
9006+ kcl_barrier_delete(name);
9007+ fail:
9008+ return error;
9009+}
9010+
9011+void process_startdone_barrier_new(sm_group_t *sg, int status)
9012+{
9013+ sm_sevent_t *sev = sg->sevent;
9014+
9015+ if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) {
9016+ log_debug(sev->se_sg, "ignore barrier cb status %d", status);
9017+ return;
9018+ }
9019+
9020+ sev->se_barrier_status = status;
9021+ sev->se_state = SEST_BARRIER_DONE;
9022+ set_bit(SEFL_CHECK, &sev->se_flags);
9023+ wake_serviced(DO_JOINLEAVE);
9024+}
9025+
9026+void process_startdone_barrier(sm_group_t *sg, int status)
9027+{
9028+ sm_uevent_t *uev = &sg->uevent;
9029+
9030+ if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) {
9031+ log_debug(sg, "ignore barrier cb status %d", status);
9032+ return;
9033+ }
9034+
9035+ uev->ue_barrier_status = status;
9036+ uev->ue_state = UEST_BARRIER_DONE;
9037+ set_bit(UEFL_CHECK, &uev->ue_flags);
9038+ wake_serviced(DO_MEMBERSHIP);
9039+}
9040+
9041+void process_recovery_barrier(sm_group_t *sg, int status)
9042+{
9043+ if (status) {
9044+ log_error(sg, "process_recovery_barrier status=%d", status);
9045+ return;
9046+ }
9047+
9048+ if (sg->state != SGST_RECOVER ||
9049+ sg->recover_state != RECOVER_BARRIERWAIT) {
9050+ log_error(sg, "process_recovery_barrier state %d recover %d",
9051+ sg->state, sg->recover_state);
9052+ return;
9053+ }
9054+
9055+ if (!sg->recover_stop)
9056+ sg->recover_state = RECOVER_STOP;
9057+ else
9058+ sg->recover_state = RECOVER_BARRIERDONE;
9059+
9060+ wake_serviced(DO_RECOVERIES);
9061+}
9062+
9063+void process_barriers(void)
9064+{
9065+ sm_group_t *sg;
9066+ bc_entry_t *be;
9067+
9068+ while (1) {
9069+ be = NULL;
9070+
9071+ spin_lock(&barriers_lock);
9072+ if (!list_empty(&barriers)) {
9073+ be = list_entry(barriers.next, bc_entry_t, list);
9074+ list_del(&be->list);
9075+ }
9076+ spin_unlock(&barriers_lock);
9077+
9078+ if (!be)
9079+ break;
9080+
9081+ sg = sm_global_id_to_sg(be->gid);
9082+ if (!sg) {
9083+ log_print("process_barriers: no sg %08x", be->gid);
9084+ break;
9085+ }
9086+
9087+ switch (be->type) {
9088+ case SM_BARRIER_STARTDONE_NEW:
9089+ process_startdone_barrier_new(sg, be->status);
9090+ break;
9091+
9092+ case SM_BARRIER_STARTDONE:
9093+ process_startdone_barrier(sg, be->status);
9094+ break;
9095+
9096+ case SM_BARRIER_RECOVERY:
9097+ process_recovery_barrier(sg, be->status);
9098+ break;
9099+ }
9100+
9101+ kfree(be);
9102+ schedule();
9103+ }
9104+}
9105diff -urN linux-orig/cluster/cman/sm_barrier.h linux-patched/cluster/cman/sm_barrier.h
bb1d8b11
AM
9106--- linux-orig/cluster/cman/sm_barrier.h 1970-01-01 07:30:00.000000000 +0730
9107+++ linux-patched/cluster/cman/sm_barrier.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
9108@@ -0,0 +1,29 @@
9109+/******************************************************************************
9110+*******************************************************************************
9111+**
9112+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9113+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9114+**
9115+** This copyrighted material is made available to anyone wishing to use,
9116+** modify, copy, or redistribute it subject to the terms and conditions
9117+** of the GNU General Public License v.2.
9118+**
9119+*******************************************************************************
9120+******************************************************************************/
9121+
9122+#ifndef __SM_BARRIER_DOT_H__
9123+#define __SM_BARRIER_DOT_H__
9124+
9125+#define SM_BARRIER_STARTDONE (0)
9126+#define SM_BARRIER_STARTDONE_NEW (1)
9127+#define SM_BARRIER_RECOVERY (2)
9128+#define SM_BARRIER_RESET (3)
9129+
9130+void init_barriers(void);
9131+void process_barriers(void);
9132+int sm_barrier(char *name, int count, int type);
9133+void process_startdone_barrier(sm_group_t *sg, int status);
9134+void process_startdone_barrier_new(sm_group_t *sg, int status);
9135+void process_recovery_barrier(sm_group_t *sg, int status);
9136+
9137+#endif
9138diff -urN linux-orig/cluster/cman/sm_control.c linux-patched/cluster/cman/sm_control.c
bb1d8b11
AM
9139--- linux-orig/cluster/cman/sm_control.c 1970-01-01 07:30:00.000000000 +0730
9140+++ linux-patched/cluster/cman/sm_control.c 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
9141@@ -0,0 +1,156 @@
9142+/******************************************************************************
9143+*******************************************************************************
9144+**
9145+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9146+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9147+**
9148+** This copyrighted material is made available to anyone wishing to use,
9149+** modify, copy, or redistribute it subject to the terms and conditions
9150+** of the GNU General Public License v.2.
9151+**
9152+*******************************************************************************
9153+******************************************************************************/
9154+
9155+#include "sm.h"
9156+#include "config.h"
9157+
9158+struct socket * sm_socket;
9159+uint32_t * sm_new_nodeids;
9160+uint32_t sm_our_nodeid;
9161+int sm_quorum, sm_quorum_next;
9162+struct list_head sm_members;
9163+int sm_member_count;
9164+
9165+
9166+/*
9167+ * Context: cnxman
9168+ * Called by cnxman when it has a new member list.
9169+ */
9170+
9171+void sm_member_update(int quorate)
9172+{
9173+ sm_quorum_next = quorate;
9174+ wake_serviced(DO_START_RECOVERY);
9175+}
9176+
9177+/*
9178+ * Context: cnxman
9179+ * Called when module is loaded.
9180+ */
9181+
9182+void sm_init(void)
9183+{
9184+ sm_socket = NULL;
9185+ sm_new_nodeids = NULL;
9186+ sm_quorum = 0;
9187+ sm_quorum_next = 0;
9188+ sm_our_nodeid = 0;
9189+ INIT_LIST_HEAD(&sm_members);
9190+ sm_member_count = 0;
9191+
9192+ init_services();
9193+ init_messages();
9194+ init_barriers();
9195+ init_serviced();
9196+ init_recovery();
9197+ init_joinleave();
9198+ init_sm_misc();
9199+}
9200+
9201+/*
9202+ * Context: cnxman
9203+ * Called at beginning of cluster join procedure.
9204+ */
9205+
9206+void sm_start(void)
9207+{
9208+ struct sockaddr_cl saddr;
9209+ struct socket *sock;
9210+ int result;
9211+
9212+ /* Create a communication channel among service managers */
9213+
9214+ result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
9215+ if (result < 0) {
9216+ log_print("can't create socket %d", result);
9217+ goto fail;
9218+ }
9219+
9220+ sm_socket = sock;
9221+
9222+ saddr.scl_family = AF_CLUSTER;
9223+ saddr.scl_port = CLUSTER_PORT_SERVICES;
9224+
9225+ result = sock->ops->bind(sock, (struct sockaddr *) &saddr,
9226+ sizeof(saddr));
9227+ if (result < 0) {
9228+ log_print("can't bind socket %d", result);
9229+ goto fail_release;
9230+ }
9231+
9232+ result = kcl_register_read_callback(sm_socket, sm_cluster_message);
9233+ if (result < 0) {
9234+ log_print("can't register read callback %d", result);
9235+ goto fail_release;
9236+ }
9237+
9238+ sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes *
9239+ sizeof(uint32_t),
9240+ GFP_KERNEL);
9241+ start_serviced();
9242+
9243+ /* cnxman should call sm_member_update() once we've joined - then we
9244+ * can get our first list of members and our own nodeid */
9245+
9246+ return;
9247+
9248+ fail_release:
9249+ sock_release(sm_socket);
9250+ sm_socket = NULL;
9251+
9252+ fail:
9253+ return;
9254+}
9255+
9256+/*
9257+ * Context: cnxman
9258+ * Called before cnxman leaves the cluster. If this returns an error to cman,
9259+ * cman should not leave the cluster but return EBUSY.
9260+ * If force is set we go away anyway. cman knows best in this case
9261+ */
9262+
9263+int sm_stop(int force)
9264+{
9265+ struct list_head *head;
9266+ sm_group_t *sg;
9267+ sm_node_t *node;
9268+ int i, busy = FALSE, error = -EBUSY;
9269+
9270+ for (i = 0; i < SG_LEVELS; i++) {
9271+ if (!list_empty(&sm_sg[i])) {
9272+ sg = list_entry(sm_sg[i].next, sm_group_t, list);
9273+ log_error(sg, "sm_stop: SG still joined");
9274+ busy = TRUE;
9275+ }
9276+ }
9277+
9278+ if (!busy || force) {
9279+ stop_serviced();
9280+
9281+ if (sm_socket)
9282+ sock_release(sm_socket);
9283+
9284+ head = &sm_members;
9285+ while (!list_empty(head)) {
9286+ node = list_entry(head->next, sm_node_t, list);
9287+ list_del(&node->list);
9288+ sm_member_count--;
9289+ kfree(node);
9290+ }
9291+
9292+ kfree(sm_new_nodeids);
9293+ sm_init();
9294+ error = 0;
9295+ }
9296+ return error;
9297+}
9298diff -urN linux-orig/cluster/cman/sm_control.h linux-patched/cluster/cman/sm_control.h
bb1d8b11
AM
9299--- linux-orig/cluster/cman/sm_control.h 1970-01-01 07:30:00.000000000 +0730
9300+++ linux-patched/cluster/cman/sm_control.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
9301@@ -0,0 +1,22 @@
9302+/******************************************************************************
9303+*******************************************************************************
9304+**
9305+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9306+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9307+**
9308+** This copyrighted material is made available to anyone wishing to use,
9309+** modify, copy, or redistribute it subject to the terms and conditions
9310+** of the GNU General Public License v.2.
9311+**
9312+*******************************************************************************
9313+******************************************************************************/
9314+
9315+#ifndef __SM_CONTROL_DOT_H__
9316+#define __SM_CONTROL_DOT_H__
9317+
9318+void sm_init(void);
9319+void sm_start(void);
9320+int sm_stop(int force);
9321+void sm_member_update(int quorate);
9322+
9323+#endif
9324diff -urN linux-orig/cluster/cman/sm_daemon.c linux-patched/cluster/cman/sm_daemon.c
bb1d8b11
AM
9325--- linux-orig/cluster/cman/sm_daemon.c 1970-01-01 07:30:00.000000000 +0730
9326+++ linux-patched/cluster/cman/sm_daemon.c 2004-11-03 11:37:37.000000000 +0800
c783755a 9327@@ -0,0 +1,100 @@
c1c6733f
AM
9328+/******************************************************************************
9329+*******************************************************************************
9330+**
9331+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9332+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9333+**
9334+** This copyrighted material is made available to anyone wishing to use,
9335+** modify, copy, or redistribute it subject to the terms and conditions
9336+** of the GNU General Public License v.2.
9337+**
9338+*******************************************************************************
9339+******************************************************************************/
9340+
9341+#include "sm.h"
9342+
9343+static unsigned long daemon_flags;
9344+static struct task_struct * daemon_task;
c1c6733f
AM
9345+extern int sm_quorum;
9346+
9347+void init_serviced(void)
9348+{
9349+ daemon_flags = 0;
9350+ daemon_task = NULL;
c1c6733f
AM
9351+}
9352+
9353+void wake_serviced(int do_flag)
9354+{
9355+ set_bit(do_flag, &daemon_flags);
c783755a 9356+ wake_up_process(daemon_task);
c1c6733f
AM
9357+}
9358+
9359+static inline int got_work(void)
9360+{
9361+ int rv = 0;
9362+
9363+ rv = (test_bit(DO_START_RECOVERY, &daemon_flags) ||
9364+ test_bit(DO_MESSAGES, &daemon_flags) ||
9365+ test_bit(DO_BARRIERS, &daemon_flags) ||
9366+ test_bit(DO_CALLBACKS, &daemon_flags));
9367+
9368+ if (sm_quorum && !rv)
9369+ rv = (test_bit(DO_JOINLEAVE, &daemon_flags) ||
9370+ test_bit(DO_RECOVERIES, &daemon_flags) ||
9371+ test_bit(DO_MEMBERSHIP, &daemon_flags));
9372+ return rv;
9373+}
9374+
9375+static int serviced(void *arg)
9376+{
c783755a 9377+ while (!kthread_should_stop()) {
c1c6733f
AM
9378+ if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags))
9379+ process_nodechange();
9380+
9381+ if (test_and_clear_bit(DO_MESSAGES, &daemon_flags))
9382+ process_messages();
9383+
9384+ if (test_and_clear_bit(DO_BARRIERS, &daemon_flags))
9385+ process_barriers();
9386+
9387+ if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags))
9388+ process_callbacks();
9389+
9390+ if (sm_quorum) {
9391+ if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags))
9392+ process_recoveries();
9393+
9394+ if (test_and_clear_bit(DO_JOINLEAVE, &daemon_flags))
9395+ process_joinleave();
9396+
9397+ if (test_and_clear_bit(DO_MEMBERSHIP, &daemon_flags))
9398+ process_membership();
9399+ }
9400+
bb1d8b11
AM
9401+ set_current_state(TASK_INTERRUPTIBLE);
9402+ if (!got_work())
9403+ schedule();
9404+ set_current_state(TASK_RUNNING);
9405+ }
c783755a 9406+
bb1d8b11 9407+ return 0;
c783755a
AM
9408+}
9409+
bb1d8b11
AM
9410+int start_serviced(void)
9411+{
9412+ struct task_struct *p;
c783755a 9413+
d3b4771f 9414+ p = kthread_run(serviced, NULL, 0, "cman_serviced");
bb1d8b11
AM
9415+ if (IS_ERR(p)) {
9416+ printk("can't start cman_serviced daemon");
9417+ return (IS_ERR(p));
9418+ }
c783755a 9419+
bb1d8b11
AM
9420+ daemon_task = p;
9421+ return 0;
9422+}
c783755a 9423+
bb1d8b11
AM
9424+void stop_serviced(void)
9425+{
9426+ kthread_stop(daemon_task);
9427+}
9428diff -urN linux-orig/cluster/cman/sm_daemon.h linux-patched/cluster/cman/sm_daemon.h
9429--- linux-orig/cluster/cman/sm_daemon.h 1970-01-01 07:30:00.000000000 +0730
9430+++ linux-patched/cluster/cman/sm_daemon.h 2004-11-03 11:37:37.000000000 +0800
9431@@ -0,0 +1,32 @@
9432+/******************************************************************************
9433+*******************************************************************************
9434+**
9435+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9436+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9437+**
9438+** This copyrighted material is made available to anyone wishing to use,
9439+** modify, copy, or redistribute it subject to the terms and conditions
9440+** of the GNU General Public License v.2.
9441+**
9442+*******************************************************************************
9443+******************************************************************************/
c783755a 9444+
bb1d8b11
AM
9445+#ifndef __SM_DAEMON_DOT_H__
9446+#define __SM_DAEMON_DOT_H__
c783755a 9447+
bb1d8b11
AM
9448+#define DO_RUN (0)
9449+#define DO_START_RECOVERY (1)
9450+#define DO_MESSAGES (2)
9451+#define DO_BARRIERS (3)
9452+#define DO_CALLBACKS (4)
9453+#define DO_JOINLEAVE (5)
9454+#define DO_RECOVERIES (6)
9455+#define DO_MEMBERSHIP (7)
9456+#define DO_RESET (8)
c783755a 9457+
bb1d8b11
AM
9458+void init_serviced(void);
9459+void wake_serviced(int do_flag);
9460+void stop_serviced(void);
9461+int start_serviced(void);
c783755a 9462+
c783755a 9463+#endif
c1c6733f 9464diff -urN linux-orig/cluster/cman/sm_internal.h linux-patched/cluster/cman/sm_internal.h
bb1d8b11
AM
9465--- linux-orig/cluster/cman/sm_internal.h 1970-01-01 07:30:00.000000000 +0730
9466+++ linux-patched/cluster/cman/sm_internal.h 2004-11-03 11:37:37.000000000 +0800
b7b72b66 9467@@ -0,0 +1,231 @@
c1c6733f
AM
9468+/******************************************************************************
9469+*******************************************************************************
9470+**
9471+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9472+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9473+**
9474+** This copyrighted material is made available to anyone wishing to use,
9475+** modify, copy, or redistribute it subject to the terms and conditions
9476+** of the GNU General Public License v.2.
9477+**
9478+*******************************************************************************
9479+******************************************************************************/
9480+
9481+#ifndef __SM_INTERNAL_DOT_H__
9482+#define __SM_INTERNAL_DOT_H__
9483+
9484+/*
9485+ * Any header files needed by this file should be included before it in sm.h.
9486+ * This file should only be included by sm.h.
9487+ */
9488+
9489+struct sm_group;
9490+struct sm_sevent;
9491+struct sm_uevent;
9492+struct sm_node;
9493+struct sm_msg;
9494+
9495+typedef struct sm_group sm_group_t;
9496+typedef struct sm_sevent sm_sevent_t;
9497+typedef struct sm_uevent sm_uevent_t;
9498+typedef struct sm_node sm_node_t;
9499+typedef struct sm_msg sm_msg_t;
9500+
9501+
9502+/*
9503+ * Number of seconds to wait before trying again to join or leave an SG
9504+ */
9505+#define RETRY_DELAY (2)
9506+
9507+
9508+/*
9509+ * Service Event - what a node uses to join or leave an sg
9510+ */
9511+
9512+/* SE Flags */
9513+#define SEFL_CHECK (0)
9514+#define SEFL_ALLOW_JOIN (1)
9515+#define SEFL_ALLOW_JSTOP (2)
9516+#define SEFL_ALLOW_LEAVE (3)
9517+#define SEFL_ALLOW_LSTOP (4)
9518+#define SEFL_ALLOW_STARTDONE (5)
9519+#define SEFL_ALLOW_BARRIER (6)
9520+#define SEFL_DELAY (7)
b7b72b66
AM
9521+#define SEFL_DELAY_RECOVERY (8)
9522+#define SEFL_LEAVE (9)
9523+#define SEFL_CANCEL (10)
c1c6733f
AM
9524+
9525+/* SE States */
9526+#define SEST_JOIN_BEGIN (1)
9527+#define SEST_JOIN_ACKWAIT (2)
9528+#define SEST_JOIN_ACKED (3)
9529+#define SEST_JSTOP_ACKWAIT (4)
9530+#define SEST_JSTOP_ACKED (5)
9531+#define SEST_JSTART_SERVICEWAIT (6)
9532+#define SEST_JSTART_SERVICEDONE (7)
9533+#define SEST_BARRIER_WAIT (8)
9534+#define SEST_BARRIER_DONE (9)
9535+#define SEST_LEAVE_BEGIN (10)
9536+#define SEST_LEAVE_ACKWAIT (11)
9537+#define SEST_LEAVE_ACKED (12)
9538+#define SEST_LSTOP_ACKWAIT (13)
9539+#define SEST_LSTOP_ACKED (14)
9540+#define SEST_LSTART_WAITREMOTE (15)
9541+#define SEST_LSTART_REMOTEDONE (16)
9542+
9543+struct sm_sevent {
9544+ struct list_head se_list;
9545+ unsigned int se_id;
9546+ sm_group_t * se_sg;
9547+ unsigned long se_flags;
9548+ unsigned int se_state;
9549+
9550+ int se_node_count;
9551+ int se_memb_count;
9552+ int se_reply_count;
9553+
9554+ uint32_t * se_node_ids;
9555+ char * se_node_status;
9556+ int se_len_ids; /* length of node_ids */
9557+ int se_len_status; /* length of node_status */
9558+
9559+ int se_barrier_status;
9560+ struct timer_list se_restart_timer;
9561+};
9562+
9563+/*
9564+ * Update Event - what an sg member uses to respond to an sevent
9565+ */
9566+
9567+/* UE Flags */
9568+#define UEFL_ALLOW_STARTDONE (0)
9569+#define UEFL_ALLOW_BARRIER (1)
9570+#define UEFL_CANCEL (2)
9571+#define UEFL_LEAVE (3)
9572+#define UEFL_CHECK (4)
9573+
9574+/* UE States */
9575+#define UEST_JSTOP (1)
9576+#define UEST_JSTART_WAITCMD (2)
9577+#define UEST_JSTART (3)
9578+#define UEST_JSTART_SERVICEWAIT (4)
9579+#define UEST_JSTART_SERVICEDONE (5)
9580+#define UEST_BARRIER_WAIT (6)
9581+#define UEST_BARRIER_DONE (7)
9582+#define UEST_LSTOP (8)
9583+#define UEST_LSTART_WAITCMD (9)
9584+#define UEST_LSTART (10)
9585+#define UEST_LSTART_SERVICEWAIT (11)
9586+#define UEST_LSTART_SERVICEDONE (12)
9587+
9588+struct sm_uevent {
9589+ unsigned int ue_state;
9590+ unsigned long ue_flags;
9591+ uint32_t ue_id;
9592+ uint32_t ue_nodeid;
9593+ int ue_num_nodes;
9594+ int ue_barrier_status;
9595+ uint16_t ue_remote_seid;
9596+};
9597+
9598+/*
9599+ * Service Group
9600+ */
9601+
9602+#define RECOVER_NONE (0)
9603+#define RECOVER_STOP (1)
9604+#define RECOVER_START (2)
9605+#define RECOVER_STARTDONE (3)
9606+#define RECOVER_BARRIERWAIT (4)
9607+#define RECOVER_BARRIERDONE (5)
9608+
9609+/* SG Flags */
9610+#define SGFL_SEVENT (1)
9611+#define SGFL_UEVENT (2)
9612+#define SGFL_NEED_RECOVERY (3)
9613+
9614+/* SG States */
9615+#define SGST_NONE (0)
9616+#define SGST_JOIN (1)
9617+#define SGST_RUN (2)
9618+#define SGST_RECOVER (3)
9619+#define SGST_UEVENT (4)
9620+
9621+struct sm_group {
9622+ struct list_head list; /* list of sg's */
9623+ uint16_t level;
9624+ uint32_t local_id;
9625+ uint32_t global_id;
9626+ unsigned long flags;
9627+ int state;
9628+ int refcount; /* references from reg/unreg */
9629+ void * service_data; /* data from the service */
9630+ struct kcl_service_ops *ops; /* ops from the service */
9631+ struct completion event_comp;
9632+
9633+ struct list_head memb; /* Membership List for RC */
9634+ int memb_count; /* number of nodes in memb */
9635+ struct list_head joining; /* nodes joining the sg */
9636+ sm_sevent_t * sevent;
9637+ sm_uevent_t uevent;
9638+
9639+ int recover_state;
9640+ int recover_stop;
9641+ struct list_head recover_list; /* recovery event list */
9642+ void * recover_data;
9643+ char recover_barrier[MAX_BARRIER_NAME_LEN];
9644+
9645+ int namelen;
9646+ char name[1]; /* must be last field */
9647+};
9648+
9649+/*
9650+ * Service Message
9651+ */
9652+
9653+/* SMSG Type */
9654+#define SMSG_JOIN_REQ (1)
9655+#define SMSG_JOIN_REP (2)
9656+#define SMSG_JSTOP_REQ (3)
9657+#define SMSG_JSTOP_REP (4)
9658+#define SMSG_JSTART_CMD (5)
9659+#define SMSG_LEAVE_REQ (6)
9660+#define SMSG_LEAVE_REP (7)
9661+#define SMSG_LSTOP_REQ (8)
9662+#define SMSG_LSTOP_REP (9)
9663+#define SMSG_LSTART_CMD (10)
9664+#define SMSG_LSTART_DONE (11)
9665+#define SMSG_RECOVER (12)
9666+
9667+/* SMSG Status */
9668+#define STATUS_POS (1)
9669+#define STATUS_NEG (2)
9670+#define STATUS_WAIT (3)
9671+
9672+struct sm_msg {
9673+ uint8_t ms_type;
9674+ uint8_t ms_status;
9675+ uint16_t ms_sevent_id;
9676+ uint32_t ms_global_sgid;
9677+ uint32_t ms_global_lastid;
9678+ uint16_t ms_sglevel;
9679+ uint16_t ms_length;
9680+ /* buf of ms_length bytes follows */
9681+};
9682+
9683+/*
9684+ * Node structure
9685+ */
9686+
9687+#define SNFL_NEED_RECOVERY (0)
9688+#define SNFL_CLUSTER_MEMBER (1)
9689+#define SNFL_LEAVING (2)
9690+
9691+struct sm_node {
9692+ struct list_head list;
9693+ uint32_t id; /* node id from cnxman */
9694+ unsigned long flags;
9695+ int incarnation; /* node incarnation number */
9696+};
9697+
9698+#endif /* __SM_INTERNAL_DOT_H__ */
9699diff -urN linux-orig/cluster/cman/sm_joinleave.c linux-patched/cluster/cman/sm_joinleave.c
bb1d8b11
AM
9700--- linux-orig/cluster/cman/sm_joinleave.c 1970-01-01 07:30:00.000000000 +0730
9701+++ linux-patched/cluster/cman/sm_joinleave.c 2004-11-03 11:37:37.000000000 +0800
b7b72b66 9702@@ -0,0 +1,1291 @@
c1c6733f
AM
9703+/******************************************************************************
9704+*******************************************************************************
9705+**
9706+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9707+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9708+**
9709+** This copyrighted material is made available to anyone wishing to use,
9710+** modify, copy, or redistribute it subject to the terms and conditions
9711+** of the GNU General Public License v.2.
9712+**
9713+*******************************************************************************
9714+******************************************************************************/
9715+
9716+#include "sm.h"
9717+
9718+/*
9719+ * Routines used by nodes that are joining or leaving a SG. These "sevent"
9720+ * routines initiate membership changes to a SG. Existing SG members respond
9721+ * using the "uevent" membership update routines.
9722+ */
9723+
9724+extern uint32_t sm_our_nodeid;
9725+extern struct list_head sm_members;
9726+static struct list_head new_event;
9727+static spinlock_t new_event_lock;
9728+static struct list_head joinleave_events;
9729+
9730+void init_joinleave(void)
9731+{
9732+ INIT_LIST_HEAD(&new_event);
9733+ spin_lock_init(&new_event_lock);
9734+ INIT_LIST_HEAD(&joinleave_events);
9735+}
9736+
9737+void new_joinleave(sm_sevent_t *sev)
9738+{
9739+ spin_lock(&new_event_lock);
9740+ list_add_tail(&sev->se_list, &new_event);
9741+ spin_unlock(&new_event_lock);
9742+ wake_serviced(DO_JOINLEAVE);
9743+}
9744+
9745+sm_sevent_t *find_sevent(unsigned int id)
9746+{
9747+ sm_sevent_t *sev;
9748+
9749+ list_for_each_entry(sev, &joinleave_events, se_list) {
9750+ if (sev->se_id == id)
9751+ return sev;
9752+ }
9753+ return NULL;
9754+}
9755+
9756+static void release_sevent(sm_sevent_t *sev)
9757+{
9758+ if (sev->se_len_ids) {
9759+ kfree(sev->se_node_ids);
9760+ sev->se_node_ids = NULL;
9761+ }
9762+
9763+ if (sev->se_len_status) {
9764+ kfree(sev->se_node_status);
9765+ sev->se_node_status = NULL;
9766+ }
9767+
9768+ sev->se_node_count = 0;
9769+ sev->se_memb_count = 0;
9770+ sev->se_reply_count = 0;
9771+}
9772+
9773+static int init_sevent(sm_sevent_t *sev)
9774+{
9775+ sm_node_t *node;
9776+ int len1, len2, count, cluster_members = 0;
9777+
9778+ /* clear state from any previous attempt */
9779+ release_sevent(sev);
9780+
9781+ list_for_each_entry(node, &sm_members, list) {
9782+ if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9783+ cluster_members++;
9784+ }
9785+
9786+ sev->se_node_count = cluster_members;
9787+ sev->se_memb_count = sev->se_sg->memb_count;
9788+
9789+ /*
9790+ * When joining, we need a node array the size of the entire cluster
9791+ * member list because we get responses from all nodes. When leaving,
9792+ * we only get responses from SG members, so the node array need only
9793+ * be that large.
9794+ */
9795+
9796+ if (sev->se_state < SEST_LEAVE_BEGIN)
9797+ count = sev->se_node_count;
9798+ else
9799+ count = sev->se_memb_count;
9800+
9801+ len1 = count * sizeof(uint32_t);
9802+ sev->se_len_ids = len1;
9803+
9804+ sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL);
9805+ if (!sev->se_node_ids)
9806+ goto fail;
9807+
9808+ len2 = count * sizeof (char);
9809+ sev->se_len_status = len2;
9810+
9811+ sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL);
9812+ if (!sev->se_node_status)
9813+ goto fail_free;
9814+
9815+ memset(sev->se_node_status, 0, len2);
9816+ memset(sev->se_node_ids, 0, len1);
9817+
9818+ return 0;
9819+
9820+ fail_free:
9821+ kfree(sev->se_node_ids);
9822+ sev->se_node_ids = NULL;
9823+ sev->se_len_ids = 0;
9824+
9825+ fail:
9826+ return -ENOMEM;
9827+}
9828+
9829+/* Context: timer */
9830+
9831+static void sev_restart(unsigned long data)
9832+{
9833+ sm_sevent_t *sev = (sm_sevent_t *) data;
9834+
9835+ clear_bit(SEFL_DELAY, &sev->se_flags);
9836+ set_bit(SEFL_CHECK, &sev->se_flags);
9837+ wake_serviced(DO_JOINLEAVE);
9838+}
9839+
9840+static void schedule_sev_restart(sm_sevent_t *sev)
9841+{
9842+ init_timer(&sev->se_restart_timer);
9843+ sev->se_restart_timer.function = sev_restart;
9844+ sev->se_restart_timer.data = (long) sev;
9845+ mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ));
9846+}
9847+
9848+void free_sg_memb(sm_group_t *sg)
9849+{
9850+ sm_node_t *node;
9851+
9852+ while (!list_empty(&sg->memb)) {
9853+ node = list_entry(sg->memb.next, sm_node_t, list);
9854+ list_del(&node->list);
9855+ kfree(node);
9856+ }
9857+ sg->memb_count = 0;
9858+}
9859+
9860+/*
9861+ * 1. First step in joining a SG - send a message to all nodes in the cluster
9862+ * asking to join the named SG. If any nodes are members they will reply with
9863+ * a POS, or a WAIT (wait means try again, only one node can join at a time).
9864+ * If no one knows about this SG, they all send NEG replies which means we form
9865+ * the SG with just ourself as a member.
9866+ */
9867+
9868+static int send_join_notice(sm_sevent_t *sev)
9869+{
9870+ sm_group_t *sg = sev->se_sg;
9871+ sm_node_t *node;
9872+ char *msg;
9873+ int i = 0, error, namelen, len = 0;
9874+
9875+ /*
9876+ * Create node array from member list in which to collect responses.
9877+ */
9878+
9879+ error = init_sevent(sev);
9880+ if (error)
9881+ goto out;
9882+
9883+ list_for_each_entry(node, &sm_members, list) {
9884+ if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9885+ sev->se_node_ids[i++] = node->id;
9886+ }
9887+
9888+ /*
9889+ * Create and send a join request message.
9890+ *
9891+ * Other nodes then run process_join_request and reply to us; we
9892+ * collect the responses in process_reply and check them in
9893+ * check_join_notice.
9894+ */
9895+
9896+ namelen = sg->namelen;
9897+ msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev);
9898+ memcpy(msg + sizeof(sm_msg_t), sg->name, namelen);
9899+
9900+ error = send_broadcast_message_sev(msg, len, sev);
9901+
9902+ out:
9903+ return error;
9904+}
9905+
9906+/*
9907+ * 2. Second step in joining a SG - after we collect all replies to our join
9908+ * request, we look at them. If anyone told us to wait, we'll wait a while, go
9909+ * back and start at step 1 again.
9910+ */
9911+
9912+static int check_join_notice(sm_sevent_t *sev)
9913+{
9914+ int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0;
9915+
9916+ for (i = 0; i < sev->se_node_count; i++) {
9917+ switch (sev->se_node_status[i]) {
9918+ case STATUS_POS:
9919+ /* this node is in the SG and will be in new proposed
9920+ * memb list */
9921+ pos++;
9922+ break;
9923+
9924+ case STATUS_WAIT:
9925+ /* this node is in the SG but something else is
9926+ * happening with it at the moment. */
9927+ wait++;
9928+ break;
9929+
9930+ case STATUS_NEG:
9931+ /* this node has no record of the SG we're interested
9932+ * in */
9933+ neg++;
9934+
9935+ if (sev->se_node_ids[i] == sm_our_nodeid)
9936+ sev->se_node_status[i] = STATUS_POS;
9937+ break;
9938+
9939+ default:
9940+ /* we didn't get a valid response from this node,
9941+ * restart the entire sev. */
9942+ restart++;
9943+ break;
9944+ }
9945+ }
9946+
9947+ if (pos && !wait && !restart) {
9948+ /* all current members of this sg pos'ed our entry */
9949+ } else if (!pos && !wait && !restart && neg) {
9950+ /* we're the first in the cluster to join this sg */
9951+ sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level);
9952+ } else
9953+ error = -1;
9954+
9955+ return error;
9956+}
9957+
9958+/*
9959+ * 3. Third step in joining the SG - tell the nodes that are already members
9960+ * to "stop" the service. We stop them so that everyone can restart with the
9961+ * new member (us!) added.
9962+ */
9963+
9964+static int send_join_stop(sm_sevent_t *sev)
9965+{
9966+ sm_group_t *sg = sev->se_sg;
9967+ sm_node_t *node;
9968+ char *msg;
9969+ uint32_t be_count;
9970+ int i, len = 0, error = 0;
9971+
9972+ /*
9973+ * Form the SG memb list with us in it.
9974+ */
9975+
9976+ for (i = 0; i < sev->se_node_count; i++) {
9977+ if (sev->se_node_status[i] != STATUS_POS)
9978+ continue;
9979+
9980+ node = sm_new_node(sev->se_node_ids[i]);
9981+ if (!node)
9982+ goto fail;
9983+
9984+ list_add_tail(&node->list, &sg->memb);
9985+ sg->memb_count++;
9986+ }
9987+
9988+ /*
9989+ * Re-init the node vector in which to collect responses again.
9990+ */
9991+
9992+ sev->se_memb_count = sg->memb_count;
9993+
9994+ memset(sev->se_node_status, 0, sev->se_len_status);
9995+ memset(sev->se_node_ids, 0, sev->se_len_ids);
9996+ i = 0;
9997+
9998+ list_for_each_entry(node, &sg->memb, list)
9999+ sev->se_node_ids[i++] = node->id;
10000+
10001+ /*
10002+ * Create and send a stop message.
10003+ *
10004+ * Other nodes then run process_stop_request and process_join_stop and
10005+ * reply to us. They stop the sg we're trying to join if they agree.
10006+ * We collect responses in process_reply and check them in
10007+ * check_join_stop.
10008+ */
10009+
10010+ msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev);
10011+ be_count = cpu_to_be32(sg->memb_count);
10012+ memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t));
10013+
10014+ error = send_members_message_sev(sg, msg, len, sev);
10015+ if (error < 0)
10016+ goto fail;
10017+
10018+ return 0;
10019+
10020+ fail:
10021+ free_sg_memb(sg);
10022+ return error;
10023+}
10024+
10025+/*
10026+ * 4. Fourth step in joining the SG - after we collect replies to our stop
10027+ * request, we look at them. Everyone sending POS agrees with us joining and
10028+ * has stopped their SG. If some nodes sent NEG, something is wrong and we
10029+ * don't have a good way to address that yet since some nodes may have sent
10030+ * POS.
10031+ *
10032+ * FIXME: even nodes replying with NEG should stop their SG so we can send an
10033+ * abort and have everyone at the same place to start from again.
10034+ */
10035+
10036+static int check_join_stop(sm_sevent_t *sev)
10037+{
10038+ sm_group_t *sg = sev->se_sg;
10039+ int i, pos = 0, neg = 0;
10040+
10041+ for (i = 0; i < sev->se_memb_count; i++) {
10042+ switch (sev->se_node_status[i]) {
10043+ case STATUS_POS:
10044+ pos++;
10045+ break;
10046+
10047+ case STATUS_NEG:
10048+ log_error(sg, "check_join_stop: neg from nodeid %u "
10049+ "(%d, %d, %u)", sev->se_node_ids[i],
10050+ pos, neg, sev->se_memb_count);
10051+ neg++;
10052+ break;
10053+
10054+ default:
10055+ log_error(sg, "check_join_stop: unknown status=%u "
10056+ "nodeid=%u", sev->se_node_status[i],
10057+ sev->se_node_ids[i]);
10058+ neg++;
10059+ break;
10060+ }
10061+ }
10062+
10063+ if (pos == sg->memb_count)
10064+ return 0;
10065+
10066+ free_sg_memb(sg);
10067+ return -1;
10068+}
10069+
10070+/*
10071+ * 5. Fifth step in joining the SG - everyone has stopped their service and we
10072+ * all now start the service with us, the new member, added to the SG member
10073+ * list. We send start to our own service here and send a message to the other
10074+ * members that they should also start their service.
10075+ */
10076+
10077+static int send_join_start(sm_sevent_t *sev)
10078+{
10079+ sm_group_t *sg = sev->se_sg;
10080+ sm_node_t *node;
10081+ uint32_t *memb;
10082+ char *msg;
10083+ int error, count = 0, len = 0;
10084+
10085+ /*
10086+ * Create a start message and send it.
10087+ */
10088+
10089+ msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev);
10090+
10091+ error = send_members_message(sg, msg, len);
10092+ if (error < 0)
10093+ goto fail;
10094+
10095+ /*
10096+ * Start the service ourself. The chunk of memory with the member ids
10097+ * must be freed by the service when it is done with it.
10098+ */
10099+
10100+ SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
10101+ memb);
10102+
10103+ list_for_each_entry(node, &sg->memb, list)
10104+ memb[count++] = node->id;
10105+
10106+ set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
10107+
10108+ sg->ops->start(sg->service_data, memb, count, sev->se_id,
10109+ SERVICE_NODE_JOIN);
10110+ return 0;
10111+
10112+ fail:
10113+ free_sg_memb(sg);
10114+ return error;
10115+}
10116+
10117+/*
10118+ * 6. Sixth step in joining the SG - once the service has completed its start,
10119+ * it does a kcl_start_done() to signal us that it's done. That gets us here
10120+ * and we do a barrier with all other members which join the barrier when their
10121+ * service is done starting.
10122+ */
10123+
10124+static int startdone_barrier_new(sm_sevent_t *sev)
10125+{
10126+ sm_group_t *sg = sev->se_sg;
10127+ char bname[MAX_BARRIER_NAME_LEN];
10128+ int error;
10129+
10130+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
10131+ sev->se_barrier_status = -1;
10132+
10133+ set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10134+
10135+ /* If we're the only member, skip the barrier */
10136+ if (sg->memb_count == 1) {
10137+ process_startdone_barrier_new(sg, 0);
10138+ return 0;
10139+ }
10140+
10141+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10142+ sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count);
10143+
10144+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW);
10145+ if (error)
10146+ goto fail;
10147+
10148+ return 0;
10149+
10150+ fail:
10151+ clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10152+ sg->ops->stop(sg->service_data);
10153+ free_sg_memb(sg);
10154+ return error;
10155+}
10156+
10157+/*
10158+ * 7. Seventh step in joining the SG - check that the barrier we joined with
10159+ * all other members returned with a successful status.
10160+ */
10161+
10162+static int check_startdone_barrier_new(sm_sevent_t *sev)
10163+{
10164+ sm_group_t *sg = sev->se_sg;
10165+ int error = sev->se_barrier_status;
10166+
10167+ if (error) {
10168+ sg->ops->stop(sg->service_data);
10169+ free_sg_memb(sg);
10170+ }
10171+ return error;
10172+}
10173+
10174+/*
10175+ * 8. Eigth step in joining the SG - send the service a "finish" indicating
10176+ * that all members have successfully started the service.
10177+ */
10178+
10179+static void do_finish_new(sm_sevent_t *sev)
10180+{
10181+ sm_group_t *sg = sev->se_sg;
10182+
10183+ sg->state = SGST_RUN;
10184+ sg->sevent = NULL;
10185+ clear_bit(SGFL_SEVENT, &sg->flags);
10186+
10187+ sg->ops->finish(sg->service_data, sev->se_id);
10188+}
10189+
10190+/*
10191+ * 9. Ninth step in joining the SG - it's done so get rid of the sevent stuff
10192+ * and tell the process which initiated the join that it's done.
10193+ */
10194+
10195+static void sevent_done(sm_sevent_t *sev)
10196+{
10197+ sm_group_t *sg = sev->se_sg;
10198+
10199+ list_del(&sev->se_list);
10200+ release_sevent(sev);
10201+ kfree(sev);
10202+ complete(&sg->event_comp);
10203+}
10204+
10205+/*
10206+ * Move through the steps of a join. Summary:
10207+ *
10208+ * 1. Send a join notice to all cluster members.
10209+ * 2. Collect and check replies to the join notice.
10210+ * 3. Send a stop message to all SG members.
10211+ * 4. Collect and check replies to the stop message.
10212+ * 5. Send a start message to all SG members and start service ourself.
10213+ * 6. Use barrier to wait for all nodes to complete the start.
10214+ * 7. Check that all SG members joined the barrier.
10215+ * 8. Send finish to the service indicating that all nodes started it.
10216+ * 9. Clean up sevent and signal completion to the process that started the join
10217+ */
10218+
10219+static void process_join_sevent(sm_sevent_t *sev)
10220+{
10221+ int error = 0;
10222+
10223+ /*
10224+ * We may cancel the current join attempt if another node is also
10225+ * attempting to join or leave. (Only a single node can join or leave
10226+ * at once.) If cancelled, 0ur join attempt will be restarted later.
10227+ */
10228+
10229+ if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
b7b72b66 10230+ error = 1;
c1c6733f
AM
10231+ goto cancel;
10232+ }
10233+
10234+ log_debug(sev->se_sg, "sevent state %u", sev->se_state);
10235+
10236+ switch (sev->se_state) {
10237+
10238+ /*
10239+ * An sevent is created in kcl_join_service with a state of
10240+ * JOIN_BEGIN.
10241+ */
10242+
10243+ case SEST_JOIN_BEGIN:
10244+ sev->se_state = SEST_JOIN_ACKWAIT;
10245+ error = send_join_notice(sev);
10246+ break;
10247+
10248+ /*
10249+ * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in
10250+ * process_reply (when all the replies have been received)
10251+ */
10252+
10253+ case SEST_JOIN_ACKED:
10254+ error = check_join_notice(sev);
10255+ if (error)
10256+ break;
10257+
10258+ sev->se_state = SEST_JSTOP_ACKWAIT;
10259+ error = send_join_stop(sev);
10260+ break;
10261+
10262+ /*
10263+ * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in
10264+ * proces_reply (when all the replies have been received)
10265+ */
10266+
10267+ case SEST_JSTOP_ACKED:
10268+ error = check_join_stop(sev);
10269+ if (error)
10270+ break;
10271+
10272+ sev->se_state = SEST_JSTART_SERVICEWAIT;
10273+ error = send_join_start(sev);
10274+ break;
10275+
10276+ /*
10277+ * se_state is changed from JSTART_SERVICEWAIT to
10278+ * JSTART_SERVICEDONE in kcl_start_done
10279+ */
10280+
10281+ case SEST_JSTART_SERVICEDONE:
10282+ sev->se_state = SEST_BARRIER_WAIT;
10283+ error = startdone_barrier_new(sev);
10284+ break;
10285+
10286+ /*
10287+ * se_state is changed from BARRIER_WAIT to BARRIER_DONE in
10288+ * process_startdone_barrier_new
10289+ */
10290+
10291+ case SEST_BARRIER_DONE:
10292+ error = check_startdone_barrier_new(sev);
10293+ if (error)
10294+ break;
10295+
10296+ do_finish_new(sev);
10297+ sevent_done(sev);
10298+ break;
10299+
10300+ default:
10301+ log_error(sev->se_sg, "no join processing for state %u",
10302+ sev->se_state);
10303+ }
10304+
10305+ cancel:
10306+ if (error) {
10307+ /* restart the sevent from the beginning */
b7b72b66
AM
10308+ log_debug(sev->se_sg, "process_join error %d %lx", error,
10309+ sev->se_flags);
c1c6733f
AM
10310+ sev->se_state = SEST_JOIN_BEGIN;
10311+ sev->se_sg->global_id = 0;
10312+ set_bit(SEFL_DELAY, &sev->se_flags);
10313+ schedule_sev_restart(sev);
10314+ }
10315+}
10316+
10317+/*
10318+ * 1. First step in leaving an SG - send a message to other SG members asking
10319+ * to leave the SG. Nodes that don't have another active sevent or uevent for
10320+ * this SG will return POS.
10321+ */
10322+
10323+static int send_leave_notice(sm_sevent_t *sev)
10324+{
10325+ sm_group_t *sg = sev->se_sg;
10326+ sm_node_t *node;
10327+ char *msg;
10328+ int i = 0, error = -1, len = 0;
10329+
10330+ /*
10331+ * Create a node array from member list in which to collect responses.
10332+ */
10333+
10334+ error = init_sevent(sev);
10335+ if (error)
10336+ goto out;
10337+
10338+ list_for_each_entry(node, &sg->memb, list)
10339+ sev->se_node_ids[i++] = node->id;
10340+
10341+ /*
10342+ * Create and send a leave request message.
10343+ */
10344+
10345+ msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev);
10346+
10347+ error = send_members_message_sev(sg, msg, len, sev);
10348+
10349+ out:
10350+ return error;
10351+}
10352+
10353+/*
10354+ * 2. Second step in leaving an SG - after we collect all replies to our leave
10355+ * request, we look at them. If anyone replied with WAIT, we abort our attempt
10356+ * at leaving and try again in a bit.
10357+ */
10358+
10359+static int check_leave_notice(sm_sevent_t *sev)
10360+{
10361+ int pos = 0, wait = 0, neg = 0, restart = 0, i;
10362+
10363+ for (i = 0; i < sev->se_memb_count; i++) {
10364+ switch (sev->se_node_status[i]) {
10365+ case STATUS_POS:
10366+ pos++;
10367+ break;
10368+
10369+ case STATUS_WAIT:
10370+ wait++;
10371+ break;
10372+
10373+ case STATUS_NEG:
10374+ neg++;
10375+ break;
10376+
10377+ default:
10378+ /* we didn't get a valid response from this node,
10379+ * restart the entire sev. */
10380+ restart++;
10381+ break;
10382+ }
10383+ }
10384+
10385+ /* all members approve */
10386+ if (pos && !wait && !restart)
10387+ return 0;
10388+
10389+ return -1;
10390+}
10391+
10392+/*
10393+ * 3. Third step in leaving the SG - tell the member nodes to "stop" the SG.
10394+ * They must be stopped in order to restart without us as a member.
10395+ */
10396+
10397+static int send_leave_stop(sm_sevent_t *sev)
10398+{
10399+ sm_group_t *sg = sev->se_sg;
10400+ char *msg;
10401+ int error, len = 0;
10402+
10403+ /*
10404+ * Re-init the status vector in which to collect responses.
10405+ */
10406+
10407+ memset(sev->se_node_status, 0, sev->se_len_status);
10408+
10409+ /*
10410+ * Create and send a stop message.
10411+ */
10412+
10413+ msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev);
10414+
10415+ error = send_members_message_sev(sg, msg, len, sev);
10416+ if (error < 0)
10417+ goto out;
10418+
10419+ /*
10420+ * we and all others stop the SG now
10421+ */
10422+
10423+ sg->ops->stop(sg->service_data);
10424+
10425+ out:
10426+ return error;
10427+}
10428+
10429+/*
10430+ * 4. Fourth step in leaving the SG - check the replies to our stop request.
10431+ * Same problem with getting different replies as check_join_stop.
10432+ */
10433+
10434+static int check_leave_stop(sm_sevent_t *sev)
10435+{
10436+ sm_group_t *sg = sev->se_sg;
10437+ int i, pos = 0, neg = 0;
10438+
10439+ for (i = 0; i < sev->se_memb_count; i++) {
10440+ switch (sev->se_node_status[i]) {
10441+ case STATUS_POS:
10442+ pos++;
10443+ break;
10444+
10445+ case STATUS_NEG:
10446+ log_error(sg, "check_leave_stop: fail from nodeid %u "
10447+ "(%d, %d, %u)", sev->se_node_ids[i],
10448+ pos, neg, sev->se_memb_count);
10449+ neg++;
10450+ break;
10451+
10452+ default:
10453+ log_error(sg, "check_leave_stop: status %u nodeid %u",
10454+ sev->se_node_status[i], sev->se_node_ids[i]);
10455+ neg++;
10456+ break;
10457+ }
10458+ }
10459+
10460+ if (pos == sg->memb_count)
10461+ return 0;
10462+
10463+ return -1;
10464+}
10465+
10466+/*
10467+ * 5. Fifth step in leaving the SG - tell the other SG members to restart the
10468+ * service without us. We, of course, don't start our own stopped service. If
10469+ * we're the last SG member and leaving, we jump right to the next step.
10470+ */
10471+
10472+static int send_leave_start(sm_sevent_t *sev)
10473+{
10474+ sm_group_t *sg = sev->se_sg;
10475+ char *msg;
10476+ int error = 0, len = 0;
10477+
10478+ if (sg->memb_count == 1) {
10479+ sev->se_state = SEST_LSTART_REMOTEDONE;
10480+ set_bit(SEFL_CHECK, &sev->se_flags);
10481+ wake_serviced(DO_JOINLEAVE);
10482+ } else {
10483+ msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev);
10484+ error = send_members_message(sg, msg, len);
10485+ }
10486+ return error;
10487+}
10488+
10489+/*
10490+ * Move through the steps of a leave. Summary:
10491+ *
10492+ * 1. Send a leave notice to all SG members.
10493+ * 2. Collect and check replies to the leave notice.
10494+ * 3. Send a stop message to all SG members and stop our own SG.
10495+ * 4. Collect and check replies to the stop message.
10496+ * 5. Send a start message to SG members.
10497+ * 6. Clean up sevent and signal completion to the process that
10498+ * started the leave.
10499+ */
10500+
10501+static void process_leave_sevent(sm_sevent_t *sev)
10502+{
10503+ int error = 0;
10504+
10505+ /*
10506+ * We may cancel the current leave attempt if another node is also
10507+ * attempting to join or leave. (Only a single node can join or leave
10508+ * at once.) Our leave attempt will be restarted after being
10509+ * cancelled.
10510+ */
10511+
10512+ if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
10513+ error = 1;
10514+ goto cancel;
10515+ }
10516+
10517+ if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) {
10518+ error = 2;
10519+ goto cancel;
10520+ }
10521+
10522+ if (!list_empty(&sev->se_sg->joining)) {
10523+ error = 3;
10524+ goto cancel;
10525+ }
10526+
10527+ log_debug(sev->se_sg, "sevent state %u", sev->se_state);
10528+
10529+ switch (sev->se_state) {
10530+
10531+ /*
10532+ * An sevent is created in kcl_leave_service with a state of
10533+ * LEAVE_BEGIN.
10534+ */
10535+
10536+ case SEST_LEAVE_BEGIN:
10537+ sev->se_state = SEST_LEAVE_ACKWAIT;
10538+ error = send_leave_notice(sev);
10539+ break;
10540+
10541+ /*
10542+ * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in
10543+ * process_reply (when all the replies have been received)
10544+ */
10545+
10546+ case SEST_LEAVE_ACKED:
10547+ error = check_leave_notice(sev);
10548+ if (error)
10549+ break;
10550+
10551+ sev->se_state = SEST_LSTOP_ACKWAIT;
10552+ error = send_leave_stop(sev);
10553+ break;
10554+
10555+ /*
10556+ * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in
10557+ * process_reply
10558+ */
10559+
10560+ case SEST_LSTOP_ACKED:
10561+ error = check_leave_stop(sev);
10562+ if (error)
10563+ break;
10564+
10565+ sev->se_state = SEST_LSTART_WAITREMOTE;
10566+ error = send_leave_start(sev);
10567+ break;
10568+
10569+ /*
10570+ * se_state is changed from LSTART_WAITREMOTE to
10571+ * LSTART_REMOTEDONE in process_leave_done
10572+ */
10573+
10574+ case SEST_LSTART_REMOTEDONE:
10575+ sevent_done(sev);
10576+ break;
10577+
10578+ default:
b7b72b66 10579+ log_error(sev->se_sg, "process_leave_sevent state=%u",
c1c6733f
AM
10580+ sev->se_state);
10581+ }
10582+
b7b72b66 10583+ cancel:
c1c6733f 10584+ if (error) {
b7b72b66
AM
10585+ log_debug(sev->se_sg, "process_leave error %d %lx", error,
10586+ sev->se_flags);
c1c6733f
AM
10587+ /* restart the sevent from the beginning */
10588+ sev->se_state = SEST_LEAVE_BEGIN;
10589+ set_bit(SEFL_DELAY, &sev->se_flags);
10590+ schedule_sev_restart(sev);
10591+ }
10592+}
10593+
10594+/*
10595+ * Sevent backout code. Take appropriate steps when a recovery occurs while
10596+ * we're in the midst of an sevent. The recovery may or may not affect the
10597+ * sevent. If it does, it usually means cancelling the sevent and restarting
10598+ * it from the beginning once the recovery processing is done.
10599+ */
10600+
10601+/*
10602+ * If any of the nodes that replied with OK is dead, we give up on the current
10603+ * join attempt and restart. Otherwise, this sevent can continue.
10604+ */
10605+
10606+static int backout_join_acked(sm_sevent_t *sev)
10607+{
10608+ sm_node_t *node;
10609+ int i;
10610+
10611+ for (i = 0; i < sev->se_node_count; i++) {
10612+ if (sev->se_node_status[i] != STATUS_POS)
10613+ continue;
10614+
10615+ list_for_each_entry(node, &sm_members, list) {
10616+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags) &&
10617+ (node->id == sev->se_node_ids[i]))
10618+ return TRUE;
10619+ }
10620+ }
10621+ return FALSE;
10622+}
10623+
10624+/*
10625+ * In this state our sg member list exists and mark_affected_sgs() will have
10626+ * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead. We
10627+ * restart the join process if this is the case, otherwise this sevent can
10628+ * continue.
10629+ */
10630+
10631+static int backout_jstop_ackwait(sm_sevent_t *sev)
10632+{
10633+ sm_group_t *sg = sev->se_sg;
10634+
10635+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10636+ return FALSE;
10637+
10638+ clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags);
10639+ free_sg_memb(sg);
10640+ return TRUE;
10641+}
10642+
10643+/*
10644+ * Same as previous.
10645+ */
10646+
10647+static int backout_jstop_acked(sm_sevent_t *sev)
10648+{
10649+ return backout_jstop_ackwait(sev);
10650+}
10651+
10652+/*
10653+ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10654+ * starting our service. The recovery process will restart the service on all
10655+ * the prior sg members (not including those that died or us). We will
10656+ * reattempt our join which should be accepted once the nodes are done with
10657+ * recovery.
10658+ */
10659+
10660+static int backout_jstart_servicewait(sm_sevent_t *sev)
10661+{
10662+ sm_group_t *sg = sev->se_sg;
10663+
10664+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10665+ return FALSE;
10666+
10667+ clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
10668+ sg->ops->stop(sg->service_data);
10669+ free_sg_memb(sg);
10670+ return TRUE;
10671+}
10672+
10673+/*
10674+ * Same as previous.
10675+ */
10676+
10677+static int backout_jstart_servicedone(sm_sevent_t *sev)
10678+{
10679+ return backout_jstart_servicewait(sev);
10680+}
10681+
10682+/*
10683+ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10684+ * waiting on the "all done" barrier. Stop our service that we just started
10685+ * and cancel the barrier. The recovery process will restart the service on
10686+ * all the prior sg members (not including those that died or us). We will
10687+ * reattempt our join which should be accepted once the nodes are done with
10688+ * recovery.
10689+ */
10690+
10691+static int backout_barrier_wait(sm_sevent_t *sev)
10692+{
10693+ sm_group_t *sg = sev->se_sg;
10694+ char bname[MAX_BARRIER_NAME_LEN];
10695+
10696+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10697+ return FALSE;
10698+
10699+ clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10700+
10701+ sg->ops->stop(sg->service_data);
10702+
10703+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
10704+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10705+ sg->global_id, sm_our_nodeid, sev->se_id,
10706+ sg->memb_count);
10707+ kcl_barrier_cancel(bname);
10708+
10709+ free_sg_memb(sg);
10710+ return TRUE;
10711+}
10712+
10713+/*
10714+ * If NEED_RECOVERY is set, a member of the sg we just joined has failed. The
10715+ * recovery began after the barrier callback. If the result in the callback is
10716+ * "success" then we are joined, this sevent is finished and we'll process the
10717+ * sg within the forthcoming recovery with the other members.
10718+ *
10719+ * We rely upon cnxman to guarantee that once all nodes have joined a barrier,
10720+ * all nodes will receive the corresponding barrier callback *before any*
10721+ * receive an sm_member_update() due to one of those nodes failing just after
10722+ * joining the barrier. If some nodes receive the sm_member_update() before
10723+ * the barrier callback and others receive the barrier callback before the
10724+ * sm_member_update() then they will disagree as to whether the node joining/
10725+ * leaving is in/out of the sg.
10726+ */
10727+
10728+static int backout_barrier_done(sm_sevent_t *sev)
10729+{
10730+ sm_group_t *sg = sev->se_sg;
10731+
10732+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10733+ return FALSE;
10734+
10735+ if (!sev->se_barrier_status) {
10736+ do_finish_new(sev);
10737+ sevent_done(sev);
10738+ return FALSE;
10739+ } else {
10740+ sg->ops->stop(sg->service_data);
10741+ free_sg_memb(sg);
10742+ return TRUE;
10743+ }
10744+}
10745+
10746+/*
10747+ * We've done nothing yet, just restart when recovery is done (if sg is flagged
10748+ * with recovery.)
10749+ */
10750+
10751+static int backout_leave_begin(sm_sevent_t *sev)
10752+{
10753+ sm_group_t *sg = sev->se_sg;
10754+
10755+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10756+ return FALSE;
10757+
10758+ return TRUE;
10759+}
10760+
10761+/*
10762+ * Ignore any replies to our leave notice and restart when recovery is done (if
10763+ * sg is flagged with recovery.)
10764+ */
10765+
10766+static int backout_leave_ackwait(sm_sevent_t *sev)
10767+{
10768+ sm_group_t *sg = sev->se_sg;
10769+
10770+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10771+ return FALSE;
10772+
10773+ clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags);
10774+
10775+ return TRUE;
10776+}
10777+
10778+/*
10779+ * Same as previous.
10780+ */
10781+
10782+static int backout_leave_acked(sm_sevent_t *sev)
10783+{
10784+ return backout_leave_ackwait(sev);
10785+}
10786+
10787+/*
10788+ * Ignore any stop replies. All the members will be stopped anyway to do the
10789+ * recovery. Let that happen and restart our leave when done.
10790+ */
10791+
10792+static int backout_lstop_ackwait(sm_sevent_t *sev)
10793+{
10794+ sm_group_t *sg = sev->se_sg;
10795+
10796+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10797+ return FALSE;
10798+
10799+ clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags);
10800+
10801+ return TRUE;
10802+}
10803+
10804+/*
10805+ * Same as previous.
10806+ */
10807+
10808+static int backout_lstop_acked(sm_sevent_t *sev)
10809+{
10810+ return backout_lstop_ackwait(sev);
10811+}
10812+
10813+/*
10814+ * All members will be stopped due to recovery and restarted by recovery
10815+ * processing. That includes us, we have to retry the leave once the recovery
10816+ * is done.
10817+ */
10818+
10819+static int backout_lstart_waitremote(sm_sevent_t *sev)
10820+{
10821+ sm_group_t *sg = sev->se_sg;
10822+
10823+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10824+ return FALSE;
10825+
10826+ return TRUE;
10827+}
10828+
10829+/*
10830+ * Reset an sevent to its beginning so it can be restarted. This is necessary
10831+ * when recovery affects an SG while we're trying to join or leave (ie. a node
10832+ * in the SG fails).
10833+ */
10834+
10835+void backout_sevents(void)
10836+{
10837+ sm_sevent_t *sev, *safe;
10838+ int delay;
10839+
10840+ list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10841+
10842+ delay = FALSE;
10843+
10844+ log_debug(sev->se_sg, "backout sevent state %u", sev->se_state);
10845+
10846+ switch (sev->se_state) {
10847+
10848+ /* backout after kcl_join_service and before
10849+ * send_join_notice */
10850+ case SEST_JOIN_BEGIN:
10851+ break;
10852+
10853+ /* backout after send_join_notice and before final
10854+ * process_reply */
10855+ case SEST_JOIN_ACKWAIT:
10856+ clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags);
10857+ sev->se_state = SEST_JOIN_BEGIN;
b7b72b66
AM
10858+ set_bit(SEFL_CHECK, &sev->se_flags);
10859+ wake_serviced(DO_JOINLEAVE);
c1c6733f
AM
10860+ break;
10861+
10862+ /* backout after final process_reply and before
10863+ * check_join_notice */
10864+ case SEST_JOIN_ACKED:
10865+ delay = backout_join_acked(sev);
10866+ break;
10867+
10868+ /* backout after send_join_stop and before final
10869+ * process_reply */
10870+ case SEST_JSTOP_ACKWAIT:
10871+ delay = backout_jstop_ackwait(sev);
10872+ break;
10873+
10874+ /* backout after final process_reply and before
10875+ * check_join_stop */
10876+ case SEST_JSTOP_ACKED:
10877+ delay = backout_jstop_acked(sev);
10878+ break;
10879+
10880+ /* backout after send_join_start and before
10881+ * kcl_start_done */
10882+ case SEST_JSTART_SERVICEWAIT:
10883+ delay = backout_jstart_servicewait(sev);
10884+ break;
10885+
10886+ /* backout after kcl_start_done and before
10887+ * startdone_barrier_new */
10888+ case SEST_JSTART_SERVICEDONE:
10889+ delay = backout_jstart_servicedone(sev);
10890+ break;
10891+
10892+ /* backout after startdone_barrier_new and before
10893+ * callback_startdone_barrier_new */
10894+ case SEST_BARRIER_WAIT:
10895+ delay = backout_barrier_wait(sev);
10896+ break;
10897+
10898+ /* backout after callback_startdone_barrier_new and
10899+ * before check_startdone_barrier_new */
10900+ case SEST_BARRIER_DONE:
10901+ delay = backout_barrier_done(sev);
10902+ break;
10903+
10904+ /* backout after kcl_leave_service and before
10905+ * send_leave_notice */
10906+ case SEST_LEAVE_BEGIN:
10907+ delay = backout_leave_begin(sev);
10908+ break;
10909+
10910+ /* backout after send_leave_notice and before final
10911+ * process_reply */
10912+ case SEST_LEAVE_ACKWAIT:
10913+ delay = backout_leave_ackwait(sev);
10914+ break;
10915+
10916+ /* backout after final process_reply and before
10917+ * check_leave_notice */
10918+ case SEST_LEAVE_ACKED:
10919+ delay = backout_leave_acked(sev);
10920+ break;
10921+
10922+ /* backout after send_leave_stop and before final
10923+ * process_reply */
10924+ case SEST_LSTOP_ACKWAIT:
10925+ delay = backout_lstop_ackwait(sev);
10926+ break;
10927+
10928+ /* backout after final process_reply and before
10929+ * check_leave_stop */
10930+ case SEST_LSTOP_ACKED:
10931+ delay = backout_lstop_acked(sev);
10932+ break;
10933+
10934+ /* backout after send_leave_start and before
10935+ * process_lstart_done */
10936+ case SEST_LSTART_WAITREMOTE:
10937+ delay = backout_lstart_waitremote(sev);
10938+ break;
10939+
10940+ /* backout after process_lstart_done and before
10941+ * process_leave_sevent */
10942+ case SEST_LSTART_REMOTEDONE:
10943+ sevent_done(sev);
10944+ delay = FALSE;
10945+ break;
10946+
10947+ default:
10948+ log_error(sev->se_sg, "backout_sevents: bad state %d",
10949+ sev->se_state);
10950+ }
10951+
10952+ if (delay) {
c1c6733f
AM
10953+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
10954+ sev->se_state = SEST_LEAVE_BEGIN;
b7b72b66
AM
10955+ set_bit(SEFL_DELAY_RECOVERY, &sev->se_flags);
10956+ set_bit(SEFL_CHECK, &sev->se_flags);
10957+ wake_serviced(DO_JOINLEAVE);
c1c6733f
AM
10958+ } else {
10959+ sev->se_state = SEST_JOIN_BEGIN;
b7b72b66
AM
10960+ set_bit(SEFL_CHECK, &sev->se_flags);
10961+ wake_serviced(DO_JOINLEAVE);
c1c6733f
AM
10962+ }
10963+ }
10964+ }
10965+}
10966+
10967+void process_joinleave(void)
10968+{
10969+ sm_sevent_t *sev = NULL, *safe;
10970+
10971+ spin_lock(&new_event_lock);
10972+ if (!list_empty(&new_event)) {
10973+ sev = list_entry(new_event.next, sm_sevent_t, se_list);
10974+ list_del(&sev->se_list);
10975+ list_add_tail(&sev->se_list, &joinleave_events);
10976+ set_bit(SEFL_CHECK, &sev->se_flags);
10977+ }
10978+ spin_unlock(&new_event_lock);
10979+
10980+ list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10981+ if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags))
10982+ continue;
10983+
b7b72b66
AM
10984+ if (test_bit(SEFL_DELAY, &sev->se_flags) ||
10985+ test_bit(SEFL_DELAY_RECOVERY, &sev->se_flags))
c1c6733f
AM
10986+ continue;
10987+
10988+ if (sev->se_state < SEST_LEAVE_BEGIN)
10989+ process_join_sevent(sev);
10990+ else
10991+ process_leave_sevent(sev);
10992+ }
10993+}
10994diff -urN linux-orig/cluster/cman/sm_joinleave.h linux-patched/cluster/cman/sm_joinleave.h
bb1d8b11
AM
10995--- linux-orig/cluster/cman/sm_joinleave.h 1970-01-01 07:30:00.000000000 +0730
10996+++ linux-patched/cluster/cman/sm_joinleave.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
10997@@ -0,0 +1,23 @@
10998+/******************************************************************************
10999+*******************************************************************************
11000+**
11001+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11002+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11003+**
11004+** This copyrighted material is made available to anyone wishing to use,
11005+** modify, copy, or redistribute it subject to the terms and conditions
11006+** of the GNU General Public License v.2.
11007+**
11008+*******************************************************************************
11009+******************************************************************************/
11010+
11011+#ifndef __SM_JOINLEAVE_DOT_H__
11012+#define __SM_JOINLEAVE_DOT_H__
11013+
11014+void init_joinleave(void);
11015+void new_joinleave(sm_sevent_t *sev);
11016+void process_joinleave(void);
11017+void backout_sevents(void);
11018+sm_sevent_t *find_sevent(unsigned int id);
11019+
11020+#endif
11021diff -urN linux-orig/cluster/cman/sm_membership.c linux-patched/cluster/cman/sm_membership.c
bb1d8b11
AM
11022--- linux-orig/cluster/cman/sm_membership.c 1970-01-01 07:30:00.000000000 +0730
11023+++ linux-patched/cluster/cman/sm_membership.c 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
11024@@ -0,0 +1,696 @@
11025+/******************************************************************************
11026+*******************************************************************************
11027+**
11028+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11029+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11030+**
11031+** This copyrighted material is made available to anyone wishing to use,
11032+** modify, copy, or redistribute it subject to the terms and conditions
11033+** of the GNU General Public License v.2.
11034+**
11035+*******************************************************************************
11036+******************************************************************************/
11037+
11038+#include "sm.h"
11039+
11040+extern struct list_head sm_members;
11041+
11042+/*
11043+ * Routines for SG members to handle other nodes joining or leaving the SG.
11044+ * These "uevent" membership update routines are the response to an "sevent" on
11045+ * a joining/leaving node.
11046+ */
11047+
11048+static void del_memb_node(sm_group_t *sg, uint32_t nodeid)
11049+{
11050+ sm_node_t *node;
11051+
11052+ list_for_each_entry(node, &sg->memb, list) {
11053+ if (node->id != nodeid)
11054+ continue;
11055+ list_del(&node->list);
11056+ kfree(node);
11057+ sg->memb_count--;
11058+ log_debug(sg, "del node %u count %d", nodeid, sg->memb_count);
11059+ break;
11060+ }
11061+}
11062+
11063+static void add_memb_node(sm_group_t *sg, sm_node_t *node)
11064+{
11065+ list_add_tail(&node->list, &sg->memb);
11066+ sg->memb_count++;
11067+ log_debug(sg, "add node %u count %d", node->id, sg->memb_count);
11068+}
11069+
11070+/*
11071+ * Join 1. The receive end of send_join_stop() from a node requesting to join
11072+ * the SG. We stop the service so it can be restarted with the new node.
11073+ */
11074+
11075+static int process_join_stop(sm_group_t *sg)
11076+{
11077+ sm_uevent_t *uev = &sg->uevent;
11078+ sm_node_t *node;
11079+ sm_msg_t reply;
11080+ int error;
11081+
11082+ if (uev->ue_num_nodes != sg->memb_count + 1) {
11083+ log_error(sg, "process_join_stop: bad num nodes %u %u",
11084+ uev->ue_num_nodes, sg->memb_count);
11085+ return -1;
11086+ }
11087+
11088+ sm_set_event_id(&uev->ue_id);
11089+
11090+ node = sm_find_joiner(sg, uev->ue_nodeid);
11091+ SM_ASSERT(node,);
11092+
11093+ sg->state = SGST_UEVENT;
11094+ sg->ops->stop(sg->service_data);
11095+
11096+ reply.ms_type = SMSG_JSTOP_REP;
11097+ reply.ms_status = STATUS_POS;
11098+ reply.ms_sevent_id = uev->ue_remote_seid;
11099+ smsg_bswap_out(&reply);
11100+
11101+ error = send_nodeid_message((char *) &reply, sizeof(reply),
11102+ uev->ue_nodeid);
11103+ if (error < 0)
11104+ return error;
11105+ return 0;
11106+}
11107+
11108+/*
11109+ * Join 2. The receive end of send_join_start() from a node joining the SG.
11110+ * We are re-starting the service with the new member added.
11111+ */
11112+
11113+static int process_join_start(sm_group_t *sg)
11114+{
11115+ sm_uevent_t *uev = &sg->uevent;
11116+ sm_node_t *node;
11117+ uint32_t *memb;
11118+ int count = 0;
11119+
11120+ /* this memory is passed to the service which must free it */
11121+ SM_RETRY(memb =
11122+ kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL),
11123+ memb);
11124+
11125+ /* transfer joining node from joining list to member list */
11126+ node = sm_find_joiner(sg, uev->ue_nodeid);
11127+ SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid););
11128+ list_del(&node->list);
11129+ add_memb_node(sg, node);
11130+
11131+ /* the new member list for the service */
11132+ list_for_each_entry(node, &sg->memb, list)
11133+ memb[count++] = node->id;
11134+
11135+ set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11136+
11137+ sg->ops->start(sg->service_data, memb, count, uev->ue_id,
11138+ SERVICE_NODE_JOIN);
11139+ return 0;
11140+}
11141+
11142+/*
11143+ * Join 3. When done starting their local service, every previous SG member
11144+ * calls startdone_barrier() and the new/joining member calls
11145+ * startdone_barrier_new(). The barrier returns when everyone has started
11146+ * their service and joined the barrier.
11147+ */
11148+
11149+static int startdone_barrier(sm_group_t *sg)
11150+{
11151+ sm_uevent_t *uev = &sg->uevent;
11152+ char bname[MAX_BARRIER_NAME_LEN];
11153+ int error;
11154+
11155+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
11156+ uev->ue_barrier_status = -1;
11157+
11158+ set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
11159+
11160+ /* If we're the only member, skip the barrier */
11161+ if (sg->memb_count == 1) {
11162+ process_startdone_barrier(sg, 0);
11163+ return 0;
11164+ }
11165+
11166+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
11167+ sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
11168+ sg->memb_count);
11169+
11170+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE);
11171+
11172+ return error;
11173+}
11174+
11175+/*
11176+ * Join 4. Check that the "all started" barrier returned a successful status.
11177+ * The newly joined member calls check_startdone_barrier_new().
11178+ */
11179+
11180+static int check_startdone_barrier(sm_group_t *sg)
11181+{
11182+ int error = sg->uevent.ue_barrier_status;
11183+ return error;
11184+}
11185+
11186+/*
11187+ * Join 5. Send the service a "finish" indicating that all members have
11188+ * successfully started. The newly joined member calls do_finish_new().
11189+ */
11190+
11191+static void do_finish(sm_group_t *sg)
11192+{
11193+ sg->state = SGST_RUN;
11194+ clear_bit(SGFL_UEVENT, &sg->flags);
11195+ sg->ops->finish(sg->service_data, sg->uevent.ue_id);
11196+}
11197+
11198+/*
11199+ * Join 6. The uevent is done. If this was a uevent for a node leaving the
11200+ * SG, then send a final message to the departed node signalling that the
11201+ * remaining nodes have restarted since it left.
11202+ */
11203+
11204+static void uevent_done(sm_group_t *sg)
11205+{
11206+ sm_uevent_t *uev = &sg->uevent;
11207+ sm_msg_t reply;
11208+
11209+ if (test_bit(UEFL_LEAVE, &uev->ue_flags)) {
11210+ reply.ms_type = SMSG_LSTART_DONE;
11211+ reply.ms_status = STATUS_POS;
11212+ reply.ms_sevent_id = uev->ue_remote_seid;
11213+ smsg_bswap_out(&reply);
11214+ send_nodeid_message((char *) &reply, sizeof(reply),
11215+ uev->ue_nodeid);
11216+ }
11217+ memset(&sg->uevent, 0, sizeof(sm_uevent_t));
11218+}
11219+
11220+/*
11221+ * Leave 1. The receive end of send_leave_stop() from a node leaving the SG.
11222+ */
11223+
11224+static int process_leave_stop(sm_group_t *sg)
11225+{
11226+ sm_uevent_t *uev = &sg->uevent;
11227+ sm_msg_t reply;
11228+ int error;
11229+
11230+ sm_set_event_id(&uev->ue_id);
11231+
11232+ sg->state = SGST_UEVENT;
11233+ sg->ops->stop(sg->service_data);
11234+
11235+ reply.ms_type = SMSG_LSTOP_REP;
11236+ reply.ms_status = STATUS_POS;
11237+ reply.ms_sevent_id = uev->ue_remote_seid;
11238+ smsg_bswap_out(&reply);
11239+
11240+ error = send_nodeid_message((char *) &reply, sizeof(reply),
11241+ uev->ue_nodeid);
11242+ if (error < 0)
11243+ return error;
11244+ return 0;
11245+}
11246+
11247+/*
11248+ * Leave 2. The receive end of send_leave_start() from a node leaving the SG.
11249+ * We are re-starting the service (without the node that's left naturally.)
11250+ */
11251+
11252+static int process_leave_start(sm_group_t *sg)
11253+{
11254+ sm_uevent_t *uev = &sg->uevent;
11255+ sm_node_t *node;
11256+ uint32_t *memb;
11257+ int count = 0;
11258+
11259+ SM_ASSERT(sg->memb_count > 1,
11260+ printk("memb_count=%u\n", sg->memb_count););
11261+
11262+ /* this memory is passed to the service which must free it */
11263+ SM_RETRY(memb =
11264+ kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL),
11265+ memb);
11266+
11267+ /* remove departed member from sg member list */
11268+ del_memb_node(sg, uev->ue_nodeid);
11269+
11270+ /* build member list to pass to service */
11271+ list_for_each_entry(node, &sg->memb, list)
11272+ memb[count++] = node->id;
11273+
11274+ /* allow us to accept the start_done callback for this start */
11275+ set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11276+
11277+ sg->ops->start(sg->service_data, memb, count, uev->ue_id,
11278+ SERVICE_NODE_LEAVE);
11279+ return 0;
11280+}
11281+
11282+/*
11283+ * Move through the steps of another node joining or leaving the SG.
11284+ */
11285+
11286+static void process_one_uevent(sm_group_t *sg)
11287+{
11288+ sm_uevent_t *uev = &sg->uevent;
11289+ int error = 0;
11290+
11291+ log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid);
11292+
11293+ switch (uev->ue_state) {
11294+
11295+ /*
11296+ * a uevent is initialized with state JSTOP in
11297+ * process_stop_request
11298+ */
11299+
11300+ case UEST_JSTOP:
11301+ uev->ue_state = UEST_JSTART_WAITCMD;
11302+ error = process_join_stop(sg);
11303+ break;
11304+
11305+ /*
11306+ * ue_state is changed from JSTART_WAITCMD to JSTART in
11307+ * process_start_request
11308+ */
11309+
11310+ case UEST_JSTART:
11311+ uev->ue_state = UEST_JSTART_SERVICEWAIT;
11312+ error = process_join_start(sg);
11313+ break;
11314+
11315+ /*
11316+ * ue_state is changed from JSTART_SERVICEWAIT to
11317+ * JSTART_SERVICEDONE in kcl_start_done
11318+ */
11319+
11320+ case UEST_JSTART_SERVICEDONE:
11321+ uev->ue_state = UEST_BARRIER_WAIT;
11322+ error = startdone_barrier(sg);
11323+ break;
11324+
11325+ /*
11326+ * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in
11327+ * process_startdone_barrier
11328+ */
11329+
11330+ case UEST_BARRIER_DONE:
11331+ error = check_startdone_barrier(sg);
11332+ if (error)
11333+ break;
11334+
11335+ do_finish(sg);
11336+ uevent_done(sg);
11337+ break;
11338+
11339+ /*
11340+ * a uevent is initialized with state LSTOP in
11341+ * process_stop_request
11342+ */
11343+
11344+ case UEST_LSTOP:
11345+ uev->ue_state = UEST_LSTART_WAITCMD;
11346+ error = process_leave_stop(sg);
11347+ break;
11348+
11349+ /*
11350+ * a uevent is changed from LSTART_WAITCMD to LSTART in
11351+ * process_start_request
11352+ */
11353+
11354+ case UEST_LSTART:
11355+ uev->ue_state = UEST_LSTART_SERVICEWAIT;
11356+ error = process_leave_start(sg);
11357+ break;
11358+
11359+ /*
11360+ * a uevent is changed from LSTART_SERVICEWAIT to to
11361+ * LSTART_SERVICEDONE in kcl_start_done
11362+ */
11363+
11364+ case UEST_LSTART_SERVICEDONE:
11365+ uev->ue_state = UEST_BARRIER_WAIT;
11366+ error = startdone_barrier(sg);
11367+ break;
11368+
11369+ default:
11370+ error = -1;
11371+ }
11372+
11373+ /* If we encounter an error during these routines, we do nothing,
11374+ expecting that a node failure related to this sg will cause a
11375+ recovery event to arrive and call cancel_one_uevent(). */
11376+
11377+ if (error)
11378+ log_error(sg, "process_one_uevent error %d state %u",
11379+ error, uev->ue_state);
11380+}
11381+
11382+static sm_node_t *failed_memb(sm_group_t *sg, int *count)
11383+{
11384+ sm_node_t *node, *sm_node, *failed_uev_node = NULL;
11385+
11386+ list_for_each_entry(node, &sg->memb, list) {
11387+
11388+ sm_node = sm_find_member(node->id);
11389+ SM_ASSERT(sm_node, );
11390+
11391+ if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) {
11392+ (*count)++;
11393+ if (node->id == sg->uevent.ue_nodeid)
11394+ failed_uev_node = sm_node;
11395+ }
11396+ }
11397+ return failed_uev_node;
11398+}
11399+
11400+static void send_recover_msg(sm_group_t *sg)
11401+{
11402+ char *msg;
11403+ int len = 0;
11404+ msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL);
11405+ send_members_message(sg, msg, len);
11406+}
11407+
11408+static void cancel_barrier(sm_group_t *sg)
11409+{
11410+ sm_uevent_t *uev = &sg->uevent;
11411+ char bname[MAX_BARRIER_NAME_LEN];
11412+
11413+ clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
11414+
11415+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
11416+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
11417+ sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
11418+ sg->memb_count);
11419+ kcl_barrier_cancel(bname);
11420+}
11421+
11422+static void cancel_one_uevent(sm_group_t *sg, int *effected)
11423+{
11424+ sm_uevent_t *uev = &sg->uevent;
11425+ int failed_count;
11426+ sm_node_t *node, *failed_joiner, *failed_leaver;
11427+
11428+ log_debug(sg, "cancel uevent state %u node %u", uev->ue_state,
11429+ uev->ue_nodeid);
11430+
11431+ switch (uev->ue_state) {
11432+
11433+ case UEST_JSTOP:
11434+ case UEST_JSTART_WAITCMD:
11435+ case UEST_JSTART:
11436+
11437+ sg->ops->stop(sg->service_data);
11438+
11439+ failed_count = 0;
11440+ failed_joiner = failed_memb(sg, &failed_count);
11441+ SM_ASSERT(!failed_joiner, );
11442+
11443+ node = sm_find_member(uev->ue_nodeid);
11444+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11445+ failed_joiner = node;
11446+
11447+ if (!failed_count) {
11448+ /* only joining node failed */
11449+ SM_ASSERT(failed_joiner, );
11450+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11451+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11452+ (*effected)++;
11453+ /* some nodes may not have gotten a JSTOP message
11454+ in which case this will tell them to begin
11455+ recovery for this sg. */
11456+ send_recover_msg(sg);
11457+
11458+ } else {
11459+ /* a member node failed (and possibly joining node, it
11460+ doesn't matter) */
11461+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11462+ }
11463+
11464+ clear_bit(SGFL_UEVENT, &sg->flags);
11465+ memset(uev, 0, sizeof(sm_uevent_t));
11466+ break;
11467+
11468+
11469+ case UEST_JSTART_SERVICEWAIT:
11470+ case UEST_JSTART_SERVICEDONE:
11471+
11472+ clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11473+ sg->ops->stop(sg->service_data);
11474+
11475+ failed_count = 0;
11476+ failed_joiner = failed_memb(sg, &failed_count);
11477+ SM_ASSERT(failed_count, );
11478+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11479+
11480+ if (failed_count == 1 && failed_joiner) {
11481+ /* only joining node failed */
11482+
11483+ } else if (failed_count && failed_joiner) {
11484+ /* joining node and another member failed */
11485+
11486+ } else {
11487+ /* other member failed, joining node still alive */
11488+ SM_ASSERT(!failed_joiner, );
11489+ del_memb_node(sg, uev->ue_nodeid);
11490+ }
11491+
11492+ clear_bit(SGFL_UEVENT, &sg->flags);
11493+ memset(uev, 0, sizeof(sm_uevent_t));
11494+ break;
11495+
11496+
11497+ case UEST_LSTOP:
11498+ case UEST_LSTART_WAITCMD:
11499+ case UEST_LSTART:
11500+
11501+ sg->ops->stop(sg->service_data);
11502+
11503+ failed_count = 0;
11504+ failed_leaver = failed_memb(sg, &failed_count);
11505+ SM_ASSERT(failed_count, );
11506+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11507+
11508+ if (failed_count == 1 && failed_leaver) {
11509+ /* only leaving node failed */
11510+
11511+ } else if (failed_count && failed_leaver) {
11512+ /* leaving node and another member failed */
11513+
11514+ } else {
11515+ /* other member failed, leaving node still alive */
11516+ SM_ASSERT(!failed_leaver, );
11517+ }
11518+
11519+ clear_bit(SGFL_UEVENT, &sg->flags);
11520+ memset(uev, 0, sizeof(sm_uevent_t));
11521+ break;
11522+
11523+
11524+ case UEST_LSTART_SERVICEWAIT:
11525+ case UEST_LSTART_SERVICEDONE:
11526+
11527+ clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11528+ sg->ops->stop(sg->service_data);
11529+
11530+ failed_count = 0;
11531+ failed_leaver = failed_memb(sg, &failed_count);
11532+ SM_ASSERT(!failed_leaver, );
11533+
11534+ node = sm_find_member(uev->ue_nodeid);
11535+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11536+ failed_leaver = node;
11537+
11538+ if (!failed_count) {
11539+ /* only leaving node failed */
11540+ SM_ASSERT(failed_leaver, );
11541+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11542+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11543+ (*effected)++;
11544+
11545+ } else if (failed_count && failed_leaver) {
11546+ /* leaving node and another member failed */
11547+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11548+
11549+ } else {
11550+ /* other member failed, leaving node still alive */
11551+ SM_ASSERT(failed_count, );
11552+ SM_ASSERT(!failed_leaver, );
11553+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11554+ node = sm_new_node(sg->uevent.ue_nodeid);
11555+ add_memb_node(sg, node);
11556+ }
11557+
11558+ clear_bit(SGFL_UEVENT, &sg->flags);
11559+ memset(uev, 0, sizeof(sm_uevent_t));
11560+ break;
11561+
11562+
11563+ case UEST_BARRIER_WAIT:
11564+
11565+ if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11566+ goto barrier_wait_leave;
11567+
11568+ sg->ops->stop(sg->service_data);
11569+ cancel_barrier(sg);
11570+
11571+ barrier_wait_join:
11572+
11573+ failed_count = 0;
11574+ failed_joiner = failed_memb(sg, &failed_count);
11575+ SM_ASSERT(failed_count, );
11576+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11577+
11578+ if (failed_count == 1 && failed_joiner) {
11579+ /* only joining node failed */
11580+
11581+ } else if (failed_count && failed_joiner) {
11582+ /* joining node and another member failed */
11583+
11584+ } else {
11585+ /* other member failed, joining node still alive */
11586+ SM_ASSERT(!failed_joiner, );
11587+ del_memb_node(sg, uev->ue_nodeid);
11588+ }
11589+
11590+ clear_bit(SGFL_UEVENT, &sg->flags);
11591+ memset(uev, 0, sizeof(sm_uevent_t));
11592+ break;
11593+
11594+ barrier_wait_leave:
11595+
11596+ failed_count = 0;
11597+ failed_leaver = failed_memb(sg, &failed_count);
11598+ SM_ASSERT(!failed_leaver, );
11599+
11600+ node = sm_find_member(uev->ue_nodeid);
11601+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11602+ failed_leaver = node;
11603+
11604+ if (!failed_count) {
11605+ /* only leaving node failed */
11606+ SM_ASSERT(failed_leaver, );
11607+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11608+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11609+ (*effected)++;
11610+
11611+ } else if (failed_count && failed_leaver) {
11612+ /* leaving node and another member failed */
11613+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11614+
11615+ } else {
11616+ /* other member failed, leaving node still alive */
11617+ SM_ASSERT(failed_count, );
11618+ SM_ASSERT(!failed_leaver, );
11619+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11620+ node = sm_new_node(sg->uevent.ue_nodeid);
11621+ add_memb_node(sg, node);
11622+ }
11623+
11624+ clear_bit(SGFL_UEVENT, &sg->flags);
11625+ memset(uev, 0, sizeof(sm_uevent_t));
11626+ break;
11627+
11628+
11629+ case UEST_BARRIER_DONE:
11630+
11631+ if (!uev->ue_barrier_status) {
11632+ do_finish(sg);
11633+ uevent_done(sg);
11634+ break;
11635+ }
11636+
11637+ if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11638+ goto barrier_wait_leave;
11639+ else
11640+ goto barrier_wait_join;
11641+
11642+
11643+ default:
11644+ log_error(sg, "cancel_one_uevent: state %d", uev->ue_state);
11645+ }
11646+}
11647+
11648+void cancel_uevents(int *effected)
11649+{
11650+ sm_group_t *sg;
11651+ sm_node_t *node, *sgnode;
11652+ int i;
11653+
11654+ list_for_each_entry(node, &sm_members, list) {
11655+ if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
11656+ continue;
11657+
11658+ /*
11659+ * Clear this dead node from the "interested in joining" list
11660+ * of any SG. The node is added to this list before the uevent
11661+ * begins.
11662+ */
11663+
11664+ for (i = 0; i < SG_LEVELS; i++) {
11665+ list_for_each_entry(sg, &sm_sg[i], list) {
11666+ sgnode = sm_find_joiner(sg, node->id);
11667+ if (sgnode) {
11668+ log_debug(sg, "clear joining node %u",
11669+ sgnode->id);
11670+ list_del(&sgnode->list);
11671+ kfree(sgnode);
11672+ }
11673+ }
11674+ }
11675+ }
11676+
11677+ /* Adjust any uevents in sg's effected by the failed node(s) */
11678+
11679+ for (i = 0; i < SG_LEVELS; i++) {
11680+ list_for_each_entry(sg, &sm_sg[i], list) {
11681+ if (!test_bit(SGFL_UEVENT, &sg->flags))
11682+ continue;
11683+
11684+ /* We may have some cancelling to do if this sg is
11685+ flagged as having a failed member, or if a joining
11686+ or leaving node has died. */
11687+
11688+ if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
11689+ cancel_one_uevent(sg, effected);
11690+ else if (sg->uevent.ue_nodeid) {
11691+ node = sm_find_member(sg->uevent.ue_nodeid);
11692+ SM_ASSERT(node, );
11693+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11694+ cancel_one_uevent(sg, effected);
11695+ }
11696+ }
11697+ }
11698+}
11699+
11700+void process_membership(void)
11701+{
11702+ sm_group_t *sg;
11703+ int i;
11704+
11705+ down(&sm_sglock);
11706+
11707+ for (i = 0; i < SG_LEVELS; i++) {
11708+ list_for_each_entry(sg, &sm_sg[i], list) {
11709+ if (!test_bit(SGFL_UEVENT, &sg->flags))
11710+ continue;
11711+
11712+ if (!test_and_clear_bit(UEFL_CHECK,
11713+ &sg->uevent.ue_flags))
11714+ continue;
11715+
11716+ process_one_uevent(sg);
11717+ }
11718+ }
11719+ up(&sm_sglock);
11720+}
11721diff -urN linux-orig/cluster/cman/sm_membership.h linux-patched/cluster/cman/sm_membership.h
bb1d8b11
AM
11722--- linux-orig/cluster/cman/sm_membership.h 1970-01-01 07:30:00.000000000 +0730
11723+++ linux-patched/cluster/cman/sm_membership.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
11724@@ -0,0 +1,20 @@
11725+/******************************************************************************
11726+*******************************************************************************
11727+**
11728+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11729+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11730+**
11731+** This copyrighted material is made available to anyone wishing to use,
11732+** modify, copy, or redistribute it subject to the terms and conditions
11733+** of the GNU General Public License v.2.
11734+**
11735+*******************************************************************************
11736+******************************************************************************/
11737+
11738+#ifndef __SM_MEMBERSHIP_DOT_H__
11739+#define __SM_MEMBERSHIP_DOT_H__
11740+
11741+void process_membership(void);
11742+void cancel_uevents(int *effected);
11743+
11744+#endif
11745diff -urN linux-orig/cluster/cman/sm_message.c linux-patched/cluster/cman/sm_message.c
bb1d8b11
AM
11746--- linux-orig/cluster/cman/sm_message.c 1970-01-01 07:30:00.000000000 +0730
11747+++ linux-patched/cluster/cman/sm_message.c 2004-11-03 11:37:37.000000000 +0800
b7b72b66 11748@@ -0,0 +1,856 @@
c1c6733f
AM
11749+/******************************************************************************
11750+*******************************************************************************
11751+**
11752+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11753+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
b7b72b66 11754+**
c1c6733f
AM
11755+** This copyrighted material is made available to anyone wishing to use,
11756+** modify, copy, or redistribute it subject to the terms and conditions
11757+** of the GNU General Public License v.2.
11758+**
11759+*******************************************************************************
11760+******************************************************************************/
11761+
11762+#include "sm.h"
11763+
11764+#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1)
11765+
11766+extern struct socket * sm_socket;
11767+extern uint32_t sm_our_nodeid;
11768+static uint32_t global_last_id;
11769+static struct list_head messages;
11770+static spinlock_t message_lock;
11771+static char smsg_buf[SMSG_BUF_SIZE];
11772+
11773+int send_nodeid_message(char *msg, int len, uint32_t nodeid);
11774+
11775+struct rq_entry {
11776+ struct list_head list;
11777+ char *msg;
11778+ int len;
11779+ uint32_t nodeid;
11780+};
11781+typedef struct rq_entry rq_entry_t;
11782+
11783+void init_messages(void)
11784+{
11785+ global_last_id = 1;
11786+ INIT_LIST_HEAD(&messages);
11787+ spin_lock_init(&message_lock);
11788+}
11789+
11790+uint32_t sm_new_global_id(int level)
11791+{
11792+ uint32_t id = global_last_id++;
11793+ uint8_t l = (uint8_t) level;
11794+
11795+ if (level > 255)
11796+ return 0;
11797+
11798+ if (id > 0x00FFFFFF)
11799+ return 0;
11800+
11801+ id |= (l << 24);
11802+ return id;
11803+}
11804+
11805+static void smsg_copy_in(char *msg, sm_msg_t *smsg)
11806+{
11807+ sm_msg_t *in = (sm_msg_t *) msg;
11808+
11809+ smsg->ms_type = in->ms_type;
11810+ smsg->ms_status = in->ms_status;
11811+ smsg->ms_sevent_id = le16_to_cpu(in->ms_sevent_id);
11812+ smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid);
11813+ smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid);
11814+ smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel);
11815+ smsg->ms_length = le16_to_cpu(in->ms_length);
11816+}
11817+
11818+/* swapping bytes in place is an easy source of errors - be careful not to
11819+ * access the fields after calling this */
11820+
11821+void smsg_bswap_out(sm_msg_t *smsg)
11822+{
11823+ smsg->ms_sevent_id = cpu_to_le16(smsg->ms_sevent_id);
11824+ smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid);
11825+ smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid);
11826+ smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel);
11827+ smsg->ms_length = cpu_to_le16(smsg->ms_length);
11828+}
11829+
11830+char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
11831+ sm_sevent_t *sev)
11832+{
11833+ char *msg;
11834+ sm_msg_t *smsg;
11835+ int fulllen = sizeof(sm_msg_t) + datalen;
11836+
11837+ msg = smsg_buf;
11838+ memset(smsg_buf, 0, SMSG_BUF_SIZE);
11839+ SM_ASSERT(fulllen <= SMSG_BUF_SIZE,);
11840+
11841+ smsg = (sm_msg_t *) msg;
11842+ smsg->ms_type = type;
11843+ smsg->ms_global_sgid = sg->global_id;
11844+ smsg->ms_sglevel = sg->level;
11845+ smsg->ms_length = datalen;
11846+ smsg->ms_sevent_id = sev ? sev->se_id : 0;
11847+
11848+ smsg_bswap_out(smsg);
11849+ *msglen = fulllen;
11850+ return msg;
11851+}
11852+
11853+static unsigned int msgtype_to_flag(int type)
11854+{
11855+ unsigned int flag;
11856+
11857+ switch (type) {
11858+ case SMSG_JOIN_REP:
11859+ case SMSG_JOIN_REQ:
11860+ flag = SEFL_ALLOW_JOIN;
11861+ break;
11862+
11863+ case SMSG_JSTOP_REP:
11864+ case SMSG_JSTOP_REQ:
11865+ flag = SEFL_ALLOW_JSTOP;
11866+ break;
11867+
11868+ case SMSG_LEAVE_REP:
11869+ case SMSG_LEAVE_REQ:
11870+ flag = SEFL_ALLOW_LEAVE;
11871+ break;
11872+
11873+ case SMSG_LSTOP_REP:
11874+ case SMSG_LSTOP_REQ:
11875+ flag = SEFL_ALLOW_LSTOP;
11876+ break;
11877+
11878+ default:
11879+ SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type););
11880+ }
11881+ return flag;
11882+}
11883+
b7b72b66 11884+static int test_allowed_msgtype(sm_sevent_t *sev, int type)
c1c6733f
AM
11885+{
11886+ unsigned int flag = msgtype_to_flag(type);
11887+
11888+ return test_bit(flag, &sev->se_flags);
11889+}
11890+
b7b72b66 11891+static void clear_allowed_msgtype(sm_sevent_t *sev, int type)
c1c6733f
AM
11892+{
11893+ unsigned int flag = msgtype_to_flag(type);
11894+
11895+ clear_bit(flag, &sev->se_flags);
11896+}
11897+
b7b72b66 11898+static void set_allowed_msgtype(sm_sevent_t *sev, int type)
c1c6733f
AM
11899+{
11900+ unsigned int flag = msgtype_to_flag(type);
11901+
11902+ set_bit(flag, &sev->se_flags);
11903+}
11904+
b7b72b66 11905+static int save_global_id(sm_sevent_t *sev, sm_msg_t *smsg)
c1c6733f
AM
11906+{
11907+ sm_group_t *sg = sev->se_sg;
11908+
11909+ if (!smsg->ms_global_sgid) {
11910+ log_error(sg, "save_global_id: zero sg id");
11911+ return -1;
11912+ }
11913+
11914+ if (!sg->global_id)
11915+ sg->global_id = smsg->ms_global_sgid;
11916+
11917+ if (sg->global_id != smsg->ms_global_sgid) {
11918+ log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid);
11919+ return -1;
11920+ }
11921+ return 0;
11922+}
11923+
b7b72b66 11924+static void save_lastid(sm_msg_t *smsg)
c1c6733f
AM
11925+{
11926+ uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF;
11927+
11928+ /*
11929+ * Keep track of the highst SG id which has been used
11930+ * in the cluster in case we need to choose a new SG id.
11931+ */
11932+
11933+ if (gid > global_last_id)
11934+ global_last_id = gid;
11935+}
11936+
11937+static int next_sev_state(int msg_type, int cur_state)
11938+{
11939+ int next = 0;
11940+
11941+ switch (msg_type) {
11942+ case SMSG_JOIN_REP:
11943+ SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,);
11944+ next = SEST_JOIN_ACKED;
11945+ break;
11946+
11947+ case SMSG_JSTOP_REP:
11948+ SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,);
11949+ next = SEST_JSTOP_ACKED;
11950+ break;
11951+
11952+ case SMSG_LEAVE_REP:
11953+ SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,);
11954+ next = SEST_LEAVE_ACKED;
11955+ break;
11956+
11957+ case SMSG_LSTOP_REP:
11958+ SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,);
11959+ next = SEST_LSTOP_ACKED;
11960+ break;
11961+ }
11962+ return next;
11963+}
11964+
11965+/*
11966+ * Functions in sevent.c send messages to other nodes and then expect replies.
11967+ * This function collects the replies for the sevent messages and moves the
11968+ * sevent to the next stage when all the expected replies have been received.
11969+ */
11970+
b7b72b66 11971+static void process_reply(sm_msg_t *smsg, uint32_t nodeid)
c1c6733f
AM
11972+{
11973+ sm_sevent_t *sev;
11974+ int i, expected, type = smsg->ms_type;
11975+
11976+ /*
11977+ * Find the relevant sevent.
11978+ */
11979+
11980+ sev = find_sevent(smsg->ms_sevent_id);
11981+ if (!sev) {
11982+ log_print("process_reply invalid id=%u nodeid=%u",
11983+ smsg->ms_sevent_id, nodeid);
11984+ goto out;
11985+ }
11986+
11987+ /*
11988+ * Check if this message type is what this sevent is waiting for.
11989+ */
11990+
11991+ if (!test_allowed_msgtype(sev, type)) {
11992+ log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u " "id=%u", type, nodeid, sev->se_id);
11993+ goto out;
11994+ }
11995+
11996+ expected =
11997+ (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count;
11998+
11999+ SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids,
12000+ printk("type=%d expected=%d len_ids=%d node_count=%d "
12001+ "memb_count=%d\n", type, expected, sev->se_len_ids,
12002+ sev->se_node_count, sev->se_memb_count););
12003+
12004+ SM_ASSERT(expected * sizeof(char) <= sev->se_len_status,
12005+ printk("type=%d expected=%d len_status=%d node_count=%d "
12006+ "memb_count=%d\n", type, expected, sev->se_len_status,
12007+ sev->se_node_count, sev->se_memb_count););
12008+
12009+ for (i = 0; i < expected; i++) {
12010+ if (sev->se_node_ids[i] == nodeid) {
12011+ /*
12012+ * Save the status from the replying node
12013+ */
12014+
12015+ if (!sev->se_node_status[i])
12016+ sev->se_node_status[i] = smsg->ms_status;
12017+ else {
12018+ log_error(sev->se_sg, "process_reply duplicate"
12019+ "id=%u nodeid=%u %u/%u",
12020+ sev->se_id, nodeid,
12021+ sev->se_node_status[i],
12022+ smsg->ms_status);
12023+ goto out;
12024+ }
12025+
12026+ if (type == SMSG_JOIN_REP) {
12027+ save_lastid(smsg);
12028+
12029+ if (smsg->ms_status == STATUS_POS)
12030+ save_global_id(sev, smsg);
12031+ }
12032+
12033+ /*
12034+ * Signal sm if we have all replies
12035+ */
12036+
12037+ if (++sev->se_reply_count == expected) {
12038+ clear_allowed_msgtype(sev, type);
b7b72b66 12039+ sev->se_state = next_sev_state(type,
c1c6733f
AM
12040+ sev->se_state);
12041+ set_bit(SEFL_CHECK, &sev->se_flags);
12042+ wake_serviced(DO_JOINLEAVE);
12043+ }
12044+
12045+ break;
12046+ }
12047+ }
12048+
12049+ out:
12050+ return;
12051+}
12052+
12053+/*
12054+ * A node wants to join an SG and has run send_join_notice. If we know nothing
12055+ * about the SG , then we have no objection - send back STATUS_POS. If we're a
12056+ * member of the SG, then send back STATUS_POS (go ahead and join) if there's
12057+ * no sevent or uevent of higher priority in progress (only a single join or
12058+ * leave is permitted for the SG at once). If there happens to be a higher
12059+ * priority sevent/uevent in progress, send back STATUS_WAIT to defer the
12060+ * requested join for a bit.
12061+ */
12062+
12063+static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name)
12064+{
12065+ sm_group_t *sg = NULL;
12066+ sm_sevent_t *sev = NULL;
12067+ sm_node_t *node;
12068+ int found = FALSE;
12069+ int level = smsg->ms_sglevel;
12070+ sm_msg_t reply;
12071+
12072+ memset(&reply, 0, sizeof(reply));
12073+
12074+ down(&sm_sglock);
12075+
12076+ if (nodeid == sm_our_nodeid)
12077+ goto next;
12078+
12079+ /*
12080+ * search SG list for an SG with given name/len
12081+ */
12082+
12083+ list_for_each_entry(sg, &sm_sg[level], list) {
12084+ if ((sg->namelen != smsg->ms_length) ||
12085+ memcmp(sg->name, name, sg->namelen))
12086+ continue;
12087+ found = TRUE;
12088+ break;
12089+ }
12090+
12091+ /*
12092+ * build reply message
12093+ */
12094+
12095+ next:
12096+
12097+ if (!found) {
12098+ reply.ms_type = SMSG_JOIN_REP;
12099+ reply.ms_status = STATUS_NEG;
12100+ reply.ms_global_lastid = global_last_id;
12101+ reply.ms_sevent_id = smsg->ms_sevent_id;
12102+ } else {
12103+ reply.ms_type = SMSG_JOIN_REP;
12104+ reply.ms_status = STATUS_POS;
12105+ reply.ms_sevent_id = smsg->ms_sevent_id;
12106+ reply.ms_global_sgid = sg->global_id;
12107+ reply.ms_global_lastid = global_last_id;
12108+
12109+ /*
12110+ * The node trying to join should wait and try again until
12111+ * we're done with recovery.
12112+ */
12113+
12114+ if (sg->state == SGST_RECOVER) {
12115+ reply.ms_status = STATUS_WAIT;
12116+ goto send;
12117+ }
12118+
12119+ /*
12120+ * An sevent node trying to join may have gotten as far as
12121+ * creating a uevent with us and then backed out. That node
12122+ * will retry joining from the beginning so we should not turn
12123+ * them away. If we're handling a uevent for another node,
12124+ * tell the joining node to wait.
12125+ */
12126+
12127+ if (test_bit(SGFL_UEVENT, &sg->flags)) {
12128+ if (sg->uevent.ue_nodeid != nodeid)
12129+ reply.ms_status = STATUS_WAIT;
12130+ goto send;
12131+ }
12132+
12133+ /*
12134+ * We're trying to join or leave the SG at the moment.
12135+ */
12136+
12137+ if (test_bit(SGFL_SEVENT, &sg->flags)) {
12138+ sev = sg->sevent;
12139+
12140+ /*
12141+ * We're trying to leave. Make the join wait until
12142+ * we've left if we're beyond LEAVE_ACKWAIT.
12143+ */
12144+
12145+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
12146+ if (sev->se_state > SEST_LEAVE_ACKED)
12147+ reply.ms_status = STATUS_WAIT;
12148+ else {
12149+ reply.ms_status = STATUS_POS;
12150+ clear_bit(SEFL_ALLOW_LEAVE,
12151+ &sev->se_flags);
12152+ set_bit(SEFL_CANCEL, &sev->se_flags);
12153+ }
12154+ }
12155+
12156+ /*
12157+ * We're trying to join. Making the other join wait
12158+ * until we're joined if we're beyond JOIN_ACKWAIT or
12159+ * if we have a lower id. (Send NEG to allow the other
12160+ * node to go ahead because we're not in the SG.)
12161+ */
12162+
12163+ else {
12164+ if (sev->se_state > SEST_JOIN_ACKED)
12165+ reply.ms_status = STATUS_WAIT;
12166+ else if (sm_our_nodeid < nodeid)
12167+ reply.ms_status = STATUS_WAIT;
12168+ else {
12169+ reply.ms_status = STATUS_NEG;
12170+ clear_bit(SEFL_ALLOW_JOIN,
12171+ &sev->se_flags);
12172+ set_bit(SEFL_CANCEL, &sev->se_flags);
12173+ }
12174+ }
12175+
12176+ if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
12177+ set_bit(SEFL_CHECK, &sev->se_flags);
12178+ wake_serviced(DO_JOINLEAVE);
12179+ }
12180+ goto send;
12181+ }
12182+
12183+ /* no r,u,s event, stick with STATUS_POS */
12184+ }
12185+
12186+ send:
12187+
12188+ if (reply.ms_status == STATUS_POS) {
12189+ node = sm_find_joiner(sg, nodeid);
12190+ if (!node) {
12191+ node = sm_new_node(nodeid);
12192+ list_add_tail(&node->list, &sg->joining);
12193+ }
12194+ }
12195+
12196+ up(&sm_sglock);
12197+ smsg_bswap_out(&reply);
12198+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12199+}
12200+
12201+/*
12202+ * Another node wants us to stop a service so it can join or leave the SG. We
12203+ * do this by saving the request info in a uevent and having the sm thread do
12204+ * the processing and then replying.
12205+ */
12206+
b7b72b66
AM
12207+static void process_stop_request(sm_msg_t *smsg, uint32_t nodeid,
12208+ uint32_t *msgbuf)
c1c6733f
AM
12209+{
12210+ sm_group_t *sg;
12211+ sm_uevent_t *uev;
12212+ sm_msg_t reply;
12213+ int type = smsg->ms_type;
12214+
12215+ if (nodeid == sm_our_nodeid)
12216+ goto agree;
12217+
12218+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12219+ if (!sg) {
12220+ log_print("process_stop_request: unknown sg id %x",
12221+ smsg->ms_global_sgid);
12222+ return;
12223+ }
12224+
12225+ /*
12226+ * We shouldn't get here with uevent already set.
12227+ */
12228+
12229+ if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) {
12230+ log_error(sg, "process_stop_request: uevent already set");
12231+ return;
12232+ }
12233+
12234+ uev = &sg->uevent;
12235+ uev->ue_nodeid = nodeid;
12236+ uev->ue_remote_seid = smsg->ms_sevent_id;
12237+ uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP;
12238+
12239+ if (type == SMSG_JSTOP_REQ)
12240+ uev->ue_num_nodes = be32_to_cpu(*msgbuf);
12241+ else
12242+ set_bit(UEFL_LEAVE, &uev->ue_flags);
12243+
12244+ /*
12245+ * Do process_join_stop() or process_leave_stop().
12246+ */
12247+
12248+ set_bit(UEFL_CHECK, &uev->ue_flags);
12249+ wake_serviced(DO_MEMBERSHIP);
12250+ return;
12251+
12252+ agree:
12253+ reply.ms_status = STATUS_POS;
12254+ reply.ms_type =
12255+ (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP;
12256+ reply.ms_sevent_id = smsg->ms_sevent_id;
12257+ smsg_bswap_out(&reply);
12258+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12259+}
12260+
b7b72b66 12261+static void process_start_request(sm_msg_t *smsg, uint32_t nodeid)
c1c6733f
AM
12262+{
12263+ sm_group_t *sg;
12264+ sm_uevent_t *uev;
12265+ int type = smsg->ms_type;
12266+
12267+ if (nodeid == sm_our_nodeid)
12268+ return;
12269+
12270+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12271+ if (!sg) {
12272+ log_print("process_start_request: unknown sg id %x",
12273+ smsg->ms_global_sgid);
12274+ return;
12275+ }
12276+
12277+ if (!test_bit(SGFL_UEVENT, &sg->flags)) {
12278+ log_error(sg, "process_start_request: no uevent");
12279+ return;
12280+ }
12281+
12282+ uev = &sg->uevent;
12283+
12284+ if (type == SMSG_JSTART_CMD)
12285+ uev->ue_state = UEST_JSTART;
12286+ else
12287+ uev->ue_state = UEST_LSTART;
12288+
12289+ set_bit(UEFL_CHECK, &uev->ue_flags);
12290+ wake_serviced(DO_MEMBERSHIP);
12291+}
12292+
b7b72b66 12293+static void process_leave_request(sm_msg_t *smsg, uint32_t nodeid)
c1c6733f
AM
12294+{
12295+ sm_group_t *sg;
12296+ sm_node_t *node;
12297+ sm_msg_t reply;
12298+ sm_sevent_t *sev;
12299+ int found = FALSE;
12300+
12301+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12302+ if (sg) {
12303+ if (nodeid == sm_our_nodeid)
12304+ found = TRUE;
12305+ else {
12306+ list_for_each_entry(node, &sg->memb, list) {
12307+ if (node->id != nodeid)
12308+ continue;
12309+ set_bit(SNFL_LEAVING, &node->flags);
12310+ found = TRUE;
12311+ break;
12312+ }
12313+ }
12314+ }
12315+
12316+ if (!found) {
12317+ reply.ms_type = SMSG_LEAVE_REP;
12318+ reply.ms_status = STATUS_NEG;
12319+ reply.ms_sevent_id = smsg->ms_sevent_id;
12320+ } else {
12321+ reply.ms_type = SMSG_LEAVE_REP;
12322+ reply.ms_status = STATUS_POS;
12323+ reply.ms_sevent_id = smsg->ms_sevent_id;
12324+
12325+ if (sg->state == SGST_RECOVER)
12326+ reply.ms_status = STATUS_WAIT;
12327+
12328+ else if (test_bit(SGFL_SEVENT, &sg->flags) &&
12329+ nodeid != sm_our_nodeid) {
12330+ sev = sg->sevent;
12331+
12332+ /*
12333+ * We're trying to join or leave at the moment. If
12334+ * we're past JOIN/LEAVE_ACKWAIT, we make the requestor
12335+ * wait. Otherwise, if joining we'll cancel to let the
12336+ * leave happen first, or if we're leaving allow the
12337+ * lower nodeid to leave first.
12338+ */
12339+
12340+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
12341+ if (sev->se_state > SEST_LEAVE_ACKWAIT)
12342+ reply.ms_status = STATUS_WAIT;
12343+ else if (sm_our_nodeid < nodeid)
12344+ reply.ms_status = STATUS_WAIT;
12345+ else {
12346+ reply.ms_status = STATUS_POS;
12347+ clear_bit(SEFL_ALLOW_LEAVE,
12348+ &sev->se_flags);
12349+ set_bit(SEFL_CANCEL, &sev->se_flags);
12350+ }
12351+ } else {
12352+ if (sev->se_state > SEST_JOIN_ACKWAIT)
12353+ reply.ms_status = STATUS_WAIT;
12354+ else {
12355+ reply.ms_status = STATUS_NEG;
12356+ clear_bit(SEFL_ALLOW_JOIN,
12357+ &sev->se_flags);
12358+ set_bit(SEFL_CANCEL, &sev->se_flags);
12359+ }
12360+ }
12361+
12362+ if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
12363+ set_bit(SEFL_CHECK, &sev->se_flags);
12364+ wake_serviced(DO_JOINLEAVE);
12365+ }
12366+ }
12367+
12368+ else if (test_bit(SGFL_UEVENT, &sg->flags)) {
12369+ if (sg->uevent.ue_nodeid != nodeid)
12370+ reply.ms_status = STATUS_WAIT;
12371+ }
12372+
12373+ }
12374+
12375+ smsg_bswap_out(&reply);
12376+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12377+}
12378+
12379+/*
12380+ * Each remaining node will send us a done message. We quit when we get the
12381+ * first. The subsequent done messages for the finished sevent get here and
12382+ * are ignored.
12383+ */
12384+
12385+static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid)
12386+{
12387+ sm_sevent_t *sev;
12388+
12389+ sev = find_sevent(smsg->ms_sevent_id);
12390+ if (!sev)
12391+ return;
12392+
12393+ if (sev->se_state != SEST_LSTART_WAITREMOTE)
12394+ return;
12395+
12396+ sev->se_state = SEST_LSTART_REMOTEDONE;
12397+ set_bit(SEFL_CHECK, &sev->se_flags);
12398+ wake_serviced(DO_JOINLEAVE);
12399+}
12400+
12401+/*
12402+ * This function and everything it calls always runs in sm context.
12403+ */
12404+
12405+static void process_message(char *msg, uint32_t nodeid)
12406+{
12407+ sm_msg_t smsg;
12408+
12409+ smsg_copy_in(msg, &smsg);
12410+
12411+ switch (smsg.ms_type) {
12412+ case SMSG_JOIN_REQ:
12413+ process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t));
12414+ break;
12415+
12416+ case SMSG_JSTOP_REQ:
12417+ process_stop_request(&smsg, nodeid,
12418+ (uint32_t *) (msg + sizeof(sm_msg_t)));
12419+ break;
12420+
12421+ case SMSG_LEAVE_REQ:
12422+ process_leave_request(&smsg, nodeid);
12423+ break;
12424+
12425+ case SMSG_LSTOP_REQ:
12426+ process_stop_request(&smsg, nodeid, NULL);
12427+ break;
12428+
12429+ case SMSG_JSTART_CMD:
12430+ case SMSG_LSTART_CMD:
12431+ process_start_request(&smsg, nodeid);
12432+ break;
12433+
12434+ case SMSG_LSTART_DONE:
12435+ process_lstart_done(&smsg, nodeid);
12436+ break;
12437+
12438+ case SMSG_JOIN_REP:
12439+ case SMSG_JSTOP_REP:
12440+ case SMSG_LEAVE_REP:
12441+ case SMSG_LSTOP_REP:
12442+ process_reply(&smsg, nodeid);
12443+ break;
12444+
12445+ case SMSG_RECOVER:
12446+ process_recover_msg(&smsg, nodeid);
12447+ break;
12448+
12449+ default:
12450+ log_print("process_message: unknown type %u nodeid %u",
12451+ smsg.ms_type, nodeid);
12452+ }
12453+}
12454+
12455+/*
12456+ * Always called from sm context.
12457+ */
12458+
12459+void process_messages(void)
12460+{
12461+ rq_entry_t *re;
12462+
12463+ while (1) {
12464+ re = NULL;
12465+
12466+ spin_lock(&message_lock);
12467+ if (!list_empty(&messages)) {
12468+ re = list_entry(messages.next, rq_entry_t, list);
12469+ list_del(&re->list);
12470+ }
12471+ spin_unlock(&message_lock);
12472+
12473+ if (!re)
12474+ break;
12475+ process_message(re->msg, re->nodeid);
12476+ kfree(re->msg);
12477+ kfree(re);
12478+ schedule();
12479+ }
12480+}
12481+
12482+/*
12483+ * Context: cnxman and sm
12484+ */
12485+
12486+static int add_to_recvqueue(char *msg, int len, uint32_t nodeid)
12487+{
12488+ rq_entry_t *re;
12489+
12490+ SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL),
12491+ re);
12492+ SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg);
12493+
12494+ memcpy(re->msg, msg, len);
12495+ re->len = len;
12496+ re->nodeid = nodeid;
12497+
12498+ spin_lock(&message_lock);
12499+ list_add_tail(&re->list, &messages);
12500+ spin_unlock(&message_lock);
12501+
12502+ wake_serviced(DO_MESSAGES);
12503+ return 0;
12504+}
12505+
12506+/*
12507+ * Context: cnxman
12508+ * Called by cnxman when a service manager message arrives.
12509+ */
12510+
12511+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12512+ unsigned int node_id)
12513+{
b7b72b66
AM
12514+ if (!node_id)
12515+ return -EINVAL;
12516+ return add_to_recvqueue(msg, len, node_id);
c1c6733f
AM
12517+}
12518+
12519+/*
12520+ * These send routines are used by sm and are always called from sm context.
12521+ */
12522+
12523+int send_nodeid_message(char *msg, int len, uint32_t nodeid)
12524+{
12525+ int error = 0;
12526+ struct sockaddr_cl saddr;
12527+
12528+ if (nodeid == sm_our_nodeid) {
12529+ add_to_recvqueue(msg, len, nodeid);
12530+ goto out;
12531+ }
12532+
12533+ saddr.scl_family = AF_CLUSTER;
12534+ saddr.scl_port = CLUSTER_PORT_SERVICES;
12535+ saddr.scl_nodeid = nodeid;
b7b72b66 12536+ error = kcl_sendmsg(sm_socket, msg, len, &saddr, sizeof(saddr), 0);
c1c6733f
AM
12537+ if (error > 0)
12538+ error = 0;
12539+
12540+ if (error)
12541+ log_print("send_nodeid_message error %d to %u", error, nodeid);
12542+ out:
12543+ return error;
12544+}
12545+
12546+int send_broadcast_message(char *msg, int len)
12547+{
12548+ int error;
12549+
12550+ error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0);
12551+ if (error > 0)
12552+ error = 0;
12553+
12554+ add_to_recvqueue(msg, len, sm_our_nodeid);
12555+
12556+ if (error)
12557+ log_print("send_broadcast_message error %d", error);
12558+
12559+ return error;
12560+}
12561+
12562+int send_members_message(sm_group_t *sg, char *msg, int len)
12563+{
12564+ sm_node_t *node;
12565+ int error = 0;
12566+
12567+ list_for_each_entry(node, &sg->memb, list) {
12568+ error = send_nodeid_message(msg, len, node->id);
12569+ if (error < 0)
12570+ break;
12571+ }
12572+ return error;
12573+}
12574+
12575+int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12576+ sm_sevent_t * sev)
12577+{
12578+ int error;
12579+ sm_msg_t *smsg = (sm_msg_t *) msg;
12580+
12581+ set_allowed_msgtype(sev, smsg->ms_type);
12582+ sev->se_reply_count = 0;
12583+
12584+ error = send_members_message(sg, msg, len);
12585+ if (error < 0)
12586+ clear_allowed_msgtype(sev, smsg->ms_type);
12587+
12588+ return error;
12589+}
12590+
12591+int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev)
12592+{
12593+ int error;
12594+ sm_msg_t *smsg = (sm_msg_t *) msg;
12595+
12596+ set_allowed_msgtype(sev, smsg->ms_type);
12597+ sev->se_reply_count = 0;
12598+
12599+ error = send_broadcast_message(msg, len);
12600+ if (error < 0)
12601+ clear_allowed_msgtype(sev, smsg->ms_type);
12602+
12603+ return error;
12604+}
12605diff -urN linux-orig/cluster/cman/sm_message.h linux-patched/cluster/cman/sm_message.h
bb1d8b11
AM
12606--- linux-orig/cluster/cman/sm_message.h 1970-01-01 07:30:00.000000000 +0730
12607+++ linux-patched/cluster/cman/sm_message.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
12608@@ -0,0 +1,34 @@
12609+/******************************************************************************
12610+*******************************************************************************
12611+**
12612+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12613+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12614+**
12615+** This copyrighted material is made available to anyone wishing to use,
12616+** modify, copy, or redistribute it subject to the terms and conditions
12617+** of the GNU General Public License v.2.
12618+**
12619+*******************************************************************************
12620+******************************************************************************/
12621+
12622+#ifndef __SM_MESSAGE_DOT_H__
12623+#define __SM_MESSAGE_DOT_H__
12624+
12625+void init_messages(void);
12626+uint32_t sm_new_global_id(int level);
12627+void smsg_bswap_out(sm_msg_t * smsg);
12628+char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
12629+ sm_sevent_t *sev);
12630+void process_messages(void);
12631+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12632+ unsigned int node_id);
12633+int send_nodeid_message(char *msg, int len, uint32_t nodeid);
12634+int send_broadcast_message(char *msg, int len);
12635+int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev);
12636+int send_members_message(sm_group_t *sg, char *msg, int len);
12637+int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12638+ sm_sevent_t * sev);
12639+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12640+ unsigned int node_id);
12641+
12642+#endif
12643diff -urN linux-orig/cluster/cman/sm_misc.c linux-patched/cluster/cman/sm_misc.c
bb1d8b11
AM
12644--- linux-orig/cluster/cman/sm_misc.c 1970-01-01 07:30:00.000000000 +0730
12645+++ linux-patched/cluster/cman/sm_misc.c 2004-11-03 11:37:37.000000000 +0800
b7b72b66 12646@@ -0,0 +1,442 @@
c1c6733f
AM
12647+/******************************************************************************
12648+*******************************************************************************
12649+**
12650+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12651+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
b7b72b66 12652+**
c1c6733f
AM
12653+** This copyrighted material is made available to anyone wishing to use,
12654+** modify, copy, or redistribute it subject to the terms and conditions
12655+** of the GNU General Public License v.2.
12656+**
12657+*******************************************************************************
12658+******************************************************************************/
12659+
12660+#include "sm.h"
12661+#include "config.h"
b7b72b66 12662+#include <linux/seq_file.h>
c1c6733f
AM
12663+
12664+#define MAX_DEBUG_MSG_LEN (40)
12665+
12666+extern struct list_head sm_members;
12667+static uint32_t local_ids;
12668+static uint32_t event_id;
12669+static spinlock_t event_id_lock;
12670+static char * debug_buf;
12671+static unsigned int debug_size;
12672+static unsigned int debug_point;
12673+static int debug_wrap;
12674+static spinlock_t debug_lock;
12675+
12676+
12677+void init_sm_misc(void)
12678+{
12679+ local_ids = 1;
12680+ event_id = 1;
12681+ spin_lock_init(&event_id_lock);
12682+ debug_buf = NULL;
12683+ debug_size = 0;
12684+ debug_point = 0;
12685+ debug_wrap = 0;
12686+ spin_lock_init(&debug_lock);
12687+
12688+ sm_debug_setup(cman_config.sm_debug_size);
12689+}
12690+
12691+sm_node_t *sm_new_node(uint32_t nodeid)
12692+{
12693+ struct kcl_cluster_node kclnode;
12694+ sm_node_t *node;
12695+ int error;
12696+
12697+ error = kcl_get_node_by_nodeid(nodeid, &kclnode);
12698+ SM_ASSERT(!error,);
12699+
12700+ SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL),
12701+ node);
12702+
12703+ memset(node, 0, sizeof(sm_node_t));
12704+ node->id = nodeid;
12705+ node->incarnation = kclnode.incarnation;
12706+ return node;
12707+}
12708+
12709+sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid)
12710+{
12711+ sm_node_t *node;
12712+
12713+ list_for_each_entry(node, &sg->joining, list) {
12714+ if (node->id == nodeid)
12715+ return node;
12716+ }
12717+ return NULL;
12718+}
12719+
12720+sm_node_t *sm_find_member(uint32_t nodeid)
12721+{
12722+ sm_node_t *node;
12723+
12724+ list_for_each_entry(node, &sm_members, list) {
12725+ if (node->id == nodeid)
12726+ return node;
12727+ }
12728+ return NULL;
12729+}
12730+
12731+uint32_t sm_new_local_id(int level)
12732+{
12733+ uint32_t id = local_ids++;
12734+ uint8_t l = (uint8_t) level;
12735+
12736+ if (level > 0xFF)
12737+ return 0;
12738+
12739+ if (id > 0x00FFFFFF)
12740+ return 0;
12741+
12742+ id |= (l << 24);
12743+ return id;
12744+}
12745+
12746+int sm_id_to_level(uint32_t id)
12747+{
12748+ uint8_t l = (id & 0xFF000000) >> 24;
12749+
12750+ return (int) l;
12751+}
12752+
12753+void sm_set_event_id(int *id)
12754+{
12755+ spin_lock(&event_id_lock);
12756+ *id = event_id++;
12757+ spin_unlock(&event_id_lock);
12758+}
12759+
12760+sm_group_t *sm_local_id_to_sg(int id)
12761+{
12762+ sm_group_t *sg;
12763+ int level = sm_id_to_level(id);
12764+ int found = FALSE;
12765+
12766+ down(&sm_sglock);
12767+
12768+ list_for_each_entry(sg, &sm_sg[level], list) {
12769+ if (sg->local_id == id) {
12770+ found = TRUE;
12771+ break;
12772+ }
12773+ }
12774+ up(&sm_sglock);
12775+ if (!found)
12776+ sg = NULL;
12777+ return sg;
12778+}
12779+
12780+sm_group_t *sm_global_id_to_sg(int id)
12781+{
12782+ sm_group_t *sg;
12783+ int level = sm_id_to_level(id);
12784+ int found = FALSE;
12785+
12786+ down(&sm_sglock);
12787+
12788+ list_for_each_entry(sg, &sm_sg[level], list) {
12789+ if (sg->global_id == id) {
12790+ found = TRUE;
12791+ break;
12792+ }
12793+ }
12794+ up(&sm_sglock);
12795+ if (!found)
12796+ sg = NULL;
12797+ return sg;
12798+}
12799+
12800+void sm_debug_log(sm_group_t *sg, const char *fmt, ...)
12801+{
12802+ va_list va;
12803+ int i, n, size, len;
12804+ char buf[MAX_DEBUG_MSG_LEN+1];
12805+
12806+ spin_lock(&debug_lock);
12807+
12808+ if (!debug_buf)
12809+ goto out;
12810+
12811+ size = MAX_DEBUG_MSG_LEN;
12812+ memset(buf, 0, size+1);
12813+
12814+ n = snprintf(buf, size, "%08x ", sg->global_id);
12815+ size -= n;
12816+
12817+ va_start(va, fmt);
12818+ vsnprintf(buf+n, size, fmt, va);
12819+ va_end(va);
12820+
12821+ len = strlen(buf);
12822+ if (len > MAX_DEBUG_MSG_LEN-1)
12823+ len = MAX_DEBUG_MSG_LEN-1;
12824+ buf[len] = '\n';
12825+ buf[len+1] = '\0';
12826+
12827+ for (i = 0; i < strlen(buf); i++) {
12828+ debug_buf[debug_point++] = buf[i];
12829+
12830+ if (debug_point == debug_size) {
12831+ debug_point = 0;
12832+ debug_wrap = 1;
12833+ }
12834+ }
12835+ out:
12836+ spin_unlock(&debug_lock);
12837+}
12838+
12839+void sm_debug_setup(int size)
12840+{
12841+ char *b = kmalloc(size, GFP_KERNEL);
12842+
12843+ spin_lock(&debug_lock);
12844+ if (debug_buf)
12845+ kfree(debug_buf);
12846+
12847+ if (size > PAGE_SIZE)
12848+ size = PAGE_SIZE;
12849+ debug_size = size;
12850+ debug_point = 0;
12851+ debug_wrap = 0;
12852+ debug_buf = b;
12853+ memset(debug_buf, 0, debug_size);
12854+ spin_unlock(&debug_lock);
12855+}
12856+
12857+#ifdef CONFIG_PROC_FS
b7b72b66
AM
12858+static struct seq_operations sm_info_op;
12859+
12860+struct sm_seq_info
12861+{
12862+ int pos;
12863+ int level;
12864+ sm_group_t *sg;
12865+};
c1c6733f
AM
12866+
12867+int sm_debug_info(char *b, char **start, off_t offset, int length)
12868+{
12869+ int i, n = 0;
12870+
12871+ spin_lock(&debug_lock);
12872+
12873+ if (debug_wrap) {
12874+ for (i = debug_point; i < debug_size; i++)
12875+ n += sprintf(b + n, "%c", debug_buf[i]);
12876+ }
12877+ for (i = 0; i < debug_point; i++)
12878+ n += sprintf(b + n, "%c", debug_buf[i]);
12879+
12880+ spin_unlock(&debug_lock);
12881+
12882+ return n;
12883+}
12884+
c1c6733f 12885+
c1c6733f 12886+
b7b72b66
AM
12887+static sm_group_t *sm_walk(loff_t offset, int *rlevel)
12888+{
12889+ sm_group_t *sg;
12890+ int level;
12891+ loff_t n = 0;
c1c6733f
AM
12892+
12893+ down(&sm_sglock);
12894+
12895+ for (level = 0; level < SG_LEVELS; level++) {
12896+ list_for_each_entry(sg, &sm_sg[level], list) {
b7b72b66
AM
12897+ if (++n == offset)
12898+ goto walk_finish;
12899+ }
12900+ }
12901+ sg = NULL;
c1c6733f 12902+
b7b72b66
AM
12903+ walk_finish:
12904+ up(&sm_sglock);
12905+ *rlevel = level;
c1c6733f 12906+
b7b72b66
AM
12907+ return sg;
12908+}
c1c6733f 12909+
c1c6733f 12910+
b7b72b66
AM
12911+static void *sm_seq_start(struct seq_file *m, loff_t * pos)
12912+{
12913+ struct sm_seq_info *ssi =
12914+ kmalloc(sizeof (struct sm_seq_info), GFP_KERNEL);
c1c6733f 12915+
b7b72b66
AM
12916+ if (!ssi)
12917+ return NULL;
c1c6733f 12918+
b7b72b66
AM
12919+ ssi->pos = *pos;
12920+ ssi->level = 0;
12921+ ssi->sg = NULL;
c1c6733f 12922+
b7b72b66
AM
12923+ /* Print the header */
12924+ if (*pos == 0) {
12925+ seq_printf(m,
12926+ "Service Name GID LID State Code\n");
12927+ }
12928+ return ssi;
12929+}
c1c6733f 12930+
b7b72b66
AM
12931+static void *sm_seq_next(struct seq_file *m, void *p, loff_t * pos)
12932+{
12933+ struct sm_seq_info *ssi = p;
c1c6733f 12934+
b7b72b66 12935+ *pos = ++ssi->pos;
c1c6733f 12936+
b7b72b66
AM
12937+ if ( !(ssi->sg = sm_walk(ssi->pos, &ssi->level)) )
12938+ return NULL;
c1c6733f 12939+
b7b72b66
AM
12940+ return ssi;
12941+}
12942+
12943+/* Called from /proc when /proc/cluster/services is opened */
12944+int sm_proc_open(struct inode *inode, struct file *file)
12945+{
12946+ return seq_open(file, &sm_info_op);
12947+}
c1c6733f 12948+
b7b72b66
AM
12949+static int sm_seq_show(struct seq_file *s, void *p)
12950+{
12951+ struct sm_seq_info *ssi = p;
12952+ sm_node_t *node;
12953+ int i;
c1c6733f 12954+
b7b72b66
AM
12955+ if (!ssi || !ssi->sg)
12956+ return 0;
c1c6733f 12957+
b7b72b66
AM
12958+ /*
12959+ * Cluster Service
12960+ */
c1c6733f 12961+
b7b72b66
AM
12962+ switch (ssi->level) {
12963+ case SERVICE_LEVEL_FENCE:
12964+ seq_printf(s, "Fence Domain: ");
12965+ break;
12966+ case SERVICE_LEVEL_GDLM:
12967+ seq_printf(s, "DLM Lock Space: ");
12968+ break;
12969+ case SERVICE_LEVEL_GFS:
12970+ seq_printf(s, "GFS Mount Group: ");
12971+ break;
12972+ case SERVICE_LEVEL_USER:
12973+ seq_printf(s, "User: ");
12974+ break;
12975+ }
c1c6733f 12976+
b7b72b66
AM
12977+ /*
12978+ * Name
12979+ */
12980+
12981+ seq_printf(s, "\"");
12982+ for (i = 0; i < ssi->sg->namelen; i++)
12983+ seq_printf(s, "%c", ssi->sg->name[i]);
12984+ seq_printf(s, "\"");
12985+
12986+ for (; i < MAX_SERVICE_NAME_LEN-1; i++)
12987+ seq_printf(s, " ");
12988+
12989+ /*
12990+ * GID LID (sans level from top byte)
12991+ */
12992+
12993+ seq_printf(s, "%3u %3u ",
12994+ (ssi->sg->global_id & 0x00FFFFFF),
12995+ (ssi->sg->local_id & 0x00FFFFFF));
12996+
12997+ /*
12998+ * State
12999+ */
13000+
13001+ switch (ssi->sg->state) {
13002+ case SGST_NONE:
13003+ seq_printf(s, "none ");
13004+ break;
13005+ case SGST_JOIN:
13006+ seq_printf(s, "join ");
13007+ break;
13008+ case SGST_RUN:
13009+ seq_printf(s, "run ");
13010+ break;
13011+ case SGST_RECOVER:
13012+ seq_printf(s, "recover %u ",
13013+ ssi->sg->recover_state);
13014+ break;
13015+ case SGST_UEVENT:
13016+ seq_printf(s, "update ");
13017+ break;
13018+ }
c1c6733f 13019+
b7b72b66
AM
13020+ /*
13021+ * Code
13022+ */
13023+
13024+ if (test_bit(SGFL_SEVENT, &ssi->sg->flags))
13025+ seq_printf(s, "S");
13026+ if (test_bit(SGFL_UEVENT, &ssi->sg->flags))
13027+ seq_printf(s, "U");
13028+ if (test_bit(SGFL_NEED_RECOVERY, &ssi->sg->flags))
13029+ seq_printf(s, "N");
13030+
13031+ seq_printf(s, "-");
13032+
13033+ if (test_bit(SGFL_SEVENT, &ssi->sg->flags)
13034+ && ssi->sg->sevent) {
13035+ seq_printf(s, "%u,%lx,%u",
13036+ ssi->sg->sevent->se_state,
13037+ ssi->sg->sevent->se_flags,
13038+ ssi->sg->sevent->se_reply_count);
13039+ }
c1c6733f 13040+
b7b72b66
AM
13041+ if (test_bit(SGFL_UEVENT, &ssi->sg->flags)) {
13042+ seq_printf(s, "%u,%lx,%u",
13043+ ssi->sg->uevent.ue_state,
13044+ ssi->sg->uevent.ue_flags,
13045+ ssi->sg->uevent.ue_nodeid);
13046+ }
c1c6733f 13047+
b7b72b66 13048+ seq_printf(s, "\n");
c1c6733f 13049+
b7b72b66
AM
13050+ /*
13051+ * node list
13052+ */
c1c6733f 13053+
b7b72b66 13054+ i = 0;
c1c6733f 13055+
b7b72b66 13056+ seq_printf(s, "[");
c1c6733f 13057+
b7b72b66
AM
13058+ list_for_each_entry(node, &ssi->sg->memb, list) {
13059+ if (i && !(i % 24))
13060+ seq_printf(s, "\n");
c1c6733f 13061+
b7b72b66
AM
13062+ if (i)
13063+ seq_printf(s, " ");
13064+
13065+ seq_printf(s, "%u", node->id);
13066+ i++;
13067+ }
13068+
13069+ seq_printf(s, "]\n\n");
13070+
13071+ return 0;
13072+}
13073+
13074+static void sm_seq_stop(struct seq_file *m, void *p)
13075+{
13076+ kfree(p);
c1c6733f 13077+}
b7b72b66
AM
13078+
13079+
13080+static struct seq_operations sm_info_op = {
13081+ .start = sm_seq_start,
13082+ .next = sm_seq_next,
13083+ .stop = sm_seq_stop,
13084+ .show = sm_seq_show
13085+};
13086+
13087+
c1c6733f
AM
13088+#endif
13089diff -urN linux-orig/cluster/cman/sm_misc.h linux-patched/cluster/cman/sm_misc.h
bb1d8b11
AM
13090--- linux-orig/cluster/cman/sm_misc.h 1970-01-01 07:30:00.000000000 +0730
13091+++ linux-patched/cluster/cman/sm_misc.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
13092@@ -0,0 +1,29 @@
13093+/******************************************************************************
13094+*******************************************************************************
13095+**
13096+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13097+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13098+**
13099+** This copyrighted material is made available to anyone wishing to use,
13100+** modify, copy, or redistribute it subject to the terms and conditions
13101+** of the GNU General Public License v.2.
13102+**
13103+*******************************************************************************
13104+******************************************************************************/
13105+
13106+#ifndef __SM_MISC_DOT_H__
13107+#define __SM_MISC_DOT_H__
13108+
13109+void init_sm_misc(void);
13110+sm_node_t *sm_new_node(uint32_t nodeid);
13111+sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid);
13112+sm_node_t *sm_find_member(uint32_t nodeid);
13113+uint32_t sm_new_local_id(int level);
13114+int sm_id_to_level(uint32_t id);
13115+void sm_set_event_id(int *id);
13116+sm_group_t *sm_local_id_to_sg(int id);
13117+sm_group_t *sm_global_id_to_sg(int id);
13118+void sm_debug_log(sm_group_t *sg, const char *fmt, ...);
13119+void sm_debug_setup(int size);
13120+
13121+#endif
13122diff -urN linux-orig/cluster/cman/sm_recover.c linux-patched/cluster/cman/sm_recover.c
bb1d8b11
AM
13123--- linux-orig/cluster/cman/sm_recover.c 1970-01-01 07:30:00.000000000 +0730
13124+++ linux-patched/cluster/cman/sm_recover.c 2004-11-03 11:37:37.000000000 +0800
b7b72b66 13125@@ -0,0 +1,524 @@
c1c6733f
AM
13126+/******************************************************************************
13127+*******************************************************************************
13128+**
13129+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13130+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13131+**
13132+** This copyrighted material is made available to anyone wishing to use,
13133+** modify, copy, or redistribute it subject to the terms and conditions
13134+** of the GNU General Public License v.2.
13135+**
13136+*******************************************************************************
13137+******************************************************************************/
13138+
13139+#include "sm.h"
13140+#include "config.h"
13141+
13142+/*
13143+ * A collection of sg's which need to be recovered due to a failed member.
13144+ * These sg's are recovered in order of level. An sg subject to cascading
13145+ * failures is moved from one of these structs to a newer one.
13146+ */
13147+
13148+struct recover {
13149+ struct list_head list; /* list of current re's */
13150+ struct list_head sgs[SG_LEVELS]; /* lists of sg's by level */
13151+ int event_id; /* event id */
13152+ int cur_level;
13153+};
13154+typedef struct recover recover_t;
13155+
13156+
13157+extern uint32_t * sm_new_nodeids;
13158+extern int sm_quorum, sm_quorum_next;
13159+extern uint32_t sm_our_nodeid;
13160+extern struct list_head sm_members;
13161+extern int sm_member_count;
13162+static struct list_head recoveries;
13163+
13164+
13165+void init_recovery(void)
13166+{
13167+ INIT_LIST_HEAD(&recoveries);
13168+}
13169+
13170+/*
13171+ * This is the first thing called when a change is announced in cluster
13172+ * membership. Nodes are marked as being a CLUSTER_MEMBER or not. SM adds new
13173+ * nodes to its sm_members list which it's not seen before. Nodes which were
13174+ * alive but are now gone are marked as "need recovery".
13175+ *
13176+ * The "need recovery" status of nodes is propagated to the node's SG's in
13177+ * mark_effected_sgs. The effected SG's are themselves marked as needing
13178+ * recovery and in new_recovery the dead nodes are removed from the SG's
13179+ * individual member lists. The "need recovery" status of nodes is cleared in
13180+ * adjust_members_done().
13181+ */
13182+
13183+static int adjust_members(void)
13184+{
13185+ sm_node_t *node;
13186+ struct kcl_cluster_node knode;
13187+ int i, error, num_nodes, sub = 0, add = 0, found;
13188+
13189+ /*
13190+ * Get list of current members from cnxman
13191+ */
13192+
13193+ memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t));
13194+ num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes);
13195+
13196+ /*
13197+ * Determine who's gone
13198+ */
13199+
13200+ list_for_each_entry(node, &sm_members, list) {
13201+ found = FALSE;
13202+ for (i = 0; i < num_nodes; i++) {
13203+ if (node->id == sm_new_nodeids[i]) {
13204+ found = TRUE;
13205+ sm_new_nodeids[i] = 0;
13206+ break;
13207+ }
13208+ }
13209+
13210+ if (found) {
13211+ error = kcl_get_node_by_nodeid(node->id, &knode);
13212+ SM_ASSERT(!error, printk("error=%d\n", error););
13213+
13214+ if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) {
13215+ /* former member is back */
13216+ set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
13217+ node->incarnation = knode.incarnation;
13218+ add++;
13219+ } else {
13220+ /* current member is still alive - if the
13221+ * incarnation number is different it died and
13222+ * returned between checks */
13223+ if (node->incarnation != knode.incarnation) {
13224+ set_bit(SNFL_NEED_RECOVERY,
13225+ &node->flags);
13226+ node->incarnation = knode.incarnation;
13227+ sub++;
13228+ }
13229+ }
13230+ } else {
13231+ /* current member has died */
13232+ if (test_and_clear_bit(SNFL_CLUSTER_MEMBER,
13233+ &node->flags)) {
13234+ set_bit(SNFL_NEED_RECOVERY, &node->flags);
13235+ sub++;
13236+ }
13237+ }
13238+ }
13239+
13240+ /*
13241+ * Look for new nodes
13242+ */
13243+
13244+ for (i = 0; i < num_nodes; i++) {
13245+ if (sm_new_nodeids[i]) {
13246+ node = sm_new_node(sm_new_nodeids[i]);
13247+ set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
13248+ add++;
13249+ list_add_tail(&node->list, &sm_members);
13250+ sm_member_count++;
13251+ }
13252+ }
13253+
13254+ /*
13255+ * Get our own nodeid
13256+ */
13257+
13258+ if (!sm_our_nodeid) {
13259+ list_for_each_entry(node, &sm_members, list) {
13260+ error = kcl_get_node_by_nodeid(node->id, &knode);
13261+ SM_ASSERT(!error, printk("error=%d\n", error););
13262+
13263+ if (knode.us) {
13264+ sm_our_nodeid = knode.node_id;
13265+ break;
13266+ }
13267+ }
13268+ }
13269+
13270+ return sub;
13271+}
13272+
13273+/*
13274+ * Given some number of dead nodes, flag SG's the dead nodes were part of.
13275+ * This requires a number of loops because each node structure does not keep a
13276+ * list of SG's it's in.
13277+ */
13278+
13279+static int mark_effected_sgs(void)
13280+{
13281+ sm_group_t *sg;
13282+ sm_node_t *node, *sgnode;
13283+ uint32_t dead_id;
13284+ int i, effected = 0;
13285+
13286+ down(&sm_sglock);
13287+
13288+ list_for_each_entry(node, &sm_members, list) {
13289+ if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
13290+ continue;
13291+
13292+ dead_id = node->id;
13293+
13294+ for (i = 0; i < SG_LEVELS; i++) {
13295+ list_for_each_entry(sg, &sm_sg[i], list) {
13296+ /* check if dead node is among sg's members */
13297+ list_for_each_entry(sgnode, &sg->memb, list) {
13298+ if (sgnode->id == dead_id) {
13299+ set_bit(SGFL_NEED_RECOVERY,
13300+ &sg->flags);
13301+ effected++;
13302+ break;
13303+ }
13304+ }
13305+ }
13306+ }
13307+ }
13308+ up(&sm_sglock);
13309+
13310+ return effected;
13311+}
13312+
13313+static recover_t *alloc_recover(void)
13314+{
13315+ recover_t *rev;
13316+ int i;
13317+
13318+ SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev);
13319+
13320+ memset(rev, 0, sizeof(recover_t));
13321+
13322+ sm_set_event_id(&rev->event_id);
13323+
13324+ for (i = 0; i < SG_LEVELS; i++) {
13325+ INIT_LIST_HEAD(&rev->sgs[i]);
13326+ }
13327+
13328+ return rev;
13329+}
13330+
13331+/*
13332+ * An in-progress revent re-start for an SG is interrupted by another node
13333+ * failure in the SG. Cancel an outstanding barrier if there is one. The SG
13334+ * will be moved to the new revent and re-started as part of that.
13335+ */
13336+
13337+static void cancel_prev_recovery(sm_group_t *sg)
13338+{
13339+ int error;
13340+
13341+ if (sg->recover_state == RECOVER_BARRIERWAIT) {
13342+ error = kcl_barrier_cancel(sg->recover_barrier);
13343+ if (error)
13344+ log_error(sg, "cancel_prev_recovery: error %d", error);
13345+ }
13346+}
13347+
13348+static void pre_recover_sg(sm_group_t *sg, recover_t *rev)
13349+{
13350+ if (sg->state == SGST_RECOVER) {
13351+ cancel_prev_recovery(sg);
13352+ list_del(&sg->recover_list);
13353+ }
13354+
13355+ sg->ops->stop(sg->service_data);
13356+ sg->state = SGST_RECOVER;
13357+ sg->recover_state = RECOVER_NONE;
13358+ sg->recover_data = rev;
13359+ list_add(&sg->recover_list, &rev->sgs[sg->level]);
13360+}
13361+
13362+/*
13363+ * When adjust_members finds that some nodes are dead and mark_effected_sgs
13364+ * finds that some SG's are effected by departed nodes, this is called to
13365+ * collect together the SG's which need to be recovered. An revent (recovery
13366+ * event) is the group of effected SG's.
13367+ */
13368+
13369+static int new_recovery(void)
13370+{
13371+ sm_group_t *sg;
13372+ recover_t *rev;
13373+ sm_node_t *node, *sgnode, *safe;
13374+ int i;
13375+
13376+ rev = alloc_recover();
13377+ list_add_tail(&rev->list, &recoveries);
13378+
13379+ down(&sm_sglock);
13380+
13381+ /*
13382+ * Stop effected SG's and add them to the rev
13383+ */
13384+
13385+ for (i = 0; i < SG_LEVELS; i++) {
13386+ list_for_each_entry(sg, &sm_sg[i], list) {
13387+ if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){
13388+ if (sg->state == SGST_JOIN)
13389+ continue;
13390+ pre_recover_sg(sg, rev);
13391+ }
13392+ }
13393+ }
13394+
13395+ /*
13396+ * For an SG needing recovery, remove dead nodes from sg->memb list
13397+ */
13398+
13399+ for (i = 0; i < SG_LEVELS; i++) {
13400+ list_for_each_entry(sg, &rev->sgs[i], recover_list) {
13401+
13402+ /* Remove dead members from SG's member list */
13403+ list_for_each_entry_safe(sgnode, safe, &sg->memb, list){
13404+
13405+ node = sm_find_member(sgnode->id);
13406+ SM_ASSERT(node, printk("id %u\n", sgnode->id););
13407+
13408+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){
13409+ list_del(&sgnode->list);
13410+ kfree(sgnode);
13411+ sg->memb_count--;
13412+ log_debug(sg, "remove node %u count %d",
13413+ sgnode->id, sg->memb_count);
13414+ }
13415+ }
13416+ }
13417+ }
13418+
13419+ up(&sm_sglock);
13420+ rev->cur_level = 0;
13421+ return 0;
13422+}
13423+
13424+/*
13425+ * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in
13426+ * mark_effected_sgs() and add_revent(). After that, we're done using the bit
13427+ * and we clear it here.
13428+ */
13429+
13430+static void adjust_members_done(void)
13431+{
13432+ sm_node_t *node;
13433+
13434+ list_for_each_entry(node, &sm_members, list)
13435+ clear_bit(SNFL_NEED_RECOVERY, &node->flags);
13436+}
13437+
13438+/*
13439+ * Start the service of the given SG. The service must be given an array of
13440+ * nodeids specifying the new sg membership. The service is responsible to
13441+ * free this chunk of memory when done with it.
13442+ */
13443+
13444+static void start_sg(sm_group_t *sg, uint32_t event_id)
13445+{
13446+ sm_node_t *node;
13447+ uint32_t *memb;
13448+ int count = 0;
13449+
13450+ SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
13451+ memb);
13452+
13453+ list_for_each_entry(node, &sg->memb, list)
13454+ memb[count++] = node->id;
13455+
13456+ sg->ops->start(sg->service_data, memb, count, event_id,
13457+ SERVICE_NODE_FAILED);
13458+}
13459+
13460+static void recovery_barrier(sm_group_t *sg)
13461+{
13462+ char bname[MAX_BARRIER_NAME_LEN];
13463+ int error, len;
13464+
13465+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
13466+
13467+ /* bypass the barrier if we're the only member */
13468+ if (sg->memb_count == 1) {
13469+ process_recovery_barrier(sg, 0);
13470+ return;
13471+ }
13472+
13473+ len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u",
13474+ sg->global_id, sg->recover_stop, sg->memb_count);
13475+
13476+ /* We save this barrier name so we can cancel it if needed. */
13477+ memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN);
13478+ memcpy(sg->recover_barrier, bname, len);
13479+
13480+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY);
13481+ if (error)
13482+ log_error(sg, "recovery_barrier error %d: %s", error, bname);
13483+}
13484+
13485+static void recover_sg(sm_group_t *sg, int event_id)
13486+{
13487+ log_debug(sg, "recover state %d", sg->recover_state);
13488+
13489+ switch (sg->recover_state) {
13490+
13491+ case RECOVER_NONE:
13492+ /* must wait for recovery to stop sg on all nodes */
13493+ sg->recover_state = RECOVER_BARRIERWAIT;
13494+ sg->recover_stop = 0;
13495+ recovery_barrier(sg);
13496+ break;
13497+
13498+ case RECOVER_BARRIERWAIT:
13499+ break;
13500+
13501+ case RECOVER_STOP:
13502+ /* barrier callback sets state STOP */
13503+ sg->recover_stop = 1;
13504+ sg->recover_state = RECOVER_START;
13505+ start_sg(sg, event_id);
13506+ break;
13507+
13508+ case RECOVER_START:
13509+ break;
13510+
13511+ case RECOVER_STARTDONE:
13512+ /* service callback sets state STARTDONE */
13513+ sg->recover_state = RECOVER_BARRIERWAIT;
13514+ recovery_barrier(sg);
13515+ break;
13516+
13517+ case RECOVER_BARRIERDONE:
13518+ /* barrier callback sets state BARRIERDONE */
13519+ sg->ops->finish(sg->service_data, event_id);
13520+ list_del(&sg->recover_list);
13521+ sg->recover_state = RECOVER_NONE;
13522+ sg->state = SGST_RUN;
13523+
13524+ /* Continue a previous, interrupted attempt to leave the sg */
13525+ if (sg->sevent) {
b7b72b66
AM
13526+ sm_sevent_t *sev = sg->sevent;
13527+ log_debug(sg, "restart leave %lx", sev->se_flags);
13528+ clear_bit(SEFL_DELAY_RECOVERY, &sev->se_flags);
13529+ set_bit(SEFL_CHECK, &sev->se_flags);
c1c6733f
AM
13530+ wake_serviced(DO_JOINLEAVE);
13531+ }
13532+ break;
13533+
13534+ default:
13535+ log_error(sg, "invalid recover_state %u", sg->recover_state);
13536+ }
13537+}
13538+
13539+static void recover_level(recover_t *rev, int level)
13540+{
13541+ sm_group_t *sg, *safe;
13542+
13543+ list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list)
13544+ recover_sg(sg, rev->event_id);
13545+}
13546+
13547+static void recover_levels(recover_t *rev)
13548+{
13549+ for (;;) {
13550+ recover_level(rev, rev->cur_level);
13551+
13552+ if (list_empty(&rev->sgs[rev->cur_level])) {
13553+ if (rev->cur_level == SG_LEVELS - 1) {
13554+ list_del(&rev->list);
13555+ kfree(rev);
13556+ return;
13557+ }
13558+ rev->cur_level++;
13559+ continue;
13560+ }
13561+ break;
13562+ }
13563+}
13564+
13565+/*
13566+ * Called by SM thread when the cluster is quorate. It restarts
13567+ * SG's that were stopped in new_recovery() due to a member death.
13568+ * It waits for all SG's at level N to complete restart before
13569+ * restarting SG's at level N+1.
13570+ */
13571+
13572+void process_recoveries(void)
13573+{
13574+ recover_t *rev, *safe;
13575+
13576+ down(&sm_sglock);
13577+ list_for_each_entry_safe(rev, safe, &recoveries, list)
13578+ recover_levels(rev);
13579+ up(&sm_sglock);
13580+}
13581+
13582+/*
13583+ * The cnxman membership has changed. Check if there's still quorum and
13584+ * whether any nodes have died. If nodes have died, initiate recovery on any
13585+ * SG's they were in. This begins immediately if the cluster remains quorate;
13586+ * if not this waits until the cluster regains quorum.
13587+ */
13588+
13589+void process_nodechange(void)
13590+{
13591+ int gone, effected;
13592+
13593+ if ((sm_quorum = sm_quorum_next))
13594+ wake_serviced(DO_RUN);
13595+
13596+ gone = adjust_members();
13597+ if (gone > 0) {
13598+ effected = mark_effected_sgs();
13599+
13600+ backout_sevents();
13601+ cancel_uevents(&effected);
13602+
13603+ if (effected > 0) {
13604+ new_recovery();
13605+ wake_serviced(DO_RECOVERIES);
13606+ }
13607+ }
13608+ adjust_members_done();
13609+}
13610+
13611+int check_recovery(sm_group_t *sg, int event_id)
13612+{
13613+ if (sg->state == SGST_RECOVER) {
13614+ recover_t *rev = (recover_t *) sg->recover_data;
13615+ if (rev && rev->event_id == event_id)
13616+ return 1;
13617+ }
13618+ return 0;
13619+}
13620+
13621+void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid)
13622+{
13623+ sm_group_t *sg;
13624+ recover_t *rev;
13625+
13626+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
13627+ if (!sg) {
13628+ log_print("process_recover_msg: unknown sg id %x",
13629+ smsg->ms_global_sgid);
13630+ return;
13631+ }
13632+
13633+ /* we already know about the recovery and can ignore the msg */
13634+ if (sg->state == SGST_RECOVER)
13635+ return;
13636+
13637+ if (test_bit(SGFL_UEVENT, &sg->flags)) {
13638+ /* we will initiate recovery on our own if we know about the
13639+ uevent so we can ignore this */
13640+ log_debug(sg, "process_recover_msg: ignore from %u", nodeid);
13641+ return;
13642+ }
13643+
13644+ log_debug(sg, "recovery initiated by msg from %u", nodeid);
13645+ rev = alloc_recover();
13646+ list_add_tail(&rev->list, &recoveries);
13647+ pre_recover_sg(sg, rev);
13648+ wake_serviced(DO_RECOVERIES);
13649+}
13650diff -urN linux-orig/cluster/cman/sm_recover.h linux-patched/cluster/cman/sm_recover.h
bb1d8b11
AM
13651--- linux-orig/cluster/cman/sm_recover.h 1970-01-01 07:30:00.000000000 +0730
13652+++ linux-patched/cluster/cman/sm_recover.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
13653@@ -0,0 +1,23 @@
13654+/******************************************************************************
13655+*******************************************************************************
13656+**
13657+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13658+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13659+**
13660+** This copyrighted material is made available to anyone wishing to use,
13661+** modify, copy, or redistribute it subject to the terms and conditions
13662+** of the GNU General Public License v.2.
13663+**
13664+*******************************************************************************
13665+******************************************************************************/
13666+
13667+#ifndef __SM_RECOVER_DOT_H__
13668+#define __SM_RECOVER_DOT_H__
13669+
13670+void init_recovery(void);
13671+void process_recoveries(void);
13672+void process_nodechange(void);
13673+int check_recovery(sm_group_t *sg, int event_id);
13674+void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid);
13675+
13676+#endif
13677diff -urN linux-orig/cluster/cman/sm_services.c linux-patched/cluster/cman/sm_services.c
bb1d8b11
AM
13678--- linux-orig/cluster/cman/sm_services.c 1970-01-01 07:30:00.000000000 +0730
13679+++ linux-patched/cluster/cman/sm_services.c 2004-11-03 11:37:37.000000000 +0800
b7b72b66 13680@@ -0,0 +1,426 @@
c1c6733f
AM
13681+/******************************************************************************
13682+*******************************************************************************
13683+**
13684+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13685+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13686+**
13687+** This copyrighted material is made available to anyone wishing to use,
13688+** modify, copy, or redistribute it subject to the terms and conditions
13689+** of the GNU General Public License v.2.
13690+**
13691+*******************************************************************************
13692+******************************************************************************/
13693+
13694+#include "sm.h"
13695+
13696+static struct list_head callbacks;
13697+static spinlock_t callback_lock;
13698+static struct list_head sg_registered[SG_LEVELS];
13699+
13700+/*
13701+ * These are the functions to register, join, leave, unregister, callback
13702+ * with/to the sm.
13703+ */
13704+
13705+struct sc_entry {
13706+ struct list_head list;
13707+ uint32_t local_id;
13708+ int event_id;
13709+};
13710+typedef struct sc_entry sc_entry_t;
13711+
13712+void init_services(void)
13713+{
13714+ int i;
13715+
13716+ INIT_LIST_HEAD(&callbacks);
13717+ spin_lock_init(&callback_lock);
13718+
13719+ for (i = 0; i < SG_LEVELS; i++) {
13720+ INIT_LIST_HEAD(&sm_sg[i]);
13721+ INIT_LIST_HEAD(&sg_registered[i]);
13722+ }
13723+ init_MUTEX(&sm_sglock);
13724+}
13725+
13726+/* Context: service */
13727+
13728+int kcl_register_service(char *name, int namelen, int level,
13729+ struct kcl_service_ops *ops, int unique,
13730+ void *servicedata, uint32_t *service_id)
13731+{
13732+ sm_group_t *sg;
13733+ int found = FALSE;
13734+ int error = -EINVAL;
13735+
13736+ if (level > SG_LEVELS - 1)
13737+ goto fail;
13738+
13739+ if (namelen > MAX_SERVICE_NAME_LEN)
13740+ goto fail;
13741+
13742+ error = kcl_addref_cluster();
13743+ if (error)
13744+ goto fail;
13745+
13746+ down(&sm_sglock);
13747+
13748+ list_for_each_entry(sg, &sm_sg[level], list) {
13749+ if ((sg->namelen == namelen) &&
13750+ (!strncmp(sg->name, name, namelen))) {
13751+ found = TRUE;
13752+ goto next;
13753+ }
13754+ }
13755+
13756+ list_for_each_entry(sg, &sg_registered[level], list) {
13757+ if ((sg->namelen == namelen) &&
13758+ (!strncmp(sg->name, name, namelen))) {
13759+ found = TRUE;
13760+ goto next;
13761+ }
13762+ }
13763+
13764+ next:
13765+
13766+ if (found && unique) {
13767+ error = -EEXIST;
13768+ goto fail_unlock;
13769+ }
13770+
13771+ if (found) {
13772+ sg->refcount++;
13773+ goto out;
13774+ }
13775+
13776+ sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL);
13777+ if (!sg) {
13778+ error = -ENOMEM;
13779+ goto fail_unlock;
13780+ }
13781+ memset(sg, 0, sizeof(sm_group_t) + namelen);
13782+
13783+ sg->refcount = 1;
13784+ sg->service_data = servicedata;
13785+ sg->ops = ops;
13786+ sg->level = level;
13787+ sg->namelen = namelen;
13788+ memcpy(sg->name, name, namelen);
13789+ sg->local_id = sm_new_local_id(level);
13790+ sg->state = SGST_NONE;
13791+ INIT_LIST_HEAD(&sg->memb);
13792+ INIT_LIST_HEAD(&sg->joining);
13793+ init_completion(&sg->event_comp);
13794+
13795+ list_add_tail(&sg->list, &sg_registered[level]);
13796+
13797+ out:
13798+ *service_id = sg->local_id;
13799+ up(&sm_sglock);
13800+ return 0;
13801+
13802+ fail_unlock:
13803+ up(&sm_sglock);
13804+ kcl_releaseref_cluster();
13805+ fail:
13806+ return error;
13807+}
13808+
13809+/* Context: service */
13810+
13811+void kcl_unregister_service(uint32_t local_id)
13812+{
13813+ sm_group_t *sg;
13814+ int level = sm_id_to_level(local_id);
13815+
13816+ down(&sm_sglock);
13817+
13818+ list_for_each_entry(sg, &sg_registered[level], list) {
13819+ if (sg->local_id == local_id) {
13820+ SM_ASSERT(sg->refcount,);
13821+ sg->refcount--;
13822+
13823+ if (!sg->refcount) {
13824+ list_del(&sg->list);
13825+ kfree(sg);
13826+ }
13827+ kcl_releaseref_cluster();
13828+ break;
13829+ }
13830+ }
13831+ up(&sm_sglock);
13832+}
13833+
13834+/* Context: service */
13835+
13836+int kcl_join_service(uint32_t local_id)
13837+{
13838+ sm_group_t *sg;
13839+ sm_sevent_t *sev;
13840+ int level = sm_id_to_level(local_id);
13841+ int error, found = FALSE;
13842+
13843+ down(&sm_sglock);
13844+
13845+ list_for_each_entry(sg, &sg_registered[level], list) {
13846+ if (sg->local_id == local_id) {
13847+ found = TRUE;
13848+ break;
13849+ }
13850+ }
13851+
13852+ if (!found) {
13853+ up(&sm_sglock);
13854+ error = -ENOENT;
13855+ goto out;
13856+ }
13857+
13858+ if (sg->state != SGST_NONE) {
13859+ up(&sm_sglock);
13860+ error = -EINVAL;
13861+ goto out;
13862+ }
13863+
c1c6733f
AM
13864+ sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13865+ if (!sev) {
b7b72b66 13866+ up(&sm_sglock);
c1c6733f
AM
13867+ error = -ENOMEM;
13868+ goto out;
13869+ }
13870+
13871+ memset(sev, 0, sizeof (sm_sevent_t));
13872+ sev->se_state = SEST_JOIN_BEGIN;
b7b72b66 13873+ sm_set_event_id(&sev->se_id);
c1c6733f
AM
13874+ sev->se_sg = sg;
13875+ sg->sevent = sev;
b7b72b66
AM
13876+ sg->state = SGST_JOIN;
13877+ set_bit(SGFL_SEVENT, &sg->flags);
13878+ list_del(&sg->list);
13879+ list_add_tail(&sg->list, &sm_sg[sg->level]);
13880+
13881+ up(&sm_sglock);
13882+
13883+ /*
13884+ * The join is a service event which will be processed asynchronously.
13885+ */
c1c6733f
AM
13886+
13887+ new_joinleave(sev);
13888+ wait_for_completion(&sg->event_comp);
13889+ error = 0;
13890+
13891+ out:
13892+ return error;
13893+}
13894+
13895+/* Context: service */
13896+
13897+int kcl_leave_service(uint32_t local_id)
13898+{
13899+ sm_group_t *sg = NULL;
13900+ sm_sevent_t *sev;
13901+ int error;
13902+
13903+ error = -ENOENT;
13904+ sg = sm_local_id_to_sg(local_id);
13905+ if (!sg)
13906+ goto out;
13907+
13908+ /* sg was never joined */
13909+ error = -EINVAL;
13910+ if (sg->state == SGST_NONE)
13911+ goto out;
13912+
b7b72b66
AM
13913+ down(&sm_sglock);
13914+
c1c6733f 13915+ /* may still be joining */
b7b72b66
AM
13916+ if (test_and_set_bit(SGFL_SEVENT, &sg->flags)) {
13917+ up(&sm_sglock);
13918+ error = -EBUSY;
c1c6733f 13919+ goto out;
b7b72b66 13920+ }
c1c6733f 13921+
c1c6733f 13922+ sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
b7b72b66
AM
13923+ if (!sev) {
13924+ up(&sm_sglock);
13925+ error = -ENOMEM;
c1c6733f 13926+ goto out;
b7b72b66 13927+ }
c1c6733f
AM
13928+
13929+ memset(sev, 0, sizeof (sm_sevent_t));
13930+ sev->se_state = SEST_LEAVE_BEGIN;
b7b72b66 13931+ sm_set_event_id(&sev->se_id);
c1c6733f
AM
13932+ set_bit(SEFL_LEAVE, &sev->se_flags);
13933+ sev->se_sg = sg;
13934+ sg->sevent = sev;
b7b72b66
AM
13935+
13936+ up(&sm_sglock);
c1c6733f
AM
13937+
13938+ new_joinleave(sev);
13939+ wait_for_completion(&sg->event_comp);
13940+ error = 0;
13941+
13942+ down(&sm_sglock);
13943+ list_del(&sg->list);
13944+ list_add_tail(&sg->list, &sg_registered[sg->level]);
13945+ up(&sm_sglock);
13946+
13947+ out:
13948+ return error;
13949+}
13950+
13951+static void process_callback(uint32_t local_id, int event_id)
13952+{
13953+ sm_group_t *sg;
13954+ sm_sevent_t *sev;
13955+ sm_uevent_t *uev;
13956+
13957+ sg = sm_local_id_to_sg(local_id);
13958+ if (!sg)
13959+ return;
13960+
13961+ if (sg->state == SGST_RECOVER) {
13962+ if (!check_recovery(sg, event_id)) {
13963+ log_error(sg, "process_callback invalid recover "
13964+ "event id %d", event_id);
13965+ return;
13966+ }
13967+
13968+ if (sg->recover_state == RECOVER_START)
13969+ sg->recover_state = RECOVER_STARTDONE;
13970+ else
13971+ log_error(sg, "process_callback recover state %u",
13972+ sg->recover_state);
13973+ wake_serviced(DO_RECOVERIES);
13974+ }
13975+
13976+ else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent &&
13977+ (sg->sevent->se_id == event_id)) {
13978+ sev = sg->sevent;
13979+
13980+ if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) &&
13981+ (sev->se_state == SEST_JSTART_SERVICEWAIT))
13982+ sev->se_state = SEST_JSTART_SERVICEDONE;
13983+
13984+ set_bit(SEFL_CHECK, &sev->se_flags);
13985+ wake_serviced(DO_JOINLEAVE);
13986+ }
13987+
13988+ else if (test_bit(SGFL_UEVENT, &sg->flags) &&
13989+ (sg->uevent.ue_id == event_id)) {
13990+ uev = &sg->uevent;
13991+
13992+ if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) {
13993+ if (uev->ue_state == UEST_JSTART_SERVICEWAIT)
13994+ uev->ue_state = UEST_JSTART_SERVICEDONE;
13995+ else if (uev->ue_state == UEST_LSTART_SERVICEWAIT)
13996+ uev->ue_state = UEST_LSTART_SERVICEDONE;
13997+ }
13998+ set_bit(UEFL_CHECK, &uev->ue_flags);
13999+ wake_serviced(DO_MEMBERSHIP);
14000+ }
14001+
14002+ else
14003+ log_error(sg, "ignoring service callback id=%x event=%u",
14004+ local_id, event_id);
14005+}
14006+
14007+void process_callbacks(void)
14008+{
14009+ sc_entry_t *se;
14010+
14011+ while (1) {
14012+ se = NULL;
14013+
14014+ spin_lock(&callback_lock);
14015+ if (!list_empty(&callbacks)) {
14016+ se = list_entry(callbacks.next, sc_entry_t, list);
14017+ list_del(&se->list);
14018+ }
14019+ spin_unlock(&callback_lock);
14020+
14021+ if (!se)
14022+ break;
14023+ process_callback(se->local_id, se->event_id);
14024+ kfree(se);
14025+ schedule();
14026+ }
14027+}
14028+
14029+/* Context: service */
14030+
14031+void kcl_start_done(uint32_t local_id, int event_id)
14032+{
14033+ sc_entry_t *se;
14034+
14035+ SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se);
14036+
14037+ se->local_id = local_id;
14038+ se->event_id = event_id;
14039+
14040+ spin_lock(&callback_lock);
14041+ list_add_tail(&se->list, &callbacks);
14042+ spin_unlock(&callback_lock);
14043+
14044+ wake_serviced(DO_CALLBACKS);
14045+}
14046+
14047+/* Context: service */
14048+
14049+void kcl_global_service_id(uint32_t local_id, uint32_t *global_id)
14050+{
14051+ sm_group_t *sg = sm_local_id_to_sg(local_id);
14052+
14053+ if (!sg)
14054+ log_print("kcl_global_service_id: can't find %x", local_id);
14055+ else
14056+ *global_id = sg->global_id;
14057+}
14058+
14059+static void copy_to_service(sm_group_t *sg, struct kcl_service *s)
14060+{
14061+ s->level = sg->level;
14062+ s->local_id = sg->local_id;
14063+ s->global_id = sg->global_id;
14064+ s->node_count = sg->memb_count;
14065+ strcpy(s->name, sg->name);
14066+}
14067+
14068+int kcl_get_services(struct list_head *head, int level)
14069+{
14070+ sm_group_t *sg;
14071+ struct kcl_service *s;
14072+ int error = -ENOMEM, count = 0;
14073+
14074+ down(&sm_sglock);
14075+
14076+ list_for_each_entry(sg, &sg_registered[level], list) {
14077+ if (head) {
14078+ s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
14079+ if (!s)
14080+ goto out;
14081+ copy_to_service(sg, s);
14082+ list_add(&s->list, head);
14083+ }
14084+ count++;
14085+ }
14086+
14087+ list_for_each_entry(sg, &sm_sg[level], list) {
14088+ if (head) {
14089+ s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
14090+ if (!s)
14091+ goto out;
14092+ copy_to_service(sg, s);
14093+ list_add(&s->list, head);
14094+ }
14095+ count++;
14096+ }
14097+
14098+ error = count;
14099+ out:
14100+ up(&sm_sglock);
14101+ return error;
14102+}
14103+
14104+/* These three global variables listed in extern form in sm.h. */
14105+struct list_head sm_sg[SG_LEVELS];
14106+struct semaphore sm_sglock;
14107diff -urN linux-orig/cluster/cman/sm_services.h linux-patched/cluster/cman/sm_services.h
bb1d8b11
AM
14108--- linux-orig/cluster/cman/sm_services.h 1970-01-01 07:30:00.000000000 +0730
14109+++ linux-patched/cluster/cman/sm_services.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
14110@@ -0,0 +1,20 @@
14111+/******************************************************************************
14112+*******************************************************************************
14113+**
14114+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14115+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14116+**
14117+** This copyrighted material is made available to anyone wishing to use,
14118+** modify, copy, or redistribute it subject to the terms and conditions
14119+** of the GNU General Public License v.2.
14120+**
14121+*******************************************************************************
14122+******************************************************************************/
14123+
14124+#ifndef __SM_SERVICES_DOT_H__
14125+#define __SM_SERVICES_DOT_H__
14126+
14127+void init_services(void);
14128+void process_callbacks(void);
14129+
14130+#endif
14131diff -urN linux-orig/cluster/cman/sm_user.c linux-patched/cluster/cman/sm_user.c
bb1d8b11
AM
14132--- linux-orig/cluster/cman/sm_user.c 1970-01-01 07:30:00.000000000 +0730
14133+++ linux-patched/cluster/cman/sm_user.c 2004-11-03 11:37:37.000000000 +0800
5a2052f6 14134@@ -0,0 +1,569 @@
c1c6733f
AM
14135+/******************************************************************************
14136+*******************************************************************************
14137+**
14138+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14139+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14140+**
14141+** This copyrighted material is made available to anyone wishing to use,
14142+** modify, copy, or redistribute it subject to the terms and conditions
14143+** of the GNU General Public License v.2.
14144+**
14145+*******************************************************************************
14146+******************************************************************************/
14147+
14148+#include "sm.h"
14149+#include "cnxman-private.h"
14150+
14151+void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode);
14152+
14153+#define UST_REGISTER 1
14154+#define UST_UNREGISTER 2
14155+#define UST_JOIN 3
14156+#define UST_LEAVE 4
14157+#define UST_JOINED 5
14158+
14159+struct event {
14160+ struct list_head list;
14161+ service_event_t type;
14162+ service_start_t start_type;
14163+ unsigned int event_id;
14164+ unsigned int last_stop;
14165+ unsigned int last_start;
14166+ unsigned int last_finish;
14167+ unsigned int node_count;
14168+ uint32_t * nodeids;
14169+};
14170+typedef struct event event_t;
14171+
14172+struct user_service {
14173+ uint32_t local_id;
14174+ pid_t pid;
14175+ int signal;
14176+ struct socket * sock;
14177+ uint8_t state;
14178+ uint8_t async;
14179+ struct semaphore lock;
14180+ struct list_head events;
14181+ spinlock_t event_lock;
14182+ unsigned int last_stop;
14183+ unsigned int last_start;
14184+ unsigned int last_finish;
14185+ unsigned int need_startdone;
14186+ unsigned int node_count;
14187+ uint32_t * nodeids;
14188+ int name_len;
14189+ char name[MAX_SERVICE_NAME_LEN];
14190+};
14191+typedef struct user_service user_service_t;
14192+
14193+
14194+static void add_event(user_service_t *us, event_t *ev)
14195+{
14196+ spin_lock(&us->event_lock);
14197+ list_add_tail(&ev->list, &us->events);
14198+
14199+ switch(ev->type) {
14200+ case SERVICE_EVENT_STOP:
14201+ us->last_stop = us->last_start;
14202+ break;
14203+ case SERVICE_EVENT_START:
14204+ us->last_start = ev->event_id;
14205+ break;
14206+ case SERVICE_EVENT_FINISH:
14207+ us->last_finish = ev->event_id;
14208+ break;
14209+ case SERVICE_EVENT_LEAVEDONE:
14210+ break;
14211+ }
14212+ spin_unlock(&us->event_lock);
14213+}
14214+
14215+static event_t *get_event(user_service_t *us)
14216+{
14217+ event_t *ev = NULL;
14218+
14219+ spin_lock(&us->event_lock);
14220+ if (!list_empty(&us->events)) {
14221+ ev = list_entry(us->events.next, event_t, list);
14222+ ev->last_stop = us->last_stop;
14223+ ev->last_start = us->last_start;
14224+ ev->last_finish = us->last_finish;
14225+ }
14226+ spin_unlock(&us->event_lock);
14227+ return ev;
14228+}
14229+
14230+static void del_event(user_service_t *us, event_t *ev)
14231+{
14232+ spin_lock(&us->event_lock);
14233+ list_del(&ev->list);
14234+ spin_unlock(&us->event_lock);
14235+}
14236+
14237+static event_t *alloc_event(void)
14238+{
14239+ event_t *ev;
14240+ SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev);
14241+ memset(ev, 0, sizeof(event_t));
14242+ return ev;
14243+}
14244+
14245+/* us->lock must be held before calling */
14246+static void user_notify(user_service_t *us)
14247+{
14248+ if (us->sock)
14249+ queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT);
14250+ if (us->pid && us->signal)
14251+ kill_proc(us->pid, us->signal, 0);
14252+}
14253+
14254+static service_start_t start_type(int type)
14255+{
14256+ switch (type) {
14257+ case SERVICE_NODE_FAILED:
14258+ return SERVICE_START_FAILED;
14259+ case SERVICE_NODE_JOIN:
14260+ return SERVICE_START_JOIN;
14261+ case SERVICE_NODE_LEAVE:
14262+ return SERVICE_START_LEAVE;
14263+ }
14264+ return 0;
14265+}
14266+
14267+static int user_stop(void *servicedata)
14268+{
14269+ user_service_t *us = (user_service_t *) servicedata;
14270+ event_t *ev;
14271+
14272+ down(&us->lock);
14273+ if (!us->sock)
14274+ goto out;
14275+
14276+ ev = alloc_event();
14277+ ev->type = SERVICE_EVENT_STOP;
14278+
14279+ add_event(us, ev);
14280+ user_notify(us);
14281+ out:
14282+ up(&us->lock);
14283+ return 0;
14284+}
14285+
14286+static int user_start(void *servicedata, uint32_t *nodeids, int count,
14287+ int event_id, int type)
14288+{
14289+ user_service_t *us = (user_service_t *) servicedata;
14290+ event_t *ev;
14291+
14292+ down(&us->lock);
14293+ if (!us->sock) {
14294+ kcl_start_done(us->local_id, event_id);
14295+ goto out;
14296+ }
14297+
14298+ us->need_startdone = event_id;
14299+
14300+ ev = alloc_event();
14301+ ev->type = SERVICE_EVENT_START;
14302+ ev->node_count = count;
14303+ ev->start_type = start_type(type);
14304+ ev->event_id = event_id;
14305+ ev->nodeids = nodeids;
14306+
14307+ add_event(us, ev);
14308+ user_notify(us);
14309+ out:
14310+ up(&us->lock);
14311+ return 0;
14312+}
14313+
14314+static void user_finish(void *servicedata, int event_id)
14315+{
14316+ user_service_t *us = (user_service_t *) servicedata;
14317+ event_t *ev;
14318+
14319+ down(&us->lock);
14320+ if (!us->sock)
14321+ goto out;
14322+
14323+ ev = alloc_event();
14324+ ev->type = SERVICE_EVENT_FINISH;
14325+ ev->event_id = event_id;
14326+
14327+ add_event(us, ev);
14328+ user_notify(us);
14329+ out:
14330+ up(&us->lock);
14331+}
14332+
14333+struct kcl_service_ops user_service_ops = {
14334+ .stop = user_stop,
14335+ .start = user_start,
14336+ .finish = user_finish
14337+};
14338+
5a2052f6 14339+static int user_register(char *u_name, user_service_t **us_data)
c1c6733f
AM
14340+{
14341+ user_service_t *us;
5a2052f6
AM
14342+ char name[MAX_SERVICE_NAME_LEN+1];
14343+ int len, error;
14344+
14345+ memset(name, 0, MAX_SERVICE_NAME_LEN+1);
14346+
14347+ if (copy_from_user(&name, u_name, MAX_SERVICE_NAME_LEN))
14348+ return -EFAULT;
c1c6733f 14349+
5a2052f6
AM
14350+ len = strlen(name);
14351+ if (len > MAX_SERVICE_NAME_LEN)
c1c6733f
AM
14352+ return -ENAMETOOLONG;
14353+ if (!len)
14354+ return -EINVAL;
14355+
14356+ us = kmalloc(sizeof(user_service_t), GFP_KERNEL);
14357+ if (!us)
14358+ return -ENOMEM;
14359+ memset(us, 0, sizeof(user_service_t));
14360+ us->nodeids = NULL;
14361+ INIT_LIST_HEAD(&us->events);
14362+ spin_lock_init(&us->event_lock);
14363+ init_MUTEX(&us->lock);
14364+ us->name_len = len;
14365+ memcpy(us->name, name, len);
14366+
14367+ error = kcl_register_service(name, len, SERVICE_LEVEL_USER,
14368+ &user_service_ops, TRUE, (void *) us,
14369+ &us->local_id);
14370+ if (error) {
14371+ kfree(us);
14372+ us = NULL;
14373+ }
14374+ *us_data = us;
14375+ return error;
14376+}
14377+
14378+static void user_unregister(user_service_t *us)
14379+{
14380+ event_t *ev;
14381+
14382+ kcl_unregister_service(us->local_id);
14383+
14384+ if (us->nodeids)
14385+ kfree(us->nodeids);
14386+
14387+ while ((ev = get_event(us))) {
14388+ del_event(us, ev);
14389+ if (ev->nodeids)
14390+ kfree(ev->nodeids);
14391+ kfree(ev);
14392+ }
14393+}
14394+
14395+static int user_join_async(void *arg)
14396+{
14397+ user_service_t *us = arg;
14398+ int user_gone = 0;
14399+
14400+ daemonize("cman_userjoin");
14401+
14402+ kcl_join_service(us->local_id);
14403+
14404+ down(&us->lock);
14405+ us->state = UST_JOINED;
14406+ us->async = 0;
14407+ if (!us->sock) {
14408+ if (us->need_startdone)
14409+ kcl_start_done(us->local_id, us->need_startdone);
14410+ user_gone = 1;
14411+ }
14412+ up(&us->lock);
14413+
14414+ if (user_gone) {
14415+ kcl_leave_service(us->local_id);
14416+ user_unregister(us);
14417+ kfree(us);
14418+ }
14419+ return 0;
14420+}
14421+
14422+static int user_leave_async(void *arg)
14423+{
14424+ user_service_t *us = arg;
14425+
14426+ daemonize("cman_userleave");
14427+
14428+ kcl_leave_service(us->local_id);
14429+
14430+ down(&us->lock);
14431+ us->async = 0;
14432+ if (!us->sock) {
14433+ user_unregister(us);
14434+ kfree(us);
14435+ } else {
14436+ event_t *ev = alloc_event();
14437+ ev->type = SERVICE_EVENT_LEAVEDONE;
14438+ add_event(us, ev);
14439+ user_notify(us);
14440+ up(&us->lock);
14441+ }
14442+
14443+ return 0;
14444+}
14445+
14446+static int user_join(user_service_t *us, int wait)
14447+{
14448+ int error = 0;
14449+
14450+ if (wait) {
14451+ error = kcl_join_service(us->local_id);
14452+ us->state = UST_JOINED;
14453+ }
14454+ else {
14455+ us->async = 1;
14456+ kernel_thread(user_join_async, us, 0);
14457+ }
14458+
14459+ return error;
14460+}
14461+
14462+static void user_leave(user_service_t *us, int wait)
14463+{
14464+ if (wait)
14465+ kcl_leave_service(us->local_id);
14466+ else {
14467+ us->async = 1;
14468+ kernel_thread(user_leave_async, us, 0);
14469+ }
14470+}
14471+
14472+static int user_start_done(user_service_t *us, unsigned int event_id)
14473+{
14474+ if (!us->need_startdone)
14475+ return -EINVAL;
14476+ if (us->need_startdone == event_id)
14477+ us->need_startdone = 0;
14478+ kcl_start_done(us->local_id, event_id);
14479+ return 0;
14480+}
14481+
14482+static void user_set_signal(user_service_t *us, int signal)
14483+{
14484+ us->pid = current->pid;
14485+ us->signal = signal;
14486+}
14487+
14488+static int user_get_event(user_service_t *us,
14489+ struct cl_service_event *user_event)
14490+{
14491+ event_t *ev;
14492+ struct cl_service_event event;
14493+
14494+ ev = get_event(us);
14495+ if (!ev)
14496+ return 0;
14497+
14498+ event.type = ev->type;
14499+ event.start_type = ev->start_type;
14500+ event.event_id = ev->event_id;
14501+ event.last_stop = ev->last_stop;
14502+ event.last_start = ev->last_start;
14503+ event.last_finish = ev->last_finish;
14504+ event.node_count = ev->node_count;
14505+
14506+ if (copy_to_user(user_event, &event, sizeof(struct cl_service_event)))
14507+ return -EFAULT;
14508+
14509+ del_event(us, ev);
14510+
14511+ if (ev->type == SERVICE_EVENT_START) {
14512+ if (us->nodeids)
14513+ kfree(us->nodeids);
14514+ us->nodeids = ev->nodeids;
14515+ us->node_count = ev->node_count;
14516+ }
14517+
14518+ kfree(ev);
14519+ return 1;
14520+}
14521+
14522+static int user_get_members(user_service_t *us,
14523+ struct cl_cluster_nodelist *u_nodelist)
14524+{
14525+ struct cl_cluster_nodelist user_nodelist;
14526+ struct cl_cluster_node user_node, *u_node;
14527+ struct cluster_node *node;
14528+ unsigned int i;
14529+ int num_nodes = 0;
14530+
14531+ if (!u_nodelist)
14532+ return us->node_count;
14533+
14534+ if (copy_from_user(&user_nodelist, (void __user *) u_nodelist,
14535+ sizeof(struct cl_cluster_nodelist)))
14536+ return -EFAULT;
14537+
14538+ if (user_nodelist.max_members < us->node_count)
14539+ return -E2BIG;
14540+
14541+ u_node = user_nodelist.nodes;
14542+
14543+ for (i = 0; i < us->node_count; i++) {
14544+ node = find_node_by_nodeid(us->nodeids[i]);
14545+ if (!node)
14546+ continue;
14547+
14548+ copy_to_usernode(node, &user_node);
14549+ if (copy_to_user(u_node, &user_node,
14550+ sizeof(struct cl_cluster_node)))
14551+ return -EFAULT;
14552+
14553+ u_node++;
14554+ num_nodes++;
14555+ }
14556+ return num_nodes;
14557+}
14558+
14559+static int user_global_id(user_service_t *us, uint32_t *id)
14560+{
14561+ uint32_t gid = 0;
14562+
14563+ if (us->state != UST_JOINED)
14564+ return -EINVAL;
14565+
14566+ kcl_global_service_id(us->local_id, &gid);
14567+
14568+ if (copy_to_user(id, &gid, sizeof(uint32_t)))
14569+ return -EFAULT;
14570+ return 0;
14571+}
14572+
14573+static int user_set_level(user_service_t *us, int level)
14574+{
14575+ int prev_id = us->local_id;
14576+ int error;
14577+
14578+ if (us->state != UST_REGISTER)
14579+ return -EINVAL;
14580+
14581+ error = kcl_register_service(us->name, us->name_len, level,
14582+ &user_service_ops, TRUE, (void *) us,
14583+ &us->local_id);
14584+ if (error)
14585+ return error;
14586+
14587+ kcl_unregister_service(prev_id);
14588+ return 0;
14589+}
14590+
14591+int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
14592+{
14593+ struct cluster_sock *c = cluster_sk(sock->sk);
14594+ user_service_t *us = c->service_data;
14595+ int error = 0;
14596+
14597+ if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER)
14598+ return -EINVAL;
14599+
14600+ switch (cmd) {
14601+ case SIOCCLUSTER_SERVICE_REGISTER:
14602+ error = user_register((char *) arg, &us);
14603+ if (!error) {
14604+ us->state = UST_REGISTER;
14605+ us->sock = sock;
14606+ c->service_data = us;
14607+ }
14608+ break;
14609+
14610+ case SIOCCLUSTER_SERVICE_UNREGISTER:
14611+ down(&us->lock);
14612+ us->state = UST_UNREGISTER;
14613+ user_unregister(us);
14614+ up(&us->lock);
14615+ break;
14616+
14617+ case SIOCCLUSTER_SERVICE_JOIN:
14618+ us->state = UST_JOIN;
14619+ user_join(us, 0);
14620+ break;
14621+
14622+ case SIOCCLUSTER_SERVICE_LEAVE:
14623+ down(&us->lock);
14624+ if (us->state != UST_JOINED) {
14625+ error = -EBUSY;
14626+ up(&us->lock);
14627+ } else {
14628+ us->state = UST_LEAVE;
14629+ up(&us->lock);
14630+ user_leave(us, 0);
14631+ }
14632+ break;
14633+
14634+ case SIOCCLUSTER_SERVICE_SETSIGNAL:
14635+ user_set_signal(us, (int) arg);
14636+ break;
14637+
14638+ case SIOCCLUSTER_SERVICE_STARTDONE:
14639+ error = user_start_done(us, (unsigned int) arg);
14640+ break;
14641+
14642+ case SIOCCLUSTER_SERVICE_GETEVENT:
14643+ error = user_get_event(us, (struct cl_service_event *) arg);
14644+ break;
14645+
14646+ case SIOCCLUSTER_SERVICE_GETMEMBERS:
14647+ error = user_get_members(us, (struct cl_cluster_nodelist *)arg);
14648+ break;
14649+
14650+ case SIOCCLUSTER_SERVICE_GLOBALID:
14651+ error = user_global_id(us, (uint32_t *) arg);
14652+ break;
14653+
14654+ case SIOCCLUSTER_SERVICE_SETLEVEL:
14655+ error = user_set_level(us, (int) arg);
14656+ break;
14657+
14658+ default:
14659+ error = -EINVAL;
14660+ }
14661+
14662+ return error;
14663+}
14664+
14665+void sm_sock_release(struct socket *sock)
14666+{
14667+ struct cluster_sock *c = cluster_sk(sock->sk);
14668+ user_service_t *us = c->service_data;
14669+ int state;
14670+
14671+ if (!us)
14672+ return;
14673+
14674+ down(&us->lock);
14675+ us->sock = NULL;
14676+ c->service_data = NULL;
14677+
14678+ if (us->need_startdone)
14679+ kcl_start_done(us->local_id, us->need_startdone);
14680+
14681+ if (us->async) {
14682+ /* async thread will clean up before exiting */
14683+ up(&us->lock);
14684+ return;
14685+ }
14686+ state = us->state;
14687+ up(&us->lock);
14688+
14689+ switch (state) {
14690+ case UST_JOIN:
14691+ break;
14692+ case UST_JOINED:
14693+ user_leave(us, 1);
14694+ /* fall through */
14695+ case UST_LEAVE:
14696+ case UST_REGISTER:
14697+ user_unregister(us);
14698+ /* fall through */
14699+ case UST_UNREGISTER:
14700+ kfree(us);
14701+ break;
14702+ }
14703+}
14704diff -urN linux-orig/cluster/cman/sm_user.h linux-patched/cluster/cman/sm_user.h
bb1d8b11
AM
14705--- linux-orig/cluster/cman/sm_user.h 1970-01-01 07:30:00.000000000 +0730
14706+++ linux-patched/cluster/cman/sm_user.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
14707@@ -0,0 +1,21 @@
14708+/******************************************************************************
14709+*******************************************************************************
14710+**
14711+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14712+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14713+**
14714+** This copyrighted material is made available to anyone wishing to use,
14715+** modify, copy, or redistribute it subject to the terms and conditions
14716+** of the GNU General Public License v.2.
14717+**
14718+*******************************************************************************
14719+******************************************************************************/
14720+
c783755a
AM
14721+#ifndef __SM_USER_DOT_H__
14722+#define __SM_USER_DOT_H__
14723+
14724+int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
14725+void sm_sock_release(struct socket *sock);
14726+void sm_sock_bind(struct socket *sock);
14727+
14728+#endif
c1c6733f 14729diff -urN linux-orig/include/cluster/cnxman-socket.h linux-patched/include/cluster/cnxman-socket.h
bb1d8b11
AM
14730--- linux-orig/include/cluster/cnxman-socket.h 1970-01-01 07:30:00.000000000 +0730
14731+++ linux-patched/include/cluster/cnxman-socket.h 2004-11-03 11:37:37.000000000 +0800
14732@@ -0,0 +1,233 @@
c1c6733f
AM
14733+/******************************************************************************
14734+*******************************************************************************
14735+**
14736+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14737+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14738+**
14739+** This copyrighted material is made available to anyone wishing to use,
14740+** modify, copy, or redistribute it subject to the terms and conditions
14741+** of the GNU General Public License v.2.
14742+**
14743+*******************************************************************************
14744+******************************************************************************/
14745+
14746+/* CMAN socket interface header,
14747+ may be include by user or kernel code */
14748+
14749+#ifndef __CNXMAN_SOCKET_H
14750+#define __CNXMAN_SOCKET_H
14751+
b7b72b66
AM
14752+/* A currently unused number. TIPC also uses this number and you're unlikely
14753+ to be using both.
14754+ */
14755+#define AF_CLUSTER 30
c1c6733f
AM
14756+#define PF_CLUSTER AF_CLUSTER
14757+
14758+/* Protocol(socket) types */
14759+#define CLPROTO_MASTER 2
14760+#define CLPROTO_CLIENT 3
14761+
c1c6733f
AM
14762+/* ioctls -- should register these properly */
14763+#define SIOCCLUSTER_NOTIFY _IOW('x', 0x01, int)
14764+#define SIOCCLUSTER_REMOVENOTIFY _IO( 'x', 0x02)
14765+#define SIOCCLUSTER_GETMEMBERS _IOR('x', 0x03, struct cl_cluster_nodelist)
14766+#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
14767+#define SIOCCLUSTER_ISQUORATE _IO( 'x', 0x05)
14768+#define SIOCCLUSTER_ISLISTENING _IOW('x', 0x06, struct cl_listen_request)
14769+#define SIOCCLUSTER_GETALLMEMBERS _IOR('x', 0x07, struct cl_cluster_nodelist)
14770+#define SIOCCLUSTER_SET_VOTES _IOW('x', 0x08, int)
14771+#define SIOCCLUSTER_GET_VERSION _IOR('x', 0x09, struct cl_version)
14772+#define SIOCCLUSTER_SET_VERSION _IOW('x', 0x0a, struct cl_version)
14773+#define SIOCCLUSTER_ISACTIVE _IO( 'x', 0x0b)
14774+#define SIOCCLUSTER_KILLNODE _IOW('x', 0x0c, int)
14775+#define SIOCCLUSTER_GET_JOINCOUNT _IO( 'x', 0x0d)
14776+#define SIOCCLUSTER_SERVICE_REGISTER _IOW('x', 0x0e, char)
14777+#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
14778+#define SIOCCLUSTER_SERVICE_JOIN _IO( 'x', 0x10)
14779+#define SIOCCLUSTER_SERVICE_LEAVE _IO( 'x', 0x20)
14780+#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
14781+#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
14782+#define SIOCCLUSTER_SERVICE_GETEVENT _IOR('x', 0x50, struct cl_service_event)
14783+#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist)
14784+#define SIOCCLUSTER_SERVICE_GLOBALID _IOR('x', 0x70, uint32_t)
14785+#define SIOCCLUSTER_SERVICE_SETLEVEL _IOR('x', 0x80, int)
14786+#define SIOCCLUSTER_GETNODE _IOWR('x', 0x90, struct cl_cluster_node)
bb1d8b11 14787+#define SIOCCLUSTER_GETCLUSTER _IOWR('x', 0x91, struct cl_cluster_info)
c1c6733f
AM
14788+#define SIOCCLUSTER_BARRIER _IOW('x', 0x0a0, struct cl_barrier_info)
14789+
b7b72b66
AM
14790+/* These were setsockopts */
14791+#define SIOCCLUSTER_PASS_SOCKET _IOW('x', 0x0b0, struct cl_passed_sock)
14792+#define SIOCCLUSTER_SET_NODENAME _IOW('x', 0x0b1, char *)
14793+#define SIOCCLUSTER_SET_NODEID _IOW('x', 0x0b2, int)
14794+#define SIOCCLUSTER_JOIN_CLUSTER _IOW('x', 0x0b3, struct cl_join_cluster_info)
14795+#define SIOCCLUSTER_LEAVE_CLUSTER _IOW('x', 0x0b4, int)
14796+
14797+
c1c6733f
AM
14798+/* Maximum size of a cluster message */
14799+#define MAX_CLUSTER_MESSAGE 1500
14800+#define MAX_CLUSTER_MEMBER_NAME_LEN 255
14801+#define MAX_BARRIER_NAME_LEN 33
14802+#define MAX_SA_ADDR_LEN 12
14803+#define MAX_CLUSTER_NAME_LEN 16
14804+
14805+/* Well-known cluster port numbers */
14806+#define CLUSTER_PORT_MEMBERSHIP 1 /* Mustn't block during cluster
14807+ * transitions! */
14808+#define CLUSTER_PORT_SERVICES 2
14809+#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */
14810+#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */
14811+#define CLUSTER_PORT_SLM 12 /* LVM SLM (simple lock manager) */
14812+
14813+/* Port numbers above this will be blocked when the cluster is inquorate or in
14814+ * transition */
14815+#define HIGH_PROTECTED_PORT 9
14816+
14817+/* Reasons for leaving the cluster */
14818+#define CLUSTER_LEAVEFLAG_DOWN 0 /* Normal shutdown */
14819+#define CLUSTER_LEAVEFLAG_KILLED 1
14820+#define CLUSTER_LEAVEFLAG_PANIC 2
14821+#define CLUSTER_LEAVEFLAG_REMOVED 3 /* This one can reduce quorum */
14822+#define CLUSTER_LEAVEFLAG_REJECTED 4 /* Not allowed into the cluster in the
14823+ * first place */
14824+#define CLUSTER_LEAVEFLAG_INCONSISTENT 5 /* Our view of the cluster is
14825+ * in a minority */
14826+#define CLUSTER_LEAVEFLAG_DEAD 6 /* Discovered to be dead */
14827+#define CLUSTER_LEAVEFLAG_FORCE 0x10 /* Forced by command-line */
14828+
14829+/* OOB messages sent to a local socket */
14830+#define CLUSTER_OOB_MSG_PORTCLOSED 1
14831+#define CLUSTER_OOB_MSG_STATECHANGE 2
14832+#define CLUSTER_OOB_MSG_SERVICEEVENT 3
14833+
14834+/* Sendmsg flags, these are above the normal sendmsg flags so they don't
14835+ * interfere */
14836+#define MSG_NOACK 0x010000 /* Don't need an ACK for this message */
14837+#define MSG_QUEUE 0x020000 /* Queue the message for sending later */
14838+#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster
14839+ */
14840+#define MSG_ALLINT 0x100000 /* Send out of all interfaces */
b7b72b66 14841+#define MSG_REPLYEXP 0x200000 /* Reply is expected */
c783755a 14842+#define MSG_BCASTSELF 0x400000 /* Broadcast message also gets send to us */
c1c6733f 14843+
c783755a 14844+typedef enum { NODESTATE_JOINING=1, NODESTATE_MEMBER,
b7b72b66 14845+ NODESTATE_DEAD } nodestate_t;
c1c6733f
AM
14846+
14847+
14848+struct sockaddr_cl {
14849+ unsigned short scl_family;
14850+ unsigned char scl_flags;
14851+ unsigned char scl_port;
14852+ int scl_nodeid;
14853+};
14854+
b7b72b66
AM
14855+/*
14856+ * This is how we pass the multicast & receive sockets into kernel space.
14857+ */
14858+struct cl_passed_sock {
c1c6733f
AM
14859+ int fd; /* FD of master socket to do multicast on */
14860+ int number; /* Socket number, to match up recvonly & bcast
14861+ * sockets */
b7b72b66 14862+ int multicast; /* Is it multicast or receive ? */
c1c6733f
AM
14863+};
14864+
14865+/* Cluster configuration info passed when we join the cluster */
14866+struct cl_join_cluster_info {
14867+ unsigned char votes;
14868+ unsigned int expected_votes;
14869+ unsigned int two_node;
14870+ unsigned int config_version;
14871+
14872+ char cluster_name[17];
14873+};
14874+
14875+
14876+/* This is the structure, per node, returned from the membership ioctl */
14877+struct cl_cluster_node {
14878+ unsigned int size;
14879+ unsigned int node_id;
14880+ unsigned int us;
14881+ unsigned int leave_reason;
14882+ unsigned int incarnation;
14883+ nodestate_t state;
14884+ char name[MAX_CLUSTER_MEMBER_NAME_LEN];
14885+ unsigned char votes;
14886+};
14887+
14888+/* The struct passed to the membership ioctls */
14889+struct cl_cluster_nodelist {
14890+ uint32_t max_members;
14891+ struct cl_cluster_node *nodes;
14892+};
14893+
14894+/* Structure passed to SIOCCLUSTER_ISLISTENING */
14895+struct cl_listen_request {
14896+ unsigned char port;
14897+ int nodeid;
14898+};
14899+
14900+/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
14901+struct cl_portclosed_oob {
14902+ unsigned char cmd; /* CLUSTER_OOB_MSG_PORTCLOSED */
14903+ unsigned char port;
14904+};
14905+
14906+/* Get all version numbers or set the config version */
14907+struct cl_version {
14908+ unsigned int major;
14909+ unsigned int minor;
14910+ unsigned int patch;
14911+ unsigned int config;
14912+};
14913+
14914+/* structure passed to barrier ioctls */
14915+struct cl_barrier_info {
14916+ char cmd;
14917+ char name[MAX_BARRIER_NAME_LEN];
14918+ unsigned int flags;
14919+ unsigned long arg;
14920+};
14921+
bb1d8b11
AM
14922+struct cl_cluster_info {
14923+ char name[MAX_CLUSTER_NAME_LEN+1];
14924+ uint16_t number;
14925+};
14926+
c1c6733f
AM
14927+typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
14928+ SERVICE_EVENT_LEAVEDONE } service_event_t;
14929+
14930+typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
14931+ service_start_t;
14932+
14933+struct cl_service_event {
14934+ service_event_t type;
14935+ service_start_t start_type;
14936+ unsigned int event_id;
14937+ unsigned int last_stop;
14938+ unsigned int last_start;
14939+ unsigned int last_finish;
14940+ unsigned int node_count;
14941+};
14942+
14943+
14944+/* Commands to the barrier ioctl */
14945+#define BARRIER_IOCTL_REGISTER 1
14946+#define BARRIER_IOCTL_CHANGE 2
14947+#define BARRIER_IOCTL_DELETE 3
14948+#define BARRIER_IOCTL_WAIT 4
14949+
bb1d8b11
AM
14950+/* Attributes of a barrier - bitmask */
14951+#define BARRIER_ATTR_AUTODELETE 1
14952+#define BARRIER_ATTR_MULTISTEP 2
14953+#define BARRIER_ATTR_MANUAL 4
14954+#define BARRIER_ATTR_ENABLED 8
14955+#define BARRIER_ATTR_CALLBACK 16
14956+
14957+/* Attribute setting commands */
14958+#define BARRIER_SETATTR_AUTODELETE 1
14959+#define BARRIER_SETATTR_MULTISTEP 2
14960+#define BARRIER_SETATTR_ENABLED 3
14961+#define BARRIER_SETATTR_NODES 4
14962+#define BARRIER_SETATTR_CALLBACK 5
14963+#define BARRIER_SETATTR_TIMEOUT 6
14964+
14965+#endif
14966diff -urN linux-orig/include/cluster/cnxman.h linux-patched/include/cluster/cnxman.h
14967--- linux-orig/include/cluster/cnxman.h 1970-01-01 07:30:00.000000000 +0730
14968+++ linux-patched/include/cluster/cnxman.h 2004-11-03 11:37:37.000000000 +0800
14969@@ -0,0 +1,87 @@
14970+/******************************************************************************
14971+*******************************************************************************
14972+**
14973+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14974+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14975+**
14976+** This copyrighted material is made available to anyone wishing to use,
14977+** modify, copy, or redistribute it subject to the terms and conditions
14978+** of the GNU General Public License v.2.
14979+**
14980+*******************************************************************************
14981+******************************************************************************/
14982+
14983+#ifndef __CNXMAN_H
14984+#define __CNXMAN_H
14985+
14986+#include "linux/in6.h"
14987+#include "cluster/cnxman-socket.h"
14988+
14989+/* In-kernel API */
14990+
14991+/* This is the structure, per node, returned from the membership request */
14992+struct kcl_cluster_node {
14993+ unsigned int size;
14994+ unsigned int node_id;
14995+ unsigned int us;
14996+ unsigned int leave_reason;
14997+ unsigned int incarnation;
14998+ nodestate_t state;
14999+ struct list_head list;
15000+ char name[MAX_CLUSTER_MEMBER_NAME_LEN];
15001+ unsigned char votes;
15002+};
15003+
15004+struct cluster_node_addr {
15005+ struct list_head list;
15006+ unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */
15007+ int addr_len;
15008+};
15009+
15010+
15011+/* Reasons for a kernel membership callback */
15012+typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason;
15013+
15014+/* Kernel version of above, the void *sock is a struct socket */
15015+struct kcl_multicast_sock {
15016+ void *sock;
15017+ int number; /* Socket number, to match up recvonly & bcast
15018+ * sockets */
15019+};
15020+
15021+extern int kcl_sendmsg(struct socket *sock, void *buf, int size,
15022+ struct sockaddr_cl *caddr, int addr_len,
15023+ unsigned int flags);
15024+extern int kcl_register_read_callback(struct socket *sock,
15025+ int (*routine) (char *, int, char *, int,
15026+ unsigned int));
15027+extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long));
15028+extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long));
15029+extern int kcl_get_members(struct list_head *list);
15030+extern int kcl_get_member_ids(uint32_t * idbuf, int size);
15031+extern int kcl_get_all_members(struct list_head *list);
15032+extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
15033+ struct kcl_cluster_node *n);
15034+extern int kcl_get_node_by_name(unsigned char *name,
15035+ struct kcl_cluster_node *n);
15036+extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n);
15037+extern int kcl_is_quorate(void);
15038+extern int kcl_addref_cluster(void);
15039+extern int kcl_releaseref_cluster(void);
15040+extern int kcl_cluster_name(char **cname);
15041+extern int kcl_get_current_interface(void);
15042+extern struct list_head *kcl_get_node_addresses(int nodeid);
15043+
15044+extern int kcl_barrier_register(char *name, unsigned int flags,
15045+ unsigned int nodes);
15046+extern int kcl_barrier_setattr(char *name, unsigned int attr,
15047+ unsigned long arg);
15048+extern int kcl_barrier_delete(char *name);
15049+extern int kcl_barrier_wait(char *name);
15050+extern int kcl_barrier_cancel(char *name);
c1c6733f 15051+
bb1d8b11
AM
15052+extern int kcl_register_quorum_device(char *name, int votes);
15053+extern int kcl_unregister_quorum_device(void);
15054+extern int kcl_quorum_device_available(int yesno);
c1c6733f
AM
15055+
15056+#endif
c1c6733f 15057diff -urN linux-orig/include/cluster/service.h linux-patched/include/cluster/service.h
bb1d8b11
AM
15058--- linux-orig/include/cluster/service.h 1970-01-01 07:30:00.000000000 +0730
15059+++ linux-patched/include/cluster/service.h 2004-11-03 11:37:37.000000000 +0800
c1c6733f
AM
15060@@ -0,0 +1,102 @@
15061+/******************************************************************************
15062+*******************************************************************************
15063+**
15064+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
15065+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
15066+**
15067+** This copyrighted material is made available to anyone wishing to use,
15068+** modify, copy, or redistribute it subject to the terms and conditions
15069+** of the GNU General Public License v.2.
15070+**
15071+*******************************************************************************
15072+******************************************************************************/
15073+
15074+#ifndef __SERVICE_DOT_H__
15075+#define __SERVICE_DOT_H__
15076+
15077+/*
15078+ * Interface between service manager and services
15079+ */
15080+
15081+/*
15082+ * Service levels are started in order from lowest, so level 0 is started on
15083+ * all nodes before level 1 is started.
15084+ */
15085+
15086+#define SERVICE_LEVEL_FENCE (0)
15087+#define SERVICE_LEVEL_GDLM (1)
15088+#define SERVICE_LEVEL_GFS (2)
15089+#define SERVICE_LEVEL_USER (3)
15090+
15091+#define MAX_SERVICE_NAME_LEN (33)
15092+
15093+/*
15094+ * The type of start a service receives. The start (and preceding stop) may be
15095+ * due to a node joining or leaving the SG or due to a node having failed.
15096+ */
15097+
15098+#define SERVICE_NODE_FAILED (1)
15099+#define SERVICE_NODE_JOIN (2)
15100+#define SERVICE_NODE_LEAVE (3)
15101+
15102+
15103+struct kcl_service {
15104+ struct list_head list;
15105+ uint16_t level;
15106+ uint32_t local_id;
15107+ uint32_t global_id;
15108+ int node_count;
15109+ char name[MAX_SERVICE_NAME_LEN];
15110+};
15111+
15112+int kcl_get_services(struct list_head *list, int level);
15113+
15114+
15115+/*
15116+ * These routines which run in CMAN context must return quickly and cannot
15117+ * block.
15118+ */
15119+
15120+struct kcl_service_ops {
15121+ int (*stop) (void *servicedata);
15122+ int (*start) (void *servicedata, uint32_t *nodeids, int count,
15123+ int event_id, int type);
15124+ void (*finish) (void *servicedata, int event_id);
15125+};
15126+
15127+/*
15128+ * Register will cause CMAN to create a Service Group (SG) for the named
15129+ * instance of the service. A local ID is returned which is used to join,
15130+ * leave and unregister the service.
15131+ */
15132+
15133+int kcl_register_service(char *name, int namelen, int level,
15134+ struct kcl_service_ops *ops, int unique,
15135+ void *servicedata, uint32_t *local_id);
15136+
15137+void kcl_unregister_service(uint32_t local_id);
15138+
15139+/*
15140+ * Once a service is joined it will be managed by CMAN and receive start, stop,
15141+ * and finish calls. After leave is called the service is no longer managed by
15142+ * CMAN. The first start for a service may arrive before kcl_join_service()
15143+ * returns.
15144+ */
15145+
15146+int kcl_join_service(uint32_t local_id);
15147+int kcl_leave_service(uint32_t local_id);
15148+
15149+/*
15150+ * After a service is started, it can ask for its cluster-wide unique ID.
15151+ */
15152+
15153+void kcl_global_service_id(uint32_t local_id, uint32_t * global_id);
15154+
15155+/*
15156+ * Called by a service when it's done with a start(). Cannot be called from
15157+ * the start function.
15158+ */
15159+
15160+void kcl_start_done(uint32_t local_id, int event_id);
15161+
15162+#endif
This page took 1.966664 seconds and 4 git commands to generate.